open Core open Bistro_utils open Bistro.EDSL open Bistro.Std open File_formats let parse_input_tree ~tree : parsed_input_tree directory workflow = workflow ~descr:"utils.parse_input_tree" [ (*let env = docker_image ~account:"carinerey" ~name:"ete3:3.0.0b35" () in*) cmd "python" (*~env*) [ string "../etc/utils/bin/parse_input_tree.py"; opt "-t" dep tree; opt "-o" ident dest; ] ] let select_out parsed_tree t = match t with | Nodes_H0 -> parsed_tree / selector [ "tree.H0.node_ids" ] | Nodes_Ha -> parsed_tree / selector [ "tree.Ha.node_ids" ] | Tree4detect -> parsed_tree / selector [ "tree.only_convergent_tags.nhx" ] | Tree4simu -> parsed_tree / selector [ "tree.only_node_ids.nhx" ] | Tree_diffsel -> parsed_tree / selector [ "tree.diffsel" ] let ready_dataset_of_raw_dataset raw_dataset = let tree = raw_dataset.input_tree in let fna = raw_dataset.fna in let parsed_tree = parse_input_tree ~tree in let faa = Bppsuite.fna2faa ~fna in { parsed_tree; fna; faa} let repo_of_ready_dataset ready_dataset = Repo.[ item ["ready_dataset/tree.H0.node_ids" ] (ready_dataset.parsed_tree / selector [ "tree.H0.node_ids" ]) ; item ["ready_dataset/tree.Ha.node_ids" ] (ready_dataset.parsed_tree / selector [ "tree.Ha.node_ids" ]) ; item ["ready_dataset/tree.only_convergent_tags.nhx" ] (ready_dataset.parsed_tree / selector [ "tree.only_convergent_tags.nhx" ]) ; item ["ready_dataset/tree.only_node_ids.nhx" ] (ready_dataset.parsed_tree / selector [ "tree.only_node_ids.nhx" ]) ; item ["ready_dataset/tree.diffsel" ] (ready_dataset.parsed_tree / selector [ "tree.diffsel" ]) ; item ["ready_dataset/simulated_sequences.fna"] ready_dataset.fna ; item ["ready_dataset/simulated_sequences.faa"] ready_dataset.faa ; ] let repo_of_raw_dataset raw_dataset = Repo.[ item ["raw_dataset/input_tree.nhx"] raw_dataset.input_tree ; item ["raw_dataset/simulated_sequences.fna"] raw_dataset.fna ; ] let derive_from_tree ~tree_dir ~tree ~preview = let tree = input (Filename.concat tree_dir tree) in let nb_sites = if preview then 10 else 100 in let parsed_tree = parse_input_tree ~tree in let fna = Bppsuite.bppseqgen ~nb_sites ~tree:(select_out parsed_tree Tree4simu) in let faa = Bppsuite.fna2faa ~fna in let raw_dataset = { input_tree = tree ; fna} in let ready_dataset = { parsed_tree; fna; faa} in if preview then repo_of_ready_dataset ready_dataset else repo_of_raw_dataset raw_dataset let derive ~tree_dir ~trees ~preview = List.map trees ~f:(fun tree -> let id = Filename.chop_extension tree in Repo.shift id (derive_from_tree ~tree_dir ~tree ~preview)) |> List.concat let main ~outdir ?(np = 2) ?(mem = 2) ~tree_dir ~preview () = let trees = Array.to_list @@ Sys.readdir tree_dir in let repo = derive ~tree_dir ~trees ~preview in Repo.build ~outdir ~np ~mem:(`GB mem) repo let command = let open Command.Let_syntax in Command.basic ~summary:"Run simulation pipeline" [%map_open let outdir = flag "--outdir" (required string) ~doc:"PATH Output directory" and preview = flag "--preview-mode" no_arg ~doc:" Preview mode" and np = flag "--np" (optional int) ~doc:"INT Number of available processors" and mem = flag "--mem" (optional int) ~doc:"INT Available memory (in GB)" and tree_dir = flag "--tree-dir" (required string) ~doc:"PATH Path to tree directory" in main ~outdir ?np ?mem ~tree_dir ~preview ]