pipeline.ml 21 KB
Newer Older
Philippe Veber's avatar
Philippe Veber committed
1
open Core
Philippe Veber's avatar
Philippe Veber committed
2
open Bistro_utils
Philippe Veber's avatar
Philippe Veber committed
3
open Bistro
Carine Rey's avatar
Carine Rey committed
4
open Convergence_detection
5
open Convergence_hypothesis
6
open Profile
7

Carine Rey's avatar
Carine Rey committed
8
let parse_input_data ~seed ~calc_dnds indir =
Carine Rey's avatar
Carine Rey committed
9 10 11 12
let error_message = {|
I need a file "tree.nhx" containing the annotated tree and
a directory "Alignments" containing fasta alignments (in nt)
with the format "gene1.fna", "gene2.fna",... |} in
13 14
  let datasets = Array.to_list @@ Sys.readdir indir in
  List.map datasets ~f:(fun dataset_prefix ->
Carine Rey's avatar
Carine Rey committed
15
      printf "Real dataset:\n\tTree: %s\n" dataset_prefix;
LANORE Vincent's avatar
LANORE Vincent committed
16 17 18 19 20 21 22 23 24 25 26 27 28 29
      let files = Array.to_list @@ Sys.readdir (Filename.concat indir dataset_prefix ) in
      if List.length files = 2 then
        let h_file = List.hd files in
        let h_file = match h_file with
          | Some s -> s
          | None -> ""
        in
        let h_file_ext = Filename.split_extension h_file in
        let t_file = List.nth files 1 in
        let t_file = match t_file with
          | Some s -> s
          | None -> ""
        in
        let t_file_ext = Filename.split_extension t_file in
Carine Rey's avatar
Carine Rey committed
30 31 32 33 34 35 36
        let fna_dir, input_tree = match (h_file_ext, t_file_ext, h_file, t_file) with
          | _ ,  ( _ , Some "nhx"), "Alignments" , _ -> h_file, t_file
          | ( _ , Some "nhx"), _, _, "Alignments" -> t_file, h_file
          | _, _, _, "Alignments" -> failwith ({|Syntax error: Naming errors in |})
          | _,_,_,_ -> failwith ({|Syntax error: Naming errors in |} ^ (Filename.concat indir dataset_prefix ) ^ "
           1st file: " ^ (h_file) ^ "
           2nd file: " ^ (t_file) ^ error_message)
LANORE Vincent's avatar
LANORE Vincent committed
37 38
        in
        let tree_prefix = Filename.chop_extension input_tree in
Philippe Veber's avatar
Philippe Veber committed
39
        let input_tree = Workflow.input (Filename.concat indir (Filename.concat dataset_prefix input_tree)) in
Carine Rey's avatar
Carine Rey committed
40
        let fna_l = Array.to_list @@ Sys.readdir (Filename.concat indir (dataset_prefix ^ "/" ^ fna_dir)) in
41
        printf "        %i files detected in %s\n" (List.length fna_l) fna_dir;
Carine Rey's avatar
Carine Rey committed
42 43 44
        List.map fna_l ~f:(function fna ->
            let fna_prefix = Filename.chop_extension fna in
            let fna = Workflow.input (Filename.concat indir (Filename.concat (dataset_prefix ^ "/" ^ fna_dir) fna)) in
45
            let filtered_input_tree = Raw_dataset.filter_input_tree ~descr:fna_prefix ~tree:input_tree ~fna () in
Carine Rey's avatar
Carine Rey committed
46
            let fna_infos = None in
47
            let raw_dataset = Raw_dataset.{input_tree=filtered_input_tree; fna; fna_infos} in
Carine Rey's avatar
Carine Rey committed
48 49
            let dataset = {Dataset.model_prefix = fna_prefix;
                           is_real = true;
Carine Rey's avatar
Carine Rey committed
50
                           calc_dnds;
Carine Rey's avatar
Carine Rey committed
51 52 53 54 55 56
                           tree_prefix = tree_prefix;
                           dataset = Ready_dataset.of_raw ~descr:("real_data." ^ tree_prefix) raw_dataset;
                           seed;
                          } in
            dataset
        )
LANORE Vincent's avatar
LANORE Vincent committed
57
      else
Carine Rey's avatar
Carine Rey committed
58
        failwith ({|More than 2 files in |} ^ (Filename.concat indir dataset_prefix) ^ error_message)
59 60 61
    )
  |> List.concat

Carine Rey's avatar
Carine Rey committed
62
let calc_fixed_seed ~(str:string) (seed:int) : int =
63 64
  let str_hash = Hashtbl.hash str in
  Hashtbl.hash (str_hash + seed)
Carine Rey's avatar
Carine Rey committed
65

Philippe Veber's avatar
Philippe Veber committed
66
let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~seed =
67 68
  let model_prefix = Convergence_hypothesis.string_of_model model in
  let descr = "."^model_prefix^"."^tree_prefix in
Carine Rey's avatar
Carine Rey committed
69
  (* with several profiles or couples of profiles *)
70 71
  let profile_f = profile.profile_f in
  let profile_c = profile.profile_c in
Carine Rey's avatar
Carine Rey committed
72
  let seed = calc_fixed_seed ~str:descr seed in
73
  let run_fna = Bppsuite.Bppseqgen.multi_profiles ~descr ~input_tree ~hypothesis:model ~profile_f ~profile_c ~seed in
Philippe Veber's avatar
Philippe Veber committed
74 75
  let fna = Bppsuite.Bppseqgen.alignment run_fna in
  let fna_infos = Some (Bppsuite.Bppseqgen.info run_fna) in
76
  let faa = Bppsuite.fna2faa fna in
77
  let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in
Carine Rey's avatar
Carine Rey committed
78
  { Dataset.model_prefix; is_real= false; calc_dnds = false; tree_prefix; dataset = ready_dataset; seed }
Carine Rey's avatar
Carine Rey committed
79

Philippe Veber's avatar
Philippe Veber committed
80
let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~add_indels ~seed =
Carine Rey's avatar
Carine Rey committed
81
  let tree_prefix = Filename.chop_extension tree in
Philippe Veber's avatar
Philippe Veber committed
82
  let input_tree = Workflow.input (Filename.concat tree_dir tree) in
83
  let tree_dataset = Tree_dataset.prepare ~descr:("simulated_data." ^ tree_prefix) input_tree in
Philippe Veber's avatar
Philippe Veber committed
84 85 86
  let dataset_H0_NeG5   = derive_from_model ~model:(H0 (Fixed 5.))   ~input_tree ~tree_dataset ~tree_prefix ~profile ~seed in
  let dataset_HaPCOC    = derive_from_model ~model:(HaPCOC (Fixed 1.))    ~input_tree ~tree_dataset ~tree_prefix ~profile ~seed in
  let dataset_HaPC_NeG5 = derive_from_model ~model:(HaPC (Fixed 5.)) ~input_tree ~tree_dataset ~tree_prefix ~profile ~seed in
87 88
  let indel_H0_NeG5 = Dataset.add_indels_to_dataset dataset_H0_NeG5 ~seed:(Random.int Int.max_value) in
  let indel_HaPC_NeG5 = Dataset.add_indels_to_dataset dataset_HaPC_NeG5 ~seed:(Random.int Int.max_value) in
89
  (* let dataset_basis_hyps = [dataset_H0_NeG5; dataset_HaPCOC; dataset_HaPC_NeG5] in *)
LANORE Vincent's avatar
LANORE Vincent committed
90
  let nes_list =
91
    [Fixed 4.] @ if preview then [(* Variable (8., 1.) *)] else
Carine Rey's avatar
Carine Rey committed
92
      [Fixed 1.; Fixed 8.; (* 4->8, 8->4, 8, 16, 1->8 and 16->1 are new ; it means 6*2*2 tasks on cyp/C4A *)
93
       Variable (1., 4.); Variable (4., 1.);
Carine Rey's avatar
Carine Rey committed
94
       Variable (4., 8.);
95
      ]
Carine Rey's avatar
Carine Rey committed
96
  in
LANORE Vincent's avatar
LANORE Vincent committed
97
  let models = h0_hapc_forall nes_list in
Carine Rey's avatar
Carine Rey committed
98
  let dataset_per_hypo = List.map models ~f:(fun model ->
Philippe Veber's avatar
Philippe Veber committed
99
      derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~seed
LANORE Vincent's avatar
LANORE Vincent committed
100
    ) in
Carine Rey's avatar
Carine Rey committed
101

Carine Rey's avatar
Carine Rey committed
102 103
  let _concat_H0HaPCOC = {Dataset.model_prefix="H0_NeG5+HaPCOC"; tree_prefix; is_real = false; calc_dnds = false; dataset = Ready_dataset.paste dataset_H0_NeG5.dataset dataset_HaPCOC.dataset; seed=(dataset_H0_NeG5.seed + dataset_HaPCOC.seed |> Hashtbl.hash) } in
  let concat_H0HaPC = {Dataset.model_prefix="H0_NeG5+HaPC_NeG5"; tree_prefix; is_real = false; calc_dnds = false; dataset = Ready_dataset.paste dataset_H0_NeG5.dataset dataset_HaPC_NeG5.dataset; seed=(dataset_H0_NeG5.seed + dataset_HaPC_NeG5.seed |> Hashtbl.hash) } in
104
  let dataset_concat_hypos = if use_concat then [concat_H0HaPC;] else [] in
105
  let dataset_with_indels = if add_indels then [indel_H0_NeG5 ; indel_HaPC_NeG5] else [] in
Carine Rey's avatar
Carine Rey committed
106
  List.concat [ (*dataset_basis_hyps;*) dataset_per_hypo ; dataset_concat_hypos; dataset_with_indels ]
107

Philippe Veber's avatar
Philippe Veber committed
108
let derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~add_indels ~seed =
LANORE Vincent's avatar
LANORE Vincent committed
109
  List.map trees ~f:(fun tree ->
Philippe Veber's avatar
Philippe Veber committed
110
      derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~add_indels ~seed)
LANORE Vincent's avatar
LANORE Vincent committed
111 112
  |> List.concat

Carine Rey's avatar
Carine Rey committed
113

Philippe Veber's avatar
Philippe Veber committed
114 115 116
let repo_of_detection_result res =
  let det_meth_prefix = Convergence_detection.meth_string_of_result res in
  Repo.[
117
    [ match res with
Philippe Veber's avatar
Philippe Veber committed
118 119
      | `Pcoc w -> item ["pcoc.results.tsv"] (Pcoc.results w)
      | `Pcoc_gamma w -> item ["pcoc_gamma.results.tsv"] (Pcoc.results w)
Carine Rey's avatar
Carine Rey committed
120
      | `Pcoc_C60 w -> item ["pcoc_C60.results.tsv"] (Pcoc.results w)
Philippe Veber's avatar
Philippe Veber committed
121
      | `Diffsel w -> item ["diffsel.results.tsv"] (Diffsel.selector w)
Carine Rey's avatar
Carine Rey committed
122 123
      | `Identical_LG w -> item ["Identical_LG.results.tsv"] (Identical.results w)
      | `Identical_WAG w -> item ["Identical_WAG.results.tsv"] (Identical.results w)
124 125
      | `Topological_LG w -> item ["Topological_LG.results.tsv"] (Topological.results w)
      | `Topological_WAG w -> item ["Topological_WAG.results.tsv"] (Topological.results w)
Philippe Veber's avatar
Philippe Veber committed
126
      | `Tdg09 w -> item ["Tdg09.results.tsv"] (Tdg09.results w)
127
      | `Multinomial w -> item ["Multinomial.results.tsv"] w
128
      | `Msd (w, e) -> item [sprintf "Msd.%f.results.tsv" e] (Msd.results w)
129 130
    ] ;
    [
Philippe Veber's avatar
Philippe Veber committed
131 132 133
      match res with
      | `Pcoc w -> item ["raw_results"]  w
      | `Pcoc_gamma w -> item ["raw_results"] w
Carine Rey's avatar
Carine Rey committed
134
      | `Pcoc_C60 w -> item ["raw_results"] w
Philippe Veber's avatar
Philippe Veber committed
135
      | `Diffsel w -> item ["raw_results"] w
Carine Rey's avatar
Carine Rey committed
136 137
      | `Identical_LG w -> item ["raw_results"] w
      | `Identical_WAG w -> item ["raw_results"] w
138 139 140
      | `Topological_LG w -> item ["raw_results"] w
      | `Topological_WAG w -> item ["raw_results"] w
      | `Tdg09 w -> item ["raw_results"] w
141
      | `Multinomial w -> item ["raw_results"] w
142
      | `Msd (w, _) -> item ["raw_results"] w
143
    ] ;
144
    match res with
Philippe Veber's avatar
Philippe Veber committed
145
    | `Diffsel w -> [item ["chain_convergence_checking.html"] (Workflow.select (Diffsel.check_conv w) ["out.html"])]
146
    | _ -> []
147
  ] |> List.concat
Philippe Veber's avatar
Philippe Veber committed
148 149
  |> Repo.shift det_meth_prefix

150
let repo_of_dataset_results_l ~dataset_results_l =
Carine Rey's avatar
Carine Rey committed
151
  List.map dataset_results_l ~f:(fun dataset_results ->
LANORE Vincent's avatar
LANORE Vincent committed
152 153 154
      let det_results_l = dataset_results.res_by_tools in
      let merged_results = dataset_results.merged_results in
      let plot_merge_results = dataset_results.plot_merged_results in
155 156
      let model_prefix = dataset_results.model_prefix in
      let tree_prefix = dataset_results.tree_prefix in
Carine Rey's avatar
Carine Rey committed
157
      let merged_results_item = Repo.item [tree_prefix ^"."^model_prefix^".merged_results.tsv"] merged_results in
158
      let plot_merged_results_item = Repo.item [tree_prefix ^"."^model_prefix^".plot_merged_results.svg"] plot_merge_results in
LANORE Vincent's avatar
LANORE Vincent committed
159 160 161
      let repo =
        merged_results_item ::
        plot_merged_results_item ::
Carine Rey's avatar
Carine Rey committed
162
        (List.map det_results_l ~f:repo_of_detection_result |> List.concat |> Repo.shift "Detection_tools" )
LANORE Vincent's avatar
LANORE Vincent committed
163 164 165 166
      in
      repo
      |> Repo.shift dataset_results.model_prefix
    )
Carine Rey's avatar
Carine Rey committed
167 168
  |> List.concat

Carine Rey's avatar
rm plot  
Carine Rey committed
169
let repo_of_real_dataset_results_l ~dataset_results_l ?(plot = false) () =
Carine Rey's avatar
Carine Rey committed
170 171 172 173 174 175 176
  List.map dataset_results_l ~f:(fun dataset_results ->
      let det_results_l = dataset_results.res_by_tools in
      let merged_results = dataset_results.merged_results in
      let plot_merge_results = dataset_results.plot_merged_results in
      let model_prefix = dataset_results.model_prefix in
      let tree_prefix = dataset_results.tree_prefix in
      let merged_results_item = Repo.item [tree_prefix ^"."^model_prefix^".merged_results.tsv"] merged_results in
Carine Rey's avatar
rm plot  
Carine Rey committed
177 178 179 180 181 182
      let plot_merged_results_item =
        if plot then
          [Repo.item [tree_prefix ^"."^model_prefix^".plot_merged_results.svg"] plot_merge_results]
        else
          []
        in
Carine Rey's avatar
Carine Rey committed
183
      List.concat [
Carine Rey's avatar
rm plot  
Carine Rey committed
184 185
       List.concat [
       [merged_results_item] ;
Carine Rey's avatar
Carine Rey committed
186 187 188 189 190 191 192 193 194 195 196
        plot_merged_results_item ;
       ]|> Repo.shift "Merged_Results" ;
       List.map det_results_l ~f:repo_of_detection_result
        |> List.concat
        |> Repo.shift model_prefix
        |> Repo.shift "Results_per_Detection_tool" ;
      ]
      |> Repo.shift dataset_results.tree_prefix
    )
  |> List.concat

197
let derive_from_det_meth ~det_meth ~(dataset : Dataset.t) ~preview =
Carine Rey's avatar
Carine Rey committed
198 199 200
  let model_prefix = dataset.model_prefix in
  let tree_prefix = dataset.tree_prefix in
  let descr = model_prefix ^"." ^ tree_prefix in
201 202
  let faa = dataset.dataset.faa in
  let fna = dataset.dataset.fna in
Carine Rey's avatar
Carine Rey committed
203
  let phy_n = Bppsuite.fna2phy ~fna in
204 205 206 207
  let tree_sc = Tree_dataset.prepare_sc_tree dataset.dataset.input_tree in
  let tree_id = Tree_dataset.prepare_tree_with_node_id dataset.dataset.input_tree in
  let diffsel_tree = Tree_dataset.prepare_diffsel_tree ~descr:tree_prefix dataset.dataset.input_tree in
  let tree_conv = Tree_dataset.prepare_topological_tree ~descr:tree_prefix dataset.dataset.input_tree in
208
  let w_every = if preview then 1 else 1 in
Carine Rey's avatar
Carine Rey committed
209
  let n_cycles = if preview then 2000 else 2000 in
210
  let seed = Hashtbl.hash dataset.seed in
Philippe Veber's avatar
Philippe Veber committed
211
  match det_meth with
Carine Rey's avatar
Carine Rey committed
212 213 214
  | `Pcoc -> `Pcoc (Pcoc.pcoc ~descr ~catx_est:10 ~max_gap_per_conv_leaf:25 ~max_gap_per_pos:25 ~plot_complete:false ~gamma:false ~faa ~tree:tree_sc ())
  | `Pcoc_gamma -> `Pcoc_gamma (Pcoc.pcoc ~descr ~catx_est:10 ~max_gap_per_conv_leaf:25 ~max_gap_per_pos:25 ~plot_complete: false ~gamma:true ~faa ~tree:tree_sc ())
  | `Pcoc_C60 -> `Pcoc_C60 (Pcoc.pcoc ~descr ~catx_est:60 ~plot_complete: false ~gamma:false ~faa ~tree:tree_sc ())
Philippe Veber's avatar
Philippe Veber committed
215
  | `Tdg09 -> `Tdg09 (Tdg09.tdg09 ~descr ~faa ~tree:tree_sc ())
216 217 218 219 220
  | `Diffsel -> `Diffsel (Diffsel.diffsel ~descr ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles ~seed ())
  | `Identical_LG -> `Identical_LG (Identical.identical ~descr ~faa ~tree_id ~tree_sc ~prot_model:"LG08" ())
  | `Identical_WAG -> `Identical_WAG (Identical.identical ~descr ~faa ~tree_id ~tree_sc ~prot_model:"WAG01" ())
  | `Topological_LG -> `Topological_LG (Topological.topological ~descr ~faa ~tree:tree_id ~tree_conv ~prot_model:"LG08" ())
  | `Topological_WAG -> `Topological_WAG (Topological.topological ~descr ~faa ~tree:tree_id ~tree_conv ~prot_model:"WAG01" ())
Carine Rey's avatar
Carine Rey committed
221
  | `Msd e -> `Msd (Msd.msd ~descr ~e ~faa ~tree_sc, e)
Carine Rey's avatar
Carine Rey committed
222 223


224
let derive_from_dataset ~dataset ~preview ~use_diffsel ~use_c60=
Carine Rey's avatar
Carine Rey committed
225
  let det_meths = [
Carine Rey's avatar
Carine Rey committed
226
    ([
Carine Rey's avatar
Carine Rey committed
227
      `Identical_LG;
Carine Rey's avatar
Carine Rey committed
228
      (*`Topological_LG;
Carine Rey's avatar
Carine Rey committed
229
      `Multinomial;
Carine Rey's avatar
Carine Rey committed
230
      `Pcoc;*)
Carine Rey's avatar
Carine Rey committed
231
      `Pcoc_gamma;
Carine Rey's avatar
Carine Rey committed
232
    ] (*@ List.map [0.05] ~f:(fun x -> `Msd x)) *)) ;
233
    (* if preview then *)
LANORE Vincent's avatar
LANORE Vincent committed
234
    [`Tdg09;]
235
    (* else
LANORE Vincent's avatar
LANORE Vincent committed
236
       [ `Tdg09;
237 238 239
        `Pcoc_gamma;
        `Identical_WAG;
        `Topological_WAG;
LANORE Vincent's avatar
LANORE Vincent committed
240
       ] *)
LANORE Vincent's avatar
LANORE Vincent committed
241
    ;
242 243 244 245 246 247
    if use_diffsel then
      [`Diffsel]
    else
      []  ;
    if use_c60 then
      [`Pcoc_C60]
LANORE Vincent's avatar
LANORE Vincent committed
248
    else
249
      []  ;
LANORE Vincent's avatar
LANORE Vincent committed
250 251
  ]
    |> List.concat in
252
  let res_by_tools = List.map det_meths ~f:(fun det_meth ->
LANORE Vincent's avatar
LANORE Vincent committed
253
      derive_from_det_meth ~det_meth ~dataset ~preview
254
    ) in
Carine Rey's avatar
Carine Rey committed
255
  let fna_infos = dataset.dataset.fna_infos in
256
  let merged_results = merge_results ?fna_infos ~res_by_tools () in
Carine Rey's avatar
Carine Rey committed
257
  let tsv = merged_results in
258
  let faa = dataset.dataset.faa in
259
  let tree = Tree_dataset.prepare_sc_tree dataset.dataset.input_tree in
Carine Rey's avatar
Carine Rey committed
260
  let plot_all_sites = if dataset.is_real then false else true in
261
  let plot_merged_results = plot_merge_results ~plot_all_sites ~res_by_tools ~tsv ~faa ~tree () in
Carine Rey's avatar
Carine Rey committed
262 263
  let model_prefix = dataset.model_prefix in
  let tree_prefix = dataset.tree_prefix in
Carine Rey's avatar
Carine Rey committed
264
  {model_prefix; tree_prefix; dataset; res_by_tools ; merged_results ; plot_merged_results}
Carine Rey's avatar
Carine Rey committed
265

266
let derive_det ~dataset_l ~preview ~use_diffsel ~use_c60 =
Carine Rey's avatar
Carine Rey committed
267
  List.map dataset_l ~f:(fun dataset ->
268
      derive_from_dataset ~preview ~dataset ~use_diffsel ~use_c60)
Carine Rey's avatar
Carine Rey committed
269

Philippe Veber's avatar
Philippe Veber committed
270
let derive_profile ~preview ~use_diffsel ~use_c60 ~tree_dir ~profile ~use_concat ~add_indels ~only_simu ~seed () =
LANORE Vincent's avatar
LANORE Vincent committed
271
  let trees = Array.to_list @@ Sys.readdir tree_dir in
Carine Rey's avatar
Carine Rey committed
272
  let repo_and_post_analyses_per_tree_simu = List.map trees ~f:(fun tree -> (*to keep together all models per tree*)
273 274 275
      let trees = [tree] in
      let tree_prefix = Filename.chop_extension tree in
      let dataset_l =
Philippe Veber's avatar
Philippe Veber committed
276
        derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~add_indels ~seed in
Carine Rey's avatar
Carine Rey committed
277
      let dataset_results_l =
278
        if only_simu then
Carine Rey's avatar
Carine Rey committed
279 280
          []
        else
281
          derive_det ~dataset_l ~preview ~use_diffsel ~use_c60
Carine Rey's avatar
Carine Rey committed
282
      in
283
      let post_analyses_res = Post_analyses.post_analyses_res_of_dataset_results_l ~tree_prefix ~dataset_results_l in
284
      let repo_per_tree = [
Philippe Veber's avatar
Philippe Veber committed
285
        Dataset.repo dataset_l ;
Carine Rey's avatar
Carine Rey committed
286 287 288 289 290 291
        Repo.shift "Results_per_hypothesis" (repo_of_dataset_results_l ~dataset_results_l);
        Post_analyses.repo_of_post_analyses_res ~prefix:tree_prefix ~post_analyses_res;
      ]
        |> List.concat
        |> Repo.shift tree_prefix
        |> Repo.shift "Results_per_tree"
292
      in
293
      (repo_per_tree, post_analyses_res, dataset_results_l, dataset_l)
Carine Rey's avatar
Carine Rey committed
294
    )
295
  in
296
  let all_repo_per_tree_simu = List.concat_map repo_and_post_analyses_per_tree_simu ~f:(fun (r, _, _, _) -> r) in
297
  (* let all_post_analyses_per_tree = List.map repo_and_post_analyses_per_tree_simu ~f:(fun (_, p, _, _) -> p) in *)
298 299 300 301
  let all_dataset_results = List.concat_map repo_and_post_analyses_per_tree_simu ~f:(fun (_, _, dr, _) -> dr) in
  let simu_dataset_l = List.concat_map repo_and_post_analyses_per_tree_simu ~f:(fun (_, _, _, d) -> d) in
  let post_analyses_simu = Post_analyses.post_analyses_simu_of_simu_dataset_l ~simu_dataset_l in
  let repo_of_post_analyses_simu = Post_analyses.repo_of_post_analyses_simu ~post_analyses_simu in
302
  let profile_prefix = profile.profile_n in
303
  (* let repo_post_analyses_all_trees = Post_analyses.repo_post_analyses_all_trees_of_all_post_analyses_per_tree ~all_post_analyses_per_tree ~profile_prefix in *)
Carine Rey's avatar
Carine Rey committed
304
  let repo_post_analyses_all_trees = [] in
305 306 307 308 309
  let repo_post_analyses_all_trees = if only_simu then
      []
    else
      repo_post_analyses_all_trees
  in
310 311 312 313 314 315 316 317
  let repo =
    repo_of_post_analyses_simu @ all_repo_per_tree_simu @ repo_post_analyses_all_trees
    |> Repo.shift profile_prefix
  in
  object
    method repo = repo
    method dataset_results = all_dataset_results
  end
318

Carine Rey's avatar
Carine Rey committed
319
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~use_diffsel ~use_c60 ~calc_dnds ~tree_dir ~profile_fn ~use_concat ~add_indels ~only_simu ?(seed = Random.int Int.max_value) () =
Carine Rey's avatar
Carine Rey committed
320 321 322 323 324 325
  let time_logger = Time_logger.create () in
  let loggers = [
      Console_logger.create () ;
      Bistro_utils.Html_logger.create "report.html" ;
      time_logger#logger ;
  ] in
Carine Rey's avatar
Carine Rey committed
326
  printf "Global seed: %i\n" seed;
327
  Out_channel.write_all "global.seed" ~data:(string_of_int seed);
328
  (* simulated trees *)
Philippe Veber's avatar
Philippe Veber committed
329
  Random.init seed ;
330
  let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
Carine Rey's avatar
Carine Rey committed
331
  let profile = Profile.profile_l_of_splitted_profile ~nb_cat:All ~nb_sites profile_fn ~seed:(Random.int Int.max_value) in
Philippe Veber's avatar
Philippe Veber committed
332
  let sim_repo_l = derive_profile ~preview ~use_diffsel ~use_c60 ~tree_dir ~profile ~use_concat ~add_indels ~only_simu ~seed () in
333
  (* real trees *)
Philippe Veber's avatar
Philippe Veber committed
334
  let indir_dataset_l = if String.is_empty indir then [] else parse_input_data ~seed ~calc_dnds indir in
335 336
  let dataset_l = indir_dataset_l in
  let dataset_results_l =
337
    if only_simu then
338 339
      []
    else
340
      derive_det ~dataset_l ~preview ~use_diffsel ~use_c60
341 342
  in
  let repo_real_trees = [
Philippe Veber's avatar
Philippe Veber committed
343
    Dataset.repo dataset_l ;
Carine Rey's avatar
rm plot  
Carine Rey committed
344
    repo_of_real_dataset_results_l ~dataset_results_l ();
345
  ] |> List.concat
Carine Rey's avatar
Carine Rey committed
346
  in
347
  let repo = (Repo.shift "Simulated_datasets" sim_repo_l#repo) @ (Repo.shift "Real_datasets" repo_real_trees) in
Philippe Veber's avatar
Philippe Veber committed
348
  Repo.build_main ~outdir ~np ~mem:(`GB mem) ~loggers repo ;
349
  time_logger#report sim_repo_l#dataset_results (Filename.concat outdir ("elapsed_time_" ^ (Unix.time () |> int_of_float |> string_of_int) ^ ".tsv"))
Philippe Veber's avatar
Philippe Veber committed
350 351

let validation_command =
Philippe Veber's avatar
Philippe Veber committed
352
  let () = Random.self_init() in
Philippe Veber's avatar
Philippe Veber committed
353 354 355 356 357 358 359
  let open Command.Let_syntax in
  Command.basic
    ~summary:"Run simulation pipeline"
    [%map_open
      let outdir =
        flag "--outdir" (required string) ~doc:"PATH Output directory"
      and indir =
360
        flag "--indir" (optional string) ~doc:"PATH Input directory"
Philippe Veber's avatar
Philippe Veber committed
361 362
      and preview =
        flag "--preview-mode" no_arg ~doc:" Preview mode"
363 364
      and use_diffsel =
        flag "--diffsel" no_arg ~doc:" use the diffsel method (very slow)."
365
      and use_c60 =
366
        flag "--c60" no_arg ~doc:" use the pcoc c60 method (slow)."
Carine Rey's avatar
Carine Rey committed
367 368
      and calc_dnds =
        flag "--dnds" no_arg ~doc:" calculate dn ds dnds trees (slow)."
Carine Rey's avatar
Carine Rey committed
369 370
      and only_simu =
        flag "--only-simu" no_arg ~doc:" mode only simulation"
371 372
      and use_concat =
        flag "--use-concat" no_arg ~doc:" Use concatenation H0+Ha_pcoc"
Carine Rey's avatar
Carine Rey committed
373 374
      and add_indels =
        flag "--add-indels" no_arg ~doc:" add indels in H*NeG5"
375
      and ns =
376
        flag "--ns" (optional int) ~doc:"INT Number of sites to simulate (WARNING: will be multiplied per 3)"
Philippe Veber's avatar
Philippe Veber committed
377 378 379 380 381 382 383 384
      and np =
        flag "--np" (optional int) ~doc:"INT Number of available processors"
      and mem =
        flag "--mem" (optional int) ~doc:"INT Available memory (in GB)"
      and tree_dir =
        flag "--tree-dir" (required string) ~doc:"PATH Path to tree directory"
      and profile_fn =
        flag "--profile-fn" (required string) ~doc:"PATH Path to profile file"
Philippe Veber's avatar
Philippe Veber committed
385 386
      and seed =
        flag "--seed" (optional int) ~doc:"INT Global seed"
Philippe Veber's avatar
Philippe Veber committed
387
      in
Carine Rey's avatar
Carine Rey committed
388
      validation_main ~outdir ?indir ?ns ?np ?mem ~preview ~use_diffsel ~use_c60 ~calc_dnds ~tree_dir ~profile_fn ~use_concat ~only_simu ~add_indels ?seed
Philippe Veber's avatar
Philippe Veber committed
389
    ]
LANORE Vincent's avatar
LANORE Vincent committed
390

Carine Rey's avatar
Carine Rey committed
391
let realdata_main ~outdir ~indir ~preview ~use_diffsel ~use_c60 ~calc_dnds ?(np = 2) ?(mem = 2) ?(seed = Random.int Int.max_value) () =
Carine Rey's avatar
Carine Rey committed
392 393 394 395 396
  let loggers = [
      Console_logger.create () ;
      Bistro_utils.Html_logger.create "report.html" ;
  ] in

LANORE Vincent's avatar
LANORE Vincent committed
397 398 399 400 401 402
  (* seed-related stuff *)
  printf "Global seed: %i\n" seed;
  Out_channel.write_all "global.seed" ~data:(string_of_int seed);
  Random.init seed ;

  (* real trees *)
Carine Rey's avatar
rm plot  
Carine Rey committed
403
  printf "Input parsing\n";
Carine Rey's avatar
Carine Rey committed
404
  let dataset_l = parse_input_data ~seed ~calc_dnds indir in
Carine Rey's avatar
rm plot  
Carine Rey committed
405
  printf "Convergent evolution detection\n";
LANORE Vincent's avatar
LANORE Vincent committed
406 407
  let dataset_results_l = derive_det ~dataset_l ~preview ~use_diffsel ~use_c60 in
  let repo_real_trees = [
Philippe Veber's avatar
Philippe Veber committed
408
    Dataset.repo dataset_l ;
Carine Rey's avatar
rm plot  
Carine Rey committed
409
    repo_of_real_dataset_results_l ~dataset_results_l ();
LANORE Vincent's avatar
LANORE Vincent committed
410 411 412
  ] |> List.concat (* list of repos *)
  in
  let repo = Repo.shift "Real_datasets" repo_real_trees in
Philippe Veber's avatar
Philippe Veber committed
413
  Repo.build_main ~outdir ~np ~mem:(`GB mem) ~loggers repo
LANORE Vincent's avatar
LANORE Vincent committed
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
  (* time_logger#report sim_repo_l#dataset_results (Filename.concat outdir ("elapsed_time_" ^ (Unix.time () |> int_of_float |> string_of_int) ^ ".tsv")) *)

let realdata_command =
  let open Command.Let_syntax in
  Command.basic
    ~summary:"Run simulation pipeline on real data only"
    [%map_open
      let outdir =
        flag "--outdir" (required string) ~doc:"PATH Output directory"
      and indir =
        flag "--indir" (required string) ~doc:"PATH Input directory"
      and preview =
        flag "--preview-mode" no_arg ~doc:" Preview mode"
      and use_diffsel =
        flag "--diffsel" no_arg ~doc:" use the diffsel method (very slow)."
      and use_c60 =
        flag "--c60" no_arg ~doc:" use the pcoc c60 method (slow)."
Carine Rey's avatar
Carine Rey committed
431 432
      and calc_dnds =
        flag "--dnds" no_arg ~doc:" calculate dn ds dnds trees (slow)."
LANORE Vincent's avatar
LANORE Vincent committed
433 434 435 436 437 438 439
      and np =
        flag "--np" (optional int) ~doc:"INT Number of available processors"
      and mem =
        flag "--mem" (optional int) ~doc:"INT Available memory (in GB)"
      and seed =
        flag "--seed" (optional int) ~doc:"INT Global seed"
      in
Carine Rey's avatar
Carine Rey committed
440
      realdata_main ~outdir ~indir ?np ?mem ~preview ~use_diffsel ~use_c60 ~calc_dnds ?seed
Philippe Veber's avatar
Philippe Veber committed
441
    ]