ready_dataset.ml 3.29 KB
Newer Older
Carine Rey's avatar
Carine Rey committed
1
open Core
2 3
open Bistro.Std
open Bistro_utils
Carine Rey's avatar
Carine Rey committed
4 5
open Bistro.EDSL
open Bistro_bioinfo.Std
6
open File_formats
7
open Bppsuite
Carine Rey's avatar
Carine Rey committed
8

9 10 11 12
type t = {
  input_tree: nhx workflow ;
  tree_dataset : [`tree_dataset] directory workflow ;
  fna: nucleotide_fasta workflow ;
Carine Rey's avatar
Carine Rey committed
13
  fna_infos: text_file workflow option ;
14 15 16
  faa: aminoacid_fasta workflow ;
}

17
let of_raw ?(descr="") (raw_dataset : Raw_dataset.t) =
18 19
  let input_tree = raw_dataset.input_tree in
  let fna = raw_dataset.fna in
Carine Rey's avatar
Carine Rey committed
20
  let fna_infos = raw_dataset.fna_infos in
21
  let tree_dataset = Tree_dataset.prepare ~descr input_tree in
22
  let faa = Bppsuite.fna2faa ~fna in
Carine Rey's avatar
Carine Rey committed
23
  { input_tree; tree_dataset ; fna; faa; fna_infos}
24

25

26
let repo rd =
Carine Rey's avatar
Carine Rey committed
27 28
  let phy_nt = (Bppsuite.fna2phy rd.fna) in
  let phy_aa = (Bppsuite.faa2phy rd.faa) in
29
  Repo.[
Carine Rey's avatar
Carine Rey committed
30
    [
31
      item ["input_tree.nhx"] rd.input_tree ;
Carine Rey's avatar
Carine Rey committed
32 33
      item ["recalculated_tree_nt.nw"] (Phyml.phyml_tree ~model:GTR ~tree:rd.input_tree phy_nt );
      item ["recalculated_tree_aa.nw"] (Phyml.phyml_tree ~model:LG ~tree:rd.input_tree phy_aa );
34
      item ["tree.H0.node_ids" ] (Tree_dataset.nodes rd.tree_dataset H0_NeG5) ;
35 36 37 38 39 40
      item ["tree.Ha.node_ids" ] (Tree_dataset.nodes rd.tree_dataset HaPCOC) ;
      item ["tree.only_convergent_tags.nhx" ] (Tree_dataset.tree rd.tree_dataset `Detection) ;
      item ["tree.only_node_ids.nhx" ] (Tree_dataset.tree rd.tree_dataset `Simulation) ;
      item ["tree.diffsel" ] (Tree_dataset.diffsel_tree rd.tree_dataset) ;
      item ["tree.convergent_topology" ] (Tree_dataset.topological_tree rd.tree_dataset) ;
      item ["simulated_sequences.fna"] rd.fna ;
Carine Rey's avatar
Carine Rey committed
41 42
      item ["simulated_sequences_nt.phy"] phy_nt ;
      item ["simulated_sequences_aa.phy"] phy_aa ;
43
      item ["simulated_sequences.faa"] rd.faa ;
Carine Rey's avatar
Carine Rey committed
44 45
    ] ;
    match rd.fna_infos with
46 47 48
    | Some w -> [item ["simulated_sequences.fna_infos"] w]
    | None -> []
      ;
49
  ]
Carine Rey's avatar
Carine Rey committed
50
  |> List.concat
51

Carine Rey's avatar
Carine Rey committed
52 53 54 55 56 57
let to_raw { input_tree ; fna ; fna_infos} =
  { Raw_dataset.input_tree ; fna ; fna_infos}

let paste_fna_infos  ~(fna_infos_l: text_file workflow list) : text_file workflow =
  workflow ~descr:"cat" [
    cmd "cat" ~stdout:dest (List.concat [
58 59
        List.map fna_infos_l ~f:(fun fna_infos -> dep fna_infos) ;
      ])
Carine Rey's avatar
Carine Rey committed
60
  ]
Carine Rey's avatar
Carine Rey committed
61 62 63 64 65

let paste d1 d2 =
  let r_d1 = to_raw d1 in
  let r_d2 = to_raw d2 in
  let fna = Bppsuite.paste_fna [r_d1.fna ; r_d2.fna ] in
Carine Rey's avatar
Carine Rey committed
66
  let fna_infos_l = List.map [r_d1.fna_infos ; r_d2.fna_infos] ~f:(fun fna_infos ->
67
      match fna_infos with
Carine Rey's avatar
Carine Rey committed
68 69
      | Some i -> [i]
      | None -> []
70
    ) |> List.concat in
Carine Rey's avatar
Carine Rey committed
71 72
  let fna_infos = Some (paste_fna_infos ~fna_infos_l) in
  let ready_dataset = of_raw {Raw_dataset.input_tree=r_d1.input_tree ; fna; fna_infos} in
Carine Rey's avatar
Carine Rey committed
73
  ready_dataset
74 75

let add_indels_to_fna ~(p:float) ~(seed:int) (fna:nucleotide_fasta workflow) : nucleotide_fasta workflow =
76
  let env = Env.env_py in
77
  workflow ~descr:("add_indels") [
Carine Rey's avatar
Carine Rey committed
78 79 80 81 82
    cmd "python" ~env [
      file_dump (string Scripts.add_indels) ;
      opt "-p" float p;
      opt "-a" dep fna;
      opt "-o" ident dest;
83
      opt "-r" int seed;
Carine Rey's avatar
Carine Rey committed
84 85 86 87
      string "-c";
    ]
  ]

88
let add_indels_to_ready_dataset ~p ~seed d =
Carine Rey's avatar
Carine Rey committed
89
  let r_d = to_raw d in
90
  (* VL: one seed per fna. FIXME:should be computed from global seed and task info instead *)
91
  let fna = add_indels_to_fna ~p ~seed r_d.fna in
Carine Rey's avatar
Carine Rey committed
92 93 94
  let fna_infos = r_d.fna_infos in
  let ready_dataset = of_raw {Raw_dataset.input_tree=r_d.input_tree ; fna; fna_infos} in
  ready_dataset