bppsuite.ml 8.33 KB
Newer Older
LANORE Vincent's avatar
LANORE Vincent committed
1
open Core
2 3 4
open Bistro.Std
open Bistro.EDSL
open Bistro_bioinfo.Std
Carine Rey's avatar
Carine Rey committed
5
open File_formats
LANORE Vincent's avatar
LANORE Vincent committed
6

7
let env = Env.env_bppsuite
LANORE Vincent's avatar
LANORE Vincent committed
8 9 10 11

let assign k v =
  seq ~sep:"=" [ string k ; v ]

Carine Rey's avatar
Carine Rey committed
12 13 14 15 16 17 18 19 20 21 22
let bash_script args code =
  let prelude =
    args
    |> List.map ~f:(fun (k, v) ->
        assign k v
      )
    |> seq ~sep:"\n"
  in
  seq ~sep:"\n" [ prelude ; string code ]

let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
23 24 25
  seq ~sep:"\n" (
    [
      assign "input.tree.file" (dep tree) ;
26
      assign "output.sequence.file" out ;
27 28 29 30
      assign "number_of_sites" (int nb_sites) ;
    ]
    @ config
  )
LANORE Vincent's avatar
LANORE Vincent committed
31

Philippe Veber's avatar
Philippe Veber committed
32 33 34 35 36 37 38 39 40
module Bppseqgen = struct
  let bpp_config_base = {|
alphabet=Codon(letter=DNA)
genetic_code = Standard
input.tree.format=Nhx
output.internal.sequences=no
nonhomogeneous = general
rate_distribution=Constant()
|}
Carine Rey's avatar
Carine Rey committed
41

Philippe Veber's avatar
Philippe Veber committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
  (* let bpp_config_H0_F= seq ~sep:"\n" [
      seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
      seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
     ]

     let bpp_config_HaPCOC_F = seq ~sep:"\n" [
      seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
      seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
      seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
      seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
     ]
     let bpp_config_HaPC_F = seq ~sep:"\n" [
      seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
      seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
      seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
      seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
     ] *)

  let bpp_config_H0_F_Ne = seq ~sep:"\n" [
      seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
      seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
      seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
Carine Rey's avatar
Carine Rey committed
64 65
    ]

Philippe Veber's avatar
Philippe Veber committed
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
  let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
      seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
      seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
      seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
      seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
    ]

  let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
      seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
      seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
      seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
      seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
    ]

  let bpp_config_of_model (m : Convergence_hypothesis.t) = match m with
    | H0 _ -> bpp_config_H0_F_Ne
    | HaPC _ -> bpp_config_HaPC_F_Ne
    | HaPCOC _ -> bpp_config_HaPCOC_F_Ne

  let bpp_config_F m = [
    string bpp_config_base ;
    bpp_config_of_model m ;
Carine Rey's avatar
Carine Rey committed
88
  ]
Philippe Veber's avatar
Philippe Veber committed
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112

  let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~hypothesis ~nb_sites_per_profile =
    seq ~sep:"\n" (
      [
        assign "input.tree.file" (dep tree) ;
        assign "input.tree.scale" (int 3) ;
        assign "PROFILE_F" (dep profile_f) ;
        assign "number_of_sites" (int nb_sites_per_profile) ;
        assign "NE_1"            (float ne_a) ;
        assign "NE_C"            (float ne_c) ;
        assign "NE_T"            (float ne_c) ;
      ]
      @ bpp_config_F hypothesis
    )

  let bppseqgen_multi_profiles_script ~config  ~out ~profile_c ~seed =
    let vars = [
      "FINAL_OUT", ident out ;
      "PARAM", config ;
      "PROFILE_C", ident profile_c ;
      "RANDOM", int seed ;
    ]
    in
    bash_script vars {|
Carine Rey's avatar
Carine Rey committed
113

114 115
  i=0
  while read -r line
Carine Rey's avatar
Carine Rey committed
116
  do
117 118 119
    ((i++))
    COL_M1=`echo $line | cut -f 1 -d " "`
    COL_M2=`echo $line | cut -f 2 -d " "`
Carine Rey's avatar
Carine Rey committed
120
    echo "i: $i" "COL_M1: $COL_M1" "COL_M2: $COL_M2"
121 122 123
    SEED=${RANDOM}
    echo seed=${SEED}
    bppseqgen param=$PARAM i=$i COL_M1=$COL_M1 COL_M2=$COL_M2 output.sequence.file=out_int_"$i".fa seed=${SEED}
124 125 126
  done < "$PROFILE_C"

  cp $PROFILE_C $FINAL_OUT.info
Carine Rey's avatar
Carine Rey committed
127 128 129 130 131 132

  # horizontal concatenation of fasta
  catfasta2phyml.pl -f out_int_* > $FINAL_OUT

|}

Philippe Veber's avatar
Philippe Veber committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
  let multi_profiles ?(descr="") ~profile_f ~profile_c ~tree ~tree_dataset ~hypothesis ~ne_c ~ne_a ~seed =
    let nb_sites_per_profile = 1 in
    (* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
    let config_f = dest // "config.bpp" in
    (* let profile_c_ok = tmp // "profiles_c.tsv" in *)
    let profile_c_ok = dep profile_c in
    let out = dest // "seq.fa" in
    let nodes = Tree_dataset.nodes tree_dataset hypothesis in
    workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
      docker env (
        and_list [
          mkdir_p dest;
          mkdir_p tmp;
          cd tmp;
          (*cmd "head" ~stdout:profile_c_ok [
            opt "-n" int nb_sites;
            dep profile_c
            ];*)
          cmd "cat" ~stdout:config_f [
            file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~hypothesis ~ne_c ~ne_a ~nb_sites_per_profile) ;
            file_dump (seq ~sep:"\n" [ string "\n" ]) ;
            dep nodes ;
          ];
          cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f  ~out  ~profile_c:profile_c_ok ~seed))];
        ]
      )
    ]

  let alignment run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
    run_bppseqgen_multi_profiles / selector ["seq.fa"]

  let info run_bppseqgen_multi_profiles : text_file workflow =
    run_bppseqgen_multi_profiles / selector ["seq.fa.info"]

end
Carine Rey's avatar
Carine Rey committed
168

Carine Rey's avatar
Carine Rey committed
169

Carine Rey's avatar
Carine Rey committed
170 171


Carine Rey's avatar
Carine Rey committed
172
let conf_file_bppseqman_fna2faa ~fna =
Carine Rey's avatar
Carine Rey committed
173 174 175 176 177
  seq ~sep:"\n" [
    assign "input.sequence.file" (dep fna) ;
    assign "output.sequence.file" dest ;
    string {|alphabet=Codon(letter=DNA)
             genetic_code = Standard
178 179
             input.sequence.remove_stop_codons = no
             input.sequence.sites_to_use = all
Carine Rey's avatar
Carine Rey committed
180 181 182 183 184 185 186 187
             input.alignment = true
             sequence.manip = Translate
           |}
  ]

let fna2faa ~(fna:nucleotide_fasta workflow) : aminoacid_fasta workflow =
  workflow ~descr:"bppsuite.fna2faa" [
    cmd "bppseqman" ~env [
Carine Rey's avatar
Carine Rey committed
188 189 190 191
      assign "param" (file_dump (conf_file_bppseqman_fna2faa ~fna)) ;
    ]
  ]

Carine Rey's avatar
Carine Rey committed
192
let conf_file_bppseqman_fna2phy ~fna =
Carine Rey's avatar
Carine Rey committed
193 194 195
  seq ~sep:"\n" [
    assign "input.sequence.file" (dep fna) ;
    assign "output.sequence.file" dest ;
196
    assign "output.sequence.format" (string "Phylip(order=interleaved, type=extended)") ;
Carine Rey's avatar
Carine Rey committed
197
    string {| input.alignment = true
198 199 200
              input.sequence.remove_stop_codons = no
              input.sequence.sites_to_use = all
              sequence.manip =
Carine Rey's avatar
Carine Rey committed
201 202
           |}
  ]
Carine Rey's avatar
Carine Rey committed
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
let conf_file_bppseqman_faa2phy ~faa =
  seq ~sep:"\n" [
    assign "alphabet" (string "Protein") ;
    assign "input.sequence.file" (dep faa) ;
    assign "output.sequence.file" dest ;
    assign "output.sequence.format" (string "Phylip(order=interleaved, type=extended)") ;
    string {| input.alignment = true
              input.sequence.remove_stop_codons = no
              input.sequence.sites_to_use = all
              sequence.manip =
           |}
  ]

let fna2phy ~(fna: nucleotide_fasta workflow) : nucleotide_phylip workflow =
  workflow ~descr:"bppsuite.fna2phy_interleaved" [
    cmd "bppseqman" ~env [
      assign "param" (file_dump (conf_file_bppseqman_fna2phy ~fna)) ;
    ]
  ]
Carine Rey's avatar
Carine Rey committed
222

Carine Rey's avatar
Carine Rey committed
223 224
let faa2phy ~(faa: aminoacid_fasta workflow) : aminoacid_phylip workflow =
  workflow ~descr:"bppsuite.faa2phy_interleaved" [
Carine Rey's avatar
Carine Rey committed
225
    cmd "bppseqman" ~env [
Carine Rey's avatar
Carine Rey committed
226
      assign "param" (file_dump (conf_file_bppseqman_faa2phy ~faa)) ;
LANORE Vincent's avatar
LANORE Vincent committed
227 228
    ]
  ]
Carine Rey's avatar
Carine Rey committed
229 230 231 232 233


let paste_fna  ~(fna_l: nucleotide_fasta workflow list) : nucleotide_fasta workflow =
  workflow ~descr:"bppsuite.catfasta" [
    cmd "catfasta2phyml.pl" ~stdout:dest ~env (List.concat [
234 235 236
        [string "-f" ] ;
        List.map fna_l ~f:(fun fna -> dep fna) ;
      ])
Carine Rey's avatar
Carine Rey committed
237
  ]