Commit c1ba7517 authored by Carine Rey's avatar Carine Rey
Browse files

add bppseqgen multi profiles

parent 8dd45285
......@@ -5,31 +5,42 @@ open Bistro_bioinfo.Std
open File_formats
let env = docker_image ~account:"carinerey" ~name:"bppsuite" ~tag:"06182018" ()
let assign k v =
seq ~sep:"=" [ string k ; v ]
let bash_script args code =
let prelude =
args
|> List.map ~f:(fun (k, v) ->
assign k v
)
|> seq ~sep:"\n"
in
seq ~sep:"\n" [ prelude ; string code ]
let conf_file_bppseqgen ~tree ~out ~nb_sites ~config ~profile_f =
let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
seq ~sep:"\n" (
[
assign "input.tree.file" (dep tree) ;
assign "output.sequence.file" out ;
assign "number_of_sites" (int nb_sites) ;
assign "PROFILE_F" (dep profile_f);
]
@ config
)
let bppseqgen ?(descr="") ~profile_f ~nb_sites ~tree ~config : nucleotide_fasta workflow =
let bppseqgen ?(descr="") ~nb_sites ~tree ~config : nucleotide_fasta workflow =
let config_f = dest // "config.bpp" in
let out = dest // "seq.fa" in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
mkdir_p dest;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen ~tree ~out ~nb_sites ~config ~profile_f))];
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen ~tree ~out ~nb_sites ~config))];
cmd "bppseqgen" [
assign "param" config_f;
]
......@@ -38,6 +49,66 @@ let bppseqgen ?(descr="") ~profile_f ~nb_sites ~tree ~config : nucleotide_fasta
] / selector ["seq.fa"]
let conf_file_bppseqgen_multi_profiles ~tree ~config =
seq ~sep:"\n" (
[
assign "input.tree.file" (dep tree) ;
]
@ config
)
let bppseqgen_multi_profiles_script ~config ~nb_sites_per_profile ~nb_combis ~nb_cols ~out ~profile_f =
let vars = [
"FINAL_OUT", ident out ;
"PARAM", config ;
"PROFILE_F", dep profile_f ;
"NB_SITES", int nb_sites_per_profile ;
"NB_COMBI_PROFILES", int nb_combis ;
"NB_COLS", int nb_cols ;
]
in
bash_script vars {|
start_i=1
end_i=$NB_COMBI_PROFILES
for ((i=start_i; i<=end_i; i++))
do
echo "i: $i"
shuf -i1-$NB_COLS -n2 -o random_num
COL_M1=`head -n 1 random_num`
COL_M2=`tail -n 1 random_num`
bppseqgen param=$PARAM PROFILE_F=$PROFILE_F i=$i COL_M1=$COL_M1 COL_M2=$COL_M2 number_of_sites=$NB_SITES output.sequence.file=out_int_"$i".fa
done
# horizontal concatenation of fasta
catfasta2phyml.pl -f out_int_* > $FINAL_OUT
|}
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~nb_sites ~tree ~config : nucleotide_fasta workflow =
let nb_sites_per_profile = if nb_sites > 100 then 2 else 1 in
let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in
let nb_cols = 100 in (*TODO: automatiser le nb de cols du fichier profile_f*)
let config_f = dest // "config.bpp" in
let out = dest // "seq.fa" in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
mkdir_p dest;
mkdir_p tmp;
cd tmp;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen_multi_profiles ~tree ~config))];
cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f ~nb_sites_per_profile ~nb_combis ~nb_cols ~out ~profile_f))];
]
)
] / selector ["seq.fa"]
let conf_file_bppseqman_fna2faa ~fna =
seq ~sep:"\n" [
assign "input.sequence.file" (dep fna) ;
......
......@@ -3,6 +3,13 @@ open Bistro_bioinfo.Std
open File_formats
val bppseqgen :
?descr : string ->
nb_sites:int ->
tree:nhx workflow ->
config:Bistro.Template.t list ->
nucleotide_fasta workflow
val bppseqgen_multi_profiles :
?descr : string ->
profile_f: _ workflow ->
nb_sites:int ->
......
......@@ -22,13 +22,16 @@ nonhomogeneous = general
rate_distribution=Constant()
|}
let bpp_config_H0 = {|nonhomogeneous.number_of_models = 1
let bpp_config_H0 = {|
model1=Codon_AAFit(model=K80, fitness=FromModel(model=LGL08_CAT_C1(nbCat=10)))
nonhomogeneous.root_freq=FromModel(model=$(model1))
|}
let bpp_config_Ha = {|nonhomogeneous.number_of_models = 2
model1=Codon_AAFit(model=K80, fitness=FromModel(model=LGL08_CAT_C1(nbCat=10)))
model2=Codon_AAFit(model=K80, fitness=FromModel(model=LGL08_CAT_C7(nbCat=10)))
let bpp_config_Ha = {|
model1=Codon_AAFit(model=K80, fitness=FromModel(model=LGL08_CAT_C2(nbCat=60)))
modelT=OneChange(model=Codon_AAFit(model=K80, fitness=FromModel(model=LGL08_CAT_C7(nbCat=10))),register=DnDs, numReg=2)
modelC=Codon_AAFit(model=K80, fitness=FromModel(model=LGL08_CAT_C7(nbCat=10)))
nonhomogeneous.root_freq=FromModel(model=$(model1))
|}
let bpp_config nodes hyp = [
......@@ -42,20 +45,15 @@ let bpp_config nodes hyp = [
]
let bpp_config_H0_F= seq ~sep:"\n" [
(* string {|nonhomogeneous.number_of_models = 1 |} ; Mv in insert nodes*)
assign "COL_M1" (int 100) ;
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1))))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_Ha_F = seq ~sep:"\n" [
(* string {|nonhomogeneous.number_of_models = 3 or 2 |} ; Mv in insert nodes*)
assign "COL_M1" (int 100) ;
assign "COL_M2" (int 98) ;
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=1)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1))))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_F nodes hyp = [
......
......@@ -57,10 +57,16 @@ let derive_from_model ~model ~tree ~tree_dataset ~tree_prefix ~profile_f ~previe
let model_prefix = Convergence_hypothesis.string_of_model model in
let nb_sites = if preview then 20 else 1000 in
let nodes = Tree_dataset.nodes tree_dataset model in
let config = Convergence_hypothesis.bpp_config_F nodes model in
let tree = Tree_dataset.tree tree_dataset `Simulation in
let descr = "."^model_prefix^"."^tree_prefix in
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config ~profile_f in
(* only 1 profile or 1 couple of profiles*)
(*let config = Convergence_hypothesis.bpp_config nodes model in
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in
*)
(* with several profiles or couples of profiles *)
let config_p = Convergence_hypothesis.bpp_config_F nodes model in
let fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~config:config_p ~profile_f in
let faa = Bppsuite.fna2faa ~fna in
let ready_dataset = { input_tree = tree ; tree_dataset ; fna; faa} in
{ model_prefix; tree_prefix; ready_dataset }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment