Commit e96940c6 authored by Philippe Veber's avatar Philippe Veber
Browse files

refactored bppseqgen

parent 6fbc0cea
...@@ -57,8 +57,8 @@ clean: ...@@ -57,8 +57,8 @@ clean:
rm -rf example/_bistro rm -rf example/_bistro
rm -rf example/outdir rm -rf example/outdir
.PHONY: clean_test .PHONY: clean-test
clean_test: clean-test:
rm -rf example/_bistro rm -rf example/_bistro
rm -rf example/outdir_test rm -rf example/outdir_test
rm -rf example/report.log rm -rf example/report.log
......
...@@ -4,8 +4,6 @@ open Bistro.EDSL ...@@ -4,8 +4,6 @@ open Bistro.EDSL
open Bistro_bioinfo.Std open Bistro_bioinfo.Std
open File_formats open File_formats
type bppseqgen_multi_profiles
let env = Env.env_bppsuite let env = Env.env_bppsuite
let assign k v = let assign k v =
...@@ -31,44 +29,87 @@ let conf_file_bppseqgen ~tree ~out ~nb_sites ~config = ...@@ -31,44 +29,87 @@ let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
@ config @ config
) )
let bppseqgen ?(descr="") ~nb_sites ~tree ~config : nucleotide_fasta workflow = module Bppseqgen = struct
let config_f = dest // "config.bpp" in let bpp_config_base = {|
let out = dest // "seq.fa" in alphabet=Codon(letter=DNA)
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [ genetic_code = Standard
docker env ( input.tree.format=Nhx
and_list [ output.internal.sequences=no
mkdir_p dest; nonhomogeneous = general
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen ~tree ~out ~nb_sites ~config))]; rate_distribution=Constant()
cmd "bppseqgen" [ |}
assign "param" config_f;
]
]
)
] / selector ["seq.fa"]
let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_sites_per_profile = (* let bpp_config_H0_F= seq ~sep:"\n" [
seq ~sep:"\n" ( seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
[ seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
assign "input.tree.file" (dep tree) ; ]
assign "input.tree.scale" (int 3) ;
assign "PROFILE_F" (dep profile_f) ; let bpp_config_HaPCOC_F = seq ~sep:"\n" [
assign "number_of_sites" (int nb_sites_per_profile) ; seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
assign "NE_1" (float ne_a) ; seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
assign "NE_C" (float ne_c) ; seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
assign "NE_T" (float ne_c) ; seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let bpp_config_H0_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] ]
@ config
)
let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed = let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
let vars = [ seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
"FINAL_OUT", ident out ; seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
"PARAM", config ; seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
"PROFILE_C", ident profile_c ; seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
"RANDOM", int seed ; ]
let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_of_model (m : Convergence_hypothesis.t) = match m with
| H0 _ -> bpp_config_H0_F_Ne
| HaPC _ -> bpp_config_HaPC_F_Ne
| HaPCOC _ -> bpp_config_HaPCOC_F_Ne
let bpp_config_F m = [
string bpp_config_base ;
bpp_config_of_model m ;
] ]
in
bash_script vars {| let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~hypothesis ~nb_sites_per_profile =
seq ~sep:"\n" (
[
assign "input.tree.file" (dep tree) ;
assign "input.tree.scale" (int 3) ;
assign "PROFILE_F" (dep profile_f) ;
assign "number_of_sites" (int nb_sites_per_profile) ;
assign "NE_1" (float ne_a) ;
assign "NE_C" (float ne_c) ;
assign "NE_T" (float ne_c) ;
]
@ bpp_config_F hypothesis
)
let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
let vars = [
"FINAL_OUT", ident out ;
"PARAM", config ;
"PROFILE_C", ident profile_c ;
"RANDOM", int seed ;
]
in
bash_script vars {|
i=0 i=0
while read -r line while read -r line
...@@ -89,39 +130,44 @@ let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed = ...@@ -89,39 +130,44 @@ let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
|} |}
let multi_profiles ?(descr="") ~profile_f ~profile_c ~tree ~tree_dataset ~hypothesis ~ne_c ~ne_a ~seed =
let nb_sites_per_profile = 1 in
(* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
let config_f = dest // "config.bpp" in
(* let profile_c_ok = tmp // "profiles_c.tsv" in *)
let profile_c_ok = dep profile_c in
let out = dest // "seq.fa" in
let nodes = Tree_dataset.nodes tree_dataset hypothesis in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
mkdir_p dest;
mkdir_p tmp;
cd tmp;
(*cmd "head" ~stdout:profile_c_ok [
opt "-n" int nb_sites;
dep profile_c
];*)
cmd "cat" ~stdout:config_f [
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~hypothesis ~ne_c ~ne_a ~nb_sites_per_profile) ;
file_dump (seq ~sep:"\n" [ string "\n" ]) ;
dep nodes ;
];
cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f ~out ~profile_c:profile_c_ok ~seed))];
]
)
]
let alignment run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa"]
let info run_bppseqgen_multi_profiles : text_file workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa.info"]
end
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~config ~ne_c ~ne_a ~nodes ~seed : bppseqgen_multi_profiles directory workflow =
let nb_sites_per_profile = 1 in
(* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
let config_f = dest // "config.bpp" in
(* let profile_c_ok = tmp // "profiles_c.tsv" in *)
let profile_c_ok = dep profile_c in
let out = dest // "seq.fa" in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
mkdir_p dest;
mkdir_p tmp;
cd tmp;
(*cmd "head" ~stdout:profile_c_ok [
opt "-n" int nb_sites;
dep profile_c
];*)
cmd "cat" ~stdout:config_f [
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~config ~ne_c ~ne_a ~nb_sites_per_profile) ;
file_dump (seq ~sep:"\n" [ string "\n" ]) ;
dep nodes ;
];
cmd "bash" [(file_dump (bppseqgen_multi_profiles_script ~config:config_f ~out ~profile_c:profile_c_ok ~seed))];
]
)
]
let bppseqgen_multi_profiles_get_fa run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa"]
let bppseqgen_multi_profiles_get_info run_bppseqgen_multi_profiles : text_file workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa.info"]
let conf_file_bppseqman_fna2faa ~fna = let conf_file_bppseqman_fna2faa ~fna =
seq ~sep:"\n" [ seq ~sep:"\n" [
......
...@@ -2,35 +2,27 @@ open Bistro.Std ...@@ -2,35 +2,27 @@ open Bistro.Std
open Bistro_bioinfo.Std open Bistro_bioinfo.Std
open File_formats open File_formats
type bppseqgen_multi_profiles module Bppseqgen : sig
val multi_profiles :
val bppseqgen : ?descr : string ->
?descr : string -> profile_f: text_file workflow ->
nb_sites:int -> profile_c: text_file workflow ->
tree:nhx workflow -> tree:nhx workflow ->
config:Bistro.Template.t list -> tree_dataset:[`tree_dataset] directory workflow ->
nucleotide_fasta workflow hypothesis:Convergence_hypothesis.t ->
ne_c: float ->
val bppseqgen_multi_profiles : ne_a: float ->
?descr : string -> seed:int ->
profile_f: text_file workflow -> [`bppseqgen] directory workflow
profile_c: text_file workflow ->
nb_sites:int -> val alignment :
tree:nhx workflow -> [`bppseqgen] directory workflow ->
config:Bistro.Template.t list -> nucleotide_fasta workflow
ne_c: float ->
ne_a: float -> val info :
nodes:text_file workflow -> [`bppseqgen] directory workflow ->
seed:int -> text_file workflow
bppseqgen_multi_profiles directory workflow end
val bppseqgen_multi_profiles_get_fa :
bppseqgen_multi_profiles directory workflow ->
nucleotide_fasta workflow
val bppseqgen_multi_profiles_get_info :
bppseqgen_multi_profiles directory workflow ->
text_file workflow
val fna2faa : val fna2faa :
fna:nucleotide_fasta workflow -> fna:nucleotide_fasta workflow ->
......
...@@ -37,60 +37,3 @@ let h0_hapc_forall nes_list = ...@@ -37,60 +37,3 @@ let h0_hapc_forall nes_list =
let assign k v = let assign k v =
seq ~sep:"=" [ string k ; v ] seq ~sep:"=" [ string k ; v ]
let bpp_config_base = {|
alphabet=Codon(letter=DNA)
genetic_code = Standard
input.tree.format=Nhx
output.internal.sequences=no
nonhomogeneous = general
rate_distribution=Constant()
|}
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let bpp_config_H0_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_of_model m = match m with
| H0 _ -> bpp_config_H0_F_Ne
| HaPC _ -> bpp_config_HaPC_F_Ne
| HaPCOC _ -> bpp_config_HaPCOC_F_Ne
let bpp_config_F m = [
string bpp_config_base ;
bpp_config_of_model m ;
]
...@@ -17,23 +17,3 @@ let workflow_of_template t = ...@@ -17,23 +17,3 @@ let workflow_of_template t =
] ]
let config_file () =
let open Bistro.EDSL in
let model = Convergence_hypothesis.(H0 (Fixed 1.)) in
let tree_dir = "/home/pveber/w/reviewphiltrans/example/trees_test/" in
let tree = "tree_small_bl.nhx" in
let input_tree = Bistro.EDSL.input (Filename.concat tree_dir tree) in
let tree_prefix = Filename.chop_extension tree in
let tree_dataset = Tree_dataset.prepare ~descr:("simulated_data." ^ tree_prefix) input_tree in
let nodes = Tree_dataset.nodes tree_dataset model in
workflow [
cmd "cat" ~stdout:dest [
file_dump (Bistro.EDSL.seq ~sep:"\n" (Convergence_hypothesis.bpp_config_F model @ [ string "\n" ])) ;
dep nodes ;
];
]
let main () =
less (config_file ())
...@@ -53,8 +53,6 @@ let calc_fixed_seed ~(str:string) (seed:int) : int = ...@@ -53,8 +53,6 @@ let calc_fixed_seed ~(str:string) (seed:int) : int =
let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns ~seed = let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns ~seed =
let model_prefix = Convergence_hypothesis.string_of_model model in let model_prefix = Convergence_hypothesis.string_of_model model in
let nb_sites = ns in
let nodes = Tree_dataset.nodes tree_dataset model in
let tree = Tree_dataset.tree tree_dataset `Simulation in let tree = Tree_dataset.tree tree_dataset `Simulation in
let descr = "."^model_prefix^"."^tree_prefix in let descr = "."^model_prefix^"."^tree_prefix in
(* only 1 profile or 1 couple of profiles*) (* only 1 profile or 1 couple of profiles*)
...@@ -62,7 +60,6 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr ...@@ -62,7 +60,6 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in
*) *)
(* with several profiles or couples of profiles *) (* with several profiles or couples of profiles *)
let config_p = Convergence_hypothesis.bpp_config_F model in
let ne_g = neg_of_model model in let ne_g = neg_of_model model in
let ne_c = nec_of_model model in let ne_c = nec_of_model model in
let ne_a = ne_g in let ne_a = ne_g in
...@@ -70,9 +67,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr ...@@ -70,9 +67,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let profile_c = profile.profile_c in let profile_c = profile.profile_c in
(*let seed = Random.int Int.max_value in*) (*let seed = Random.int Int.max_value in*)
let seed = calc_fixed_seed ~str:descr seed in let seed = calc_fixed_seed ~str:descr seed in
let run_fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~nodes ~config:config_p ~profile_f ~profile_c ~ne_c ~ne_a ~seed in let run_fna = Bppsuite.Bppseqgen.multi_profiles ~descr ~tree ~tree_dataset ~hypothesis:model ~profile_f ~profile_c ~ne_c ~ne_a ~seed in
let fna = Bppsuite.bppseqgen_multi_profiles_get_fa run_fna in let fna = Bppsuite.Bppseqgen.alignment run_fna in
let fna_infos = Some (Bppsuite.bppseqgen_multi_profiles_get_info run_fna) in let fna_infos = Some (Bppsuite.Bppseqgen.info run_fna) in
let faa = Bppsuite.fna2faa ~fna in let faa = Bppsuite.fna2faa ~fna in
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in
...@@ -411,4 +408,4 @@ let realdata_command = ...@@ -411,4 +408,4 @@ let realdata_command =
flag "--seed" (optional int) ~doc:"INT Global seed" flag "--seed" (optional int) ~doc:"INT Global seed"
in in
realdata_main ~outdir ~indir ?np ?mem ~preview ~use_diffsel ~use_c60 ?seed realdata_main ~outdir ~indir ?np ?mem ~preview ~use_diffsel ~use_c60 ?seed
] ]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment