Commit e96940c6 authored by Philippe Veber's avatar Philippe Veber
Browse files

refactored bppseqgen

parent 6fbc0cea
...@@ -57,8 +57,8 @@ clean: ...@@ -57,8 +57,8 @@ clean:
rm -rf example/_bistro rm -rf example/_bistro
rm -rf example/outdir rm -rf example/outdir
.PHONY: clean_test .PHONY: clean-test
clean_test: clean-test:
rm -rf example/_bistro rm -rf example/_bistro
rm -rf example/outdir_test rm -rf example/outdir_test
rm -rf example/report.log rm -rf example/report.log
......
...@@ -4,8 +4,6 @@ open Bistro.EDSL ...@@ -4,8 +4,6 @@ open Bistro.EDSL
open Bistro_bioinfo.Std open Bistro_bioinfo.Std
open File_formats open File_formats
type bppseqgen_multi_profiles
let env = Env.env_bppsuite let env = Env.env_bppsuite
let assign k v = let assign k v =
...@@ -31,22 +29,65 @@ let conf_file_bppseqgen ~tree ~out ~nb_sites ~config = ...@@ -31,22 +29,65 @@ let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
@ config @ config
) )
let bppseqgen ?(descr="") ~nb_sites ~tree ~config : nucleotide_fasta workflow = module Bppseqgen = struct
let config_f = dest // "config.bpp" in let bpp_config_base = {|
let out = dest // "seq.fa" in alphabet=Codon(letter=DNA)
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [ genetic_code = Standard
docker env ( input.tree.format=Nhx
and_list [ output.internal.sequences=no
mkdir_p dest; nonhomogeneous = general
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen ~tree ~out ~nb_sites ~config))]; rate_distribution=Constant()
cmd "bppseqgen" [ |}
assign "param" config_f;
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] ]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let bpp_config_H0_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] ]
)
] / selector ["seq.fa"]
let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_sites_per_profile = let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_of_model (m : Convergence_hypothesis.t) = match m with
| H0 _ -> bpp_config_H0_F_Ne
| HaPC _ -> bpp_config_HaPC_F_Ne
| HaPCOC _ -> bpp_config_HaPCOC_F_Ne
let bpp_config_F m = [
string bpp_config_base ;
bpp_config_of_model m ;
]
let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~hypothesis ~nb_sites_per_profile =
seq ~sep:"\n" ( seq ~sep:"\n" (
[ [
assign "input.tree.file" (dep tree) ; assign "input.tree.file" (dep tree) ;
...@@ -57,10 +98,10 @@ let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_ ...@@ -57,10 +98,10 @@ let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_
assign "NE_C" (float ne_c) ; assign "NE_C" (float ne_c) ;
assign "NE_T" (float ne_c) ; assign "NE_T" (float ne_c) ;
] ]
@ config @ bpp_config_F hypothesis
) )
let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed = let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
let vars = [ let vars = [
"FINAL_OUT", ident out ; "FINAL_OUT", ident out ;
"PARAM", config ; "PARAM", config ;
...@@ -89,14 +130,14 @@ let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed = ...@@ -89,14 +130,14 @@ let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
|} |}
let multi_profiles ?(descr="") ~profile_f ~profile_c ~tree ~tree_dataset ~hypothesis ~ne_c ~ne_a ~seed =
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~config ~ne_c ~ne_a ~nodes ~seed : bppseqgen_multi_profiles directory workflow =
let nb_sites_per_profile = 1 in let nb_sites_per_profile = 1 in
(* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *) (* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
let config_f = dest // "config.bpp" in let config_f = dest // "config.bpp" in
(* let profile_c_ok = tmp // "profiles_c.tsv" in *) (* let profile_c_ok = tmp // "profiles_c.tsv" in *)
let profile_c_ok = dep profile_c in let profile_c_ok = dep profile_c in
let out = dest // "seq.fa" in let out = dest // "seq.fa" in
let nodes = Tree_dataset.nodes tree_dataset hypothesis in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [ workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env ( docker env (
and_list [ and_list [
...@@ -108,7 +149,7 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~ ...@@ -108,7 +149,7 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~
dep profile_c dep profile_c
];*) ];*)
cmd "cat" ~stdout:config_f [ cmd "cat" ~stdout:config_f [
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~config ~ne_c ~ne_a ~nb_sites_per_profile) ; file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~hypothesis ~ne_c ~ne_a ~nb_sites_per_profile) ;
file_dump (seq ~sep:"\n" [ string "\n" ]) ; file_dump (seq ~sep:"\n" [ string "\n" ]) ;
dep nodes ; dep nodes ;
]; ];
...@@ -117,12 +158,17 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~ ...@@ -117,12 +158,17 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~
) )
] ]
let bppseqgen_multi_profiles_get_fa run_bppseqgen_multi_profiles : nucleotide_fasta workflow = let alignment run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa"] run_bppseqgen_multi_profiles / selector ["seq.fa"]
let bppseqgen_multi_profiles_get_info run_bppseqgen_multi_profiles : text_file workflow = let info run_bppseqgen_multi_profiles : text_file workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa.info"] run_bppseqgen_multi_profiles / selector ["seq.fa.info"]
end
let conf_file_bppseqman_fna2faa ~fna = let conf_file_bppseqman_fna2faa ~fna =
seq ~sep:"\n" [ seq ~sep:"\n" [
assign "input.sequence.file" (dep fna) ; assign "input.sequence.file" (dep fna) ;
......
...@@ -2,35 +2,27 @@ open Bistro.Std ...@@ -2,35 +2,27 @@ open Bistro.Std
open Bistro_bioinfo.Std open Bistro_bioinfo.Std
open File_formats open File_formats
type bppseqgen_multi_profiles module Bppseqgen : sig
val multi_profiles :
val bppseqgen :
?descr : string ->
nb_sites:int ->
tree:nhx workflow ->
config:Bistro.Template.t list ->
nucleotide_fasta workflow
val bppseqgen_multi_profiles :
?descr : string -> ?descr : string ->
profile_f: text_file workflow -> profile_f: text_file workflow ->
profile_c: text_file workflow -> profile_c: text_file workflow ->
nb_sites:int ->
tree:nhx workflow -> tree:nhx workflow ->
config:Bistro.Template.t list -> tree_dataset:[`tree_dataset] directory workflow ->
hypothesis:Convergence_hypothesis.t ->
ne_c: float -> ne_c: float ->
ne_a: float -> ne_a: float ->
nodes:text_file workflow ->
seed:int -> seed:int ->
bppseqgen_multi_profiles directory workflow [`bppseqgen] directory workflow
val bppseqgen_multi_profiles_get_fa : val alignment :
bppseqgen_multi_profiles directory workflow -> [`bppseqgen] directory workflow ->
nucleotide_fasta workflow nucleotide_fasta workflow
val bppseqgen_multi_profiles_get_info : val info :
bppseqgen_multi_profiles directory workflow -> [`bppseqgen] directory workflow ->
text_file workflow text_file workflow
end
val fna2faa : val fna2faa :
fna:nucleotide_fasta workflow -> fna:nucleotide_fasta workflow ->
......
...@@ -37,60 +37,3 @@ let h0_hapc_forall nes_list = ...@@ -37,60 +37,3 @@ let h0_hapc_forall nes_list =
let assign k v = let assign k v =
seq ~sep:"=" [ string k ; v ] seq ~sep:"=" [ string k ; v ]
let bpp_config_base = {|
alphabet=Codon(letter=DNA)
genetic_code = Standard
input.tree.format=Nhx
output.internal.sequences=no
nonhomogeneous = general
rate_distribution=Constant()
|}
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let bpp_config_H0_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_of_model m = match m with
| H0 _ -> bpp_config_H0_F_Ne
| HaPC _ -> bpp_config_HaPC_F_Ne
| HaPCOC _ -> bpp_config_HaPCOC_F_Ne
let bpp_config_F m = [
string bpp_config_base ;
bpp_config_of_model m ;
]
...@@ -17,23 +17,3 @@ let workflow_of_template t = ...@@ -17,23 +17,3 @@ let workflow_of_template t =
] ]
let config_file () =
let open Bistro.EDSL in
let model = Convergence_hypothesis.(H0 (Fixed 1.)) in
let tree_dir = "/home/pveber/w/reviewphiltrans/example/trees_test/" in
let tree = "tree_small_bl.nhx" in
let input_tree = Bistro.EDSL.input (Filename.concat tree_dir tree) in
let tree_prefix = Filename.chop_extension tree in
let tree_dataset = Tree_dataset.prepare ~descr:("simulated_data." ^ tree_prefix) input_tree in
let nodes = Tree_dataset.nodes tree_dataset model in
workflow [
cmd "cat" ~stdout:dest [
file_dump (Bistro.EDSL.seq ~sep:"\n" (Convergence_hypothesis.bpp_config_F model @ [ string "\n" ])) ;
dep nodes ;
];
]
let main () =
less (config_file ())
...@@ -53,8 +53,6 @@ let calc_fixed_seed ~(str:string) (seed:int) : int = ...@@ -53,8 +53,6 @@ let calc_fixed_seed ~(str:string) (seed:int) : int =
let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns ~seed = let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns ~seed =
let model_prefix = Convergence_hypothesis.string_of_model model in let model_prefix = Convergence_hypothesis.string_of_model model in
let nb_sites = ns in
let nodes = Tree_dataset.nodes tree_dataset model in
let tree = Tree_dataset.tree tree_dataset `Simulation in let tree = Tree_dataset.tree tree_dataset `Simulation in
let descr = "."^model_prefix^"."^tree_prefix in let descr = "."^model_prefix^"."^tree_prefix in
(* only 1 profile or 1 couple of profiles*) (* only 1 profile or 1 couple of profiles*)
...@@ -62,7 +60,6 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr ...@@ -62,7 +60,6 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in
*) *)
(* with several profiles or couples of profiles *) (* with several profiles or couples of profiles *)
let config_p = Convergence_hypothesis.bpp_config_F model in
let ne_g = neg_of_model model in let ne_g = neg_of_model model in
let ne_c = nec_of_model model in let ne_c = nec_of_model model in
let ne_a = ne_g in let ne_a = ne_g in
...@@ -70,9 +67,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr ...@@ -70,9 +67,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let profile_c = profile.profile_c in let profile_c = profile.profile_c in
(*let seed = Random.int Int.max_value in*) (*let seed = Random.int Int.max_value in*)
let seed = calc_fixed_seed ~str:descr seed in let seed = calc_fixed_seed ~str:descr seed in
let run_fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~nodes ~config:config_p ~profile_f ~profile_c ~ne_c ~ne_a ~seed in let run_fna = Bppsuite.Bppseqgen.multi_profiles ~descr ~tree ~tree_dataset ~hypothesis:model ~profile_f ~profile_c ~ne_c ~ne_a ~seed in
let fna = Bppsuite.bppseqgen_multi_profiles_get_fa run_fna in let fna = Bppsuite.Bppseqgen.alignment run_fna in
let fna_infos = Some (Bppsuite.bppseqgen_multi_profiles_get_info run_fna) in let fna_infos = Some (Bppsuite.Bppseqgen.info run_fna) in
let faa = Bppsuite.fna2faa ~fna in let faa = Bppsuite.fna2faa ~fna in
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment