Commit e96940c6 authored by Philippe Veber's avatar Philippe Veber
Browse files

refactored bppseqgen

parent 6fbc0cea
......@@ -57,8 +57,8 @@ clean:
rm -rf example/_bistro
rm -rf example/outdir
.PHONY: clean_test
clean_test:
.PHONY: clean-test
clean-test:
rm -rf example/_bistro
rm -rf example/outdir_test
rm -rf example/report.log
......
......@@ -4,8 +4,6 @@ open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
type bppseqgen_multi_profiles
let env = Env.env_bppsuite
let assign k v =
......@@ -31,22 +29,65 @@ let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
@ config
)
let bppseqgen ?(descr="") ~nb_sites ~tree ~config : nucleotide_fasta workflow =
let config_f = dest // "config.bpp" in
let out = dest // "seq.fa" in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
mkdir_p dest;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen ~tree ~out ~nb_sites ~config))];
cmd "bppseqgen" [
assign "param" config_f;
module Bppseqgen = struct
let bpp_config_base = {|
alphabet=Codon(letter=DNA)
genetic_code = Standard
input.tree.format=Nhx
output.internal.sequences=no
nonhomogeneous = general
rate_distribution=Constant()
|}
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let bpp_config_H0_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
)
] / selector ["seq.fa"]
let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_sites_per_profile =
let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_of_model (m : Convergence_hypothesis.t) = match m with
| H0 _ -> bpp_config_H0_F_Ne
| HaPC _ -> bpp_config_HaPC_F_Ne
| HaPCOC _ -> bpp_config_HaPCOC_F_Ne
let bpp_config_F m = [
string bpp_config_base ;
bpp_config_of_model m ;
]
let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~hypothesis ~nb_sites_per_profile =
seq ~sep:"\n" (
[
assign "input.tree.file" (dep tree) ;
......@@ -57,10 +98,10 @@ let conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~ne_c ~ne_a ~config ~nb_
assign "NE_C" (float ne_c) ;
assign "NE_T" (float ne_c) ;
]
@ config
@ bpp_config_F hypothesis
)
let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
let vars = [
"FINAL_OUT", ident out ;
"PARAM", config ;
......@@ -89,14 +130,14 @@ let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
|}
let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~config ~ne_c ~ne_a ~nodes ~seed : bppseqgen_multi_profiles directory workflow =
let multi_profiles ?(descr="") ~profile_f ~profile_c ~tree ~tree_dataset ~hypothesis ~ne_c ~ne_a ~seed =
let nb_sites_per_profile = 1 in
(* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
let config_f = dest // "config.bpp" in
(* let profile_c_ok = tmp // "profiles_c.tsv" in *)
let profile_c_ok = dep profile_c in
let out = dest // "seq.fa" in
let nodes = Tree_dataset.nodes tree_dataset hypothesis in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
......@@ -108,7 +149,7 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~
dep profile_c
];*)
cmd "cat" ~stdout:config_f [
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~config ~ne_c ~ne_a ~nb_sites_per_profile) ;
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~hypothesis ~ne_c ~ne_a ~nb_sites_per_profile) ;
file_dump (seq ~sep:"\n" [ string "\n" ]) ;
dep nodes ;
];
......@@ -117,12 +158,17 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_f ~profile_c ~nb_sites ~tree ~
)
]
let bppseqgen_multi_profiles_get_fa run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
let alignment run_bppseqgen_multi_profiles : nucleotide_fasta workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa"]
let bppseqgen_multi_profiles_get_info run_bppseqgen_multi_profiles : text_file workflow =
let info run_bppseqgen_multi_profiles : text_file workflow =
run_bppseqgen_multi_profiles / selector ["seq.fa.info"]
end
let conf_file_bppseqman_fna2faa ~fna =
seq ~sep:"\n" [
assign "input.sequence.file" (dep fna) ;
......
......@@ -2,35 +2,27 @@ open Bistro.Std
open Bistro_bioinfo.Std
open File_formats
type bppseqgen_multi_profiles
val bppseqgen :
?descr : string ->
nb_sites:int ->
tree:nhx workflow ->
config:Bistro.Template.t list ->
nucleotide_fasta workflow
val bppseqgen_multi_profiles :
module Bppseqgen : sig
val multi_profiles :
?descr : string ->
profile_f: text_file workflow ->
profile_c: text_file workflow ->
nb_sites:int ->
tree:nhx workflow ->
config:Bistro.Template.t list ->
tree_dataset:[`tree_dataset] directory workflow ->
hypothesis:Convergence_hypothesis.t ->
ne_c: float ->
ne_a: float ->
nodes:text_file workflow ->
seed:int ->
bppseqgen_multi_profiles directory workflow
[`bppseqgen] directory workflow
val bppseqgen_multi_profiles_get_fa :
bppseqgen_multi_profiles directory workflow ->
val alignment :
[`bppseqgen] directory workflow ->
nucleotide_fasta workflow
val bppseqgen_multi_profiles_get_info :
bppseqgen_multi_profiles directory workflow ->
val info :
[`bppseqgen] directory workflow ->
text_file workflow
end
val fna2faa :
fna:nucleotide_fasta workflow ->
......
......@@ -37,60 +37,3 @@ let h0_hapc_forall nes_list =
let assign k v =
seq ~sep:"=" [ string k ; v ]
let bpp_config_base = {|
alphabet=Codon(letter=DNA)
genetic_code = Standard
input.tree.format=Nhx
output.internal.sequences=no
nonhomogeneous = general
rate_distribution=Constant()
|}
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let bpp_config_H0_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F_Ne = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_of_model m = match m with
| H0 _ -> bpp_config_H0_F_Ne
| HaPC _ -> bpp_config_HaPC_F_Ne
| HaPCOC _ -> bpp_config_HaPCOC_F_Ne
let bpp_config_F m = [
string bpp_config_base ;
bpp_config_of_model m ;
]
......@@ -17,23 +17,3 @@ let workflow_of_template t =
]
let config_file () =
let open Bistro.EDSL in
let model = Convergence_hypothesis.(H0 (Fixed 1.)) in
let tree_dir = "/home/pveber/w/reviewphiltrans/example/trees_test/" in
let tree = "tree_small_bl.nhx" in
let input_tree = Bistro.EDSL.input (Filename.concat tree_dir tree) in
let tree_prefix = Filename.chop_extension tree in
let tree_dataset = Tree_dataset.prepare ~descr:("simulated_data." ^ tree_prefix) input_tree in
let nodes = Tree_dataset.nodes tree_dataset model in
workflow [
cmd "cat" ~stdout:dest [
file_dump (Bistro.EDSL.seq ~sep:"\n" (Convergence_hypothesis.bpp_config_F model @ [ string "\n" ])) ;
dep nodes ;
];
]
let main () =
less (config_file ())
......@@ -53,8 +53,6 @@ let calc_fixed_seed ~(str:string) (seed:int) : int =
let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~preview ~ns ~seed =
let model_prefix = Convergence_hypothesis.string_of_model model in
let nb_sites = ns in
let nodes = Tree_dataset.nodes tree_dataset model in
let tree = Tree_dataset.tree tree_dataset `Simulation in
let descr = "."^model_prefix^"."^tree_prefix in
(* only 1 profile or 1 couple of profiles*)
......@@ -62,7 +60,6 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in
*)
(* with several profiles or couples of profiles *)
let config_p = Convergence_hypothesis.bpp_config_F model in
let ne_g = neg_of_model model in
let ne_c = nec_of_model model in
let ne_a = ne_g in
......@@ -70,9 +67,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let profile_c = profile.profile_c in
(*let seed = Random.int Int.max_value in*)
let seed = calc_fixed_seed ~str:descr seed in
let run_fna = Bppsuite.bppseqgen_multi_profiles ~descr ~nb_sites ~tree ~nodes ~config:config_p ~profile_f ~profile_c ~ne_c ~ne_a ~seed in
let fna = Bppsuite.bppseqgen_multi_profiles_get_fa run_fna in
let fna_infos = Some (Bppsuite.bppseqgen_multi_profiles_get_info run_fna) in
let run_fna = Bppsuite.Bppseqgen.multi_profiles ~descr ~tree ~tree_dataset ~hypothesis:model ~profile_f ~profile_c ~ne_c ~ne_a ~seed in
let fna = Bppsuite.Bppseqgen.alignment run_fna in
let fna_infos = Some (Bppsuite.Bppseqgen.info run_fna) in
let faa = Bppsuite.fna2faa ~fna in
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment