Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 961383a1 authored by Carine Rey's avatar Carine Rey
Browse files

add empirical aa freqencies in simulations

parent 2d3aa01b
open Bistro.EDSL
open Bistro.Std
open Core
open Bistro.Std
open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
......@@ -10,22 +11,30 @@ let assign k v =
seq ~sep:"=" [ string k ; v ]
let conf_file_bppseqgen ~tree ~nb_sites ~config =
let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
seq ~sep:"\n" (
[
assign "input.tree.file" (dep tree) ;
assign "output.sequence.file" dest ;
assign "output.sequence.file" out ;
assign "number_of_sites" (int nb_sites) ;
]
@ config
)
let bppseqgen ~nb_sites ~tree ~config : nucleotide_fasta workflow =
workflow ~descr:"bppsuite.bppseqgen" [
cmd "bppseqgen" ~env [
assign "param" (file_dump (conf_file_bppseqgen ~tree ~nb_sites ~config)) ;
]
]
let bppseqgen ?(descr="") ?profile_f ~nb_sites ~tree ~config : nucleotide_fasta workflow =
let config_f = dest // "config.bpp" in
let out = dest // "seq.fa" in
workflow ~descr:("bppsuite.bppseqgen" ^ descr) [
docker env (
and_list [
mkdir_p dest;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppseqgen ~tree ~out ~nb_sites ~config))];
cmd "bppseqgen" ~env [
assign "param" config_f;
]
]
)
] / selector ["seq.fa"]
let conf_file_bppseqman_fna2faa ~fna =
......
......@@ -3,6 +3,8 @@ open Bistro_bioinfo.Std
open File_formats
val bppseqgen :
?descr : string ->
?profile_f: _ workflow ->
nb_sites:int ->
tree:nhx workflow ->
config:Bistro.Template.t list ->
......
......@@ -11,7 +11,7 @@ type det_out =
| Pcoc_out
| Diffsel_out
let pcoc ?gamma ~(faa:aminoacid_fasta workflow) ~(tree:_ workflow) : (*`pcoc*) det_out directory workflow =
let pcoc ?plot_complete ?gamma ~(faa:aminoacid_fasta workflow) ~(tree:_ workflow) : (*`pcoc*) det_out directory workflow =
let env = docker_image ~account:"carinerey" ~name:"pcoc" ~tag:"06212018" () in
workflow ~descr:"convergence_detection.pcoc" [
cmd "pcoc_det.py" ~env [
......@@ -20,6 +20,7 @@ let pcoc ?gamma ~(faa:aminoacid_fasta workflow) ~(tree:_ workflow) : (*`pcoc*)
opt "-aa" dep faa ;
opt "-o" ident dest ;
option ( flag string "--gamma" ) gamma;
option ( flag string "--plot --plot_complete_ali" ) plot_complete;
]
]
......
......@@ -12,6 +12,7 @@ type det_out =
| Diffsel_out
val pcoc :
?plot_complete : bool ->
?gamma : bool ->
faa : aminoacid_fasta workflow ->
tree : _ workflow ->
......
......@@ -10,6 +10,9 @@ let string_of_model m = match m with
| H0 -> "H0"
| Ha -> "Ha"
let assign k v =
seq ~sep:"=" [ string k ; v ]
let bpp_config_base = {|
alphabet=Codon(letter=DNA)
genetic_code = Standard
......@@ -38,3 +41,30 @@ let bpp_config nodes hyp = [
| Ha -> bpp_config_Ha
) ;
]
let bpp_config_H0_F ~profile_f = seq ~sep:"\n" [
assign "COL_M1" (int 2) ;
(* string {|nonhomogeneous.number_of_models = 1 |} ; Mv in insert nodes*)
seq [string "model1=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file=" ; dep profile_f ; string ", col=$(COL_M1)))))" ] ;
]
let bpp_config_Ha_F ~profile_f = seq ~sep:"\n" [
(* string {|nonhomogeneous.number_of_models = 3 or 2 |} ; Mv in insert nodes*)
assign "COL_M1" (int 2) ;
assign "COL_M2" (int 85) ;
assign "modelT" (string "model2") ;
assign "modelC" (string "model3") ;
seq [string "model1=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file=" ; dep profile_f ; string ", col=$(COL_M1)))))" ] ;
seq [string "$(modelT)=OneChange(model=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file=" ; dep profile_f ; string ", col=$(COL_M2))))))" ] ;
seq [string "$(modelC)=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file=" ; dep profile_f ; string ", col=$(COL_M2)))))" ] ;
]
let bpp_config_F nodes hyp profile_f = [
string bpp_config_base ;
insert nodes ;
match hyp with
| H0 -> bpp_config_H0_F ~profile_f
| Ha -> bpp_config_Ha_F ~profile_f
;
]
......@@ -53,29 +53,31 @@ let repo_of_dataset_l ~preview dataset_l =
)
|> List.concat
let derive_from_model ~model ~tree ~tree_dataset ~tree_prefix ~preview =
let derive_from_model ~model ~tree ~tree_dataset ~tree_prefix ~profile_f ~preview =
let model_prefix = Convergence_hypothesis.string_of_model model in
let nb_sites = if preview then 10 else 100 in
let nodes = Tree_dataset.nodes tree_dataset model in
let config = Convergence_hypothesis.bpp_config nodes model in
let config = Convergence_hypothesis.bpp_config_F nodes model profile_f in
let tree = Tree_dataset.tree tree_dataset `Simulation in
let fna = Bppsuite.bppseqgen ~nb_sites ~tree ~config in
let descr = "."^model_prefix^"."^tree_prefix in
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config ~profile_f in
let faa = Bppsuite.fna2faa ~fna in
let ready_dataset = { input_tree = tree ; tree_dataset ; fna; faa} in
let model_prefix = Convergence_hypothesis.string_of_model model in
{ model_prefix; tree_prefix; ready_dataset }
let derive_from_tree ~tree_dir ~tree ~preview =
let derive_from_tree ~tree_dir ~tree ~profile_f ~preview =
let tree_prefix = Filename.chop_extension tree in
let tree = input (Filename.concat tree_dir tree) in
let tree_dataset = Tree_dataset.prepare tree in
let models = Convergence_hypothesis.[H0; Ha] in
List.map models ~f:(fun model ->
derive_from_model ~model ~tree ~tree_dataset ~tree_prefix ~preview
derive_from_model ~model ~tree ~tree_dataset ~tree_prefix ~profile_f ~preview
)
let derive_sim ~tree_dir ~trees ~preview =
let derive_sim ~tree_dir ~trees ~profile_fn ~preview =
let profile_f = input profile_fn in
List.map trees ~f:(fun tree ->
derive_from_tree ~tree_dir ~tree ~preview)
derive_from_tree ~tree_dir ~tree ~profile_f ~preview)
|> List.concat
......@@ -104,8 +106,8 @@ let derive_from_det_meth ~det_meth ~dataset ~preview =
let w_every = if preview then 1 else 10 in
let n_cycles = if preview then 100 else 1000 in
let det_result = match det_meth with
| Pcoc -> Convergence_detection.pcoc ~gamma:false ~faa ~tree:pcoc_tree
| Pcoc_gamma -> Convergence_detection.pcoc ~gamma:true ~faa ~tree:pcoc_tree
| Pcoc -> Convergence_detection.pcoc ~plot_complete:true ~gamma:false ~faa ~tree:pcoc_tree
| Pcoc_gamma -> Convergence_detection.pcoc ~plot_complete: true ~gamma:true ~faa ~tree:pcoc_tree
| Diffsel -> Convergence_detection.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles
in
{det_meth; det_result; dataset}
......@@ -119,22 +121,22 @@ let derive_from_dataset ~dataset ~preview=
derive_from_det_meth ~det_meth ~dataset ~preview
)
let derive_det ~dataset_l ~preview=
let derive_det ~dataset_l ~profile_fn ~preview=
List.map dataset_l ~f:(fun dataset ->
derive_from_dataset ~preview ~dataset)
|> List.concat
let main ~outdir ?(np = 2) ?(mem = 2) ~tree_dir ~preview () =
let main ~outdir ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn ~preview () =
let logger =
Logger.tee [
Console_logger.create () ;
Dot_output.create "dag.dot" (*dot -Tpdf example/dag.dot -o dag.pdf*)
] in
let trees = Array.to_list @@ Sys.readdir tree_dir in
let dataset_l = derive_sim ~tree_dir ~trees ~preview in
let det_results_l = derive_det ~dataset_l ~preview in
let dataset_l = derive_sim ~tree_dir ~trees ~profile_fn ~preview in
let det_results_l = derive_det ~dataset_l ~profile_fn ~preview in
let repo = [
repo_of_dataset_l dataset_l ~preview ;
repo_of_dataset_l dataset_l ~preview ;
repo_of_det_results_l det_results_l;
]
|> List.concat
......@@ -156,6 +158,8 @@ let command =
flag "--mem" (optional int) ~doc:"INT Available memory (in GB)"
and tree_dir =
flag "--tree-dir" (required string) ~doc:"PATH Path to tree directory"
and profile_fn =
flag "--profile-fn" (required string) ~doc:"PATH Path to profile file"
in
main ~outdir ?np ?mem ~tree_dir ~preview
main ~outdir ?np ?mem ~tree_dir ~profile_fn ~preview
]
......@@ -107,9 +107,15 @@ if t:
features.remove("ND")
if not "Condition" in features:
logger.error("\"Condition\" must be in the detected tags. \"Condition:1\" must identify branches under the convergent model under Ha")
logger.error("\"Condition\" must be in the detected tags. \"Condition:1\" must identify nodes under the convergent model under Ha")
sys.exit(1)
if not "Transition" in features:
logger.error("\"Transition\" must be in the detected tags. \"Transition:1\" must identify nodes where there are transitions")
sys.exit(1)
if "Transition" in features:
logger.info("\"Transition\" is in the detected tags. \"Transition:1\" will be use to indicate OneChange model on this node")
# Check existing bl
branch_lengths = []
nodeId = 0
......@@ -140,18 +146,57 @@ t.write(format=1, features=["ND"], outfile = "%s/tree.only_node_ids.nhx" %(OutDi
#### -> 1 line: all nodes ids
all_node_ids = range(nodeId-1)
with open("%s/tree.H0.node_ids" %(OutDirName), "w") as output_H0_node_ids:
output_H0_node_ids.write("nonhomogeneous.number_of_models = 1")
output_H0_node_ids.write("\n")
output_H0_node_ids.write("model1.nodes_id="+",".join(map(str, all_node_ids)))
### tree.Ha.node_ids: alternative hypothesis
#### -> 2 lines: 1) node ids under the ancestral model
#### 2) node ids under the convergent model
conv_node_ids = [n.ND for n in t.search_nodes(Condition = "1")]
trans_node_ids = [n.ND for n in t.search_nodes(Transition = "1")]
conv_node_ids = [i for i in conv_node_ids if i not in trans_node_ids]
not_conv_node_ids = [i for i in all_node_ids if i not in conv_node_ids+trans_node_ids]
logger.info(conv_node_ids)
logger.info(trans_node_ids)
logger.info(not_conv_node_ids)
if trans_node_ids and conv_node_ids:
nonhomogeneous_number_of_models = "nonhomogeneous.number_of_models = 3"
models = "\n".join(["modelT = model2",
"modelC = model3"])
node_ids = "\n".join(["model1.nodes_id="+",".join(map(str, not_conv_node_ids)),
"model2.nodes_id="+",".join(map(str, trans_node_ids)),
"model3.nodes_id="+",".join(map(str, conv_node_ids))
])
elif trans_node_ids and not conv_node_ids:
nonhomogeneous_number_of_models = "nonhomogeneous.number_of_models = 2"
models = "\n".join(["modelT = model2",
"modelC = model3"])
node_ids = "\n".join(["model1.nodes_id="+",".join(map(str, not_conv_node_ids)),
"model2.nodes_id="+",".join(map(str, trans_node_ids))
])
elif not trans_node_ids and conv_node_ids:
nonhomogeneous_number_of_models = "nonhomogeneous.number_of_models = 2"
models = "\n".join(["modelT = model3",
"modelC = model2"])
node_ids = "\n".join(["model1.nodes_id="+",".join(map(str, not_conv_node_ids)),
"model2.nodes_id="+",".join(map(str, conv_node_ids))
])
else:
logger.error("No convergent nodes")
sys.exit(2)
with open("%s/tree.Ha.node_ids" %(OutDirName), "w") as output_Ha_node_ids:
conv_node_ids = [n.ND for n in t.search_nodes(Condition = "1")]
not_conv_node_ids = [i for i in all_node_ids if i not in conv_node_ids]
output_Ha_node_ids.write("model2.nodes_id="+",".join(map(str, conv_node_ids)))
output_Ha_node_ids.write(nonhomogeneous_number_of_models)
output_Ha_node_ids.write("\n")
output_Ha_node_ids.write(models)
output_Ha_node_ids.write("\n")
output_Ha_node_ids.write("model1.nodes_id="+",".join(map(str, not_conv_node_ids)))
output_Ha_node_ids.write(node_ids)
######### output trees for detection
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment