Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit cbb474fb authored by Philippe Veber's avatar Philippe Veber
Browse files

Merge branch 'orthomam'

parents 7d33bbfc fb326a34
module Top = Bistro_utils.Toplevel_eval.Make(struct let np = 8 let mem = 10 end)
module Top = Bistro_utils.Toplevel_eval.Make(struct let np = 8 let mem = 10 end)()
open Top
open Reviewphiltrans
let njplot (t : Biotope.Formats.newick Bistro.file) = Sys.command (Printf.sprintf "njplot %s" (path t))
(*
open Pipeline2
......
......@@ -14,8 +14,8 @@ let main ~n_h0 ~n_ha ~seed:i () =
}
in
let w = benchmark sim in
(* print_endline (Debug.path (multinomial sim)) ; *)
print_endline (Debug.path w)
Bistro_engine.Scheduler.simple_eval_exn ~np:4 ~mem:(`GB 4) (Bistro.Workflow.path w)
|> print_endline
let command =
let open Command.Let_syntax in
......
......@@ -13,3 +13,11 @@
(libraries reviewphiltrans)
(preprocess
(pps ppx_jane)))
(executable
(name orthomam_app)
(public_name orthomam_convergence)
(modules orthomam_app)
(libraries reviewphiltrans)
(preprocess
(pps ppx_jane)))
open Reviewphiltrans
module Top = Bistro_utils.Toplevel_eval.Make(struct let np = 3 let mem = 10 end)()
let () =
try
Reviewphiltrans_toolbox.Orthomam_db.make "/disk/data/omm"
|> Orthomam.(
site_ranking
~convergent_species:species_with_echolocation
~meth:`tdg09
)
|> Top.eval
|> Core.(Fn.flip List.take 10)
|> List.iter Reviewphiltrans_toolbox.Candidate_site.(fun x -> Option.iter print_endline x.alignment_id)
with
| Failure _ -> ()
......@@ -127,19 +127,17 @@ rate_distribution=Constant()
let ne_g = Convergence_hypothesis.neg_of_model hypothesis in
let ne_c = Convergence_hypothesis.nec_of_model hypothesis in
let ne_a = ne_g in
Workflow.shell ~descr:("bppsuite.bppseqgen" ^ descr) [
within_container img (
and_list [
mkdir_p dest;
mkdir_p tmp;
cd tmp;
cmd "cat" ~stdout:config_f [
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~hypothesis ~ne_c ~ne_a ~nb_sites_per_profile) ;
dep nodes ;
];
cmd "bash" [ file_dump (bppseqgen_multi_profiles_script ~config:config_f ~out ~profile_c:profile_c_ok ~seed)];
]
)
Workflow.shell ~descr:("bppsuite.bppseqgen" ^ descr) ~img [
and_list [
mkdir_p dest;
mkdir_p tmp;
cd tmp;
cmd "cat" ~stdout:config_f [
file_dump (conf_file_bppseqgen_multi_profiles ~tree ~profile_f ~hypothesis ~ne_c ~ne_a ~nb_sites_per_profile) ;
dep nodes ;
];
cmd "bash" [ file_dump (bppseqgen_multi_profiles_script ~config:config_f ~out ~profile_c:profile_c_ok ~seed)];
]
]
let alignment run_bppseqgen_multi_profiles : nucleotide_fasta file =
......@@ -159,7 +157,7 @@ let conf_file_bppseqman_fna2faa ~fna =
assign "output.sequence.file" dest ;
string {|alphabet=Codon(letter=DNA)
genetic_code = Standard
input.sequence.remove_stop_codons = no
input.sequence.remove_stop_codons = yes
input.sequence.sites_to_use = all
input.alignment = true
sequence.manip = Translate
......@@ -167,8 +165,8 @@ let conf_file_bppseqman_fna2faa ~fna =
]
let fna2faa (fna : nucleotide_fasta file) : aminoacid_fasta file =
Workflow.shell ~descr:"bppsuite.fna2faa" [
cmd "bppseqman" ~img [
Workflow.shell ~descr:"bppsuite.fna2faa" ~img [
cmd "bppseqman" [
assign "param" (file_dump (conf_file_bppseqman_fna2faa ~fna)) ;
]
]
......@@ -198,23 +196,23 @@ let conf_file_bppseqman_faa2phy ~faa =
]
let fna2phy ~(fna: nucleotide_fasta file) : nucleotide_phylip file =
Workflow.shell ~descr:"bppsuite.fna2phy_interleaved" [
cmd "bppseqman" ~img [
Workflow.shell ~descr:"bppsuite.fna2phy_interleaved" ~img [
cmd "bppseqman" [
assign "param" (file_dump (conf_file_bppseqman_fna2phy ~fna)) ;
]
]
let faa2phy ~(faa: aminoacid_fasta file) : aminoacid_phylip file =
Workflow.shell ~descr:"bppsuite.faa2phy_interleaved" [
cmd "bppseqman" ~img [
Workflow.shell ~descr:"bppsuite.faa2phy_interleaved" ~img [
cmd "bppseqman" [
assign "param" (file_dump (conf_file_bppseqman_faa2phy ~faa)) ;
]
]
let paste_fna ~(fna_l: nucleotide_fasta file list) : nucleotide_fasta file =
Workflow.shell ~descr:"bppsuite.catfasta" [
cmd "catfasta2phyml.pl" ~stdout:dest ~img (List.concat [
Workflow.shell ~descr:"bppsuite.catfasta" ~img [
cmd "catfasta2phyml.pl" ~stdout:dest (List.concat [
[string "-f" ] ;
List.map fna_l ~f:(fun fna -> dep fna) ;
])
......
......@@ -69,8 +69,8 @@ let merge_results ?fna_infos ~(res_by_tools : result list) () : text file =
seq ~sep:" " [opt; dep w]
)
in
Workflow.shell ~descr:"convergence_detection.merge_results" [
cmd "python" ~img:Env.env_py [
Workflow.shell ~descr:"convergence_detection.merge_results" ~img:Env.env_py [
cmd "python" [
file_dump (string Scripts.merge_det_results) ;
opt "-o" ident dest ;
seq ~sep:" " command ;
......@@ -79,8 +79,8 @@ let merge_results ?fna_infos ~(res_by_tools : result list) () : text file =
]
let merge_result_tables ?fna_infos ?oracle ?multinomial ?tdg09 ?identical ?topological ?pcoc ?pcoc_v2 ?pcoc_pcp ?diffsel ?diffseldsparse () : text file =
Workflow.shell ~descr:"convergence_detection.merge_results" [
cmd "python" ~img:Env.env_py [
Workflow.shell ~descr:"convergence_detection.merge_results" ~img:Env.env_py [
cmd "python" [
file_dump (string Scripts.merge_det_results) ;
opt "-o" ident dest ;
option (opt "--multinomial" dep) multinomial ;
......@@ -141,50 +141,46 @@ let plot_merge_results ?t_choices ~plot_all_sites ~(res_by_tools:result list) ~t
in
let out = dest // "results.svg" in
let inner =
Workflow.shell ~descr:"convergence_detection.plot_results" [
within_container img (
and_list [
mkdir_p dest ;
cmd "python" [
Utils.script_dump Scripts.[ diffsel_script_utils ; plot_data ; plot_convergent_sites ] ;
opt "-msa" dep faa ;
opt "-tsv" dep tsv ;
opt "-tree" dep tree ;
opt "-out" ident out ;
opt "-meth" ident meths ;
option (opt "-t" ident) meths_t ;
option (opt "--t_tsv" dep) t_choices ;
flag string "--all_sites" plot_all_sites ;
]
]
)
]
in
Workflow.select inner ["results.svg"]
let plot_convergent_sites ?(plot_all_sites = true) ~alignment ~detection_results ~tree () =
Workflow.shell ~descr:"plot_convergent_sites.py" [
within_container Env.env_pcoc (
Workflow.shell ~descr:"convergence_detection.plot_results" ~img [
and_list [
mkdir_p dest ;
cmd "python" [
Utils.script_dump Scripts.[ diffsel_script_utils ; plot_data ; plot_convergent_sites ] ;
opt "-tsv" dep detection_results ;
opt "-msa" dep alignment ;
opt "-msa" dep faa ;
opt "-tsv" dep tsv ;
opt "-tree" dep tree ;
opt "-out" ident (dest // "plot.svg") ;
opt "-out" ident out ;
opt "-meth" ident meths ;
option (opt "-t" ident) meths_t ;
option (opt "--t_tsv" dep) t_choices ;
flag string "--all_sites" plot_all_sites ;
]
]
)
]
in
Workflow.select inner ["results.svg"]
let plot_convergent_sites ?(plot_all_sites = true) ~alignment ~detection_results ~tree () =
Workflow.shell ~descr:"plot_convergent_sites.py" ~img:Env.env_pcoc [
and_list [
mkdir_p dest ;
cmd "python" [
Utils.script_dump Scripts.[ diffsel_script_utils ; plot_data ; plot_convergent_sites ] ;
opt "-tsv" dep detection_results ;
opt "-msa" dep alignment ;
opt "-tree" dep tree ;
opt "-out" ident (dest // "plot.svg") ;
flag string "--all_sites" plot_all_sites ;
]
]
]
|> Fn.flip Workflow.select ["plot.svg"]
let recall_precision_curve table =
let img = [ docker_image ~account:"pveber" ~name:"r_basics" ~tag:"20190710" () ] in
Workflow.shell ~descr:"recall_precision_curve" [
cmd "Rscript" ~img [
Workflow.shell ~descr:"recall_precision_curve" ~img [
cmd "Rscript" [
file_dump (string Scripts.recall_precision_curve) ;
dep table ;
dest ;
......@@ -193,8 +189,10 @@ let recall_precision_curve table =
let%workflow recall_precision_auc_table table =
let module RT = Reviewphiltrans_toolbox.Result_table in
let { RT.labels ; scores_per_meth } = RT.of_file [%path table] in
Array.map scores_per_meth ~f:(fun (meth, scores) ->
let { RT.oracle ; scores_per_meth } = RT.of_file [%path table] in
let labels = Option.value_exn oracle in
List.map scores_per_meth ~f:(fun (meth, scores) ->
let scores = Array.filter_opt scores in
let _, auc = Biocaml_unix.Bin_pred.recall_precision_curve ~labels ~scores in
meth, auc
)
......
......@@ -76,4 +76,4 @@ val oracle :
val recall_precision_auc_table :
text file ->
(string * float) array workflow
(string * float) list workflow
open Bistro
open File_formats
module type Dataset = sig
module type Query = sig
type t
val tree : t -> nhx file
val tree :
branch_length_unit:[`Nucleotide | `Amino_acid | `Codon] ->
t -> nhx file
val nucleotide_alignment : t -> nucleotide_fasta file
end
module Make(D : Dataset) = struct
open D
module type S = sig
type query
val amino_acid_alignment : query -> aminoacid_fasta file
val gene_tree : query -> nw file
val dn_tree : query -> text file
val ds_tree : query -> text file
val dnds_tree : query -> text file
val identical : query -> text file
val topological : query -> text file
val multinomial : query -> text file
val multinomial_simulation_lrt : query -> text file
val multinomial_simulation_sparse : query -> text file
val multinomial_asymptotic_lrt : query -> text file
val multinomial_asymptotic_sparse : query -> text file
val tdg09 : query -> text file
val failsafe_tdg09 : query -> text file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> text file
val pcoc_v2 :
?gamma:bool ->
?aa_profiles:Pcoc.aa_profiles ->
query -> text file
val diffsel : query -> text file
val diffseldsparse :
?pi:float ->
?shiftprob:float * float ->
?eps:float ->
query ->
text file
end
module Make(Q : Query) = struct
open Q
let amino_acid_alignment d =
Bppsuite.fna2faa (nucleotide_alignment d)
Utils.amino_acid_fasta_of_nucleotide_fasta (nucleotide_alignment d)
let phylip_nucleotide_alignment d =
Bppsuite.fna2phy ~fna:(nucleotide_alignment d)
......@@ -20,53 +53,73 @@ module Make(D : Dataset) = struct
let gene_tree d =
Tree_dataset.raxmlng_fna ~fna:(nucleotide_alignment d) ()
let%pworkflow tree_with_no_single_child ~branch_length_unit d : newick file =
let tree_file = [%path tree ~branch_length_unit d] in
let open Phylogenetics in
let tree = Newick.from_file tree_file in
let tree = Newick.map_inner_tree tree ~f:Reviewphiltrans_toolbox.Convergence_tree.remove_nodes_with_single_child in
Newick.to_file tree [%dest]
let identical d =
let tree_sc = Tree_dataset.prepare_sc_tree (tree d) in
let tree_id = Tree_dataset.prepare_tree_with_node_id (tree d) in
let tree_sc = Tree_dataset.prepare_sc_tree (tree ~branch_length_unit:`Amino_acid d) in
let tree_id = Tree_dataset.prepare_tree_with_node_id (tree ~branch_length_unit:`Amino_acid d) in
Identical.identical ~tree_id ~tree_sc ~prot_model:"LG08" ~faa:(amino_acid_alignment d) ()
|> Identical.results
let topological d =
let faa = amino_acid_alignment d in
let tree_conv = Tree_dataset.prepare_topological_tree (tree d) in
let tree = Tree_dataset.prepare_tree_with_node_id (tree d) in
let tree_conv = Tree_dataset.prepare_topological_tree (tree ~branch_length_unit:`Amino_acid d) in
let tree = Tree_dataset.prepare_tree_with_node_id (tree ~branch_length_unit:`Amino_acid d) in
Topological.topological ~faa ~tree ~tree_conv ~prot_model:"LG08" ()
|> Topological.results
let multinomial d =
Multinomial.multinomial
~tree_sc:(tree d)
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d)
()
let multinomial_asymptotic_lrt d =
Multinomial.multinomial_asymptotic_lrt
~tree_sc:(tree d)
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d)
let multinomial_asymptotic_sparse d =
Multinomial.multinomial_asymptotic_sparse
~tree_sc:(tree d)
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d)
let multinomial_simulation_lrt d =
Multinomial.multinomial_simulation_lrt
~tree_sc:(tree d)
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d)
let multinomial_simulation_sparse d =
Multinomial.multinomial_simulation_sparse
~tree_sc:(tree d)
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d)
let tdg09 d =
Tamuri.tdg09
~tree:(tree d)
~tree:(tree_with_no_single_child ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d)
()
|> Tamuri.results
let%pworkflow mock_tdg09 d =
match Biotk.Fasta.from_file [%path amino_acid_alignment d] with
| Ok (_, item :: _) ->
let open Core_kernel in
let n = String.length item.sequence in
"Sites\tTdg09_1MinusFDR\tTdg09_1MinusLRT\tTdg09_prob_post"
:: List.init n ~f:(fun i -> sprintf "%d\t0.0\t0.0\tNA" (i + 1))
|> Out_channel.write_lines [%dest]
| _ -> failwith "couldn't read an item in fasta"
let failsafe_tdg09 d = Workflow.trywith (tdg09 d) (mock_tdg09 d)
let diffseltree d =
Tree_dataset.prepare_diffsel_tree (tree d)
Tree_dataset.prepare_diffsel_tree (tree ~branch_length_unit:`Amino_acid d)
let diffsel d =
Diffsel.diffsel
......@@ -90,21 +143,21 @@ module Make(D : Dataset) = struct
let pcoc ?(gamma = true) ?(ncat = 10) d =
let faa = amino_acid_alignment d in
let tree = tree d in
let tree = tree ~branch_length_unit:`Amino_acid d in
Pcoc.pcoc ~catx_est:ncat ~plot_complete:false ~gamma ~faa ~tree ()
|> Pcoc.results
let pcoc_v2 ?(gamma = true) ?(aa_profiles = `C10) d =
let faa = amino_acid_alignment d in
let tree = tree d in
let tree = tree ~branch_length_unit:`Amino_acid d in
Pcoc.pcoc_v2 ~aa_profiles ~gamma ~faa ~tree ()
|> Pcoc.results
let dn_ds_dnds_trees d =
Testnh.dn_ds_trees_real_data ~fna:(nucleotide_alignment d) ~tree:(tree d) ()
Testnh.dn_ds_trees_real_data ~fna:(nucleotide_alignment d) ~tree:(tree ~branch_length_unit:`Nucleotide d) ()
let dn_tree d = (dn_ds_dnds_trees d).dn_tsv
let ds_tree d = (dn_ds_dnds_trees d).ds_tsv
let dnds_tree d = (dn_ds_dnds_trees d).dnds_tsv
end
open Bistro
open File_formats
module type Query = sig
type t
val tree :
branch_length_unit:[`Nucleotide | `Amino_acid | `Codon] ->
t -> nhx file
val nucleotide_alignment : t -> nucleotide_fasta file
end
module type S = sig
type query
val amino_acid_alignment : query -> aminoacid_fasta file
val gene_tree : query -> nw file
val dn_tree : query -> text file
val ds_tree : query -> text file
val dnds_tree : query -> text file
val identical : query -> text file
val topological : query -> text file
val multinomial : query -> text file
val multinomial_simulation_lrt : query -> text file
val multinomial_simulation_sparse : query -> text file
val multinomial_asymptotic_lrt : query -> text file
val multinomial_asymptotic_sparse : query -> text file
val tdg09 : query -> text file
val failsafe_tdg09 : query -> text file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> text file
val pcoc_v2 :
?gamma:bool ->
?aa_profiles:Pcoc.aa_profiles ->
query -> text file
val diffsel : query -> text file
val diffseldsparse :
?pi:float ->
?shiftprob:float * float ->
?eps:float ->
query ->
text file
end
module Make(Q : Query) : S with type query := Q.t
......@@ -61,27 +61,25 @@ let diffsel ~(phy_n:nucleotide_phylip file) ~(tree: _ file) ~(w_every:int) ~(n_c
let n_cycles = if (n_cycles > 200) then 20 else n_cycles in
let script_r = tmp // "DiffselMCMCConvergenceAnalysis.Rmd" in
(*_build/diffsel -t data/samhd1.tree -d data/samhd1.ali -ncond 3 -x 1 10000 myrun*)
Workflow.shell ~descr:("convergence_detection.run_diffsel." ^ descr) [
within_container env (
and_list [
mkdir_p dest;
cd tmp;
cmd "cp" [ file_dump (string Scripts.diffselMCMCConvergenceAnalysis) ; script_r] ;
cmd "cp" [dep phy_n; dest_ali]; (* required dep to link the file in the env *)
cmd "cp" [dep tree; dest_tree]; (* required dep to link the file in the env *)
cmd "cp" [dep phy_n; tmp_ali]; (* required dep to link the file in the env *)
cmd "cp" [dep tree; tmp_tree]; (* required dep to link the file in the env *)
cmd "/diffsel/_build/diffsel" [
opt "-t" ident tmp_tree;
opt "-d" ident tmp_ali ;
opt "-ncond" int 2 ;
opt "-x" seq [ int w_every; string " "; int n_cycles];
option (opt "-seed" int) seed ;
ident chainname_tmp ;
];
cmd "bash" [(file_dump (diffsel_add_iterations_script ~chainname ~ali:tmp_ali ~tree:tmp_tree ~seed))];
]
)
Workflow.shell ~descr:("convergence_detection.run_diffsel." ^ descr) ~img:env [
and_list [
mkdir_p dest;
cd tmp;
cmd "cp" [ file_dump (string Scripts.diffselMCMCConvergenceAnalysis) ; script_r] ;
cmd "cp" [dep phy_n; dest_ali]; (* required dep to link the file in the env *)
cmd "cp" [dep tree; dest_tree]; (* required dep to link the file in the env *)
cmd "cp" [dep phy_n; tmp_ali]; (* required dep to link the file in the env *)
cmd "cp" [dep tree; tmp_tree]; (* required dep to link the file in the env *)
cmd "/diffsel/_build/diffsel" [
opt "-t" ident tmp_tree;
opt "-d" ident tmp_ali ;
opt "-ncond" int 2 ;
opt "-x" seq [ int w_every; string " "; int n_cycles];
option (opt "-seed" int) seed ;
ident chainname_tmp ;
];
cmd "bash" [(file_dump (diffsel_add_iterations_script ~chainname ~ali:tmp_ali ~tree:tmp_tree ~seed))];
]
]
let check_conv run_diffsel : [`diffsel_check_conv] directory =
......@@ -90,24 +88,22 @@ let check_conv run_diffsel : [`diffsel_check_conv] directory =
let trace = Workflow.select run_diffsel ["myrun.trace"] in
let out = dest // "out.html" in
let nb_new_iterations = dest // "new_iterations.txt" in
Workflow.shell ~descr:"convergence_detection.DiffselMCMCConvergenceAnalysis" [
within_container env (
and_list [
mkdir_p tmp ;
mkdir_p dest ;
cd tmp ;
cmd "cp" [ file_dump (string Scripts.diffselMCMCConvergenceAnalysis) ; script] ;
cmd "Rscript" [
string "-e" ;
string {|"rmarkdown::render(\"DiffselMCMCConvergenceAnalysis.Rmd\",|} ;
string {|params=list(set_trace1=\"|} ;
dep trace ;
string {|\"))"|};
] ;
cmd "cp" [string "DiffselMCMCConvergenceAnalysis.html" ; ident out] ;
cmd "cp" [string "new_iterations.txt" ; ident nb_new_iterations]
]
)
Workflow.shell ~descr:"convergence_detection.DiffselMCMCConvergenceAnalysis" ~img:env [
and_list [
mkdir_p tmp ;
mkdir_p dest ;
cd tmp ;
cmd "cp" [ file_dump (string Scripts.diffselMCMCConvergenceAnalysis) ; script] ;
cmd "Rscript" [
string "-e" ;
string {|"rmarkdown::render(\"DiffselMCMCConvergenceAnalysis.Rmd\",|} ;
string {|params=list(set_trace1=\"|} ;
dep trace ;
string {|\"))"|};
] ;
cmd "cp" [string "DiffselMCMCConvergenceAnalysis.html" ; ident out] ;
cmd "cp" [string "new_iterations.txt" ; ident nb_new_iterations]
]
]
let selector run_diffsel : text file =
......@@ -120,25 +116,23 @@ let selector run_diffsel : text file =
let dep_ali = (dep run_diffsel) // "myrun.ali" in
let chainname = (dep run_diffsel) // "myrun" in
let out = dest in
Workflow.shell ~descr:"convergence_detection.parse_diffsel" [
within_container env (
and_list [
mkdir_p tmp ;
cd tmp ;
Workflow.shell ~descr:"convergence_detection.parse_diffsel" ~img:env [
and_list [
mkdir_p tmp ;
cd tmp ;
cmd "cp" [dep_ali; tmp_ali]; (* required dep to link the file in the env *)
cmd "cp" [dep_tree; tmp_tree]; (* required dep to link the file in the env *)
cmd "cp" [dep_ali; tmp_ali]; (* required dep to link the file in the env *)
cmd "cp" [dep_tree; tmp_tree]; (* required dep to link the file in the env *)
(*python diffsel_analyze_result.py [-r /path/to/readdiffsel] [-o output_file] chainname *)
cmd "cp" [ file_dump (string Scripts.diffsel_script_utils) ; package] ;
cmd "cp" [ file_dump (string Scripts.diffsel_analyze_result) ; script] ;
(*python diffsel_analyze_result.py [-r /path/to/readdiffsel] [-o output_file] chainname *)
cmd "cp" [ file_dump (string Scripts.diffsel_script_utils) ; package] ;
cmd "cp" [ file_dump (string Scripts.diffsel_analyze_result) ; script] ;
cmd "python" [
string "diffsel_analyze_result.py" ;
opt "-r" string "/diffsel/_build/readdiffsel" ;
opt "-o" ident out ;
ident chainname ;
]
cmd "python" [
string "diffsel_analyze_result.py" ;
opt "-r" string "/diffsel/_build/readdiffsel" ;
opt "-o" ident out ;
ident chainname ;
]
)
]
]
......@@ -53,30 +53,28 @@ let diffseldsparse
let chainname = dest // "myrun" in