Commit 1ece636f authored by Philippe Veber's avatar Philippe Veber
Browse files

Merge branch 'simulation-dataset-refactoring'

parents a2508335 5eba3f41
open Core
open Codepi
let main ~n_h0 ~n_ha ~seed:i () =
let open Simulation_dataset in
let sim =
bppseqgen_mixed
~tree:(NHX (Bistro.Workflow.input "example/trees_analyses/cyp_coding.Chrysithr_root.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0 ~n_ha ~ne_s:4. ~seed:i ()
in
let w = benchmark sim in
Bistro_engine.Scheduler.simple_eval_exn ~np:4 ~mem:(`GB 4) (Bistro.Workflow.path w)
|> print_endline
let command =
let open Command.Let_syntax in
Command.basic
~summary:"Run diffseldsparse bench"
[%map_open
let n_ha =
flag "--n-ha" (required int) ~doc:"INT Number of Ha sites"
and n_h0 =
flag "--n-h0" (required int) ~doc:"INT Number of H0 sites"
and seed =
flag "--seed" (required int) ~doc:"INT Global seed"
in
main ~n_ha ~n_h0 ~seed
]
let () = Command.run command
......@@ -6,14 +6,6 @@
(preprocess
(pps ppx_jane)))
(executable
(name diffseldsparse_benchmark)
(public_name diffseldsparse_benchmark)
(modules diffseldsparse_benchmark)
(libraries codepi)
(preprocess
(pps ppx_jane)))
(executable
(name orthomam_app)
(public_name orthomam_convergence)
......
......@@ -3,6 +3,8 @@ open Bistro
open Codepi
open Codepi.File_formats
module Pipeline = Simulation_pipeline.Mutsel
type dataset = {
label : string ;
tree : nhx file ;
......@@ -49,18 +51,17 @@ let orthomam_echolocation = {
}
type detection_method = {
result : Simulation_dataset.t -> text file ;
col : int ;
result : Pipeline.query -> cpt file ;
label : string ;
requires_rooted_tree : bool ;
}
let meth ?(col = 1) ?(requires_rooted_tree = false) result label =
{ result ; col ; label ; requires_rooted_tree }
let meth ?(requires_rooted_tree = false) result label =
{ result ; label ; requires_rooted_tree }
let methods = Simulation_dataset.[
let methods = Pipeline.[
meth tdg09 "tdg09" ;
meth pcoc ~col:3 "pcoc" ;
meth pcoc "pcoc" ;
(* meth pcoc_v2 ~col:3 "pcoc v2" ; *)
meth (gemma ~lmm_test:`Score ~relatedness_mode:`Standardized) "gemma" ;
meth inhouse_lmm "LMM" ;
......@@ -68,65 +69,27 @@ let methods = Simulation_dataset.[
meth topological "topological" ~requires_rooted_tree:true ;
]
let benchmark { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in
let sim =
convdet_simulation ~seed:42 ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ()
in
let results, labels =
List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label)
else None
)
|> List.unzip
in
Utils.average_precision_plot ~oracle:(oracle sim) ~labels ~results
let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in
let param =
Convdet_simulation_param.make ~seed ~tree:(NHX t) ~branch_scale ~ne_s
let q =
Pipeline.query ~seed ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ()
in
let sim = convdet_simulation_of_param param in
let simulation = Convdet_simulation_param.simulation param in
let results, labels =
let simulation = Pipeline.simulation q in
let results =
List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label)
Some (m.result q)
else None
)
|> List.unzip
in
benchmark_statistics simulation ~labels ~results
let report =
let module H = Bistro_utils.Html_report in
H.make ~title:"Codepi benchmark" [
H.section "Besnard 2009 dataset" ;
H.pdf (benchmark besnard2009) ;
H.section "Rubisco dataset" ;
H.pdf (benchmark rubisco) ;
H.section "Rodent dataset" ;
H.pdf (benchmark oneline_rodent) ;
H.section "Orthomam/echolocation dataset" ;
H.pdf (benchmark orthomam_echolocation) ;
]
|> H.render
Pipeline.benchmark_statistics simulation ~results
let () =
let open Bistro_utils.Repo in
let datasets = [ besnard2009 ; rubisco ; oneline_rodent ; orthomam_echolocation ] in
let repo =
item ["report.html"] report
:: List.map datasets ~f:(fun d ->
List.map datasets ~f:(fun d ->
item [d.label ^ ".rds"] (benchmark_rds d)
)
in
......
open Core
open Bistro.Shell_dsl
open Bistro
open File_formats
type result = [
| `Pcoc of [`pcoc] directory
......@@ -12,7 +13,7 @@ type result = [
| `Topological_LG of [`topological] directory
| `Topological_WAG of [`topological] directory
| `Tdg09 of [`tdg09] directory
| `Multinomial of text file
| `Multinomial of cpt file
| `Msd of [`msd] directory * float
]
......@@ -34,11 +35,11 @@ type dataset_res = {
tree_prefix : string ;
dataset : Dataset.t ;
res_by_tools: result list ;
merged_results : text file ;
merged_results : cpt file ;
plot_merged_results : svg file ;
}
let merge_results ?fna_infos ~(res_by_tools : result list) () : text file =
let merge_results ?fna_infos ~(res_by_tools : result list) () : cpt file =
let command = List.map res_by_tools ~f:(fun res ->
let w = match res with
| `Pcoc d -> Pcoc.results d
......@@ -78,7 +79,7 @@ let merge_results ?fna_infos ~(res_by_tools : result list) () : text file =
] ;
]
let merge_result_tables ?fna_infos ?oracle ?multinomial ?tdg09 ?identical ?topological ?pcoc ?pcoc_v2 ?pcoc_pcp ?diffsel ?diffseldsparse () : text file =
let merge_result_tables ?fna_infos ?oracle ?multinomial ?tdg09 ?identical ?topological ?pcoc ?pcoc_v2 ?pcoc_pcp ?diffsel ?diffseldsparse () : cpt file =
Workflow.shell ~descr:"convergence_detection.merge_results" ~img:Env.env_py [
cmd "python" [
file_dump (string Scripts.merge_det_results) ;
......
......@@ -11,7 +11,7 @@ type result = [
| `Topological_LG of [`topological] directory
| `Topological_WAG of [`topological] directory
| `Tdg09 of [`tdg09] directory
| `Multinomial of text file
| `Multinomial of cpt file
| `Msd of [`msd] directory * float
]
......@@ -22,7 +22,7 @@ type dataset_res = {
tree_prefix : string ;
dataset : Dataset.t ;
res_by_tools: result list ;
merged_results : text file ;
merged_results : cpt file ;
plot_merged_results : svg file
}
......@@ -30,22 +30,22 @@ val merge_results :
?fna_infos:text file ->
res_by_tools : result list ->
unit ->
text file
cpt file
val merge_result_tables :
?fna_infos:text file ->
?oracle:text file ->
?multinomial:text file ->
?tdg09:text file ->
?identical:text file ->
?topological:text file ->
?pcoc:text file ->
?pcoc_v2:text file ->
?pcoc_pcp:text file ->
?diffsel:text file ->
?diffseldsparse:text file ->
?oracle:cpt file ->
?multinomial:cpt file ->
?tdg09:cpt file ->
?identical:cpt file ->
?topological:cpt file ->
?pcoc:cpt file ->
?pcoc_v2:cpt file ->
?pcoc_pcp:cpt file ->
?diffsel:cpt file ->
?diffseldsparse:cpt file ->
unit ->
text file
cpt file
val plot_merge_results :
? t_choices : text file ->
......@@ -53,27 +53,27 @@ val plot_merge_results :
res_by_tools : result list ->
tree:nhx file ->
faa:aminoacid_fasta file ->
tsv:text file ->
tsv:cpt file ->
unit ->
svg file
val plot_convergent_sites :
?plot_all_sites:bool ->
alignment:aminoacid_fasta file ->
detection_results:text file ->
detection_results:cpt file ->
tree:nhx file ->
unit ->
svg file
val recall_precision_curve :
text file ->
cpt file ->
svg file
val oracle :
n_h0:int ->
n_ha:int ->
text file
cpt file
val recall_precision_auc_table :
text file ->
cpt file ->
(string * float) list workflow
......@@ -15,9 +15,11 @@ end
module type S = sig
type query
include Query with type t := query
val amino_acid_alignment : query -> aminoacid_fasta file
val gene_tree : query -> nw file
val gene_tree : query -> newick file
val dn_tree : query -> text file
......@@ -25,52 +27,50 @@ module type S = sig
val dnds_tree : query -> text file
val identical : query -> text file
val topological : query -> text file
val identical : query -> cpt file
val multinomial : query -> text file
val topological : query -> cpt file
val multinomial_simulation_lrt : query -> text file
val multinomial_simulation_lrt : query -> cpt file
val multinomial_simulation_sparse : query -> text file
val multinomial_simulation_sparse : query -> cpt file
val multinomial_asymptotic_lrt : query -> text file
val multinomial_asymptotic_lrt : query -> cpt file
val multinomial_asymptotic_sparse : query -> text file
val multinomial_asymptotic_sparse : query -> cpt file
val tdg09 : query -> text file
val tdg09 : query -> cpt file
val failsafe_tdg09 : query -> text file
val failsafe_tdg09 : query -> cpt file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> text file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> cpt file
val pcoc_v2 :
?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> text file
?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> cpt file
val gemma :
query ->
lmm_test:[ `All | `LRT | `Score | `Wald ] ->
relatedness_mode:[ `Centered | `Standardized ] ->
text file
cpt file
val inhouse_lmm : query -> text file
val inhouse_lmm : query -> cpt file
val diffsel : query -> text file
val diffsel : query -> cpt file
val diffseldsparse :
?pi:float ->
?shiftprob:float * float ->
?eps:float ->
query ->
text file
cpt file
val view_site :
query -> convergent_species:string list -> site_pos:int -> pdf file
end
module Make (Q : Query) = struct
open Q
include Q
let amino_acid_alignment d =
Utils.amino_acid_fasta_of_nucleotide_fasta (nucleotide_alignment d)
......@@ -119,11 +119,6 @@ module Make (Q : Query) = struct
Topological.topological ~faa ~tree ~tree_conv ~prot_model:"LG08" ()
|> Topological.results
let multinomial d =
Multinomial.multinomial
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d) ()
let multinomial_asymptotic_lrt d =
Multinomial.multinomial_asymptotic_lrt
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
......
......@@ -15,9 +15,11 @@ end
module type S = sig
type query
include Query with type t := query
val amino_acid_alignment : query -> aminoacid_fasta file
val gene_tree : query -> nw file
val gene_tree : query -> newick file
val dn_tree : query -> text file
......@@ -25,45 +27,43 @@ module type S = sig
val dnds_tree : query -> text file
val identical : query -> text file
val topological : query -> text file
val identical : query -> cpt file
val multinomial : query -> text file
val topological : query -> cpt file
val multinomial_simulation_lrt : query -> text file
val multinomial_simulation_lrt : query -> cpt file
val multinomial_simulation_sparse : query -> text file
val multinomial_simulation_sparse : query -> cpt file
val multinomial_asymptotic_lrt : query -> text file
val multinomial_asymptotic_lrt : query -> cpt file
val multinomial_asymptotic_sparse : query -> text file
val multinomial_asymptotic_sparse : query -> cpt file
val tdg09 : query -> text file
val tdg09 : query -> cpt file
val failsafe_tdg09 : query -> text file
val failsafe_tdg09 : query -> cpt file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> text file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> cpt file
val pcoc_v2 :
?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> text file
?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> cpt file
val gemma :
query ->
lmm_test:[ `All | `LRT | `Score | `Wald ] ->
relatedness_mode:[ `Centered | `Standardized ] ->
text file
cpt file
val inhouse_lmm : query -> text file
val inhouse_lmm : query -> cpt file
val diffsel : query -> text file
val diffsel : query -> cpt file
val diffseldsparse :
?pi:float ->
?shiftprob:float * float ->
?eps:float ->
query ->
text file
cpt file
val view_site :
query -> convergent_species:string list -> site_pos:int -> pdf file
......
......@@ -106,7 +106,7 @@ let check_conv run_diffsel : [`diffsel_check_conv] directory =
]
]
let selector run_diffsel : text file =
let selector run_diffsel : cpt file =
let env = Env.env_diffsel in
let package = tmp // "diffsel_script_utils.py" in
let script = tmp // "diffsel_analyze_result.py" in
......
......@@ -13,7 +13,7 @@ val diffsel :
val selector :
[`diffsel] directory ->
text file
cpt file
val check_conv :
[`diffsel] directory ->
......
......@@ -117,7 +117,7 @@ let readdiffseldsparse run =
]
]
let posterior_probabilities run_diffseldsparse : text file =
let posterior_probabilities run_diffseldsparse : cpt file =
let tmp_tree = tmp // "myrun.tree" in
let tmp_ali = tmp // "myrun.ali" in
let dep_tree = (dep run_diffseldsparse) // "myrun.tree" in
......
......@@ -14,7 +14,7 @@ val diffseldsparse :
val posterior_probabilities :
[`diffseldsparse] directory ->
text file
cpt file
val readdiffseldsparse :
[`diffseldsparse] directory ->
......@@ -26,4 +26,4 @@ val check_conv :
val results :
[`readdiffseldsparse] directory ->
text file
cpt file
......@@ -5,24 +5,19 @@ class type bimbam = object
method format : [`bimbam]
end
class type nhx = object
inherit text
method format : [`nhx]
end
class type newick = object
inherit text
method format : [`newick]
end
class type phylip = object
inherit text
method format : [`phylip]
class type nhx = object
inherit newick
method newick_variant : [`nhx]
end
class type nw = object
class type phylip = object
inherit text
method format : [`nw]
method format : [`phylip]
end
class type diffsel_tree = object
......@@ -46,10 +41,23 @@ class type aminoacid_fasta = object
end
class type nucleotide_phylip = object
inherit text
method format : [`Nucleotide]
inherit phylip
method alphabet : [`Nucleotide]
end
class type aminoacid_phylip = object
inherit text
method format : [`Aminoacid]
inherit phylip
method alphabet : [`Aminoacid]
end
class type rds = object
inherit binary_file
method format : [`rds]
end
(** Convergence Prediction Table *)
class type cpt = object
inherit tsv
method header : [`Yes]
method fields : [`Site_then_scores]
end
......@@ -33,4 +33,4 @@ val univariate_lmm :
univariate_lmm_output file
val result_table_of_output :
aminoacid_fasta file -> univariate_lmm_output file -> text file
aminoacid_fasta file -> univariate_lmm_output file -> cpt file
......@@ -105,5 +105,5 @@ let identical ?(descr="") ~(tree_id:_ file) ~(tree_sc:_ file) ~(faa:aminoacid_fa
]
]
let results run_identical : text file =
let results run_identical : cpt file =
Workflow.select run_identical ["out1.tsv"]
......@@ -4,4 +4,4 @@ open File_formats
val test :
aminoacid_fasta file ->
nhx file ->
text file
cpt file
......@@ -42,7 +42,7 @@ let msd ?(descr="") ~e ~(faa : aminoacid_fasta file) ~(tree_sc : _ file) : [`msd
];
]
let results run_msd : text file =
let results run_msd : cpt file =
Workflow.shell ~descr:"convergence_detection.parse_msd" ~img [
cmd "python" [
file_dump (string Scripts.parse_output_msd) ;
......
open Core
open Bistro
open Bistro.Shell_dsl
open File_formats
let multinomial ?(descr="") ~(tree_sc:_ file) ~(faa:aminoacid_fasta file) () : text file =
let img = Env.env_py in
Workflow.shell ~descr:("calc_multinomial."^descr) ~img [
cmd "python" [
file_dump (string Scripts.calc_multinomial) ;
opt "-t" dep tree_sc;
opt "-a" dep faa;
opt "-o" ident dest ;
]
]
let%pworkflow multinomial_ocaml_implementation ~meth ~(tree_sc:_ file) ~(faa:aminoacid_fasta file) (* : text file *) =
let%pworkflow multinomial_ocaml_implementation ~meth ~(tree_sc:_ file) ~(faa:aminoacid_fasta file) (* : cpt file *) =
let open Phylogenetics in
let open Phylogenetics_convergence in
let module MT = Multinomial_test in
......@@ -79,7 +67,7 @@ let%pworkflow multinomial_ocaml_implementation ~meth ~(tree_sc:_ file) ~(faa:ami
sprintf "%d\t%f\t%f" (i + 1) (1. -. pval) lr
)
in
let header = "Sites\t1MinusLRT\tLikelihoodRatio" in
let header = "Sites\tMultinomial_1mp\tMultinomial_LikelihoodRatio" in
Out_channel.write_lines [%dest] (header :: res_lines)
let multinomial_asymptotic_lrt ~tree_sc ~faa = multinomial_ocaml_implementation ~meth:`Asymptotic_LRT ~tree_sc ~faa
......
......@@ -3,7 +3,7 @@ open Bistro
open File_formats
open Codepitk
let ensembl_tree : nhx file =