Commit 1ece636f authored by Philippe Veber's avatar Philippe Veber
Browse files

Merge branch 'simulation-dataset-refactoring'

parents a2508335 5eba3f41
open Core
open Codepi
let main ~n_h0 ~n_ha ~seed:i () =
let open Simulation_dataset in
let sim =
bppseqgen_mixed
~tree:(NHX (Bistro.Workflow.input "example/trees_analyses/cyp_coding.Chrysithr_root.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0 ~n_ha ~ne_s:4. ~seed:i ()
in
let w = benchmark sim in
Bistro_engine.Scheduler.simple_eval_exn ~np:4 ~mem:(`GB 4) (Bistro.Workflow.path w)
|> print_endline
let command =
let open Command.Let_syntax in
Command.basic
~summary:"Run diffseldsparse bench"
[%map_open
let n_ha =
flag "--n-ha" (required int) ~doc:"INT Number of Ha sites"
and n_h0 =
flag "--n-h0" (required int) ~doc:"INT Number of H0 sites"
and seed =
flag "--seed" (required int) ~doc:"INT Global seed"
in
main ~n_ha ~n_h0 ~seed
]
let () = Command.run command
...@@ -6,14 +6,6 @@ ...@@ -6,14 +6,6 @@
(preprocess (preprocess
(pps ppx_jane))) (pps ppx_jane)))
(executable
(name diffseldsparse_benchmark)
(public_name diffseldsparse_benchmark)
(modules diffseldsparse_benchmark)
(libraries codepi)
(preprocess
(pps ppx_jane)))
(executable (executable
(name orthomam_app) (name orthomam_app)
(public_name orthomam_convergence) (public_name orthomam_convergence)
......
...@@ -3,6 +3,8 @@ open Bistro ...@@ -3,6 +3,8 @@ open Bistro
open Codepi open Codepi
open Codepi.File_formats open Codepi.File_formats
module Pipeline = Simulation_pipeline.Mutsel
type dataset = { type dataset = {
label : string ; label : string ;
tree : nhx file ; tree : nhx file ;
...@@ -49,18 +51,17 @@ let orthomam_echolocation = { ...@@ -49,18 +51,17 @@ let orthomam_echolocation = {
} }
type detection_method = { type detection_method = {
result : Simulation_dataset.t -> text file ; result : Pipeline.query -> cpt file ;
col : int ;
label : string ; label : string ;
requires_rooted_tree : bool ; requires_rooted_tree : bool ;
} }
let meth ?(col = 1) ?(requires_rooted_tree = false) result label = let meth ?(requires_rooted_tree = false) result label =
{ result ; col ; label ; requires_rooted_tree } { result ; label ; requires_rooted_tree }
let methods = Simulation_dataset.[ let methods = Pipeline.[
meth tdg09 "tdg09" ; meth tdg09 "tdg09" ;
meth pcoc ~col:3 "pcoc" ; meth pcoc "pcoc" ;
(* meth pcoc_v2 ~col:3 "pcoc v2" ; *) (* meth pcoc_v2 ~col:3 "pcoc v2" ; *)
meth (gemma ~lmm_test:`Score ~relatedness_mode:`Standardized) "gemma" ; meth (gemma ~lmm_test:`Score ~relatedness_mode:`Standardized) "gemma" ;
meth inhouse_lmm "LMM" ; meth inhouse_lmm "LMM" ;
...@@ -68,65 +69,27 @@ let methods = Simulation_dataset.[ ...@@ -68,65 +69,27 @@ let methods = Simulation_dataset.[
meth topological "topological" ~requires_rooted_tree:true ; meth topological "topological" ~requires_rooted_tree:true ;
] ]
let benchmark { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in
let sim =
convdet_simulation ~seed:42 ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ()
in
let results, labels =
List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label)
else None
)
|> List.unzip
in
Utils.average_precision_plot ~oracle:(oracle sim) ~labels ~results
let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } = let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in let q =
let param = Pipeline.query ~seed ~tree:(NHX t) ~branch_scale ~ne_s
Convdet_simulation_param.make ~seed ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv" ~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 () ~n_h0:900 ~n_ha:100 ()
in in
let sim = convdet_simulation_of_param param in let simulation = Pipeline.simulation q in
let simulation = Convdet_simulation_param.simulation param in let results =
let results, labels =
List.filter_map methods ~f:(fun m -> List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label) Some (m.result q)
else None else None
) )
|> List.unzip
in in
benchmark_statistics simulation ~labels ~results Pipeline.benchmark_statistics simulation ~results
let report =
let module H = Bistro_utils.Html_report in
H.make ~title:"Codepi benchmark" [
H.section "Besnard 2009 dataset" ;
H.pdf (benchmark besnard2009) ;
H.section "Rubisco dataset" ;
H.pdf (benchmark rubisco) ;
H.section "Rodent dataset" ;
H.pdf (benchmark oneline_rodent) ;
H.section "Orthomam/echolocation dataset" ;
H.pdf (benchmark orthomam_echolocation) ;
]
|> H.render
let () = let () =
let open Bistro_utils.Repo in let open Bistro_utils.Repo in
let datasets = [ besnard2009 ; rubisco ; oneline_rodent ; orthomam_echolocation ] in let datasets = [ besnard2009 ; rubisco ; oneline_rodent ; orthomam_echolocation ] in
let repo = let repo =
item ["report.html"] report List.map datasets ~f:(fun d ->
:: List.map datasets ~f:(fun d ->
item [d.label ^ ".rds"] (benchmark_rds d) item [d.label ^ ".rds"] (benchmark_rds d)
) )
in in
......
open Core open Core
open Bistro.Shell_dsl open Bistro.Shell_dsl
open Bistro open Bistro
open File_formats
type result = [ type result = [
| `Pcoc of [`pcoc] directory | `Pcoc of [`pcoc] directory
...@@ -12,7 +13,7 @@ type result = [ ...@@ -12,7 +13,7 @@ type result = [
| `Topological_LG of [`topological] directory | `Topological_LG of [`topological] directory
| `Topological_WAG of [`topological] directory | `Topological_WAG of [`topological] directory
| `Tdg09 of [`tdg09] directory | `Tdg09 of [`tdg09] directory
| `Multinomial of text file | `Multinomial of cpt file
| `Msd of [`msd] directory * float | `Msd of [`msd] directory * float
] ]
...@@ -34,11 +35,11 @@ type dataset_res = { ...@@ -34,11 +35,11 @@ type dataset_res = {
tree_prefix : string ; tree_prefix : string ;
dataset : Dataset.t ; dataset : Dataset.t ;
res_by_tools: result list ; res_by_tools: result list ;
merged_results : text file ; merged_results : cpt file ;
plot_merged_results : svg file ; plot_merged_results : svg file ;
} }
let merge_results ?fna_infos ~(res_by_tools : result list) () : text file = let merge_results ?fna_infos ~(res_by_tools : result list) () : cpt file =
let command = List.map res_by_tools ~f:(fun res -> let command = List.map res_by_tools ~f:(fun res ->
let w = match res with let w = match res with
| `Pcoc d -> Pcoc.results d | `Pcoc d -> Pcoc.results d
...@@ -78,7 +79,7 @@ let merge_results ?fna_infos ~(res_by_tools : result list) () : text file = ...@@ -78,7 +79,7 @@ let merge_results ?fna_infos ~(res_by_tools : result list) () : text file =
] ; ] ;
] ]
let merge_result_tables ?fna_infos ?oracle ?multinomial ?tdg09 ?identical ?topological ?pcoc ?pcoc_v2 ?pcoc_pcp ?diffsel ?diffseldsparse () : text file = let merge_result_tables ?fna_infos ?oracle ?multinomial ?tdg09 ?identical ?topological ?pcoc ?pcoc_v2 ?pcoc_pcp ?diffsel ?diffseldsparse () : cpt file =
Workflow.shell ~descr:"convergence_detection.merge_results" ~img:Env.env_py [ Workflow.shell ~descr:"convergence_detection.merge_results" ~img:Env.env_py [
cmd "python" [ cmd "python" [
file_dump (string Scripts.merge_det_results) ; file_dump (string Scripts.merge_det_results) ;
......
...@@ -11,7 +11,7 @@ type result = [ ...@@ -11,7 +11,7 @@ type result = [
| `Topological_LG of [`topological] directory | `Topological_LG of [`topological] directory
| `Topological_WAG of [`topological] directory | `Topological_WAG of [`topological] directory
| `Tdg09 of [`tdg09] directory | `Tdg09 of [`tdg09] directory
| `Multinomial of text file | `Multinomial of cpt file
| `Msd of [`msd] directory * float | `Msd of [`msd] directory * float
] ]
...@@ -22,7 +22,7 @@ type dataset_res = { ...@@ -22,7 +22,7 @@ type dataset_res = {
tree_prefix : string ; tree_prefix : string ;
dataset : Dataset.t ; dataset : Dataset.t ;
res_by_tools: result list ; res_by_tools: result list ;
merged_results : text file ; merged_results : cpt file ;
plot_merged_results : svg file plot_merged_results : svg file
} }
...@@ -30,22 +30,22 @@ val merge_results : ...@@ -30,22 +30,22 @@ val merge_results :
?fna_infos:text file -> ?fna_infos:text file ->
res_by_tools : result list -> res_by_tools : result list ->
unit -> unit ->
text file cpt file
val merge_result_tables : val merge_result_tables :
?fna_infos:text file -> ?fna_infos:text file ->
?oracle:text file -> ?oracle:cpt file ->
?multinomial:text file -> ?multinomial:cpt file ->
?tdg09:text file -> ?tdg09:cpt file ->
?identical:text file -> ?identical:cpt file ->
?topological:text file -> ?topological:cpt file ->
?pcoc:text file -> ?pcoc:cpt file ->
?pcoc_v2:text file -> ?pcoc_v2:cpt file ->
?pcoc_pcp:text file -> ?pcoc_pcp:cpt file ->
?diffsel:text file -> ?diffsel:cpt file ->
?diffseldsparse:text file -> ?diffseldsparse:cpt file ->
unit -> unit ->
text file cpt file
val plot_merge_results : val plot_merge_results :
? t_choices : text file -> ? t_choices : text file ->
...@@ -53,27 +53,27 @@ val plot_merge_results : ...@@ -53,27 +53,27 @@ val plot_merge_results :
res_by_tools : result list -> res_by_tools : result list ->
tree:nhx file -> tree:nhx file ->
faa:aminoacid_fasta file -> faa:aminoacid_fasta file ->
tsv:text file -> tsv:cpt file ->
unit -> unit ->
svg file svg file
val plot_convergent_sites : val plot_convergent_sites :
?plot_all_sites:bool -> ?plot_all_sites:bool ->
alignment:aminoacid_fasta file -> alignment:aminoacid_fasta file ->
detection_results:text file -> detection_results:cpt file ->
tree:nhx file -> tree:nhx file ->
unit -> unit ->
svg file svg file
val recall_precision_curve : val recall_precision_curve :
text file -> cpt file ->
svg file svg file
val oracle : val oracle :
n_h0:int -> n_h0:int ->
n_ha:int -> n_ha:int ->
text file cpt file
val recall_precision_auc_table : val recall_precision_auc_table :
text file -> cpt file ->
(string * float) list workflow (string * float) list workflow
...@@ -15,9 +15,11 @@ end ...@@ -15,9 +15,11 @@ end
module type S = sig module type S = sig
type query type query
include Query with type t := query
val amino_acid_alignment : query -> aminoacid_fasta file val amino_acid_alignment : query -> aminoacid_fasta file
val gene_tree : query -> nw file val gene_tree : query -> newick file
val dn_tree : query -> text file val dn_tree : query -> text file
...@@ -25,52 +27,50 @@ module type S = sig ...@@ -25,52 +27,50 @@ module type S = sig
val dnds_tree : query -> text file val dnds_tree : query -> text file
val identical : query -> text file val identical : query -> cpt file
val topological : query -> text file
val multinomial : query -> text file val topological : query -> cpt file
val multinomial_simulation_lrt : query -> text file val multinomial_simulation_lrt : query -> cpt file
val multinomial_simulation_sparse : query -> text file val multinomial_simulation_sparse : query -> cpt file
val multinomial_asymptotic_lrt : query -> text file val multinomial_asymptotic_lrt : query -> cpt file
val multinomial_asymptotic_sparse : query -> text file val multinomial_asymptotic_sparse : query -> cpt file
val tdg09 : query -> text file val tdg09 : query -> cpt file
val failsafe_tdg09 : query -> text file val failsafe_tdg09 : query -> cpt file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> text file val pcoc : ?gamma:bool -> ?ncat:int -> query -> cpt file
val pcoc_v2 : val pcoc_v2 :
?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> text file ?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> cpt file
val gemma : val gemma :
query -> query ->
lmm_test:[ `All | `LRT | `Score | `Wald ] -> lmm_test:[ `All | `LRT | `Score | `Wald ] ->
relatedness_mode:[ `Centered | `Standardized ] -> relatedness_mode:[ `Centered | `Standardized ] ->
text file cpt file
val inhouse_lmm : query -> text file val inhouse_lmm : query -> cpt file
val diffsel : query -> text file val diffsel : query -> cpt file
val diffseldsparse : val diffseldsparse :
?pi:float -> ?pi:float ->
?shiftprob:float * float -> ?shiftprob:float * float ->
?eps:float -> ?eps:float ->
query -> query ->
text file cpt file
val view_site : val view_site :
query -> convergent_species:string list -> site_pos:int -> pdf file query -> convergent_species:string list -> site_pos:int -> pdf file
end end
module Make (Q : Query) = struct module Make (Q : Query) = struct
open Q include Q
let amino_acid_alignment d = let amino_acid_alignment d =
Utils.amino_acid_fasta_of_nucleotide_fasta (nucleotide_alignment d) Utils.amino_acid_fasta_of_nucleotide_fasta (nucleotide_alignment d)
...@@ -119,11 +119,6 @@ module Make (Q : Query) = struct ...@@ -119,11 +119,6 @@ module Make (Q : Query) = struct
Topological.topological ~faa ~tree ~tree_conv ~prot_model:"LG08" () Topological.topological ~faa ~tree ~tree_conv ~prot_model:"LG08" ()
|> Topological.results |> Topological.results
let multinomial d =
Multinomial.multinomial
~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
~faa:(amino_acid_alignment d) ()
let multinomial_asymptotic_lrt d = let multinomial_asymptotic_lrt d =
Multinomial.multinomial_asymptotic_lrt Multinomial.multinomial_asymptotic_lrt
~tree_sc:(tree ~branch_length_unit:`Amino_acid d) ~tree_sc:(tree ~branch_length_unit:`Amino_acid d)
......
...@@ -15,9 +15,11 @@ end ...@@ -15,9 +15,11 @@ end
module type S = sig module type S = sig
type query type query
include Query with type t := query
val amino_acid_alignment : query -> aminoacid_fasta file val amino_acid_alignment : query -> aminoacid_fasta file
val gene_tree : query -> nw file val gene_tree : query -> newick file
val dn_tree : query -> text file val dn_tree : query -> text file
...@@ -25,45 +27,43 @@ module type S = sig ...@@ -25,45 +27,43 @@ module type S = sig
val dnds_tree : query -> text file val dnds_tree : query -> text file
val identical : query -> text file val identical : query -> cpt file
val topological : query -> text file
val multinomial : query -> text file val topological : query -> cpt file
val multinomial_simulation_lrt : query -> text file val multinomial_simulation_lrt : query -> cpt file
val multinomial_simulation_sparse : query -> text file val multinomial_simulation_sparse : query -> cpt file
val multinomial_asymptotic_lrt : query -> text file val multinomial_asymptotic_lrt : query -> cpt file
val multinomial_asymptotic_sparse : query -> text file val multinomial_asymptotic_sparse : query -> cpt file
val tdg09 : query -> text file val tdg09 : query -> cpt file
val failsafe_tdg09 : query -> text file val failsafe_tdg09 : query -> cpt file
val pcoc : ?gamma:bool -> ?ncat:int -> query -> text file val pcoc : ?gamma:bool -> ?ncat:int -> query -> cpt file
val pcoc_v2 : val pcoc_v2 :
?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> text file ?gamma:bool -> ?aa_profiles:Pcoc.aa_profiles -> query -> cpt file
val gemma : val gemma :
query -> query ->
lmm_test:[ `All | `LRT | `Score | `Wald ] -> lmm_test:[ `All | `LRT | `Score | `Wald ] ->
relatedness_mode:[ `Centered | `Standardized ] -> relatedness_mode:[ `Centered | `Standardized ] ->
text file cpt file
val inhouse_lmm : query -> text file val inhouse_lmm : query -> cpt file
val diffsel : query -> text file val diffsel : query -> cpt file
val diffseldsparse : val diffseldsparse :
?pi:float -> ?pi:float ->
?shiftprob:float * float -> ?shiftprob:float * float ->
?eps:float -> ?eps:float ->
query -> query ->
text file cpt file
val view_site : val view_site :
query -> convergent_species:string list -> site_pos:int -> pdf file query -> convergent_species:string list -> site_pos:int -> pdf file
......
...@@ -106,7 +106,7 @@ let check_conv run_diffsel : [`diffsel_check_conv] directory = ...@@ -106,7 +106,7 @@ let check_conv run_diffsel : [`diffsel_check_conv] directory =
] ]
] ]
let selector run_diffsel : text file = let selector run_diffsel : cpt file =
let env = Env.env_diffsel in let env = Env.env_diffsel in
let package = tmp // "diffsel_script_utils.py" in let package = tmp // "diffsel_script_utils.py" in
let script = tmp // "diffsel_analyze_result.py" in let script = tmp // "diffsel_analyze_result.py" in
......
...@@ -13,7 +13,7 @@ val diffsel : ...@@ -13,7 +13,7 @@ val diffsel :