Commit 2079ac34 authored by Philippe Veber's avatar Philippe Veber
Browse files

lmm benchmark

parent eb0e6316
......@@ -5,7 +5,7 @@ let main ~n_h0 ~n_ha ~seed:i () =
let open Simulation_dataset in
let sim =
bppseqgen_mixed_simulation
~tree:(NHX "example/trees_analyses/cyp_coding.Chrysithr_root.nhx")
~tree:(NHX (Bistro.Workflow.input "example/trees_analyses/cyp_coding.Chrysithr_root.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0 ~n_ha ~ne_s:4. ~seed:i ()
in
......
open Core_kernel
open Bistro
open Codepi
open Codepi.File_formats
let benchmark d =
let open Simulation_dataset in
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:[| "topological"; "multinomial";"gemma_score";"inhouse"; "pcoc"; (* "pcocv2" ; *)"tdg09" |]
~results:[
topological d, 1 ;
multinomial_asymptotic_lrt d, 1 ;
gemma ~lmm_test:`Score ~relatedness_mode:`Standardized d, 1 ;
inhouse_lmm d, 1 ;
pcoc d, 3 ;
(* pcoc_v2 d ; *)
tdg09 d, 1 ;
]
type dataset = {
tree : nhx file ;
rooted : bool ;
branch_scale : float ;
ne_s : float * float ;
}
let () =
let besnard2009 = {
tree = Bistro.Workflow.input "data/besnard2009/besnard2009.nhx" ;
rooted = true ;
branch_scale = 3. ;
ne_s = 2., 2. ;
}
let oneline_rodent = {
tree = Bistro.Workflow.input "data/online_rodent/online_rodent.nhx" ;
rooted = true ;
branch_scale = 3. ;
ne_s = 2., 2. ;
}
let rubisco = {
tree = Rubisco_dataset.(Path "data/rubisco" |> Query.tree ~branch_length_unit:`Amino_acid) ;
rooted = false ;
branch_scale = 3. ;
ne_s = 2., 2. ;
}
let orthomam_echolocation = {
tree = (
Orthomam.tree_of_db
~branch_length_unit:`Amino_acid
~convergent_species:Orthomam.species_with_echolocation
(Codepitk.Orthomam_db.make "omm")
) ;
rooted = false ;
branch_scale = 3. ;
ne_s = 4., 4. ;
}
type detection_method = {
result : Simulation_dataset.t -> text file ;
col : int ;
label : string ;
requires_rooted_tree : bool ;
}
let meth ?(col = 1) ?(requires_rooted_tree = false) result label =
{ result ; col ; label ; requires_rooted_tree }
let methods = Simulation_dataset.[
meth tdg09 "tdg09" ;
meth pcoc ~col:3 "pcoc" ;
meth pcoc_v2 ~col:3 "pcoc v2" ;
meth (gemma ~lmm_test:`Score ~relatedness_mode:`Standardized) "gemma" ;
meth inhouse_lmm "LMM" ;
meth multinomial_asymptotic_lrt "multinomial" ;
meth topological "topological" ~requires_rooted_tree:true ;
]
let benchmark { tree = t ; rooted ; ne_s ; branch_scale } =
let open Simulation_dataset in
let sim =
let open Simulation_dataset in
convdet_simulation
~tree:(NHX "data/besnard2009/besnard2009.nhx")
convdet_simulation ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ~branch_scale:3. ~ne_s:(2., 2.) ()
~n_h0:900 ~n_ha:100 ()
in
let results, labels =
List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label)
else None
)
|> List.unzip
in
let w = benchmark sim in
Utils.recall_precision_curve ~oracle:(oracle sim) ~labels ~results
let report =
let module H = Bistro_utils.Html_report in
H.make ~title:"Codepi benchmark" [
H.section "Besnard 2009 dataset" ;
H.pdf (benchmark besnard2009) ;
H.section "Rubisco dataset" ;
H.pdf (benchmark rubisco) ;
H.section "Rodent dataset" ;
H.pdf (benchmark oneline_rodent) ;
H.section "Orthomam/echolocation dataset" ;
H.pdf (benchmark orthomam_echolocation) ;
]
|> H.render
let () =
let w = report in
let loggers = [ Bistro_utils.Console_logger.create () ] in
Bistro_engine.Scheduler.simple_eval_exn ~loggers ~np:4 ~mem:(`GB 4) (Bistro.Workflow.path w)
|> print_endline
open Base
open Printf
open Bistro
open File_formats
let calc_fixed_seed ~(str:string) (seed:int) : int =
let str_hash = Hashtbl.hash str in
Hashtbl.hash (str_hash + seed)
type tree =
| NHX of string
| NHX of nhx file
| Pair_tree of {
npairs : int ;
branch_length1 : float ;
......@@ -72,25 +73,12 @@ let convdet_simulation ?(branch_scale = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(
seed : int ;
}
let prefix_of_tree = function
| NHX path ->
Caml.Filename.chop_extension path
| Pair_tree { branch_length1 ; branch_length2 ; npairs } ->
sprintf "pair_tree(bl1=%g,bl2=%g,np=%d)" branch_length1 branch_length2 npairs
let tree_prefix = function
| Bppseqgen_simulation { tree ; _ }
| Bppseqgen_mixed { tree ; _ }
| Convdet_simulation { tree ; _ } ->
prefix_of_tree tree
let tree ~branch_length_unit:_ = function
| Bppseqgen_simulation { tree ; _ }
| Bppseqgen_mixed { tree ; _ }
| Convdet_simulation { tree ; _ } ->
match tree with
| NHX path ->
Workflow.input path
| NHX w -> w
| Pair_tree { branch_length1 ;
branch_length2 ;
npairs } ->
......@@ -110,7 +98,7 @@ let profile ~nb_sites ~profiles ~seed =
let bppseqgen sim ~hypothesis ~nb_sites ~profiles =
let model_prefix = Convergence_hypothesis.string_of_model hypothesis in
let descr = sprintf ".%s.%s" model_prefix (tree_prefix sim) in
let descr = sprintf ".%s" model_prefix in
let profile = profile ~nb_sites ~profiles ~seed:(seed sim) in
let profile_f = profile.profile_f in
let profile_c = profile.profile_c in
......@@ -162,7 +150,7 @@ let oracle d =
let multinomial_benchmark d =
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:[|"LRT";"LRTsim";"sparse";"sparse_sim"|]
~labels:["LRT";"LRTsim";"sparse";"sparse_sim"]
~results:[
multinomial_asymptotic_lrt d, 1 ;
multinomial_simulation_lrt d, 1 ;
......@@ -197,7 +185,7 @@ let benchmark ?mode d =
let benchmark2 d =
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:[|"identical";"topological";"multinomial";"pcoc";"tdg09"|]
~labels:["identical";"topological";"multinomial";"pcoc";"tdg09"]
~results:[
identical d, 1 ;
topological d, 1 ;
......@@ -228,7 +216,7 @@ let explore_params ~(f: param_t -> _) =
let simu_of_param ?n_h0:(n_h0=50) (p: param_t) =
let bf, gbgc = p in
convdet_simulation
~tree:(NHX "example/trees_analyses/C4AmaranthaceaePolyroot.nhx")
~tree:(NHX (Workflow.input "example/trees_analyses/C4AmaranthaceaePolyroot.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~branch_scale:bf
~gBGC:(match gbgc with
......
open Bistro
open File_formats
type tree =
| NHX of string
| NHX of nhx file
| Pair_tree of {
npairs : int;
branch_length1 : float;
......
......@@ -35,7 +35,7 @@ let%pworkflow fasta_cappend fa1 fa2 =
let%pworkflow recall_precision_curve ~labels ~oracle ~results : pdf file =
let open Codepitk in
let n = Array.length labels in
let n = List.length labels in
assert (n = List.length results) ;
let colors = Array.sub [|"blue";"green";"red";"tan1";"royalblue4";"darkorchid";"yellow"|] ~pos:0 ~len:n in
let labels = [%param labels] in
......@@ -73,7 +73,7 @@ let%pworkflow recall_precision_curve ~labels ~oracle ~results : pdf file =
OCamlR_grDevices.pdf [%dest] ;
OCamlR_graphics.plot ~xlab:"Recall" ~ylab:"Precision" ~xlim:(0., 1.) ~ylim:(0., 1.) ~x:[||] ~y:[||] () ;
List.iteri result_curves ~f:(fun i (x, y) -> OCamlR_graphics.lines ~col:colors.(i) ~x ~y ()) ;
OCamlR_graphics.legend ~pch:[|19|] ~col:colors `topright labels ;
OCamlR_graphics.legend ~pch:[|19|] ~col:colors `topright (Array.of_list labels) ;
OCamlR_grDevices.dev_off ()
let%pworkflow [@version 2] newick_of_nhx (t : nhx file) : newick file =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment