Commit 74b927db authored by Philippe Veber's avatar Philippe Veber
Browse files

generate R-readable summary out of benchmark

parent 4434509f
...@@ -4,6 +4,7 @@ open Codepi ...@@ -4,6 +4,7 @@ open Codepi
open Codepi.File_formats open Codepi.File_formats
type dataset = { type dataset = {
label : string ;
tree : nhx file ; tree : nhx file ;
rooted : bool ; rooted : bool ;
branch_scale : float ; branch_scale : float ;
...@@ -11,27 +12,31 @@ type dataset = { ...@@ -11,27 +12,31 @@ type dataset = {
} }
let besnard2009 = { let besnard2009 = {
label = "besnard2009" ;
tree = Bistro.Workflow.input "data/besnard2009/besnard2009.nhx" ; tree = Bistro.Workflow.input "data/besnard2009/besnard2009.nhx" ;
rooted = true ; rooted = true ;
branch_scale = 3. ; branch_scale = 10. ;
ne_s = 4., 4. ; ne_s = 4., 4. ;
} }
let oneline_rodent = { let oneline_rodent = {
label = "online_rodent" ;
tree = Bistro.Workflow.input "data/online_rodent/online_rodent.nhx" ; tree = Bistro.Workflow.input "data/online_rodent/online_rodent.nhx" ;
rooted = true ; rooted = true ;
branch_scale = 3. ; branch_scale = 10. ;
ne_s = 4., 4. ; ne_s = 4., 4. ;
} }
let rubisco = { let rubisco = {
label = "rubisco" ;
tree = Rubisco_dataset.(Path "data/rubisco" |> Query.tree ~branch_length_unit:`Amino_acid) ; tree = Rubisco_dataset.(Path "data/rubisco" |> Query.tree ~branch_length_unit:`Amino_acid) ;
rooted = false ; rooted = false ;
branch_scale = 3. ; branch_scale = 10. ;
ne_s = 4., 4. ; ne_s = 4., 4. ;
} }
let orthomam_echolocation = { let orthomam_echolocation = {
label = "orthomam_echolocation" ;
tree = ( tree = (
Orthomam.tree_of_db Orthomam.tree_of_db
~branch_length_unit:`Amino_acid ~branch_length_unit:`Amino_acid
...@@ -39,7 +44,7 @@ let orthomam_echolocation = { ...@@ -39,7 +44,7 @@ let orthomam_echolocation = {
(Codepitk.Orthomam_db.make "omm") (Codepitk.Orthomam_db.make "omm")
) ; ) ;
rooted = false ; rooted = false ;
branch_scale = 3. ; branch_scale = 10. ;
ne_s = 4., 4. ; ne_s = 4., 4. ;
} }
...@@ -63,10 +68,10 @@ let methods = Simulation_dataset.[ ...@@ -63,10 +68,10 @@ let methods = Simulation_dataset.[
meth topological "topological" ~requires_rooted_tree:true ; meth topological "topological" ~requires_rooted_tree:true ;
] ]
let benchmark { tree = t ; rooted ; ne_s ; branch_scale } = let benchmark { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in let open Simulation_dataset in
let sim = let sim =
convdet_simulation ~seed:3 ~tree:(NHX t) ~branch_scale ~ne_s convdet_simulation ~seed:42 ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv" ~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 () ~n_h0:900 ~n_ha:100 ()
in in
...@@ -80,6 +85,25 @@ let benchmark { tree = t ; rooted ; ne_s ; branch_scale } = ...@@ -80,6 +85,25 @@ let benchmark { tree = t ; rooted ; ne_s ; branch_scale } =
in in
Utils.average_precision_plot ~oracle:(oracle sim) ~labels ~results Utils.average_precision_plot ~oracle:(oracle sim) ~labels ~results
let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in
let param =
Convdet_simulation_param.make ~seed ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ()
in
let sim = convdet_simulation_of_param param in
let simulation = Convdet_simulation_param.simulation param in
let results, labels =
List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label)
else None
)
|> List.unzip
in
benchmark_statistics simulation ~labels ~results
let report = let report =
let module H = Bistro_utils.Html_report in let module H = Bistro_utils.Html_report in
H.make ~title:"Codepi benchmark" [ H.make ~title:"Codepi benchmark" [
...@@ -98,7 +122,13 @@ let report = ...@@ -98,7 +122,13 @@ let report =
|> H.render |> H.render
let () = let () =
let w = report in let open Bistro_utils.Repo in
let datasets = [ besnard2009 ; rubisco ; oneline_rodent ; orthomam_echolocation ] in
let repo =
item ["report.html"] report
:: List.map datasets ~f:(fun d ->
item [d.label ^ ".rds"] (benchmark_rds d)
)
in
let loggers = [ Bistro_utils.Console_logger.create () ] in let loggers = [ Bistro_utils.Console_logger.create () ] in
Bistro_engine.Scheduler.simple_eval_exn ~loggers ~np:4 ~mem:(`GB 4) (Bistro.Workflow.path w) build_main ~loggers ~np:4 ~mem:(`GB 4) repo ~outdir:"res"
|> print_endline
...@@ -15,6 +15,42 @@ type tree = ...@@ -15,6 +15,42 @@ type tree =
branch_length2 : float ; branch_length2 : float ;
} }
let tree_workflow = function
| NHX w -> w
| Pair_tree { branch_length1 ;
branch_length2 ;
npairs } ->
Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs
module Convdet_simulation_param = struct
type t = {
tree : tree ;
branch_scale : float ;
profiles : string ;
n_h0 : int ;
n_ha : int ;
ne_s : float * float ;
gBGC : float * float ;
seed : int ;
}
let make ?(branch_scale = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () = {
tree ;
profiles ;
n_h0 ;
n_ha ;
ne_s ;
gBGC ;
branch_scale ;
seed : int ;
}
let simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_scale ; seed ; tree ; _ } =
let tree = tree_workflow tree in
let fitness_profiles = Workflow.input profiles in
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
end
type t = type t =
| Bppseqgen_simulation of { | Bppseqgen_simulation of {
hypothesis : Convergence_hypothesis.t ; hypothesis : Convergence_hypothesis.t ;
...@@ -31,16 +67,7 @@ type t = ...@@ -31,16 +67,7 @@ type t =
n_ha : int ; n_ha : int ;
ne_s : float ; ne_s : float ;
} }
| Convdet_simulation of { | Convdet_simulation of Convdet_simulation_param.t
tree : tree ;
branch_scale : float ;
profiles : string ;
n_h0 : int ;
n_ha : int ;
ne_s : float * float ;
gBGC : float * float ;
seed : int ;
}
let bppseqgen_mixed_simulation ?(ne_s = 1.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () = let bppseqgen_mixed_simulation ?(ne_s = 1.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () =
Bppseqgen_mixed { Bppseqgen_mixed {
...@@ -61,28 +88,16 @@ let bppseqgen_simulation ~hyp ~tree ~profiles ~nb_sites ~seed = ...@@ -61,28 +88,16 @@ let bppseqgen_simulation ~hyp ~tree ~profiles ~nb_sites ~seed =
seed ; seed ;
} }
let convdet_simulation ?(branch_scale = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () = let convdet_simulation ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha () =
Convdet_simulation { Convdet_simulation (Convdet_simulation_param.make ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha ())
tree ;
profiles ; let convdet_simulation_of_param p = Convdet_simulation p
n_h0 ;
n_ha ;
ne_s ;
gBGC ;
branch_scale ;
seed : int ;
}
let tree ~branch_length_unit:_ = function let tree ~branch_length_unit:_ = function
| Bppseqgen_simulation { tree ; _ } | Bppseqgen_simulation { tree ; _ }
| Bppseqgen_mixed { tree ; _ } | Bppseqgen_mixed { tree ; _ }
| Convdet_simulation { tree ; _ } -> | Convdet_simulation { tree ; _ } ->
match tree with tree_workflow tree
| NHX w -> w
| Pair_tree { branch_length1 ;
branch_length2 ;
npairs } ->
Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs
let seed = function let seed = function
| Bppseqgen_mixed s -> s.seed | Bppseqgen_mixed s -> s.seed
...@@ -115,10 +130,8 @@ let rec nucleotide_alignment = function ...@@ -115,10 +130,8 @@ let rec nucleotide_alignment = function
let h0 = nucleotide_alignment (Bppseqgen_simulation { hypothesis = H0 (Fixed ne_s) ; profiles ; seed ; nb_sites = n_h0 ; tree }) in let h0 = nucleotide_alignment (Bppseqgen_simulation { hypothesis = H0 (Fixed ne_s) ; profiles ; seed ; nb_sites = n_h0 ; tree }) in
let ha = nucleotide_alignment (Bppseqgen_simulation { hypothesis = HaPC (Fixed ne_s) ; profiles ; seed ; nb_sites = n_ha ; tree }) in let ha = nucleotide_alignment (Bppseqgen_simulation { hypothesis = HaPC (Fixed ne_s) ; profiles ; seed ; nb_sites = n_ha ; tree }) in
Utils.fasta_cappend h0 ha Utils.fasta_cappend h0 ha
| Convdet_simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_scale ; seed ; _ } as sim -> | Convdet_simulation param ->
let tree = tree ~branch_length_unit:`Nucleotide sim in Convdet_simulation_param.simulation param
let fitness_profiles = Workflow.input profiles in
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
|> Simulator.alignment_of_simulation |> Simulator.alignment_of_simulation
include Detection_pipeline.Make(struct include Detection_pipeline.Make(struct
...@@ -194,6 +207,125 @@ let benchmark2 d = ...@@ -194,6 +207,125 @@ let benchmark2 d =
tdg09 d, 1 ; tdg09 d, 1 ;
] ]
let%pworkflow benchmark_statistics simulation ~labels ~results =
let open Phylogenetics in
let open Codepitk in
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in
let load_results fn col =
let open Codepitk in
let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
match Dataframe.get_col df col with
| Some (Floats xs) -> Array.map xs ~f:Option.some
| Some (Float_opts xs) -> xs
| Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
| Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
| _ -> failwith "expected a numeric column at pos 1"
in
let sim : simulation = [%eval simulation] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in
let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
let labels = [%param labels] in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let columns = List.map2_exn labels results ~f:(fun l r ->
l, `Numeric (Numeric.of_array_opt r)
) in
let amino_acid_vector_of_codon_vector xs =
let module Codon = Codon.Universal_genetic_code.NS in
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then
acc +. xs.Codon.%(c)
else acc
)
)
in
let collect_profiles sel =
Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun p ->
sel p
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
)
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i =
let t =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s -> Char.equal (s : Dna.t :> string).[i] aa)
)
in
(t :> int array)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
let seqs =
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
if Poly.equal cond cond_s then Some s else None
)
|> List.filter_opt
in
Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in
let make_classification_data x y =
Prc.Classification_data (
List.init (Array.length x) ~f:(fun i ->
match x.(i), y.(i) with
| Some x_i, Some y_i -> Some (x_i, y_i)
| None, _ | _, None -> None
)
|> List.filter_opt
)
in
let estimates, lower_bounds, upper_bounds =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun scores ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub
)
|> List.unzip3
in
let open OCamlR_base in
let auc_estimates = Dataframe.create [
"method", `Character (Character.of_list labels) ;
"estimate", `Numeric (Numeric.of_list estimates) ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
]
in
let oracle =
Array.(
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
)
|> Logical.of_array
in
let results = Dataframe.create columns in
let data = List_.create [
Some "results", Dataframe.to_sexp results ;
Some "oracle", Logical.to_sexp oracle ;
Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
]
in
saveRDS ~file:[%dest] (List_.to_sexp data)
(* param exploration for SMBE paper *) (* param exploration for SMBE paper *)
(* type branch_scale_t = float *) (* type branch_scale_t = float *)
let branch_scale_range = [ 1.; 3.; 6.; 9. ] let branch_scale_range = [ 1.; 3.; 6.; 9. ]
......
...@@ -29,6 +29,24 @@ val bppseqgen_simulation : ...@@ -29,6 +29,24 @@ val bppseqgen_simulation :
seed:int -> seed:int ->
t t
module Convdet_simulation_param : sig
type t
val make :
?branch_scale:float ->
?ne_s:float * float ->
?gBGC:float * float ->
?seed:int ->
tree:tree ->
profiles:string ->
n_h0:int ->
n_ha:int ->
unit ->
t
val simulation :
t ->
Codepitk.Simulator.Site_independent_mutsel.simulation workflow
end
val convdet_simulation : val convdet_simulation :
?branch_scale:float -> ?branch_scale:float ->
?ne_s:float * float -> ?ne_s:float * float ->
...@@ -41,6 +59,8 @@ val convdet_simulation : ...@@ -41,6 +59,8 @@ val convdet_simulation :
unit -> unit ->
t t
val convdet_simulation_of_param : Convdet_simulation_param.t -> t
include Detection_pipeline.Query with type t := t include Detection_pipeline.Query with type t := t
include Detection_pipeline.S with type query := t include Detection_pipeline.S with type query := t
...@@ -55,6 +75,12 @@ val benchmark : ?mode:[ `fast | `full ] -> t -> svg file ...@@ -55,6 +75,12 @@ val benchmark : ?mode:[ `fast | `full ] -> t -> svg file
val benchmark2 : t -> pdf file val benchmark2 : t -> pdf file
val benchmark_statistics :
Codepitk.Simulator.Site_independent_mutsel.simulation workflow ->
labels:string list ->
results:(text file * int) list ->
binary_file file
(** stuff for gbgc SBME paper *) (** stuff for gbgc SBME paper *)
type gBGC_t = type gBGC_t =
| Global of float | Global of float
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment