Commit 74b927db authored by Philippe Veber's avatar Philippe Veber
Browse files

generate R-readable summary out of benchmark

parent 4434509f
......@@ -4,6 +4,7 @@ open Codepi
open Codepi.File_formats
type dataset = {
label : string ;
tree : nhx file ;
rooted : bool ;
branch_scale : float ;
......@@ -11,27 +12,31 @@ type dataset = {
}
let besnard2009 = {
label = "besnard2009" ;
tree = Bistro.Workflow.input "data/besnard2009/besnard2009.nhx" ;
rooted = true ;
branch_scale = 3. ;
branch_scale = 10. ;
ne_s = 4., 4. ;
}
let oneline_rodent = {
label = "online_rodent" ;
tree = Bistro.Workflow.input "data/online_rodent/online_rodent.nhx" ;
rooted = true ;
branch_scale = 3. ;
branch_scale = 10. ;
ne_s = 4., 4. ;
}
let rubisco = {
label = "rubisco" ;
tree = Rubisco_dataset.(Path "data/rubisco" |> Query.tree ~branch_length_unit:`Amino_acid) ;
rooted = false ;
branch_scale = 3. ;
branch_scale = 10. ;
ne_s = 4., 4. ;
}
let orthomam_echolocation = {
label = "orthomam_echolocation" ;
tree = (
Orthomam.tree_of_db
~branch_length_unit:`Amino_acid
......@@ -39,7 +44,7 @@ let orthomam_echolocation = {
(Codepitk.Orthomam_db.make "omm")
) ;
rooted = false ;
branch_scale = 3. ;
branch_scale = 10. ;
ne_s = 4., 4. ;
}
......@@ -63,10 +68,10 @@ let methods = Simulation_dataset.[
meth topological "topological" ~requires_rooted_tree:true ;
]
let benchmark { tree = t ; rooted ; ne_s ; branch_scale } =
let benchmark { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in
let sim =
convdet_simulation ~seed:3 ~tree:(NHX t) ~branch_scale ~ne_s
convdet_simulation ~seed:42 ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ()
in
......@@ -80,6 +85,25 @@ let benchmark { tree = t ; rooted ; ne_s ; branch_scale } =
in
Utils.average_precision_plot ~oracle:(oracle sim) ~labels ~results
let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
let open Simulation_dataset in
let param =
Convdet_simulation_param.make ~seed ~tree:(NHX t) ~branch_scale ~ne_s
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~n_h0:900 ~n_ha:100 ()
in
let sim = convdet_simulation_of_param param in
let simulation = Convdet_simulation_param.simulation param in
let results, labels =
List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then
Some ((m.result sim, m.col), m.label)
else None
)
|> List.unzip
in
benchmark_statistics simulation ~labels ~results
let report =
let module H = Bistro_utils.Html_report in
H.make ~title:"Codepi benchmark" [
......@@ -98,7 +122,13 @@ let report =
|> H.render
let () =
let w = report in
let open Bistro_utils.Repo in
let datasets = [ besnard2009 ; rubisco ; oneline_rodent ; orthomam_echolocation ] in
let repo =
item ["report.html"] report
:: List.map datasets ~f:(fun d ->
item [d.label ^ ".rds"] (benchmark_rds d)
)
in
let loggers = [ Bistro_utils.Console_logger.create () ] in
Bistro_engine.Scheduler.simple_eval_exn ~loggers ~np:4 ~mem:(`GB 4) (Bistro.Workflow.path w)
|> print_endline
build_main ~loggers ~np:4 ~mem:(`GB 4) repo ~outdir:"res"
......@@ -15,6 +15,42 @@ type tree =
branch_length2 : float ;
}
let tree_workflow = function
| NHX w -> w
| Pair_tree { branch_length1 ;
branch_length2 ;
npairs } ->
Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs
module Convdet_simulation_param = struct
type t = {
tree : tree ;
branch_scale : float ;
profiles : string ;
n_h0 : int ;
n_ha : int ;
ne_s : float * float ;
gBGC : float * float ;
seed : int ;
}
let make ?(branch_scale = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () = {
tree ;
profiles ;
n_h0 ;
n_ha ;
ne_s ;
gBGC ;
branch_scale ;
seed : int ;
}
let simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_scale ; seed ; tree ; _ } =
let tree = tree_workflow tree in
let fitness_profiles = Workflow.input profiles in
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
end
type t =
| Bppseqgen_simulation of {
hypothesis : Convergence_hypothesis.t ;
......@@ -31,16 +67,7 @@ type t =
n_ha : int ;
ne_s : float ;
}
| Convdet_simulation of {
tree : tree ;
branch_scale : float ;
profiles : string ;
n_h0 : int ;
n_ha : int ;
ne_s : float * float ;
gBGC : float * float ;
seed : int ;
}
| Convdet_simulation of Convdet_simulation_param.t
let bppseqgen_mixed_simulation ?(ne_s = 1.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () =
Bppseqgen_mixed {
......@@ -61,28 +88,16 @@ let bppseqgen_simulation ~hyp ~tree ~profiles ~nb_sites ~seed =
seed ;
}
let convdet_simulation ?(branch_scale = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () =
Convdet_simulation {
tree ;
profiles ;
n_h0 ;
n_ha ;
ne_s ;
gBGC ;
branch_scale ;
seed : int ;
}
let convdet_simulation ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha () =
Convdet_simulation (Convdet_simulation_param.make ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha ())
let convdet_simulation_of_param p = Convdet_simulation p
let tree ~branch_length_unit:_ = function
| Bppseqgen_simulation { tree ; _ }
| Bppseqgen_mixed { tree ; _ }
| Convdet_simulation { tree ; _ } ->
match tree with
| NHX w -> w
| Pair_tree { branch_length1 ;
branch_length2 ;
npairs } ->
Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs
tree_workflow tree
let seed = function
| Bppseqgen_mixed s -> s.seed
......@@ -115,10 +130,8 @@ let rec nucleotide_alignment = function
let h0 = nucleotide_alignment (Bppseqgen_simulation { hypothesis = H0 (Fixed ne_s) ; profiles ; seed ; nb_sites = n_h0 ; tree }) in
let ha = nucleotide_alignment (Bppseqgen_simulation { hypothesis = HaPC (Fixed ne_s) ; profiles ; seed ; nb_sites = n_ha ; tree }) in
Utils.fasta_cappend h0 ha
| Convdet_simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_scale ; seed ; _ } as sim ->
let tree = tree ~branch_length_unit:`Nucleotide sim in
let fitness_profiles = Workflow.input profiles in
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
| Convdet_simulation param ->
Convdet_simulation_param.simulation param
|> Simulator.alignment_of_simulation
include Detection_pipeline.Make(struct
......@@ -194,6 +207,125 @@ let benchmark2 d =
tdg09 d, 1 ;
]
let%pworkflow benchmark_statistics simulation ~labels ~results =
let open Phylogenetics in
let open Codepitk in
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in
let load_results fn col =
let open Codepitk in
let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
match Dataframe.get_col df col with
| Some (Floats xs) -> Array.map xs ~f:Option.some
| Some (Float_opts xs) -> xs
| Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
| Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
| _ -> failwith "expected a numeric column at pos 1"
in
let sim : simulation = [%eval simulation] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in
let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
let labels = [%param labels] in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let columns = List.map2_exn labels results ~f:(fun l r ->
l, `Numeric (Numeric.of_array_opt r)
) in
let amino_acid_vector_of_codon_vector xs =
let module Codon = Codon.Universal_genetic_code.NS in
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then
acc +. xs.Codon.%(c)
else acc
)
)
in
let collect_profiles sel =
Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun p ->
sel p
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
)
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i =
let t =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s -> Char.equal (s : Dna.t :> string).[i] aa)
)
in
(t :> int array)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
let seqs =
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
if Poly.equal cond cond_s then Some s else None
)
|> List.filter_opt
in
Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in
let make_classification_data x y =
Prc.Classification_data (
List.init (Array.length x) ~f:(fun i ->
match x.(i), y.(i) with
| Some x_i, Some y_i -> Some (x_i, y_i)
| None, _ | _, None -> None
)
|> List.filter_opt
)
in
let estimates, lower_bounds, upper_bounds =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun scores ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub
)
|> List.unzip3
in
let open OCamlR_base in
let auc_estimates = Dataframe.create [
"method", `Character (Character.of_list labels) ;
"estimate", `Numeric (Numeric.of_list estimates) ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
]
in
let oracle =
Array.(
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
)
|> Logical.of_array
in
let results = Dataframe.create columns in
let data = List_.create [
Some "results", Dataframe.to_sexp results ;
Some "oracle", Logical.to_sexp oracle ;
Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
]
in
saveRDS ~file:[%dest] (List_.to_sexp data)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let branch_scale_range = [ 1.; 3.; 6.; 9. ]
......
......@@ -29,6 +29,24 @@ val bppseqgen_simulation :
seed:int ->
t
module Convdet_simulation_param : sig
type t
val make :
?branch_scale:float ->
?ne_s:float * float ->
?gBGC:float * float ->
?seed:int ->
tree:tree ->
profiles:string ->
n_h0:int ->
n_ha:int ->
unit ->
t
val simulation :
t ->
Codepitk.Simulator.Site_independent_mutsel.simulation workflow
end
val convdet_simulation :
?branch_scale:float ->
?ne_s:float * float ->
......@@ -41,6 +59,8 @@ val convdet_simulation :
unit ->
t
val convdet_simulation_of_param : Convdet_simulation_param.t -> t
include Detection_pipeline.Query with type t := t
include Detection_pipeline.S with type query := t
......@@ -55,6 +75,12 @@ val benchmark : ?mode:[ `fast | `full ] -> t -> svg file
val benchmark2 : t -> pdf file
val benchmark_statistics :
Codepitk.Simulator.Site_independent_mutsel.simulation workflow ->
labels:string list ->
results:(text file * int) list ->
binary_file file
(** stuff for gbgc SBME paper *)
type gBGC_t =
| Global of float
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment