Commit 54d6b1a3 authored by Philippe Veber's avatar Philippe Veber
Browse files

Simulation_pipeline.Mutsel.benchmark: use all columns in cpt files

parent ce9c40f6
...@@ -52,17 +52,16 @@ let orthomam_echolocation = { ...@@ -52,17 +52,16 @@ let orthomam_echolocation = {
type detection_method = { type detection_method = {
result : Pipeline.query -> cpt file ; result : Pipeline.query -> cpt file ;
col : int ;
label : string ; label : string ;
requires_rooted_tree : bool ; requires_rooted_tree : bool ;
} }
let meth ?(col = 1) ?(requires_rooted_tree = false) result label = let meth ?(requires_rooted_tree = false) result label =
{ result ; col ; label ; requires_rooted_tree } { result ; label ; requires_rooted_tree }
let methods = Pipeline.[ let methods = Pipeline.[
meth tdg09 "tdg09" ; meth tdg09 "tdg09" ;
meth pcoc ~col:3 "pcoc" ; meth pcoc "pcoc" ;
(* meth pcoc_v2 ~col:3 "pcoc v2" ; *) (* meth pcoc_v2 ~col:3 "pcoc v2" ; *)
meth (gemma ~lmm_test:`Score ~relatedness_mode:`Standardized) "gemma" ; meth (gemma ~lmm_test:`Score ~relatedness_mode:`Standardized) "gemma" ;
meth inhouse_lmm "LMM" ; meth inhouse_lmm "LMM" ;
...@@ -77,15 +76,14 @@ let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } = ...@@ -77,15 +76,14 @@ let benchmark_rds ?(seed = 42) { tree = t ; rooted ; ne_s ; branch_scale ; _ } =
~n_h0:900 ~n_ha:100 () ~n_h0:900 ~n_ha:100 ()
in in
let simulation = Pipeline.simulation q in let simulation = Pipeline.simulation q in
let results, labels = let results =
List.filter_map methods ~f:(fun m -> List.filter_map methods ~f:(fun m ->
if not m.requires_rooted_tree || rooted then if not m.requires_rooted_tree || rooted then
Some ((m.result q, m.col), m.label) Some (m.result q)
else None else None
) )
|> List.unzip
in in
Pipeline.benchmark_statistics simulation ~labels ~results Pipeline.benchmark_statistics simulation ~results
let () = let () =
let open Bistro_utils.Repo in let open Bistro_utils.Repo in
......
...@@ -96,30 +96,25 @@ module Mutsel = struct ...@@ -96,30 +96,25 @@ module Mutsel = struct
} }
let%pworkflow benchmark_statistics simulation ~labels ~results = let%pworkflow benchmark_statistics simulation ~results =
let open Phylogenetics in let open Phylogenetics in
let open Codepitk in let open Codepitk in
let open OCamlR_base in let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in let module Codon = Codon.Universal_genetic_code.NS in
let load_results fn col =
let open Codepitk in
let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
match Dataframe.get_col df col with
| Some (Floats xs) -> Array.map xs ~f:Option.some
| Some (Float_opts xs) -> xs
| Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
| Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
| _ -> failwith "expected a numeric column at pos 1"
in
let sim : simulation = [%eval simulation] in let sim : simulation = [%eval simulation] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in let result_paths = [%eval Bistro.Workflow.path_list results] in
let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in let results =
let labels = [%param labels] in List.map result_paths ~f:Cpt.of_file
|> Result.all
|> Rresult.R.failwith_error_msg
|> List.concat_map ~f:Cpt.columns
in
let labels = List.map results ~f:fst in
let n_h0 = Array.length sim.h0_params in let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in let nsites = n_h0 + n_ha in
let columns = List.map2_exn labels results ~f:(fun l r -> let columns = List.map results ~f:(fun (l, r) ->
l, `Numeric (Numeric.of_array_opt r) l, `Numeric (Numeric.of_array_opt r)
) in ) in
let amino_acid_vector_of_codon_vector xs = let amino_acid_vector_of_codon_vector xs =
...@@ -183,7 +178,7 @@ module Mutsel = struct ...@@ -183,7 +178,7 @@ module Mutsel = struct
in in
let estimates, lower_bounds, upper_bounds = let estimates, lower_bounds, upper_bounds =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun scores -> List.map results ~f:(fun (_, scores) ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in let theta_hat = Prc.auc_trapezoidal_lt data in
...@@ -314,134 +309,6 @@ module Mutsel = struct ...@@ -314,134 +309,6 @@ module Mutsel = struct
(* let v = g.gc_stat.gc_variance_among_sequences in (* let v = g.gc_stat.gc_variance_among_sequences in
Float.(v >= 8.388e-05 && v <= 5.262e-02) *) Float.(v >= 8.388e-05 && v <= 5.262e-02) *)
(* let%workflow benchmark_statistics q methods =
* let open Phylogenetics in
* let open Codepitk in
* let open OCamlR_base in
* let open Codepitk.Simulator.Site_independent_mutsel in
* let module Codon = Codon.Universal_genetic_code.NS in
* let results_paths = [%eval Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
* let sim : simulation = [%eval simulation q] in
* let load_results fn =
* let open Codepitk in
* let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
* List.init (Dataframe.ncols df - 1) ~f:(fun i ->
* match Dataframe.get_col df (i + 1) with
* | Some (Floats xs) -> Array.map xs ~f:Option.some
* | Some (Float_opts xs) -> xs
* | Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
* | Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
* | Some _ -> failwith "expected a numeric column at pos 1"
* | None -> assert false
* )
* in
* let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
* let labels = [%param labels] in
* let n_h0 = Array.length sim.h0_params in
* let n_ha = Array.length sim.ha_params in
* let nsites = n_h0 + n_ha in
* let columns = List.map2_exn labels results ~f:(fun l r ->
* l, `Numeric (Numeric.of_array_opt r)
* ) in
* let amino_acid_vector_of_codon_vector xs =
* Amino_acid.Vector.init (fun aa ->
* List.fold Codon.all ~init:0. ~f:(fun acc c ->
* if Amino_acid.equal aa (Codon.aa_of_codon c) then
* acc +. xs.Codon.%(c)
* else acc
* )
* )
* in
* let collect_profiles sel =
* Array.append sim.h0_params sim.ha_params
* |> Array.map ~f:(fun p ->
* sel p
* |> Mutsel.stationary_distribution
* |> amino_acid_vector_of_codon_vector
* |> Amino_acid.Vector.to_array
* )
* |> Numeric.Matrix.of_arrays
* in
* let ancestral_profiles = collect_profiles fst in
* let convergent_profiles = collect_profiles snd in
* let counts seqs i =
* let t =
* Amino_acid.Table.init (fun aa ->
* let aa = Amino_acid.to_char aa in
* List.count seqs ~f:(fun s ->
* let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
* let codon = match Codon.of_string codon_str with
* | Some c -> c
* | None -> assert false
* in
* Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
* )
* in
* (t :> int array)
* in
* let collect_counts cond =
* let species = Convergence_tree.leaves sim.tree in
* let seqs =
* List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
* if Poly.equal cond cond_s then Some s else None
* )
* |> List.filter_opt
* in
* Array.init nsites ~f:(counts seqs)
* |> Integer.Matrix.of_arrays
* in
* let ancestral_counts = collect_counts `Ancestral in
* let convergent_counts = collect_counts `Convergent in
* let make_classification_data x y =
* Prc.Classification_data (
* List.init (Array.length x) ~f:(fun i ->
* match x.(i), y.(i) with
* | Some x_i, Some y_i -> Some (x_i, y_i)
* | None, _ | _, None -> None
* )
* |> List.filter_opt
* )
* in
* let estimates, lower_bounds, upper_bounds =
* let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
* List.map results ~f:(fun scores ->
* let Prc.Classification_data xs as data = make_classification_data scores oracle in
* let n = List.count xs ~f:snd in
* let theta_hat = Prc.auc_trapezoidal_lt data in
* let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
* theta_hat, lb, ub
* )
* |> List.unzip3
* in
* let open OCamlR_base in
* let auc_estimates = Dataframe.create [
* "method", `Character (Character.of_list labels) ;
* "estimate", `Numeric (Numeric.of_list estimates) ;
* "lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
* "upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
* ]
* in
* let oracle =
* Array.(
* append
* (map sim.h0_profiles ~f:(Fn.const false))
* (map sim.ha_profiles ~f:(Fn.const true))
* )
* |> Logical.of_array
* in
* let results = Dataframe.create columns in
* let data = List_.create [
* Some "results", Dataframe.to_sexp results ;
* Some "oracle", Logical.to_sexp oracle ;
* Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
* Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
* Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
* Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
* Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
* ]
* in
* saveRDS ~file:[%dest] (List_.to_sexp data) *)
end end
module Bppseqgen = struct module Bppseqgen = struct
......
...@@ -49,8 +49,7 @@ module Mutsel : sig ...@@ -49,8 +49,7 @@ module Mutsel : sig
val benchmark_statistics : val benchmark_statistics :
Codepitk.Simulator.Site_independent_mutsel.simulation workflow -> Codepitk.Simulator.Site_independent_mutsel.simulation workflow ->
labels:string list -> results:cpt file list ->
results:(cpt file * int) list ->
binary_file file binary_file file
(* val benchmark : t -> (t -> cpt file) list -> benchmark workflow (* val benchmark : t -> (t -> cpt file) list -> benchmark workflow
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment