Commit 8d3102fe authored by Philippe Veber's avatar Philippe Veber
Browse files

Simulation_pipeline.Mutsel: separate benchmark of its rds output

parent 54d6b1a3
...@@ -79,6 +79,8 @@ end ...@@ -79,6 +79,8 @@ end
module Mutsel = struct module Mutsel = struct
open Phylogenetics
type query = Mutsel_query.t type query = Mutsel_query.t
let query = Mutsel_query.make let query = Mutsel_query.make
let simulation = Mutsel_query.simulation let simulation = Mutsel_query.simulation
...@@ -87,36 +89,34 @@ module Mutsel = struct ...@@ -87,36 +89,34 @@ module Mutsel = struct
type benchmark = { type benchmark = {
n_h0 : int ;
n_ha : int ;
method_labels : string list ; method_labels : string list ;
method_outputs : float option array list ; method_outputs : float option array list ;
average_precision : (float * (float * float)) list ; average_precision : (float * (float * float)) list ;
site_model : [`Convergent | `Non_convergent] array ; profiles : (float array * float array) array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ; ancestral_counts : int Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ; convergent_counts : int Amino_acid.table array ;
} }
let%pworkflow benchmark_statistics simulation ~results = let%workflow benchmark q methods =
let open Phylogenetics in let open Phylogenetics in
let open Codepitk in let open Codepitk in
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in let module Codon = Codon.Universal_genetic_code.NS in
let sim : simulation = [%eval simulation] in let sim : simulation = [%eval simulation q] in
let result_paths = [%eval Bistro.Workflow.path_list results] in let result_paths = [%eval Bistro.Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
let results = let results =
List.map result_paths ~f:Cpt.of_file List.map result_paths ~f:Cpt.of_file
|> Result.all |> Result.all
|> Rresult.R.failwith_error_msg |> Rresult.R.failwith_error_msg
|> List.concat_map ~f:Cpt.columns |> List.concat_map ~f:Cpt.columns
in in
let labels = List.map results ~f:fst in let method_labels, method_outputs = List.unzip results in
let n_h0 = Array.length sim.h0_params in let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in let nsites = n_h0 + n_ha in
let columns = List.map results ~f:(fun (l, r) ->
l, `Numeric (Numeric.of_array_opt r)
) in
let amino_acid_vector_of_codon_vector xs = let amino_acid_vector_of_codon_vector xs =
Amino_acid.Vector.init (fun aa -> Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c -> List.fold Codon.all ~init:0. ~f:(fun acc c ->
...@@ -126,32 +126,28 @@ module Mutsel = struct ...@@ -126,32 +126,28 @@ module Mutsel = struct
) )
) )
in in
let collect_profiles sel = let compute_profile p =
Mutsel.stationary_distribution p
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
in
let profiles =
Array.append sim.h0_params sim.ha_params Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun p -> |> Array.map ~f:(fun (p1, p2) ->
sel p compute_profile p1, compute_profile p2
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
) )
|> Numeric.Matrix.of_arrays
in in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i = let counts seqs i =
let t = Amino_acid.Table.init (fun aa ->
Amino_acid.Table.init (fun aa -> let aa = Amino_acid.to_char aa in
let aa = Amino_acid.to_char aa in List.count seqs ~f:(fun s ->
List.count seqs ~f:(fun s -> let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in let codon = match Codon.of_string codon_str with
let codon = match Codon.of_string codon_str with | Some c -> c
| Some c -> c | None -> assert false
| None -> assert false in
in Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa) )
)
in
(t :> int array)
in in
let collect_counts cond = let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in let species = Convergence_tree.leaves sim.tree in
...@@ -162,7 +158,6 @@ module Mutsel = struct ...@@ -162,7 +158,6 @@ module Mutsel = struct
|> List.filter_opt |> List.filter_opt
in in
Array.init nsites ~f:(counts seqs) Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in in
let ancestral_counts = collect_counts `Ancestral in let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in let convergent_counts = collect_counts `Convergent in
...@@ -176,33 +171,56 @@ module Mutsel = struct ...@@ -176,33 +171,56 @@ module Mutsel = struct
|> List.filter_opt |> List.filter_opt
) )
in in
let estimates, lower_bounds, upper_bounds = let average_precision =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun (_, scores) -> List.map results ~f:(fun (_, scores) ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub theta_hat, (lb, ub)
) )
|> List.unzip3
in in
{
method_labels ; method_outputs ;
ancestral_counts ; convergent_counts ;
average_precision ; profiles ; n_h0 ; n_ha
}
let%pworkflow rds_of_benchmark b =
let { average_precision ; method_labels ;
n_h0 ; n_ha ; method_outputs ; profiles ;
ancestral_counts ; convergent_counts } = [%eval b] in
let open OCamlR_base in let open OCamlR_base in
let auc_estimates = Dataframe.create [ let collect_profiles sel =
"method", `Character (Character.of_list labels) ; Array.map profiles ~f:sel
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let collect_counts c =
Array.map c ~f:(fun c -> (c : int Amino_acid.table :> int array))
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts ancestral_counts in
let convergent_counts = collect_counts convergent_counts in
let auc_estimates =
let estimates, bounds = List.unzip average_precision in
let lower_bounds, upper_bounds = List.unzip bounds in
Dataframe.create [
"method", `Character (Character.of_list method_labels) ;
"estimate", `Numeric (Numeric.of_list estimates) ; "estimate", `Numeric (Numeric.of_list estimates) ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ; "lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ; "upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
] ]
in in
let oracle = let oracle =
Array.( Array.init (n_h0 + n_ha) ~f:(fun i -> if i < n_h0 then false else true)
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
)
|> Logical.of_array |> Logical.of_array
in in
let columns = List.map2_exn method_labels method_outputs ~f:(fun l r ->
l, `Numeric (Numeric.of_array_opt r)
) in
let results = Dataframe.create columns in let results = Dataframe.create columns in
let data = List_.create [ let data = List_.create [
Some "results", Dataframe.to_sexp results ; Some "results", Dataframe.to_sexp results ;
......
...@@ -38,23 +38,20 @@ module Mutsel : sig ...@@ -38,23 +38,20 @@ module Mutsel : sig
type benchmark = { type benchmark = {
n_h0 : int ;
n_ha : int ;
method_labels : string list ; method_labels : string list ;
method_outputs : float option array list ; method_outputs : float option array list ;
average_precision : (float * (float * float)) list ; average_precision : (float * (float * float)) list ;
site_model : [`Convergent | `Non_convergent] array ; profiles : (float array * float array) array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ; ancestral_counts : int Phylogenetics.Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ; convergent_counts : int Phylogenetics.Amino_acid.table array ;
} }
val benchmark_statistics : val benchmark : query -> (query -> cpt file) list -> benchmark workflow
Codepitk.Simulator.Site_independent_mutsel.simulation workflow ->
results:cpt file list ->
binary_file file
(* val benchmark : t -> (t -> cpt file) list -> benchmark workflow val rds_of_benchmark : benchmark workflow -> rds file
*
* val rds_of_benchmark : benchmark workflow -> rds file *)
(** stuff for gbgc SBME paper *) (** stuff for gbgc SBME paper *)
type gBGC_t = type gBGC_t =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment