Commit 8d3102fe authored by Philippe Veber's avatar Philippe Veber
Browse files

Simulation_pipeline.Mutsel: separate benchmark of its rds output

parent 54d6b1a3
......@@ -79,6 +79,8 @@ end
module Mutsel = struct
open Phylogenetics
type query = Mutsel_query.t
let query = Mutsel_query.make
let simulation = Mutsel_query.simulation
......@@ -87,36 +89,34 @@ module Mutsel = struct
type benchmark = {
n_h0 : int ;
n_ha : int ;
method_labels : string list ;
method_outputs : float option array list ;
average_precision : (float * (float * float)) list ;
site_model : [`Convergent | `Non_convergent] array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ;
profiles : (float array * float array) array ;
ancestral_counts : int Amino_acid.table array ;
convergent_counts : int Amino_acid.table array ;
}
let%pworkflow benchmark_statistics simulation ~results =
let%workflow benchmark q methods =
let open Phylogenetics in
let open Codepitk in
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in
let sim : simulation = [%eval simulation] in
let result_paths = [%eval Bistro.Workflow.path_list results] in
let sim : simulation = [%eval simulation q] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
let results =
List.map result_paths ~f:Cpt.of_file
|> Result.all
|> Rresult.R.failwith_error_msg
|> List.concat_map ~f:Cpt.columns
in
let labels = List.map results ~f:fst in
let method_labels, method_outputs = List.unzip results in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let columns = List.map results ~f:(fun (l, r) ->
l, `Numeric (Numeric.of_array_opt r)
) in
let amino_acid_vector_of_codon_vector xs =
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
......@@ -126,32 +126,28 @@ module Mutsel = struct
)
)
in
let collect_profiles sel =
let compute_profile p =
Mutsel.stationary_distribution p
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
in
let profiles =
Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun p ->
sel p
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
|> Array.map ~f:(fun (p1, p2) ->
compute_profile p1, compute_profile p2
)
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i =
let t =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s ->
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon = match Codon.of_string codon_str with
| Some c -> c
| None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
)
in
(t :> int array)
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s ->
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon = match Codon.of_string codon_str with
| Some c -> c
| None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
......@@ -162,7 +158,6 @@ module Mutsel = struct
|> List.filter_opt
in
Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in
......@@ -176,33 +171,56 @@ module Mutsel = struct
|> List.filter_opt
)
in
let estimates, lower_bounds, upper_bounds =
let average_precision =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun (_, scores) ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub
theta_hat, (lb, ub)
)
|> List.unzip3
in
{
method_labels ; method_outputs ;
ancestral_counts ; convergent_counts ;
average_precision ; profiles ; n_h0 ; n_ha
}
let%pworkflow rds_of_benchmark b =
let { average_precision ; method_labels ;
n_h0 ; n_ha ; method_outputs ; profiles ;
ancestral_counts ; convergent_counts } = [%eval b] in
let open OCamlR_base in
let auc_estimates = Dataframe.create [
"method", `Character (Character.of_list labels) ;
let collect_profiles sel =
Array.map profiles ~f:sel
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let collect_counts c =
Array.map c ~f:(fun c -> (c : int Amino_acid.table :> int array))
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts ancestral_counts in
let convergent_counts = collect_counts convergent_counts in
let auc_estimates =
let estimates, bounds = List.unzip average_precision in
let lower_bounds, upper_bounds = List.unzip bounds in
Dataframe.create [
"method", `Character (Character.of_list method_labels) ;
"estimate", `Numeric (Numeric.of_list estimates) ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
]
in
let oracle =
Array.(
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
)
Array.init (n_h0 + n_ha) ~f:(fun i -> if i < n_h0 then false else true)
|> Logical.of_array
in
let columns = List.map2_exn method_labels method_outputs ~f:(fun l r ->
l, `Numeric (Numeric.of_array_opt r)
) in
let results = Dataframe.create columns in
let data = List_.create [
Some "results", Dataframe.to_sexp results ;
......
......@@ -38,23 +38,20 @@ module Mutsel : sig
type benchmark = {
n_h0 : int ;
n_ha : int ;
method_labels : string list ;
method_outputs : float option array list ;
average_precision : (float * (float * float)) list ;
site_model : [`Convergent | `Non_convergent] array ;
profiles : (float array * float array) array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ;
}
val benchmark_statistics :
Codepitk.Simulator.Site_independent_mutsel.simulation workflow ->
results:cpt file list ->
binary_file file
val benchmark : query -> (query -> cpt file) list -> benchmark workflow
(* val benchmark : t -> (t -> cpt file) list -> benchmark workflow
*
* val rds_of_benchmark : benchmark workflow -> rds file *)
val rds_of_benchmark : benchmark workflow -> rds file
(** stuff for gbgc SBME paper *)
type gBGC_t =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment