Commit bbdd8e37 authored by Philippe Veber's avatar Philippe Veber
Browse files

Simulation_dataset: separate in-house simulation and bppseqgen's

parent 265897c0
......@@ -15,28 +15,13 @@ type tree =
branch_length2 : float ;
}
type 's benchmark = {
method_labels : string list ;
method_outputs : float option array list ;
average_precision : (float * (float * float)) list ;
site_model : 's array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ;
}
module type S = sig
type t
type site_model
include Detection_pipeline.Query with type t := t
include Detection_pipeline.S with type query := t
val alignment_plot : t -> svg file
type query
val benchmark : t -> text file list -> site_model benchmark workflow
include Detection_pipeline.Query with type t := query
include Detection_pipeline.S with type query := query
val rds_of_benchmark : site_model benchmark workflow -> rds file
val alignment_plot : query -> svg file
end
let tree_workflow = function
......@@ -46,7 +31,18 @@ let tree_workflow = function
npairs } ->
Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs
module Mutsel_param = struct
module Make(Q : Detection_pipeline.Query) = struct
include Detection_pipeline.Make(Q)
let alignment_plot d =
Convergence_detection.plot_convergent_sites
~tree:(Q.tree ~branch_length_unit:`Amino_acid d)
~alignment:(amino_acid_alignment d)
~detection_results:(multinomial_asymptotic_lrt d)
()
end
module Mutsel_query = struct
type t = {
tree : tree ;
branch_scale : float ;
......@@ -73,379 +69,524 @@ module Mutsel_param = struct
let tree = tree_workflow tree in
let fitness_profiles = Workflow.input profiles in
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
end
type t =
| Bppseqgen of {
hypothesis : Convergence_hypothesis.t ;
tree : tree ;
profiles : string ;
nb_sites : int ;
seed : int ;
}
| Bppseqgen_mixed of {
tree : tree ;
profiles : string ;
seed : int ;
n_h0 : int ;
n_ha : int ;
ne_s : float ;
}
| Mutsel of Mutsel_param.t
let nucleotide_alignment p =
simulation p
|> Simulator.alignment_of_simulation
let bppseqgen_mixed ?(ne_s = 1.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () =
Bppseqgen_mixed {
tree ;
profiles ;
seed ;
n_ha ;
n_h0 ;
ne_s ;
}
let tree ~branch_length_unit:_ { tree ; _ } = tree_workflow tree
end
let bppseqgen ~hyp ~tree ~profiles ~nb_sites ~seed =
Bppseqgen {
hypothesis = hyp ;
tree ;
profiles ;
nb_sites ;
seed ;
}
let mutsel ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha () =
Mutsel (Mutsel_param.make ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha ())
let of_mutsel_param p = Mutsel p
let tree ~branch_length_unit:_ = function
| Bppseqgen { tree ; _ }
| Bppseqgen_mixed { tree ; _ }
| Mutsel { tree ; _ } ->
tree_workflow tree
let seed = function
| Bppseqgen_mixed s -> s.seed
| Bppseqgen s -> s.seed
| Mutsel s -> s.seed
let profile ~nb_sites ~profiles ~seed =
Profile.profile_l_of_splitted_profile
~nb_cat:All
~nb_sites
profiles
~seed:(calc_fixed_seed ~str:profiles seed)
let bppseqgen_simulation sim ~hypothesis ~nb_sites ~profiles =
let model_prefix = Convergence_hypothesis.string_of_model hypothesis in
let descr = sprintf ".%s" model_prefix in
let profile = profile ~nb_sites ~profiles ~seed:(seed sim) in
let profile_f = profile.profile_f in
let profile_c = profile.profile_c in
Bppsuite.Bppseqgen.multi_profiles
~descr
~input_tree:(tree ~branch_length_unit:`Nucleotide sim)
~hypothesis ~profile_f ~profile_c ~seed:(seed sim)
let rec nucleotide_alignment = function
| Bppseqgen { hypothesis ; nb_sites ; profiles ; _ } as sim ->
bppseqgen_simulation sim ~hypothesis ~nb_sites ~profiles
|> Bppsuite.Bppseqgen.alignment
| Bppseqgen_mixed { profiles ; seed ; n_h0 ; n_ha ; ne_s ; tree } ->
let h0 = nucleotide_alignment (Bppseqgen { hypothesis = H0 (Fixed ne_s) ; profiles ; seed ; nb_sites = n_h0 ; tree }) in
let ha = nucleotide_alignment (Bppseqgen { hypothesis = HaPC (Fixed ne_s) ; profiles ; seed ; nb_sites = n_ha ; tree }) in
Utils.fasta_cappend h0 ha
| Mutsel param ->
Mutsel_param.simulation param
|> Simulator.alignment_of_simulation
module Mutsel = struct
type query = Mutsel_query.t
let query = Mutsel_query.make
let simulation = Mutsel_query.simulation
include Detection_pipeline.Make(struct
type nonrec t = t
let tree = tree
let nucleotide_alignment = nucleotide_alignment
end)
let alignment_plot d =
Convergence_detection.plot_convergent_sites
~tree:(tree ~branch_length_unit:`Amino_acid d)
~alignment:(amino_acid_alignment d)
~detection_results:(multinomial_asymptotic_lrt d)
()
let oracle d =
let n_h0, n_ha =
match d with
| Bppseqgen { nb_sites ; hypothesis ; _ } -> (
match hypothesis with
| H0 _ -> nb_sites, 0
| HaPC _ | HaPCOC _ -> 0, nb_sites
)
| Bppseqgen_mixed { n_h0 ; n_ha ; _ }
| Mutsel { n_h0 ; n_ha ; _ } -> n_h0, n_ha
in
Convergence_detection.oracle ~n_h0 ~n_ha
let multinomial_benchmark d =
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:["LRT";"LRTsim";"sparse";"sparse_sim"]
~results:[
multinomial_asymptotic_lrt d, 1 ;
multinomial_simulation_lrt d, 1 ;
multinomial_asymptotic_sparse d, 1 ;
multinomial_simulation_sparse d, 1 ;
]
include Make(Mutsel_query)
let result_table ?(mode = `fast) d =
Convergence_detection.merge_result_tables
~multinomial:(multinomial_asymptotic_lrt d)
~tdg09:(tdg09 d)
~identical:(identical d)
~topological:(topological d)
~pcoc:(
match mode with
| `fast -> pcoc ~gamma:false ~ncat:10 d
| `full -> pcoc d
)
?diffsel:(
match mode with
| `fast -> None
| `full -> Some (diffsel d)
)
~oracle:(oracle d)
()
let benchmark ?mode d =
result_table ?mode d
|> Convergence_detection.recall_precision_curve
let benchmark2 d =
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:["identical";"topological";"multinomial";"pcoc";"tdg09"]
~results:[
identical d, 1 ;
topological d, 1 ;
multinomial_asymptotic_lrt d, 1 ;
pcoc d, 3 ;
tdg09 d, 1 ;
]
type benchmark = {
method_labels : string list ;
method_outputs : float option array list ;
average_precision : (float * (float * float)) list ;
site_model : [`Convergent | `Non_convergent] array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ;
}
let%pworkflow benchmark_statistics simulation ~labels ~results =
let open Phylogenetics in
let open Codepitk in
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in
let load_results fn col =
let%pworkflow benchmark_statistics simulation ~labels ~results =
let open Phylogenetics in
let open Codepitk in
let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
match Dataframe.get_col df col with
| Some (Floats xs) -> Array.map xs ~f:Option.some
| Some (Float_opts xs) -> xs
| Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
| Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
| _ -> failwith "expected a numeric column at pos 1"
in
let sim : simulation = [%eval simulation] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in
let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
let labels = [%param labels] in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let columns = List.map2_exn labels results ~f:(fun l r ->
l, `Numeric (Numeric.of_array_opt r)
) in
let amino_acid_vector_of_codon_vector xs =
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then
acc +. xs.Codon.%(c)
else acc
)
)
in
let collect_profiles sel =
Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun p ->
sel p
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
)
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i =
let t =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s ->
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon = match Codon.of_string codon_str with
| Some c -> c
| None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in
let load_results fn col =
let open Codepitk in
let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
match Dataframe.get_col df col with
| Some (Floats xs) -> Array.map xs ~f:Option.some
| Some (Float_opts xs) -> xs
| Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
| Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
| _ -> failwith "expected a numeric column at pos 1"
in
let sim : simulation = [%eval simulation] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in
let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
let labels = [%param labels] in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let columns = List.map2_exn labels results ~f:(fun l r ->
l, `Numeric (Numeric.of_array_opt r)
) in
let amino_acid_vector_of_codon_vector xs =
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then
acc +. xs.Codon.%(c)
else acc
)
)
in
(t :> int array)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
let seqs =
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
if Poly.equal cond cond_s then Some s else None
let collect_profiles sel =
Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun p ->
sel p
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
)
|> List.filter_opt
|> Numeric.Matrix.of_arrays
in
Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in
let make_classification_data x y =
Prc.Classification_data (
List.init (Array.length x) ~f:(fun i ->
match x.(i), y.(i) with
| Some x_i, Some y_i -> Some (x_i, y_i)
| None, _ | _, None -> None
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i =
let t =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s ->
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon = match Codon.of_string codon_str with
| Some c -> c
| None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
)
in
(t :> int array)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
let seqs =
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
if Poly.equal cond cond_s then Some s else None
)
|> List.filter_opt
in
Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in
let make_classification_data x y =
Prc.Classification_data (
List.init (Array.length x) ~f:(fun i ->
match x.(i), y.(i) with
| Some x_i, Some y_i -> Some (x_i, y_i)
| None, _ | _, None -> None
)
|> List.filter_opt
)
in
let estimates, lower_bounds, upper_bounds =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun scores ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub
)
|> List.filter_opt
)
in
let estimates, lower_bounds, upper_bounds =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun scores ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub
|> List.unzip3
in
let open OCamlR_base in
let auc_estimates = Dataframe.create [
"method", `Character (Character.of_list labels) ;
"estimate", `Numeric (Numeric.of_list estimates) ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
]
in
let oracle =
Array.(
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
)
|> List.unzip3
in
let open OCamlR_base in
let auc_estimates = Dataframe.create [
"method", `Character (Character.of_list labels) ;
"estimate", `Numeric (Numeric.of_list estimates) ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
]
in
let oracle =
Array.(
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
)
|> Logical.of_array
in
let results = Dataframe.create columns in
let data = List_.create [
Some "results", Dataframe.to_sexp results ;
Some "oracle", Logical.to_sexp oracle ;
Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
|> Logical.of_array
in
let results = Dataframe.create columns in
let data = List_.create [
Some "results", Dataframe.to_sexp results ;
Some "oracle", Logical.to_sexp oracle ;
Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
]
in
saveRDS ~file:[%dest] (List_.to_sexp data)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let branch_scale_range = [ 1.; 3.; 6.; 9. ]
type gBGC_t = Global of float | Convergent of float * float
let gBGC_range =
let range = [ 0.; 2.; 4.; 8.; 16.; 32.; 64.; ] in
List.concat [
(* List.map ~f:(fun x -> Global x) range ; *)
List.map ~f:(fun x -> Convergent (0., x)) range ;
]
in
saveRDS ~file:[%dest] (List_.to_sexp data)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let branch_scale_range = [ 1.; 3.; 6.; 9. ]
type gBGC_t = Global of float | Convergent of float * float
let gBGC_range =
let range = [ 0.; 2.; 4.; 8.; 16.; 32.; 64.; ] in
List.concat [
(* List.map ~f:(fun x -> Global x) range ; *)
List.map ~f:(fun x -> Convergent (0., x)) range ;
]
type param_t = float * gBGC_t
let explore_params ~(f: param_t -> _) =
List.map branch_scale_range ~f:(fun (bf:float) ->
List.map gBGC_range ~f:(fun (gbgc:gBGC_t) -> ((bf, gbgc), f (bf, gbgc)))
) |> List.concat
let simu_of_param ?n_h0:(n_h0=50) (p: param_t) =
let bf, gbgc = p in
mutsel
~tree:(NHX (Workflow.input "example/trees_analyses/C4AmaranthaceaePolyroot.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~branch_scale:bf
~gBGC:(match gbgc with
| Convergent (a,c) -> (a, c)
| Global g -> (g, g))
~ne_s:(4., 4.)
~n_ha:0
~n_h0
()
let filter_results ~(f: _ -> bool) (results: (param_t * _) list) =
List.filter results ~f:(fun (_, x) -> f x)
type record_t = {
gc_means_ancestral: ([`first | `second | `third] * float) list ;
gc_means_convergent: ([`first | `second | `third] * float) list
}
let%workflow record_of_simu s =
let tree = [%path tree ~branch_length_unit:`Nucleotide s] in
let nucleotide_alignment = [%path nucleotide_alignment s] in
let gc_mean_from_simu ~pos =
Alistats.nucleotide_fasta_gc_ac ~pos tree nucleotide_alignment
in let (m1_a, m1_c), (m2_a, m2_c), (m3_a, m3_c) =
gc_mean_from_simu ~pos:`first,
gc_mean_from_simu ~pos:`second,
gc_mean_from_simu ~pos:`third
in {
gc_means_ancestral = [(`first, m1_a.gc_mean) ; (`second, m2_a.gc_mean) ; (`third, m3_a.gc_mean)] ;
gc_means_convergent = [(`first, m1_c.gc_mean) ; (`second, m2_c.gc_mean) ; (`third, m3_c.gc_mean)]
type param_t = float * gBGC_t
let explore_params ~(f: param_t -> _) =
List.map branch_scale_range ~f:(fun (bf:float) ->
List.map gBGC_range ~f:(fun (gbgc:gBGC_t) -> ((bf, gbgc), f (bf, gbgc)))
) |> List.concat
let simu_of_param ?n_h0:(n_h0=50) (p: param_t) =
let bf, gbgc = p in
Mutsel_query.make
~tree:(NHX (Workflow.input "example/trees_analyses/C4AmaranthaceaePolyroot.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~branch_scale:bf
~gBGC:(match gbgc with
| Convergent (a,c) -> (a, c)
| Global g -> (g, g))
~ne_s:(4., 4.)
~n_ha:0
~n_h0
()
let filter_results ~(f: _ -> bool) (results: (param_t * _) list) =
List.filter results ~f:(fun (_, x) -> f x)
type record_t = {
gc_means_ancestral: ([`first | `second | `third] * float) list ;
gc_means_convergent: ([`first | `second | `third] * float) list
}
let expected_gc = [
(`first, (0.3326, 0.5157, 0.5589, 0.6080, 0.8621)) ;
(`second, (0.2102, 0.3784, 0.4160, 0.4626, 0.7499)) ;
(`third, (0.2242, 0.4852, 0.6274, 0.7358, 0.9575))
]
let quartile (min_, fq_, mean_, tq_, max_) x =
match Float.( x < min_, x < fq_, x < mean_, x < tq_, x < max_) with
| true, _, _, _, _ -> `below_min
| false, true, _, _, _ -> `first
| _, false, true, _, _ -> `second
| _, _, false, true, _ -> `third
| _, _, _, false, true -> `fourth
| _, _, _, _, false -> `over_max
let adjacent q1 q2 =
match q1, q2 with
| `first, `first | `second, `second
| `third, `third | `fourth, `fourth
| `first, `second | `second, `first
| `second, `third | `third, `second
| `third, `fourth | `fourth, `third -> true