Commit bbdd8e37 authored by Philippe Veber's avatar Philippe Veber
Browse files

Simulation_dataset: separate in-house simulation and bppseqgen's

parent 265897c0
...@@ -15,28 +15,13 @@ type tree = ...@@ -15,28 +15,13 @@ type tree =
branch_length2 : float ; branch_length2 : float ;
} }
type 's benchmark = {
method_labels : string list ;
method_outputs : float option array list ;
average_precision : (float * (float * float)) list ;
site_model : 's array ;
ancestral_counts : int Phylogenetics.Amino_acid.table array ;
convergent_counts : int Phylogenetics.Amino_acid.table array ;
}
module type S = sig module type S = sig
type t type query
type site_model
include Detection_pipeline.Query with type t := t
include Detection_pipeline.S with type query := t
val alignment_plot : t -> svg file
val benchmark : t -> text file list -> site_model benchmark workflow include Detection_pipeline.Query with type t := query
include Detection_pipeline.S with type query := query
val rds_of_benchmark : site_model benchmark workflow -> rds file val alignment_plot : query -> svg file
end end
let tree_workflow = function let tree_workflow = function
...@@ -46,7 +31,18 @@ let tree_workflow = function ...@@ -46,7 +31,18 @@ let tree_workflow = function
npairs } -> npairs } ->
Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs Simulator.pair_tree ~branch_length1 ~branch_length2 ~npairs
module Mutsel_param = struct module Make(Q : Detection_pipeline.Query) = struct
include Detection_pipeline.Make(Q)
let alignment_plot d =
Convergence_detection.plot_convergent_sites
~tree:(Q.tree ~branch_length_unit:`Amino_acid d)
~alignment:(amino_acid_alignment d)
~detection_results:(multinomial_asymptotic_lrt d)
()
end
module Mutsel_query = struct
type t = { type t = {
tree : tree ; tree : tree ;
branch_scale : float ; branch_scale : float ;
...@@ -73,379 +69,524 @@ module Mutsel_param = struct ...@@ -73,379 +69,524 @@ module Mutsel_param = struct
let tree = tree_workflow tree in let tree = tree_workflow tree in
let fitness_profiles = Workflow.input profiles in let fitness_profiles = Workflow.input profiles in
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles () Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
end
type t = let nucleotide_alignment p =
| Bppseqgen of { simulation p
hypothesis : Convergence_hypothesis.t ; |> Simulator.alignment_of_simulation
tree : tree ;
profiles : string ;
nb_sites : int ;
seed : int ;
}
| Bppseqgen_mixed of {
tree : tree ;
profiles : string ;
seed : int ;
n_h0 : int ;
n_ha : int ;
ne_s : float ;
}
| Mutsel of Mutsel_param.t
let bppseqgen_mixed ?(ne_s = 1.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () = let tree ~branch_length_unit:_ { tree ; _ } = tree_workflow tree
Bppseqgen_mixed { end
tree ;
profiles ;
seed ;
n_ha ;
n_h0 ;
ne_s ;
}
let bppseqgen ~hyp ~tree ~profiles ~nb_sites ~seed =
Bppseqgen {
hypothesis = hyp ;
tree ;
profiles ;
nb_sites ;
seed ;
}
let mutsel ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha () = module Mutsel = struct
Mutsel (Mutsel_param.make ?branch_scale ?ne_s ?gBGC ?seed ~tree ~profiles ~n_h0 ~n_ha ()) type query = Mutsel_query.t
let query = Mutsel_query.make
let of_mutsel_param p = Mutsel p let simulation = Mutsel_query.simulation
let tree ~branch_length_unit:_ = function
| Bppseqgen { tree ; _ }
| Bppseqgen_mixed { tree ; _ }
| Mutsel { tree ; _ } ->
tree_workflow tree
let seed = function
| Bppseqgen_mixed s -> s.seed
| Bppseqgen s -> s.seed
| Mutsel s -> s.seed
let profile ~nb_sites ~profiles ~seed =
Profile.profile_l_of_splitted_profile
~nb_cat:All
~nb_sites
profiles
~seed:(calc_fixed_seed ~str:profiles seed)
let bppseqgen_simulation sim ~hypothesis ~nb_sites ~profiles =
let model_prefix = Convergence_hypothesis.string_of_model hypothesis in
let descr = sprintf ".%s" model_prefix in
let profile = profile ~nb_sites ~profiles ~seed:(seed sim) in
let profile_f = profile.profile_f in
let profile_c = profile.profile_c in
Bppsuite.Bppseqgen.multi_profiles
~descr
~input_tree:(tree ~branch_length_unit:`Nucleotide sim)
~hypothesis ~profile_f ~profile_c ~seed:(seed sim)
let rec nucleotide_alignment = function
| Bppseqgen { hypothesis ; nb_sites ; profiles ; _ } as sim ->
bppseqgen_simulation sim ~hypothesis ~nb_sites ~profiles
|> Bppsuite.Bppseqgen.alignment
| Bppseqgen_mixed { profiles ; seed ; n_h0 ; n_ha ; ne_s ; tree } ->
let h0 = nucleotide_alignment (Bppseqgen { hypothesis = H0 (Fixed ne_s) ; profiles ; seed ; nb_sites = n_h0 ; tree }) in
let ha = nucleotide_alignment (Bppseqgen { hypothesis = HaPC (Fixed ne_s) ; profiles ; seed ; nb_sites = n_ha ; tree }) in
Utils.fasta_cappend h0 ha
| Mutsel param ->
Mutsel_param.simulation param
|> Simulator.alignment_of_simulation
include Detection_pipeline.Make(struct include Make(Mutsel_query)
type nonrec t = t
let tree = tree
let nucleotide_alignment = nucleotide_alignment
end)
let alignment_plot d =
Convergence_detection.plot_convergent_sites
~tree:(tree ~branch_length_unit:`Amino_acid d)
~alignment:(amino_acid_alignment d)
~detection_results:(multinomial_asymptotic_lrt d)
()
let oracle d =
let n_h0, n_ha =
match d with
| Bppseqgen { nb_sites ; hypothesis ; _ } -> (
match hypothesis with
| H0 _ -> nb_sites, 0
| HaPC _ | HaPCOC _ -> 0, nb_sites
)
| Bppseqgen_mixed { n_h0 ; n_ha ; _ }
| Mutsel { n_h0 ; n_ha ; _ } -> n_h0, n_ha
in
Convergence_detection.oracle ~n_h0 ~n_ha
let multinomial_benchmark d =
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:["LRT";"LRTsim";"sparse";"sparse_sim"]
~results:[
multinomial_asymptotic_lrt d, 1 ;
multinomial_simulation_lrt d, 1 ;
multinomial_asymptotic_sparse d, 1 ;
multinomial_simulation_sparse d, 1 ;
]
let result_table ?(mode = `fast) d = type benchmark = {
Convergence_detection.merge_result_tables method_labels : string list ;
~multinomial:(multinomial_asymptotic_lrt d) method_outputs : float option array list ;
~tdg09:(tdg09 d) average_precision : (float * (float * float)) list ;
~identical:(identical d) site_model : [`Convergent | `Non_convergent] array ;
~topological:(topological d) ancestral_counts : int Phylogenetics.Amino_acid.table array ;
~pcoc:( convergent_counts : int Phylogenetics.Amino_acid.table array ;
match mode with }
| `fast -> pcoc ~gamma:false ~ncat:10 d
| `full -> pcoc d
)
?diffsel:(
match mode with
| `fast -> None
| `full -> Some (diffsel d)
)
~oracle:(oracle d)
()
let benchmark ?mode d =
result_table ?mode d
|> Convergence_detection.recall_precision_curve
let benchmark2 d =
Utils.recall_precision_curve
~oracle:(oracle d)
~labels:["identical";"topological";"multinomial";"pcoc";"tdg09"]
~results:[
identical d, 1 ;
topological d, 1 ;
multinomial_asymptotic_lrt d, 1 ;
pcoc d, 3 ;
tdg09 d, 1 ;
]
let%pworkflow benchmark_statistics simulation ~labels ~results =
let open Phylogenetics in let%pworkflow benchmark_statistics simulation ~labels ~results =
let open Codepitk in let open Phylogenetics in
let open OCamlR_base in
let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in
let load_results fn col =
let open Codepitk in let open Codepitk in
let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in let open OCamlR_base in
match Dataframe.get_col df col with let open Codepitk.Simulator.Site_independent_mutsel in
| Some (Floats xs) -> Array.map xs ~f:Option.some let module Codon = Codon.Universal_genetic_code.NS in
| Some (Float_opts xs) -> xs let load_results fn col =
| Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i)) let open Codepitk in
| Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int) let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
| _ -> failwith "expected a numeric column at pos 1" match Dataframe.get_col df col with
in | Some (Floats xs) -> Array.map xs ~f:Option.some
let sim : simulation = [%eval simulation] in | Some (Float_opts xs) -> xs
let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in | Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in | Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
let labels = [%param labels] in | _ -> failwith "expected a numeric column at pos 1"
let n_h0 = Array.length sim.h0_params in in
let n_ha = Array.length sim.ha_params in let sim : simulation = [%eval simulation] in
let nsites = n_h0 + n_ha in let result_paths = [%eval Bistro.Workflow.path_list (List.map ~f:fst results)] in
let columns = List.map2_exn labels results ~f:(fun l r -> let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
l, `Numeric (Numeric.of_array_opt r) let labels = [%param labels] in
) in let n_h0 = Array.length sim.h0_params in
let amino_acid_vector_of_codon_vector xs = let n_ha = Array.length sim.ha_params in
Amino_acid.Vector.init (fun aa -> let nsites = n_h0 + n_ha in
List.fold Codon.all ~init:0. ~f:(fun acc c -> let columns = List.map2_exn labels results ~f:(fun l r ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then l, `Numeric (Numeric.of_array_opt r)
acc +. xs.Codon.%(c) ) in
else acc let amino_acid_vector_of_codon_vector xs =
) Amino_acid.Vector.init (fun aa ->
) List.fold Codon.all ~init:0. ~f:(fun acc c ->
in if Amino_acid.equal aa (Codon.aa_of_codon c) then
let collect_profiles sel = acc +. xs.Codon.%(c)
Array.append sim.h0_params sim.ha_params else acc
|> Array.map ~f:(fun p -> )
sel p
|> Mutsel.stationary_distribution
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
)
|> Numeric.Matrix.of_arrays
in
let ancestral_profiles = collect_profiles fst in
let convergent_profiles = collect_profiles snd in
let counts seqs i =
let t =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s ->
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon = match Codon.of_string codon_str with
| Some c -> c
| None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
) )
in in
(t :> int array) let collect_profiles sel =
in Array.append sim.h0_params sim.ha_params
let collect_counts cond = |> Array.map ~f:(fun p ->
let species = Convergence_tree.leaves sim.tree in sel p
let seqs = |> Mutsel.stationary_distribution
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) -> |> amino_acid_vector_of_codon_vector
if Poly.equal cond cond_s then Some s else None |> Amino_acid.Vector.to_array
) )
|> List.filter_opt |> Numeric.Matrix.of_arrays
in in
Array.init nsites ~f:(counts seqs) let ancestral_profiles = collect_profiles fst in
|> Integer.Matrix.of_arrays let convergent_profiles = collect_profiles snd in
in let counts seqs i =
let ancestral_counts = collect_counts `Ancestral in let t =
let convergent_counts = collect_counts `Convergent in Amino_acid.Table.init (fun aa ->
let make_classification_data x y = let aa = Amino_acid.to_char aa in
Prc.Classification_data ( List.count seqs ~f:(fun s ->
List.init (Array.length x) ~f:(fun i -> let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
match x.(i), y.(i) with let codon = match Codon.of_string codon_str with
| Some x_i, Some y_i -> Some (x_i, y_i) | Some c -> c
| None, _ | _, None -> None | None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
)
in
(t :> int array)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
let seqs =
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
if Poly.equal cond cond_s then Some s else None
)
|> List.filter_opt
in
Array.init nsites ~f:(counts seqs)
|> Integer.Matrix.of_arrays
in
let ancestral_counts = collect_counts `Ancestral in
let convergent_counts = collect_counts `Convergent in
let make_classification_data x y =
Prc.Classification_data (
List.init (Array.length x) ~f:(fun i ->
match x.(i), y.(i) with
| Some x_i, Some y_i -> Some (x_i, y_i)
| None, _ | _, None -> None
)
|> List.filter_opt
)
in
let estimates, lower_bounds, upper_bounds =
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
List.map results ~f:(fun scores ->
let Prc.Classification_data xs as data = make_classification_data scores oracle in
let n = List.count xs ~f:snd in
let theta_hat = Prc.auc_trapezoidal_lt data in
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
theta_hat, lb, ub
) )
|> List.filter_opt |> List.unzip3
) in
in let open OCamlR_base in
let estimates, lower_bounds, upper_bounds = let auc_estimates = Dataframe.create [
let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in "method", `Character (Character.of_list labels) ;
List.map results ~f:(fun scores -> "estimate", `Numeric (Numeric.of_list estimates) ;
let Prc.Classification_data xs as data = make_classification_data scores oracle in "lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
let n = List.count xs ~f:snd in "upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
let theta_hat = Prc.auc_trapezoidal_lt data in ]
let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in in
theta_hat, lb, ub let oracle =
Array.(
append
(map sim.h0_profiles ~f:(Fn.const false))
(map sim.ha_profiles ~f:(Fn.const true))
) )
|> List.unzip3 |> Logical.of_array
in in
let open OCamlR_base in let results = Dataframe.create columns in
let auc_estimates = Dataframe.create [ let data = List_.create [
"method", `Character (Character.of_list labels) ; Some "results", Dataframe.to_sexp results ;
"estimate", `Numeric (Numeric.of_list estimates) ; Some "oracle", Logical.to_sexp oracle ;
"lower_bound", `Numeric (Numeric.of_list lower_bounds) ; Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
"upper_bound", `Numeric (Numeric.of_list upper_bounds) ; Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
] Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
in Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
let oracle = Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
Array.( ]
append in
(map sim.h0_profiles ~f:(Fn.const false)) saveRDS ~file:[%dest] (List_.to_sexp data)
(map sim.ha_profiles ~f:(Fn.const true))
) (* param exploration for SMBE paper *)
|> Logical.of_array (* type branch_scale_t = float *)
in let branch_scale_range = [ 1.; 3.; 6.; 9. ]
let results = Dataframe.create columns in
let data = List_.create [ type gBGC_t = Global of float | Convergent of float * float
Some "results", Dataframe.to_sexp results ; let gBGC_range =
Some "oracle", Logical.to_sexp oracle ; let range = [ 0.; 2.; 4.; 8.; 16.; 32.; 64.; ] in
Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ; List.concat [
Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ; (* List.map ~f:(fun x -> Global x) range ; *)
Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ; List.map ~f:(fun x -> Convergent (0., x)) range ;
Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
] ]
in
saveRDS ~file:[%dest] (List_.to_sexp data)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let branch_scale_range = [ 1.; 3.; 6.; 9. ]
type gBGC_t = Global of float | Convergent of float * float
let gBGC_range =
let range = [ 0.; 2.; 4.; 8.; 16.; 32.; 64.; ] in
List.concat [
(* List.map ~f:(fun x -> Global x) range ; *)
List.map ~f:(fun x -> Convergent (0., x)) range ;
]
type param_t = float * gBGC_t type param_t = float * gBGC_t
let explore_params ~(f: param_t -> _) = let explore_params ~(f: param_t -> _) =
List.map branch_scale_range ~f:(fun (bf:float) -> List.map branch_scale_range ~f:(fun (bf:float) ->
List.map gBGC_range ~f:(fun (gbgc:gBGC_t) -> ((bf, gbgc), f (bf, gbgc))) List.map gBGC_range ~f:(fun (gbgc:gBGC_t) -> ((bf, gbgc), f (bf, gbgc)))
) |> List.concat ) |> List.concat
let simu_of_param ?n_h0:(n_h0=50) (p: param_t) = let simu_of_param ?n_h0:(n_h0=50) (p: param_t) =
let bf, gbgc = p in let bf, gbgc = p in
mutsel Mutsel_query.make
~tree:(NHX (Workflow.input "example/trees_analyses/C4AmaranthaceaePolyroot.nhx")) ~tree:(NHX (Workflow.input "example/trees_analyses/C4AmaranthaceaePolyroot.nhx"))
~profiles:"example/aa_fitness/263SelectedProfiles.tsv" ~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~branch_scale:bf ~branch_scale:bf
~gBGC:(match gbgc with ~gBGC:(match gbgc with
| Convergent (a,c) -> (a, c) | Convergent (a,c) -> (a, c)
| Global g -> (g, g)) | Global g -> (g, g))
~ne_s:(4., 4.) ~ne_s:(4., 4.)
~n_ha:0 ~n_ha:0
~n_h0 ~n_h0
() ()
let filter_results ~(f: _ -> bool) (results: (param_t * _) list) = let filter_results ~(f: _ -> bool) (results: (param_t * _) list) =
List.filter results ~f:(fun (_, x) -> f x) List.filter results ~f:(fun (_, x) -> f x)
type record_t = { type record_t = {
gc_means_ancestral: ([`first | `second | `third] * float) list ; gc_means_ancestral: ([`first | `second | `third] * float) list ;
gc_means_convergent: ([`first | `second | `third] * float) list gc_means_convergent: ([`first | `second | `third] * float) list
}
let%workflow record_of_simu s =
let tree = [%path tree ~branch_length_unit:`Nucleotide s] in
let nucleotide_alignment = [%path nucleotide_alignment s] in
let gc_mean_from_simu ~pos =
Alistats.nucleotide_fasta_gc_ac ~pos tree nucleotide_alignment
in let (m1_a, m1_c), (m2_a, m2_c), (m3_a, m3_c) =
gc_mean_from_simu ~pos:`first,
gc_mean_from_simu ~pos:`second,
gc_mean_from_simu ~pos:`third
in {
gc_means_ancestral = [(`first, m1_a.gc_mean) ; (`second, m2_a.gc_mean) ; (`third, m3_a.gc_mean)] ;
gc_means_convergent = [(`first, m1_c.gc_mean) ; (`second, m2_c.gc_mean) ; (`third, m3_c.gc_mean)]
} }
let expected_gc = [ let%workflow record_of_simu s =
(`first, (0.3326, 0.5157, 0.5589, 0.6080, 0.8621)) ; let tree = [%path tree ~branch_length_unit:`Nucleotide s] in
(`second, (0.2102, 0.3784, 0.4160, 0.4626, 0.7499)) ; let nucleotide_alignment = [%path nucleotide_alignment s] in
(`third, (0.2242, 0.4852, 0.6274, 0.7358, 0.9575)) let gc_mean_from_simu ~pos =
] Alistats.nucleotide_fasta_gc_ac ~pos tree nucleotide_alignment
in let (m1_a, m1_c), (m2_a, m2_c), (m3_a, m3_c) =
let quartile (min_, fq_, mean_, tq_, max_) x = gc_mean_from_simu ~pos:`first,
match Float.( x < min_, x < fq_, x < mean_, x < tq_, x < max_) with gc_mean_from_simu ~pos:`second,
| true, _, _, _, _ -> `below_min gc_mean_from_simu ~pos:`third
| false, true, _, _, _ -> `first in {
| _, false, true, _, _ -> `second gc_means_ancestral = [(`first, m1_a.gc_mean) ; (`second, m2_a.gc_mean) ; (`third, m3_a.gc_mean)] ;
| _, _, false, true, _ -> `third gc_means_convergent = [(`first, m1_c.gc_mean) ; (`second, m2_c.gc_mean) ; (`third, m3_c.gc_mean)]