Commit 9fee54e9 authored by Louis Duchemin's avatar Louis Duchemin
Browse files

Refactoring Mutsel simulator

parent f02d2187
......@@ -22,15 +22,18 @@ let simulation ?branch_scale ?seed ~n_h0 ~n_ha ~ne_s:(ne_s0, ne_s1) ~gBGC:(gBGC0
|> to_fitness
|> Array.map ~f: Amino_acid.Vector.of_array_exn
) in
Codepitk.Mutsel_simulator.Site_independent.simulation
?branch_scale ?seed ~n_h0 ~n_ha ~gBGC:(gBGC0, gBGC1)
let tree = Option.value_map branch_scale ~default:tree
~f:(fun scale -> Codepitk.Mutsel_simulator.tree_rescale_branch_length tree ~scale)
in
Codepitk.Mutsel_simulator.Site_independent.make
?seed ~n_h0 ~n_ha ~gBGC:(gBGC0, gBGC1)
~ne_s:(ne_s0, ne_s1) ~tree ~fitness_profiles ()
in
Workflow.plugin ~descr:"simulator.simulation" f
let alignment_of_simulation sim =
let f = fun%workflow dest ->
let sim : Codepitk.Mutsel_simulator.Site_independent.simulation =
let sim : Codepitk.Mutsel_simulator.Site_independent.t =
[%eval sim]
in
let species_name = Phylogenetics.Tree.leaves sim.tree in
......@@ -45,7 +48,7 @@ let alignment_of_simulation sim =
let fitness_histogram sim =
let f = fun%workflow dest ->
let open Phylogenetics in
let sim : Codepitk.Mutsel_simulator.Site_independent.simulation =
let sim : Codepitk.Mutsel_simulator.Site_independent.t =
[%eval sim]
in
let params = Array.append sim.h0_params sim.ha_params in
......
......@@ -12,12 +12,12 @@ val simulation :
tree:nhx file ->
fitness_profiles:#text file ->
unit ->
Mutsel_simulator.Site_independent.simulation workflow
Mutsel_simulator.Site_independent.t workflow
val alignment_of_simulation :
Mutsel_simulator.Site_independent.simulation workflow ->
Mutsel_simulator.Site_independent.t workflow ->
nucleotide_fasta file
val fitness_histogram :
Mutsel_simulator.Site_independent.simulation workflow ->
Mutsel_simulator.Site_independent.t workflow ->
pdf file
......@@ -127,7 +127,7 @@ module Mutsel = struct
let open Codepitk in
let open Codepitk.Mutsel_simulator.Site_independent in
let module Codon = Codon.Universal_genetic_code.NS in
let sim : simulation = [%eval simulation q] in
let sim : t = [%eval simulation q] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
let results =
List.map result_paths ~f:Cpt.of_file
......
......@@ -39,7 +39,7 @@ module Mutsel : sig
val simulation :
query ->
Codepitk.Mutsel_simulator.Site_independent.simulation workflow
Codepitk.Mutsel_simulator.Site_independent.t workflow
type benchmark = {
......
......@@ -8,61 +8,37 @@ let tree_rescale_branch_length t ~scale =
{ bi with length = bi.length *. scale }
)
module Site_independent = struct
include Simulator.Make(Mutsel.NSCodon)(Mutsel)(Branch_info)
let rescale_fitness beta = Amino_acid.Vector.map ~f:(( *. ) beta)
let alignment tree ~root param =
List.init (Array.length root) ~f:(fun i ->
site_gillespie_direct tree ~root:root.(i) ~param:(param i)
|> Tree.leaves
|> List.map ~f:Codon.Universal_genetic_code.NS.to_string
)
|> List.transpose_exn
|> List.map ~f:String.concat
|> List.map ~f:Dna.of_string_unsafe
type simulation = {
sequences : Dna.t list ;
fitness_profiles : Amino_acid.vector array ;
tree : Convergence_tree.t ;
h0_profiles : int array ;
ha_profiles : (int * int) array ;
h0_params : (Mutsel.param * Mutsel.param) array ;
ha_params : (Mutsel.param * Mutsel.param) array ;
base_param : Mutsel.param ;
gBGC : float * float ;
ne_s : float * float ;
}
let most_probable_aa (pref : Amino_acid.vector) =
let most_probable_aa (pref : Amino_acid.vector) =
let pref = Amino_acid.Vector.to_array pref in
let arr = Array.mapi pref ~f:(fun i x -> x, i) in
match Array.max_elt ~compare:Poly.compare arr with
| None -> assert false
| Some (_, i) -> i
let simulation ?branch_scale ?seed ~n_h0 ~n_ha ~ne_s:(ne_s0, ne_s1) ~gBGC:(gBGC0, gBGC1) ~tree ~fitness_profiles () =
let () = Option.iter ~f:Random.init seed in
let rescale_fitness beta = Amino_acid.Vector.map ~f:(( *. ) beta) in
let base_param =
let p = Mutsel.random_param ~alpha_nucleotide:10. ~alpha_fitness:0.1 in
{ p with omega = 1. }
in
let random_profile () = Random.int (Array.length fitness_profiles) in
let most_probable_aa = Array.map fitness_profiles ~f:most_probable_aa in
let h0_profiles = Array.init n_h0 ~f:(fun _ -> random_profile ()) in
let ha_profiles = Array.init n_ha ~f:(fun _ ->
let make_profiles n_h0 n_ha fitness_profiles =
let random_profile () = Random.int (Array.length fitness_profiles)
and most_probable_aa = Array.map fitness_profiles ~f:most_probable_aa in
let h0_profiles = Array.init n_h0 ~f:(fun _ -> random_profile ())
and ha_profiles = Array.init n_ha ~f:(fun _ ->
let rec loop () =
let p = random_profile () in
let q = random_profile () in
let p = random_profile ()
and q = random_profile () in
if p = q || most_probable_aa.(p) = most_probable_aa.(q) then loop ()
else (p, q)
in
loop ()
)
in
) in
h0_profiles, ha_profiles
let make_params ~gBGC:(gBGC0, gBGC1) ~ne_s:(ne_s0, ne_s1) ~fitness_profiles ~h0_profiles ~ha_profiles =
let base_param = Mutsel.random_param ~alpha_nucleotide:10. ~alpha_fitness:0.1 in
let make_param ~gBGC ~ne_s p =
{ base_param with gBGC ; scaled_fitness = rescale_fitness ne_s fitness_profiles.(p) }
{
base_param with
gBGC; scaled_fitness = rescale_fitness ne_s fitness_profiles.(p)
}
in
let make_param_pair p q =
make_param ~gBGC:gBGC0 ~ne_s:ne_s0 p,
......@@ -71,12 +47,15 @@ module Site_independent = struct
let h0_params = Array.map h0_profiles ~f:(fun profile ->
make_param_pair profile profile
)
in
let ha_params = Array.map ha_profiles ~f:(fun (p, q) ->
and ha_params = Array.map ha_profiles ~f:(fun (p,q) ->
make_param_pair p q
)
in
let params_of_cond i cond =
(base_param, h0_params, ha_params)
let params_of_cond h0_params ha_params =
let n_h0 = Array.length h0_params in
fun i cond ->
let p, q =
if i < n_h0 then h0_params.(i)
else ha_params.(i - n_h0)
......@@ -84,10 +63,41 @@ module Site_independent = struct
match cond with
| `Ancestral -> p
| `Convergent -> q
in
let params i bi =
params_of_cond i (Branch_info.condition bi)
in
module Site_independent = struct
module Sim = Simulator.Make(Mutsel.NSCodon)(Mutsel)(Branch_info)
type t = {
sequences : Dna.t list ;
fitness_profiles : Amino_acid.vector array ;
tree : Convergence_tree.t ;
h0_profiles : int array ;
ha_profiles : (int * int) array ;
h0_params : (Mutsel.param * Mutsel.param) array ;
ha_params : (Mutsel.param * Mutsel.param) array ;
base_param : Mutsel.param ;
gBGC : float * float ;
ne_s : float * float ;
}
let alignment tree ~root param =
List.init (Array.length root) ~f:(fun i ->
Sim.site_gillespie_direct tree ~root:root.(i) ~param:(param i)
|> Tree.leaves
|> List.map ~f:Codon.Universal_genetic_code.NS.to_string
)
|> List.transpose_exn
|> List.map ~f:String.concat
|> List.map ~f:Dna.of_string_unsafe
let make ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitness_profiles () =
let () = Option.iter ~f:Random.init seed in
let h0_profiles, ha_profiles = make_profiles n_h0 n_ha fitness_profiles in
let base_param, h0_params, ha_params =
make_params ~gBGC ~ne_s ~fitness_profiles ~h0_profiles ~ha_profiles in
let params_of_cond = params_of_cond h0_params ha_params in
let params i bi = params_of_cond i (Branch_info.condition bi) in
let root_condition = `Ancestral in
let root_dists = Array.init (n_h0 + n_ha) ~f:(fun i ->
Mutsel.stationary_distribution (params_of_cond i root_condition)
......@@ -95,14 +105,58 @@ module Site_independent = struct
|> Mutsel.NSCodon.Table.of_array_exn
)
in
let root = hmm0 ~len:(n_h0 + n_ha) ~dist:(Array.get root_dists) in
let rescaled_tree =
Option.value_map branch_scale ~default:tree ~f:(fun scale ->
tree_rescale_branch_length tree ~scale
let root = Sim.hmm0 ~len:(n_h0 + n_ha) ~dist:(Array.get root_dists) in
let sequences = alignment tree ~root params in
{ sequences ; fitness_profiles ; h0_profiles ; ha_profiles ;
gBGC ; ne_s ; h0_params ; ha_params ; base_param ; tree}
end
module With_CpG_hypermutability = struct
module Sim = Mutsel_simulator_cpg.Make(Branch_info)
type t = {
sequences : Dna.t list ;
fitness_profiles : Amino_acid.vector array ;
tree : Convergence_tree.t ;
h0_profiles : int array ;
ha_profiles : (int * int) array ;
gBGC : float * float ;
ne_s : float * float ;
rate_CpG : float ;
}
let make ~rng ~n_h0 ~n_ha ~ne_s ~gBGC ~rate_CpG ~(tree:Convergence_tree.t) ~fitness_profiles () =
let h0_profiles, ha_profiles = make_profiles n_h0 n_ha fitness_profiles in
let _, h0_params, ha_params =
make_params ~gBGC ~ne_s ~fitness_profiles ~h0_profiles ~ha_profiles in
let params_of_cond = params_of_cond h0_params ha_params in
let param_factory state pos branch =
let module EM = Mutsel_simulator_cpg.Evolution_model in
let left_codon, right_codon = EM.flanking_codons state pos in
let context_CpG = EM.context_CpG_of_codons ?left_codon ?right_codon () in
let mutsel = params_of_cond pos (Branch_info.condition branch) in
EM.{ mutsel ; rate_CpG ; context_CpG }
in
(* FIXME : this is stationary dist for Mutsel without CpG *)
let root_dists = Array.init (n_h0 + n_ha) ~f:(fun i ->
Mutsel.stationary_distribution (params_of_cond i `Ancestral)
|> Mutsel.NSCodon.Vector.to_array
|> Mutsel.NSCodon.Table.of_array_exn
)
in
let root = Array.map root_dists ~f:(fun codon_dist ->
let dist = Gsl.Randist.discrete_preproc (codon_dist :> float array) in
Gsl.Randist.discrete rng dist
|> Mutsel.NSCodon.of_int_exn
)
in
let sequences = alignment rescaled_tree ~root params in
let sequences =
Sim.sequence_gillespie_direct rng tree ~root ~param:param_factory
|> Tree.leaves
|> List.map ~f:Dna.of_codons
in
{ sequences ; fitness_profiles ; h0_profiles ; ha_profiles ;
gBGC = gBGC0, gBGC1 ; ne_s = ne_s0, ne_s1 ;
h0_params ; ha_params ; base_param ; tree}
gBGC ; ne_s ; tree ; rate_CpG}
end
\ No newline at end of file
open Phylogenetics
val tree_rescale_branch_length : Convergence_tree.t -> scale:float -> Convergence_tree.t
module Site_independent : sig
type simulation = {
type t = {
sequences : Dna.t list ;
fitness_profiles : Amino_acid.vector array ;
tree : Convergence_tree.t ;
......@@ -14,8 +17,7 @@ module Site_independent : sig
ne_s : float * float ;
}
val simulation :
?branch_scale:float ->
val make :
?seed:int ->
n_h0:int ->
n_ha:int ->
......@@ -24,5 +26,31 @@ module Site_independent : sig
tree:Convergence_tree.t ->
fitness_profiles:Amino_acid.vector array ->
unit ->
simulation
t
end
module With_CpG_hypermutability : sig
type t = {
sequences : Dna.t list ;
fitness_profiles : Amino_acid.vector array ;
tree : Convergence_tree.t ;
h0_profiles : int array ;
ha_profiles : (int * int) array ;
gBGC : float * float ;
ne_s : float * float ;
rate_CpG : float ;
}
val make :
rng:Gsl.Rng.t ->
n_h0:int ->
n_ha:int ->
ne_s:float * float ->
gBGC:float * float ->
rate_CpG:float ->
tree:Convergence_tree.t ->
fitness_profiles:Amino_acid.vector array ->
unit ->
t
end
......@@ -11,15 +11,12 @@ module Evolution_model = struct
}
type param = {
nucleotide_rates : Rate_matrix.Nucleotide.t ;
nucleotide_stat_dist : Nucleotide.vector ;
omega : float ; (* dN/dS *)
scaled_fitness : Amino_acid.vector ;
gBGC : float ;
mutsel: Mutsel.param;
rate_CpG: float;
context_CpG : context_CpG;
}
let inspect_codon codon =
let x, y, z = NSCodon.nucleotides codon in
Nucleotide.(inspect x, inspect y, inspect z)
......@@ -38,7 +35,7 @@ module Evolution_model = struct
Amino_acid.Vector.init (fun _ -> 1. /. float Amino_acid.card)
|> fitness_of_profile
let make_param ~gBGC ~scaled_fitness ~omega ~nucleotide_rates ~nucleotide_stat_dist ~rate_CpG state pos _branch =
let flanking_codons state pos =
let n = Array.length state in
let left_codon =
if 0 <= pos - 1 && pos - 1 < n
......@@ -47,29 +44,31 @@ module Evolution_model = struct
and right_codon =
if pos + 1 < n then Some state.(pos + 1) else None
in
left_codon, right_codon
let make_param ~mutsel ~rate_CpG state pos =
let left_codon, right_codon = flanking_codons state pos in
let context_CpG = context_CpG_of_codons ?left_codon ?right_codon () in
{
nucleotide_rates;
nucleotide_stat_dist;
omega;
scaled_fitness;
gBGC;
rate_CpG;
context_CpG;
}
{ mutsel; rate_CpG; context_CpG }
let flat_param ~rate_CpG =
let pi = Nucleotide.flat_profile () in
let rho = Linear_algebra.Lacaml.Vector.init 6 ~f:(fun _ -> 1. /. 6.) in
let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
let mutsel = Mutsel.{
gBGC = 0.;
omega = 1. ;
scaled_fitness = flat_fitness ();
nucleotide_rates ;
nucleotide_stat_dist = pi
}
in
make_param ~mutsel ~rate_CpG
let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG =
let pi = Nucleotide.random_profile alpha_nuc in
let rho = Utils.random_profile 6 in
let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
let scaled_fitness = Amino_acid.random_profile alpha_fitness |> fitness_of_profile in
make_param ~gBGC:0. ~scaled_fitness ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
let random_param ~alpha_nucleotide ~alpha_fitness ~rate_CpG =
let mutsel = Mutsel.random_param ~alpha_nucleotide ~alpha_fitness in
make_param ~mutsel ~rate_CpG
let rate_increase_CpG (pos, x_a, x_b) context codon =
let open Nucleotide in
......@@ -89,7 +88,7 @@ module Evolution_model = struct
else if delta < - 50. then 0.
else delta / (1. - exp (- delta))
let rate_matrix { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ; rate_CpG ; context_CpG; _ } =
let rate_matrix { mutsel = { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ;_ } ; rate_CpG ; context_CpG } =
let nuc_rates = (nucleotide_rates :> Nucleotide.matrix) in
Mutsel.NSCodon_rate_matrix.make (fun p q ->
match NSCodon.neighbours p q with
......@@ -114,7 +113,7 @@ module Evolution_model = struct
| None -> 0.
)
let rate_vector { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ; rate_CpG ; context_CpG; _ } p =
let rate_vector { mutsel = { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ; _ } ; rate_CpG ; context_CpG } p =
let nuc_rates = (nucleotide_rates :> Nucleotide.matrix) in
NSCodon.Table.init (fun q ->
if NSCodon.equal p q then 0.
......@@ -225,8 +224,8 @@ let demo seq_length ~rate_CpG ~branch_length =
let codon_seed = Owl.Stats.uniform_int_rvs ~a:0 ~b:(NSCodon.card - 1) in
NSCodon.of_int_exn codon_seed
) in
let param seq i b = Evolution_model.flat_param ~rate_CpG seq i b in
let tree = Tree.(node () (List1.singleton (branch branch_length (leaf ())))) in
let param seq i _branch = Evolution_model.flat_param ~rate_CpG seq i in
let sim = M.sequence_gillespie_direct rng tree ~root:root_sequence ~param in
let simulated_sequence = Tree.leaves sim |> List.hd_exn in
let counts = Evolution_model.dinucleotide_ratios simulated_sequence in
......
......@@ -9,11 +9,7 @@ module Evolution_model : sig
}
type param = {
nucleotide_rates : Rate_matrix.Nucleotide.t ;
nucleotide_stat_dist : Nucleotide.vector ;
omega : float ; (* dN/dS *)
scaled_fitness : Amino_acid.vector ;
gBGC : float ;
mutsel: Mutsel.param;
rate_CpG: float;
context_CpG: context_CpG;
}
......@@ -22,18 +18,20 @@ module Evolution_model : sig
val stationary_distribution: param -> NSCodon.vector
val flanking_codons: NSCodon.t array -> int -> NSCodon.t option * NSCodon.t option
val context_CpG_of_codons: ?left_codon:NSCodon.t -> ?right_codon:NSCodon.t -> unit -> context_CpG
val make_param:
gBGC:float ->
scaled_fitness:Amino_acid.vector ->
omega:float ->
nucleotide_rates:Nucleotide.matrix ->
nucleotide_stat_dist:Nucleotide.vector ->
mutsel:Mutsel.param ->
rate_CpG:float ->
NSCodon.t array -> int -> 'a -> param
NSCodon.t array -> int -> param
val flat_param: rate_CpG:float -> NSCodon.t array -> int -> 'a -> param
val flat_param: rate_CpG:float -> NSCodon.t array -> int -> param
val random_param: alpha_nuc:float -> alpha_fitness:float -> rate_CpG:float -> NSCodon.t array -> int -> 'a -> param
val random_param:
alpha_nucleotide:float -> alpha_fitness:float -> rate_CpG:float ->
NSCodon.t array -> int -> param
val dinucleotide_ratios: NSCodon.t array -> float array array
end
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment