Commit 8f248d0d authored by Philippe Veber's avatar Philippe Veber
Browse files

tk/Mutsel_simulator_cpg: initial speed assessment

using (debugged) implementation in phylogenetics, perform simulation
for 500 to 2000 sites. Quadratic complexity is expected here, to
observe it I use the log transform from

	t = K n^2

to
	log t = 2 log n + log K

Running times are only nearly quadratic:

> df <- data.frame(n = c(500,1000,1300,2000), t = c(5.15,18.48,28.63,68.07)) ; fit <- lm(log2(t) ~ log2(n), data = df) ; summary(fit)

Call:
lm(formula = log2(t) ~ log2(n), data = df)

Residuals:
        1         2         3         4
 0.015095  0.007796 -0.061122  0.038231

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept) -14.24278    0.36390  -39.14 0.000652 ***
log2(n)       1.85062    0.03608   51.30 0.000380 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.05237 on 2 degrees of freedom
Multiple R-squared:  0.9992,	Adjusted R-squared:  0.9989
F-statistic:  2631 on 1 and 2 DF,  p-value: 0.0003798

Using a quadratic fit is nevertheless not so bad:

> df <- data.frame(n = c(500,1000,1300,2000), t = c(5.15,18.48,28.63,68.07)) ; fit <- lm(t ~ I(n^2), data = df) ; summary(fit)

Call:
lm(formula = t ~ I(n^2), data = df)

Residuals:
      1       2       3       4
-0.1138  0.6815 -0.7004  0.1327

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.086e+00  5.581e-01   1.945 0.191208
I(n^2)      1.671e-05  2.501e-07  66.822 0.000224 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.702 on 2 degrees of freedom
Multiple R-squared:  0.9996,	Adjusted R-squared:  0.9993
F-statistic:  4465 on 1 and 2 DF,  p-value: 0.0002239
parent 1e38b720
......@@ -58,16 +58,16 @@ module Evolution_model = struct
context_CpG;
}
let flat_param ~rate_CpG =
let flat_param ~rate_CpG =
let pi = Nucleotide.flat_profile () in
let rho = Linear_algebra.Lacaml.Vector.init 6 ~f:(fun _ -> 1. /. 6.) in
let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG =
let pi = Nucleotide.random_profile alpha_nuc in
let rho = Utils.random_profile 6 in
let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG =
let pi = Nucleotide.random_profile alpha_nuc in
let rho = Utils.random_profile 6 in
let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
let scaled_fitness = Amino_acid.random_profile alpha_fitness |> fitness_of_profile in
make_param ~gBGC:0. ~scaled_fitness ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
......@@ -118,7 +118,7 @@ module Evolution_model = struct
let nucleotides_of_codons codons =
Array.concat_map codons ~f:(fun codon ->
let x, y, z = NSCodon.nucleotides codon in
let x, y, z = NSCodon.nucleotides codon in
[|x;y;z|]
)
......@@ -127,15 +127,15 @@ module Evolution_model = struct
Array.count seq ~f:(Nucleotide.equal nuc)
)
let dinucleotide_counts seq =
let observed = Array.init Nucleotide.card ~f:(fun _ ->
let dinucleotide_counts seq =
let observed = Array.init Nucleotide.card ~f:(fun _ ->
Array.init Nucleotide.card ~f:(fun _ -> 0)
) in
Array.iteri seq ~f:(fun pos nuc ->
) in
Array.iteri seq ~f:(fun pos nuc ->
if (pos + 1) < Array.length seq then
let next = seq.(pos+1) in
Nucleotide.(
observed.(to_int nuc).(to_int next) <-
observed.(to_int nuc).(to_int next) <-
observed.(to_int nuc).(to_int next) + 1
));
observed
......@@ -144,54 +144,97 @@ module Evolution_model = struct
let nucleotide_sequence = nucleotides_of_codons codon_sequence in
let single_counts = nucleotide_counts nucleotide_sequence in
let dinuc_counts = dinucleotide_counts nucleotide_sequence in
Array.mapi dinuc_counts ~f:(fun fst_nuc counts ->
Array.mapi counts ~f:(fun snd_nuc observed ->
let expected =
Array.mapi dinuc_counts ~f:(fun fst_nuc counts ->
Array.mapi counts ~f:(fun snd_nuc observed ->
let expected =
float_of_int Nucleotide.Table.(
get single_counts (Nucleotide.of_int_exn fst_nuc) *
get single_counts (Nucleotide.of_int_exn fst_nuc) *
get single_counts (Nucleotide.of_int_exn snd_nuc)
) /.
) /.
float_of_int (Array.length nucleotide_sequence)
in
float_of_int observed /. expected)
)
end
module Make(BI: Simulator.Branch_info) = Simulator.Make(NSCodon)(Evolution_model)(BI)
end
module Make(BI: Simulator.Branch_info) = struct
include Simulator.Make(NSCodon)(Evolution_model)(BI)
let rng = Gsl.Rng.(make (default ()))
let symbol_sample v =
Gsl.Randist.(discrete rng (discrete_preproc v))
|> NSCodon.of_int_exn
let memo f =
let table = Caml.Hashtbl.create 253 in
fun x ->
match Caml.Hashtbl.find table x with
| y -> y
| exception Caml.Not_found ->
let y = f x in
Caml.Hashtbl.add table x y ;
y
let sequence_gillespie_direct tree ~root ~param =
let codon_rates = memo Evolution_model.rate_matrix in
Tree.propagate tree ~init:root ~node:Fn.const ~leaf:Fn.const ~branch:(fun seq b ->
let rec loop state remaining_time =
let rate_matrices = Array.mapi state ~f:(fun i _ -> codon_rates (param state i b)) in
let rates =
Array.mapi rate_matrices ~f:(fun i mat ->
NSCodon.Table.init (fun m ->
if NSCodon.equal m state.(i) then 0. else (mat :> NSCodon.matrix).NSCodon.%{state.(i), m}
)
) in
let pos_rates = Array.map rates ~f:(fun r -> Owl.Stats.sum (r :> float array)) in
let total_rate = Array.reduce_exn pos_rates ~f:( +. ) in
let tau = Owl.Stats.exponential_rvs ~lambda:total_rate in
if Float.(tau > remaining_time) then state
else
let pos = Owl.Stats.categorical_rvs pos_rates in
let next_letter = symbol_sample (rates.(pos) :> float array) in
let next_state =
let t = Array.copy state in
t.(pos) <- next_letter ;
t
in
loop next_state Float.(remaining_time - tau)
in
loop seq (BI.length b)
)
let demo seq_length ~rate_CpG ~branch_length =
let demo seq_length ~rate_CpG ~branch_length =
let module Branch_info = struct
type t = Newick.branch_info
let length branch = Option.value_exn Newick.(branch.length)
end
end
in
let module M = Make(Branch_info) in
let root_sequence = Array.init seq_length ~f:(fun _ ->
let root_sequence = Array.init seq_length ~f:(fun _ ->
let codon_seed = Owl.Stats.uniform_int_rvs ~a:0 ~b:(NSCodon.card - 1) in
match NSCodon.of_int codon_seed with
| Some codon -> codon
| None -> sprintf "Cannot create codon from integer : %d" codon_seed |> failwith
NSCodon.of_int_exn codon_seed
) in
let param = Evolution_model.random_param ~alpha_nuc:0.5 ~alpha_fitness:0.5 ~rate_CpG in
let tree =
let param seq i b = Evolution_model.flat_param ~rate_CpG seq i b in
let tree =
sprintf "(leaf:%f);" branch_length
|> Newick.from_string_exn
|> Newick.with_inner_tree ~f:(fun tree ->
M.sequence_gillespie_direct tree ~root:root_sequence ~param
)
)
in
let simulated_sequence = Tree.leaves tree |> List.hd_exn in
let simulated_sequence = Tree.leaves tree |> List.hd_exn in
let counts = Evolution_model.dinucleotide_ratios simulated_sequence in
Array.init Nucleotide.card ~f:(fun i ->
Array.init Nucleotide.card ~f:(fun i ->
let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in
"\t" ^ nuc
)
|> String.concat_array ~sep:"\t"
)
|> String.concat_array ~sep:"\t"
|> print_endline ;
Array.iteri counts ~f:(fun i nuc_ratios ->
Array.iteri counts ~f:(fun i nuc_ratios ->
let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in
print_string nuc;
Array.iteri nuc_ratios ~f:(fun _ ratio ->
Array.iteri nuc_ratios ~f:(fun _ ratio ->
print_string ("\t" ^ string_of_float ratio)
);
Out_channel.newline stdout;
......
......@@ -42,4 +42,4 @@ module Make(BI: Simulator.Branch_info) : sig
include module type of Simulator.Make(NSCodon)(Evolution_model)(BI)
end
val demo : int -> rate_CpG:float -> branch_length:float -> unit
\ No newline at end of file
val demo : int -> rate_CpG:float -> branch_length:float -> unit
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment