### tk/Mutsel_simulator_cpg: initial speed assessment

```using (debugged) implementation in phylogenetics, perform simulation
for 500 to 2000 sites. Quadratic complexity is expected here, to
observe it I use the log transform from

t = K n^2

to
log t = 2 log n + log K

Running times are only nearly quadratic:

> df <- data.frame(n = c(500,1000,1300,2000), t = c(5.15,18.48,28.63,68.07)) ; fit <- lm(log2(t) ~ log2(n), data = df) ; summary(fit)

Call:
lm(formula = log2(t) ~ log2(n), data = df)

Residuals:
1         2         3         4
0.015095  0.007796 -0.061122  0.038231

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -14.24278    0.36390  -39.14 0.000652 ***
log2(n)       1.85062    0.03608   51.30 0.000380 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.05237 on 2 degrees of freedom
Multiple R-squared:  0.9992,	Adjusted R-squared:  0.9989
F-statistic:  2631 on 1 and 2 DF,  p-value: 0.0003798

> df <- data.frame(n = c(500,1000,1300,2000), t = c(5.15,18.48,28.63,68.07)) ; fit <- lm(t ~ I(n^2), data = df) ; summary(fit)

Call:
lm(formula = t ~ I(n^2), data = df)

Residuals:
1       2       3       4
-0.1138  0.6815 -0.7004  0.1327

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.086e+00  5.581e-01   1.945 0.191208
I(n^2)      1.671e-05  2.501e-07  66.822 0.000224 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.702 on 2 degrees of freedom
Multiple R-squared:  0.9996,	Adjusted R-squared:  0.9993
F-statistic:  4465 on 1 and 2 DF,  p-value: 0.0002239```
parent 1e38b720
 ... ... @@ -58,16 +58,16 @@ module Evolution_model = struct context_CpG; } let flat_param ~rate_CpG = let flat_param ~rate_CpG = let pi = Nucleotide.flat_profile () in let rho = Linear_algebra.Lacaml.Vector.init 6 ~f:(fun _ -> 1. /. 6.) in let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG = let pi = Nucleotide.random_profile alpha_nuc in let rho = Utils.random_profile 6 in let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG = let pi = Nucleotide.random_profile alpha_nuc in let rho = Utils.random_profile 6 in let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in let scaled_fitness = Amino_acid.random_profile alpha_fitness |> fitness_of_profile in make_param ~gBGC:0. ~scaled_fitness ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG ... ... @@ -118,7 +118,7 @@ module Evolution_model = struct let nucleotides_of_codons codons = Array.concat_map codons ~f:(fun codon -> let x, y, z = NSCodon.nucleotides codon in let x, y, z = NSCodon.nucleotides codon in [|x;y;z|] ) ... ... @@ -127,15 +127,15 @@ module Evolution_model = struct Array.count seq ~f:(Nucleotide.equal nuc) ) let dinucleotide_counts seq = let observed = Array.init Nucleotide.card ~f:(fun _ -> let dinucleotide_counts seq = let observed = Array.init Nucleotide.card ~f:(fun _ -> Array.init Nucleotide.card ~f:(fun _ -> 0) ) in Array.iteri seq ~f:(fun pos nuc -> ) in Array.iteri seq ~f:(fun pos nuc -> if (pos + 1) < Array.length seq then let next = seq.(pos+1) in Nucleotide.( observed.(to_int nuc).(to_int next) <- observed.(to_int nuc).(to_int next) <- observed.(to_int nuc).(to_int next) + 1 )); observed ... ... @@ -144,54 +144,97 @@ module Evolution_model = struct let nucleotide_sequence = nucleotides_of_codons codon_sequence in let single_counts = nucleotide_counts nucleotide_sequence in let dinuc_counts = dinucleotide_counts nucleotide_sequence in Array.mapi dinuc_counts ~f:(fun fst_nuc counts -> Array.mapi counts ~f:(fun snd_nuc observed -> let expected = Array.mapi dinuc_counts ~f:(fun fst_nuc counts -> Array.mapi counts ~f:(fun snd_nuc observed -> let expected = float_of_int Nucleotide.Table.( get single_counts (Nucleotide.of_int_exn fst_nuc) * get single_counts (Nucleotide.of_int_exn fst_nuc) * get single_counts (Nucleotide.of_int_exn snd_nuc) ) /. ) /. float_of_int (Array.length nucleotide_sequence) in float_of_int observed /. expected) ) end module Make(BI: Simulator.Branch_info) = Simulator.Make(NSCodon)(Evolution_model)(BI) end module Make(BI: Simulator.Branch_info) = struct include Simulator.Make(NSCodon)(Evolution_model)(BI) let rng = Gsl.Rng.(make (default ())) let symbol_sample v = Gsl.Randist.(discrete rng (discrete_preproc v)) |> NSCodon.of_int_exn let memo f = let table = Caml.Hashtbl.create 253 in fun x -> match Caml.Hashtbl.find table x with | y -> y | exception Caml.Not_found -> let y = f x in Caml.Hashtbl.add table x y ; y let sequence_gillespie_direct tree ~root ~param = let codon_rates = memo Evolution_model.rate_matrix in Tree.propagate tree ~init:root ~node:Fn.const ~leaf:Fn.const ~branch:(fun seq b -> let rec loop state remaining_time = let rate_matrices = Array.mapi state ~f:(fun i _ -> codon_rates (param state i b)) in let rates = Array.mapi rate_matrices ~f:(fun i mat -> NSCodon.Table.init (fun m -> if NSCodon.equal m state.(i) then 0. else (mat :> NSCodon.matrix).NSCodon.%{state.(i), m} ) ) in let pos_rates = Array.map rates ~f:(fun r -> Owl.Stats.sum (r :> float array)) in let total_rate = Array.reduce_exn pos_rates ~f:( +. ) in let tau = Owl.Stats.exponential_rvs ~lambda:total_rate in if Float.(tau > remaining_time) then state else let pos = Owl.Stats.categorical_rvs pos_rates in let next_letter = symbol_sample (rates.(pos) :> float array) in let next_state = let t = Array.copy state in t.(pos) <- next_letter ; t in loop next_state Float.(remaining_time - tau) in loop seq (BI.length b) ) let demo seq_length ~rate_CpG ~branch_length = let demo seq_length ~rate_CpG ~branch_length = let module Branch_info = struct type t = Newick.branch_info let length branch = Option.value_exn Newick.(branch.length) end end in let module M = Make(Branch_info) in let root_sequence = Array.init seq_length ~f:(fun _ -> let root_sequence = Array.init seq_length ~f:(fun _ -> let codon_seed = Owl.Stats.uniform_int_rvs ~a:0 ~b:(NSCodon.card - 1) in match NSCodon.of_int codon_seed with | Some codon -> codon | None -> sprintf "Cannot create codon from integer : %d" codon_seed |> failwith NSCodon.of_int_exn codon_seed ) in let param = Evolution_model.random_param ~alpha_nuc:0.5 ~alpha_fitness:0.5 ~rate_CpG in let tree = let param seq i b = Evolution_model.flat_param ~rate_CpG seq i b in let tree = sprintf "(leaf:%f);" branch_length |> Newick.from_string_exn |> Newick.with_inner_tree ~f:(fun tree -> M.sequence_gillespie_direct tree ~root:root_sequence ~param ) ) in let simulated_sequence = Tree.leaves tree |> List.hd_exn in let simulated_sequence = Tree.leaves tree |> List.hd_exn in let counts = Evolution_model.dinucleotide_ratios simulated_sequence in Array.init Nucleotide.card ~f:(fun i -> Array.init Nucleotide.card ~f:(fun i -> let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in "\t" ^ nuc ) |> String.concat_array ~sep:"\t" ) |> String.concat_array ~sep:"\t" |> print_endline ; Array.iteri counts ~f:(fun i nuc_ratios -> Array.iteri counts ~f:(fun i nuc_ratios -> let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in print_string nuc; Array.iteri nuc_ratios ~f:(fun _ ratio -> Array.iteri nuc_ratios ~f:(fun _ ratio -> print_string ("\t" ^ string_of_float ratio) ); Out_channel.newline stdout; ... ...
 ... ... @@ -42,4 +42,4 @@ module Make(BI: Simulator.Branch_info) : sig include module type of Simulator.Make(NSCodon)(Evolution_model)(BI) end val demo : int -> rate_CpG:float -> branch_length:float -> unit \ No newline at end of file val demo : int -> rate_CpG:float -> branch_length:float -> unit
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!