open Core_kernel open Phylogenetics open Phylogenetics.Linear_algebra.Lacaml module Evolution_model = struct type param = { stationary_distribution : Amino_acid.vector ; exchangeability_matrix : Amino_acid.matrix ; scale : float ; } let param_of_wag (wag : Wag.t) scale = { scale ; stationary_distribution = wag.freqs ; exchangeability_matrix = wag.rate_matrix ; } let stationary_distribution p = p.stationary_distribution let rate_matrix p = Rate_matrix.Amino_acid.make (fun i j -> p.scale *. p.exchangeability_matrix.Amino_acid.%{i, j} *. p.stationary_distribution.Amino_acid.%(j) ) let transition_probability_matrix p = let module V = Amino_acid.Vector in let module M = Amino_acid.Matrix in let m = rate_matrix p in let sqrt_pi = V.map p.stationary_distribution ~f:Float.sqrt in let diag_pi = M.diagm sqrt_pi in let diag_pi_inv = V.map sqrt_pi ~f:(fun v -> 1. /. v) |> M.diagm in let m' = M.(dot diag_pi @@ dot m diag_pi_inv) in let (d_vec, step_transform_matrix) = M.diagonalize m' in let transform_matrix = M.dot diag_pi_inv step_transform_matrix in let rev_transform_matrix = M.dot (M.transpose step_transform_matrix) diag_pi in fun t -> let exp_matrix = Amino_acid.Vector.(exp (scal_mul t d_vec)) |> Amino_acid.Matrix.diagm in Amino_acid.Matrix.(dot transform_matrix @@ dot exp_matrix rev_transform_matrix) let test_diagonal_matrix_exponential () = let stationary_distribution = Amino_acid.random_profile 0.5 in let exchangeability_matrix = Rate_matrix.Amino_acid.make (fun aa_i aa_j-> let i, j = Amino_acid.(to_int aa_i, to_int aa_j) in if i <= j then float_of_int i *. 0.01 +. float_of_int j *. 0.001 else float_of_int j *. 0.01 +. float_of_int i *. 0.001 ) in let p = { stationary_distribution; exchangeability_matrix; scale=1.; } in let t = 100. in let diag_exp_matrix = transition_probability_matrix p t in let m = rate_matrix p in let exp_matrix = Amino_acid.Matrix.(expm (scal_mul t m)) in Amino_acid.Matrix.robust_equal ~tol:1e-10 diag_exp_matrix exp_matrix let%test "Matrix exponential through diagonalisation matches naive implementation" = test_diagonal_matrix_exponential () end let choose_aa p = Amino_acid.Table.of_vector p |> Amino_acid.Table.choose module CTMC = Phylo_ctmc.Make(Amino_acid) let tol = 0.001 type likelihood_ratio_test = { full_log_likelihood : float ; reduced_log_likelihood : float ; _D_ : float ; df : float ; pvalue : float ; } let lrt ~full_log_likelihood ~reduced_log_likelihood ~df = let _D_ = 2. *. (full_log_likelihood -. reduced_log_likelihood) in let pvalue = 1. -. Owl.Stats.chi2_cdf ~df _D_ in { full_log_likelihood ; reduced_log_likelihood ; _D_ ; df ; pvalue } module type S = sig type branch_info type leaf_info type site type simulation = (Amino_acid.t, Amino_acid.t, branch_info) Tree.t module Model1 : sig type param = float val maximum_log_likelihood : ?debug:bool -> exchangeability_matrix:Rate_matrix.Amino_acid.t -> stationary_distribution:Amino_acid.vector -> (_, leaf_info, branch_info) Tree.t -> site -> float * param val simulate_site : exchangeability_matrix:Amino_acid.matrix -> stationary_distribution:Amino_acid.vector -> (_, _, branch_info) Tree.t -> param:param -> simulation end module Model2 : sig type param = { scale : float ; stationary_distribution : Amino_acid.vector ; } val maximum_log_likelihood : ?debug:bool -> ?mode:[< `dense | `sparse > `sparse ] -> exchangeability_matrix:Rate_matrix.Amino_acid.t -> (_, leaf_info, branch_info) Tree.t -> site -> float * param val lrt : ?mode:[< `dense | `sparse > `sparse ] -> Wag.t -> (_, leaf_info, branch_info) Tree.t -> site -> Model1.param * param * likelihood_ratio_test end module Model3 : sig type param = { scale : float ; stationary_distribution0 : Amino_acid.vector ; stationary_distribution1 : Amino_acid.vector ; } val maximum_log_likelihood : ?debug:bool -> ?mode:[< `dense | `sparse > `sparse ] -> exchangeability_matrix:Rate_matrix.Amino_acid.t -> (_, leaf_info, branch_info) Tree.t -> site -> float * param val lrt : ?mode:[< `dense | `sparse > `sparse ] -> Wag.t -> (_, leaf_info, branch_info) Tree.t -> site -> Model2.param * param * likelihood_ratio_test val simulate_site : exchangeability_matrix:Rate_matrix.Amino_acid.t -> scale:float -> stationary_distribution0:Amino_acid.vector -> stationary_distribution1:Amino_acid.vector -> (_, leaf_info, branch_info) Tree.t -> simulation end end module type Leaf_info = sig type t type species val species : t -> species val condition : t -> [`Ancestral | `Convergent] end module type Branch_info = sig type t val length : t -> float val condition : t -> [`Ancestral | `Convergent] end module type Site = sig type t type species val get_aa : t -> species -> Amino_acid.t option end module Make(Branch_info : Branch_info)(Leaf_info : Leaf_info)(Site : Site with type species = Leaf_info.species) = struct module Simulator = Simulator.Make(Amino_acid)(Evolution_model)(Branch_info) type simulation = (Amino_acid.t, Amino_acid.t, Branch_info.t) Tree.t let aa_of_leaf_info site li = Site.get_aa site (Leaf_info.species li) module Model1 = struct type param = float let log_likelihood ~exchangeability_matrix ~stationary_distribution ~scale tree site = let pi = (stationary_distribution : Amino_acid.vector :> vec) in let p = { Evolution_model.scale = 10. ** scale ; exchangeability_matrix ; stationary_distribution } in let transition_matrix = let f = Evolution_model.transition_probability_matrix p in fun b -> (f (Branch_info.length b) :> mat) in let leaf_state = aa_of_leaf_info site in CTMC.pruning_with_missing_values tree ~transition_matrix ~leaf_state ~root_frequencies:pi let clip f param = if Float.(param.(0) > 3.) then Float.infinity else f param let decode_vec p = p.(0) let inner_maximum_log_likelihood ?debug ~exchangeability_matrix ~stationary_distribution tree site = let f vec = let scale = decode_vec vec in -. log_likelihood ~exchangeability_matrix ~stationary_distribution ~scale tree site in let sample () = [| Owl.Stats.uniform_rvs ~a:(-4.) ~b:1. |] in let ll, p_star = Nelder_mead.minimize ?debug ~tol ~maxit:10_000 ~f:(clip f) ~sample () in -. ll, p_star let maximum_log_likelihood ?debug ~exchangeability_matrix ~stationary_distribution tree site = let ll, vec = inner_maximum_log_likelihood ?debug ~exchangeability_matrix ~stationary_distribution tree site in ll, decode_vec vec let simulate_site ~exchangeability_matrix ~stationary_distribution tree ~param:scale = let root = choose_aa stationary_distribution in let p = { Evolution_model.stationary_distribution ; scale ; exchangeability_matrix ; } in Simulator.site_gillespie_first_reaction tree ~root ~param:(Fn.const p) end module Model2 = struct type param = { scale : float ; stationary_distribution : Amino_acid.vector ; } let log_likelihood ~exchangeability_matrix ~param:{ stationary_distribution ; scale } tree site = let p = { Evolution_model.scale ; exchangeability_matrix ; stationary_distribution } in let transition_matrix = let f = Evolution_model.transition_probability_matrix p in fun b -> (f (Branch_info.length b) :> mat) in let leaf_state = aa_of_leaf_info site in CTMC.pruning_with_missing_values tree ~transition_matrix ~leaf_state ~root_frequencies:(stationary_distribution :> Vector.t) let counts xs = Amino_acid.Table.init (fun aa -> List.count xs ~f:(Amino_acid.equal aa)) type vec_schema = { nz : int ; (* number of non-zero AA in count table *) idx : int array ; (* indices of non-zero AA *) } let sparse_param_schema counts = let k = (counts : int Amino_acid.table :> _ array) in let idx, nz = Array.foldi k ~init:([], 0) ~f:(fun i ((assoc, nz) as acc) k_i -> if k_i = 0 then acc else i :: assoc, nz + 1 ) in let idx = Array.of_list idx in { nz ; idx } let dense_param_schema counts = let nz = Array.length (counts : int Amino_acid.table :> _ array) in let idx = Array.init nz ~f:Fn.id in { nz ; idx } let profile_guess schema counts = let counts = (counts : int Amino_acid.table :> _ array) in let total_counts = Array.fold counts ~init:0. ~f:(fun acc x -> 1. +. acc +. float x) in Array.map schema.idx ~f:(fun idx -> Float.log (float (1 + counts.(idx)) /. total_counts)) let initial_param schema counts = Array.append [| 0. |] (profile_guess schema counts) let extract_frequencies ~offset schema param = let r = Array.create ~len:Amino_acid.card 0. in Array.iteri schema.idx ~f:(fun sparse_idx full_idx -> r.(full_idx) <- Float.exp param.(sparse_idx + offset) ) ; let s = Owl.Stats.sum r in Amino_acid.Vector.init (fun aa -> r.((aa :> int)) /. s) let param_schema ?(mode = `sparse) counts = match mode with | `sparse -> sparse_param_schema counts | `dense -> dense_param_schema counts let nelder_mead_init theta0 = let c = ref (-1) in fun _ -> incr c ; if !c = 0 then theta0 else Array.init (Array.length theta0) ~f:(fun i -> theta0.(i) +. if i = !c - 1 then -. 1. else 0. ) let decode_vec schema param = let stationary_distribution = extract_frequencies ~offset:1 schema param in let scale = 10. ** param.(0) in { scale ; stationary_distribution } let inner_maximum_log_likelihood ?debug ?mode ~exchangeability_matrix tree site = let counts = Tree.leaves tree |> List.filter_map ~f:(aa_of_leaf_info site) |> counts in let schema = param_schema ?mode counts in let theta0 = initial_param schema counts in let sample = nelder_mead_init theta0 in let f p = let param = decode_vec schema p in -. log_likelihood ~exchangeability_matrix ~param tree site in let ll, p_star = Nelder_mead.minimize ~tol ?debug ~maxit:10_000 ~f:(Model1.clip f) ~sample () in -. ll, schema, p_star let maximum_log_likelihood ?debug ?mode ~exchangeability_matrix tree site = let ll, schema, vec = inner_maximum_log_likelihood ?debug ?mode ~exchangeability_matrix tree site in ll, decode_vec schema vec let lrt ?mode (wag : Wag.t) tree site = let exchangeability_matrix = wag.rate_matrix in let stationary_distribution = wag.freqs in let reduced_log_likelihood, p1 = Model1.maximum_log_likelihood ~exchangeability_matrix ~stationary_distribution tree site in let full_log_likelihood, schema, p2 = inner_maximum_log_likelihood ?mode ~exchangeability_matrix tree site in let df = float (Array.length p2 - 1 - 1) in let lrt = lrt ~full_log_likelihood ~reduced_log_likelihood ~df in p1, decode_vec schema p2, lrt end module Model3 = struct type param = { scale : float ; stationary_distribution0 : Amino_acid.vector ; stationary_distribution1 : Amino_acid.vector ; } let evolution_model_param exchangeability_matrix param cond = let f stationary_distribution = { Evolution_model.scale = param.scale ; exchangeability_matrix ; stationary_distribution } in match cond with | `Ancestral -> f param.stationary_distribution0 | `Convergent -> f param.stationary_distribution1 | _ -> assert false let log_likelihood ~exchangeability_matrix ~param tree site = let f cond = evolution_model_param exchangeability_matrix param cond |> Evolution_model.transition_probability_matrix in let transition_matrix = let f0 = f `Ancestral in (* pre-computation *) let f1 = f `Convergent in fun b -> let bl = Branch_info.length b in match Branch_info.condition b with | `Ancestral -> (f0 bl :> mat) | `Convergent -> (f1 bl :> mat) in let root_frequencies = (param.stationary_distribution0 :> Vector.t) in let leaf_state = aa_of_leaf_info site in CTMC.pruning_with_missing_values tree ~transition_matrix ~leaf_state ~root_frequencies let tuple_map (x, y) ~f = (f x, f y) let counts tree site = Tree.leaves tree |> List.partition_tf ~f:(fun l -> match Leaf_info.condition l with | `Ancestral -> true | `Convergent -> false ) |> tuple_map ~f:(List.filter_map ~f:(aa_of_leaf_info site)) |> tuple_map ~f:Model2.counts let initial_param schema tree site = let k0, k1 = counts tree site in Array.concat [ [| 0. |] ; Model2.profile_guess schema k0 ; Model2.profile_guess schema k1 ; ] let extract_frequencies schema param = Model2.extract_frequencies ~offset:1 schema param, Model2.extract_frequencies ~offset:(1 + schema.nz) schema param let decode_vec schema vec = let stationary_distribution0, stationary_distribution1 = extract_frequencies schema vec in let scale = 10. ** vec.(0) in { scale ; stationary_distribution0 ; stationary_distribution1 } let inner_maximum_log_likelihood ?debug ?mode ?model2_opt ~exchangeability_matrix tree site = let schema = Tree.leaves tree |> List.filter_map ~f:(aa_of_leaf_info site) |> Model2.counts |> Model2.param_schema ?mode in let theta0 = match model2_opt with | None -> initial_param schema tree site | Some param -> Array.(append param (sub param ~pos:1 ~len:(length param - 1))) in let f vec = let param = decode_vec schema vec in -. log_likelihood ~exchangeability_matrix ~param tree site in let sample = Model2.nelder_mead_init theta0 in let ll, p_star = Nelder_mead.minimize ~tol ?debug ~maxit:10_000 ~f:(Model1.clip f) ~sample () in -. ll, schema, p_star let maximum_log_likelihood ?debug ?mode ~exchangeability_matrix tree site = let ll, schema, vec = inner_maximum_log_likelihood ?debug ?mode ~exchangeability_matrix tree site in ll, decode_vec schema vec let lrt ?mode (wag : Wag.t) tree site = let exchangeability_matrix = wag.rate_matrix in let reduced_log_likelihood, schema2, p2 = Model2.inner_maximum_log_likelihood ?mode ~exchangeability_matrix tree site in let full_log_likelihood, schema3, p3 = inner_maximum_log_likelihood ?mode ~model2_opt:p2 ~exchangeability_matrix tree site in let df = float (Array.length p3 - Array.length p2 - 1) in let lrt = lrt ~full_log_likelihood ~reduced_log_likelihood ~df in Model2.decode_vec schema2 p2, decode_vec schema3 p3, lrt let simulate_site ~exchangeability_matrix ~scale ~stationary_distribution0 ~stationary_distribution1 tree = let param b = evolution_model_param exchangeability_matrix { scale ; stationary_distribution0 ; stationary_distribution1 } (Branch_info.condition b) in let root = choose_aa stationary_distribution0 in Simulator.site_gillespie_first_reaction tree ~root ~param end end module Pack = struct type leaf_info = int * Convergence_tree.condition module Leaf_info = struct type species = int type t = leaf_info let condition = snd let species = fst end module Site = struct type t = Amino_acid.t array type species = int let get_aa x i = Some (Array.get x i) let of_simulation s = Tree.leaves s |> Array.of_list end include Make(Convergence_tree.Branch_info)(Leaf_info)(Site) let simulate_profile alpha = Owl.Stats.dirichlet_rvs ~alpha:(Array.create ~len:Amino_acid.card alpha) |> Amino_acid.Vector.of_array_exn let pair_tree = let leaf_info i cond = (i, cond) in let node_info = () in Convergence_tree.pair_tree ~leaf_info ~node_info let convergence_tree t = let leaves = Convergence_tree.leaves t |> List.mapi ~f:(fun i (label, cond) -> label, (i, cond)) in Tree.map t ~node:Fn.id ~branch:Fn.id ~leaf:( List.Assoc.find_exn ~equal:String.equal leaves ) end module Implementation_check = struct open Pack let likelihood_plot_demo (wag : Wag.t) = let tree = pair_tree ~branch_length1:1. ~branch_length2:1. ~npairs:100 in let root = choose_aa wag.freqs in let true_scale = 1. in let p = Evolution_model.param_of_wag wag true_scale in let site = Simulator.site_gillespie_first_reaction tree ~root ~param:(Fn.const p) |> Site.of_simulation in let ll, scale_hat = Model1.maximum_log_likelihood ~exchangeability_matrix:wag.rate_matrix ~stationary_distribution:wag.freqs tree site in let f scale = Model1.log_likelihood ~exchangeability_matrix:wag.rate_matrix ~stationary_distribution:wag.freqs ~scale tree site in let x = Array.init 100 ~f:(fun i -> let i = float i in let a = -1. and b = 2. in a +. (b -. a) *. i /. 100. ) in let y = Array.map x ~f in printf "LL = %g, scale_hat = %g" ll scale_hat ; OCamlR_graphics.plot ~x ~y () let lrt_1_vs_2_null_simulation ?(seed = 31415926535897931) ?mode ?(nb_simulations = 1_000) (wag : Wag.t) = Owl_stats_prng.init seed ; let tree = pair_tree ~branch_length1:1. ~branch_length2:1. ~npairs:30 in let true_scale = 1. in let f _ = let simulation = Model1.simulate_site ~exchangeability_matrix:wag.rate_matrix ~stationary_distribution:wag.freqs ~param:true_scale tree in let site = Site.of_simulation simulation in let p1, p2, lrt = Model2.lrt ?mode wag tree site in simulation, p1, p2, lrt in Array.init nb_simulations ~f let lrt_2_vs_3_null_simulation ?(seed = 31415926535897931) ?mode ?(alpha = 0.1) ?(nb_simulations = 1_000) (wag : Wag.t) = Owl_stats_prng.init seed ; let tree = pair_tree ~branch_length1:1. ~branch_length2:1. ~npairs:30 in let true_scale = 1. in let f _ = let stationary_distribution = simulate_profile alpha in let simulation = Model1.simulate_site ~exchangeability_matrix:wag.rate_matrix ~stationary_distribution ~param:true_scale tree in let site = Site.of_simulation simulation in let p2, p3, lrt = Model3.lrt ?mode wag tree site in simulation, p2, p3, lrt in Array.init nb_simulations ~f let render_pvalue_histogram ~title results dest = OCamlR_grDevices.pdf dest ; ignore ( let values = Array.map results ~f:(fun (_,_,_,lrt) -> lrt.pvalue) in OCamlR_graphics.hist ~main:title ~xlab:"p" ~breaks:(`n 20) values :> OCamlR_graphics.hist) ; OCamlR_grDevices.dev_off () let render_stat_histogram ~title ~df results dest = OCamlR_grDevices.pdf dest ; ignore ( let values = Array.map results ~f:(fun (_,_,_,lrt) -> lrt._D_) in OCamlR_graphics.hist ~main:title ~xlab:"D" ~freq:false ~breaks:(`n 20) values :> OCamlR_graphics.hist) ; let x = Array.init 1_000 ~f:(fun i -> float i /. 10.) in let y = Array.map x ~f:(Gsl.Randist.chisq_pdf ~nu:df) in OCamlR_graphics.lines ~x ~y () ; OCamlR_grDevices.dev_off () end