mutsel_simulator_cpg.ml 9.54 KB
Newer Older
1 2 3
open Core_kernel
open Phylogenetics

4
module NSCodon = Codon.Universal_genetic_code.NS
5
module NSCodon_rate_matrix = Rate_matrix.Make(NSCodon)
6

7
module Evolution_model = struct
8 9 10 11
  type context_CpG = {
    lh: Nucleotide.repr option;
    rh: Nucleotide.repr option;
  }
12

13 14 15 16 17 18 19 20
  type param = {
    nucleotide_rates : Rate_matrix.Nucleotide.t ;
    nucleotide_stat_dist : Nucleotide.vector ;
    omega : float ; (* dN/dS *)
    scaled_fitness : Amino_acid.vector ;
    gBGC : float ;
    rate_CpG: float;
    context_CpG : context_CpG;
21 22
  }

Philippe Veber's avatar
Philippe Veber committed
23 24
  let inspect_codon codon =
    let x, y, z = NSCodon.nucleotides codon in
25
    Nucleotide.(inspect x, inspect y, inspect z)
26

27
  let context_CpG_of_codons ?left_codon ?right_codon () =
28 29
    let open Option.Monad_infix in
    {
30 31
      lh = left_codon  >>| inspect_codon >>| trd3 ;
      rh = right_codon >>| inspect_codon >>| fst3 ;
32
    }
33

34 35 36 37 38 39 40
  let fitness_of_profile ?(beta = 1.) =
    Amino_acid.Vector.map ~f:(fun x -> beta *. Float.log x)

  let flat_fitness () =
    Amino_acid.Vector.init (fun _ -> 1. /. float Amino_acid.card)
    |> fitness_of_profile

41
  let make_param ~gBGC ~scaled_fitness ~omega ~nucleotide_rates ~nucleotide_stat_dist ~rate_CpG state pos _branch =
42 43 44 45 46 47 48
    let n = Array.length state in
    let left_codon =
      if 0 <= pos - 1 && pos - 1 < n
      then Some state.(pos - 1)
      else None
    and right_codon =
      if pos + 1 < n then Some state.(pos + 1) else None
49
    in
50
    let context_CpG = context_CpG_of_codons ?left_codon ?right_codon () in
51 52
    {
      nucleotide_rates;
53 54 55 56 57
      nucleotide_stat_dist;
      omega;
      scaled_fitness;
      gBGC;
      rate_CpG;
58 59 60
      context_CpG;
    }

61
  let flat_param ~rate_CpG =
62 63
    let pi = Nucleotide.flat_profile () in
    let rho = Linear_algebra.Lacaml.Vector.init 6 ~f:(fun _ -> 1. /. 6.) in
64 65
    let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
    make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
66

67 68 69 70
  let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG =
    let pi = Nucleotide.random_profile alpha_nuc in
    let rho = Utils.random_profile 6 in
    let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in
71
    let scaled_fitness = Amino_acid.random_profile alpha_fitness |> fitness_of_profile in
Louis Duchemin's avatar
Louis Duchemin committed
72
    make_param ~gBGC:0. ~scaled_fitness ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
73

74 75 76 77 78 79 80 81 82
  let rate_increase_CpG (pos, x_a, x_b) context codon =
    let open Nucleotide in
    match (pos, inspect x_a, inspect x_b), context, codon with
    | (0, C, T), _, (_, G, _)
    | (1, C, T), _, (_, _, G)
    | (2, C, T), { rh = Some G ; _ }, _
    | (0, G, A), { lh = Some C ; _ }, _
    | (1, G, A), _, (C, _, _)
    | (2, G, A), _, (_,C,_) -> true
83 84 85 86 87 88 89 90 91 92 93 94 95 96
    | _ -> false

  let fixation_probability delta =
    let open Float in
    if abs delta < 1e-30 then 1. + delta / 2.
    else if delta > 50. then delta
    else if delta < - 50. then 0.
    else delta / (1. - exp (- delta))

  let rate_matrix { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ; rate_CpG ; context_CpG; _ } =
    let nuc_rates = (nucleotide_rates :> Nucleotide.matrix) in
    Mutsel.NSCodon_rate_matrix.make (fun p q ->
        match NSCodon.neighbours p q with
        | Some (pos, x_a, x_b) ->
97
          let rate_CpG = if rate_increase_CpG (pos, x_a, x_b) context_CpG (inspect_codon p) then rate_CpG else 1. in
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
          let _B_ = match Nucleotide.(inspect x_a, inspect x_b) with
            | (A | T), (C | G) -> gBGC
            | (C | G), (A | T) -> -. gBGC
            | _ -> 0.
          in
          let selection_coefficient =
            _B_ +.
            if NSCodon.synonym p q then 0.
            else
              let aa_p = NSCodon.aa_of_codon p in
              let aa_q = NSCodon.aa_of_codon q in
              _F_.Amino_acid.%(aa_q) -. _F_.Amino_acid.%(aa_p)
          in
          let p_fix = fixation_probability selection_coefficient in
          let q_ab = nuc_rates.Nucleotide.%{x_a, x_b} in
          q_ab *. omega *. p_fix *. rate_CpG
        | None -> 0.
      )

117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  let rate_vector { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ; rate_CpG ; context_CpG; _ } p =
    let nuc_rates = (nucleotide_rates :> Nucleotide.matrix) in
    NSCodon.Table.init (fun q ->
        if NSCodon.equal p q then 0.
        else
          match NSCodon.neighbours p q with
          | Some (pos, x_a, x_b) ->
            let rate_CpG = if rate_increase_CpG (pos, x_a, x_b) context_CpG (inspect_codon p) then rate_CpG else 1. in
            let _B_ = match Nucleotide.(inspect x_a, inspect x_b) with
              | (A | T), (C | G) -> gBGC
              | (C | G), (A | T) -> -. gBGC
              | _ -> 0.
            in
            let selection_coefficient =
              _B_ +.
              if NSCodon.synonym p q then 0.
              else
                let aa_p = NSCodon.aa_of_codon p in
                let aa_q = NSCodon.aa_of_codon q in
                _F_.Amino_acid.%(aa_q) -. _F_.Amino_acid.%(aa_p)
            in
            let p_fix = fixation_probability selection_coefficient in
            let q_ab = nuc_rates.Nucleotide.%{x_a, x_b} in
            q_ab *. omega *. p_fix *. rate_CpG
          | None -> 0.
      )

144
  let stationary_distribution p = NSCodon_rate_matrix.stationary_distribution (rate_matrix p)
145

146 147
  let nucleotides_of_codons codons =
    Array.concat_map codons ~f:(fun codon ->
148
        let x, y, z = NSCodon.nucleotides codon in
149
        [|x;y;z|]
150 151 152 153 154 155 156
      )

  let nucleotide_counts seq =
    Nucleotide.Table.init (fun nuc ->
        Array.count seq ~f:(Nucleotide.equal nuc)
      )

157 158
  let dinucleotide_counts seq =
    let observed = Array.init Nucleotide.card ~f:(fun _ ->
159
        Array.init Nucleotide.card ~f:(fun _ -> 0)
160 161
      ) in
    Array.iteri seq ~f:(fun pos nuc ->
162 163 164
        if (pos + 1) < Array.length seq then
          let next = seq.(pos+1) in
          Nucleotide.(
165
            observed.(to_int nuc).(to_int next) <-
166 167 168 169
              observed.(to_int nuc).(to_int next) + 1
          ));
    observed

170 171 172
  let dinucleotide_ratios codon_sequence =
    let nucleotide_sequence = nucleotides_of_codons codon_sequence in
    let single_counts = nucleotide_counts nucleotide_sequence in
173
    let dinuc_counts = dinucleotide_counts nucleotide_sequence in
174 175 176
    Array.mapi dinuc_counts ~f:(fun fst_nuc counts ->
        Array.mapi counts ~f:(fun snd_nuc observed ->
            let expected =
177
              float_of_int Nucleotide.Table.(
178
                  get single_counts (Nucleotide.of_int_exn fst_nuc) *
179
                  get single_counts (Nucleotide.of_int_exn snd_nuc)
180
                ) /.
181
              float_of_int (Array.length nucleotide_sequence)
182
            in
183
            float_of_int observed /. expected)
184
      )
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
end

module Make(BI: Simulator.Branch_info) = struct
  include Simulator.Make(NSCodon)(Evolution_model)(BI)

  let rng = Gsl.Rng.(make (default ()))

  let symbol_sample v =
    Gsl.Randist.(discrete rng (discrete_preproc v))
    |> NSCodon.of_int_exn

  let memo f =
    let table = Caml.Hashtbl.create 253 in
    fun x ->
      match Caml.Hashtbl.find table x with
      | y -> y
      | exception Caml.Not_found ->
        let y = f x in
        Caml.Hashtbl.add table x y ;
        y

  let sequence_gillespie_direct tree ~root ~param =
207
    let codon_rates = memo (fun (p, s) -> Evolution_model.rate_vector p s) in
208 209
    Tree.propagate tree ~init:root ~node:Fn.const ~leaf:Fn.const ~branch:(fun seq b ->
        let rec loop state remaining_time =
210
          let n = Array.length state in
211
          let rates =
212 213
            Array.init n ~f:(fun i ->
                codon_rates (param state i b, state.(i))
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
              ) in
          let pos_rates = Array.map rates ~f:(fun r -> Owl.Stats.sum (r :> float array)) in
          let total_rate = Array.reduce_exn pos_rates ~f:( +. ) in
          let tau = Owl.Stats.exponential_rvs ~lambda:total_rate in
          if Float.(tau > remaining_time) then state
          else
            let pos = Owl.Stats.categorical_rvs pos_rates in
            let next_letter = symbol_sample (rates.(pos) :> float array) in
            let next_state =
              let t = Array.copy state in
              t.(pos) <- next_letter ;
              t
            in
            loop next_state Float.(remaining_time - tau)
        in
        loop seq (BI.length b)
      )
Louis Duchemin's avatar
Louis Duchemin committed
231

232
let demo seq_length ~rate_CpG ~branch_length =
Louis Duchemin's avatar
Louis Duchemin committed
233 234 235
  let module Branch_info = struct
    type t = Newick.branch_info
    let length branch = Option.value_exn Newick.(branch.length)
236
  end
Louis Duchemin's avatar
Louis Duchemin committed
237 238
  in
  let module M = Make(Branch_info) in
239
  let root_sequence = Array.init seq_length ~f:(fun _ ->
Louis Duchemin's avatar
Louis Duchemin committed
240
      let codon_seed = Owl.Stats.uniform_int_rvs ~a:0 ~b:(NSCodon.card - 1) in
241
      NSCodon.of_int_exn codon_seed
Louis Duchemin's avatar
Louis Duchemin committed
242
    ) in
243 244
  let param seq i b = Evolution_model.flat_param ~rate_CpG seq i b in
  let tree =
Louis Duchemin's avatar
Louis Duchemin committed
245
    sprintf "(leaf:%f);" branch_length
Philippe Veber's avatar
Philippe Veber committed
246 247
    |> Newick.from_string_exn
    |> Newick.with_inner_tree ~f:(fun tree ->
Louis Duchemin's avatar
Louis Duchemin committed
248
        M.sequence_gillespie_direct tree ~root:root_sequence ~param
249
      )
Louis Duchemin's avatar
Louis Duchemin committed
250
  in
251
  let simulated_sequence = Tree.leaves tree |> List.hd_exn in
Louis Duchemin's avatar
Louis Duchemin committed
252
  let counts = Evolution_model.dinucleotide_ratios simulated_sequence in
253
  Array.init Nucleotide.card ~f:(fun i ->
Louis Duchemin's avatar
Louis Duchemin committed
254 255
      let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in
      "\t" ^ nuc
256 257
    )
  |> String.concat_array ~sep:"\t"
Louis Duchemin's avatar
Louis Duchemin committed
258
  |> print_endline ;
259
  Array.iteri counts ~f:(fun i nuc_ratios ->
Louis Duchemin's avatar
Louis Duchemin committed
260 261
      let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in
      print_string nuc;
262
      Array.iteri nuc_ratios ~f:(fun _ ratio ->
Louis Duchemin's avatar
Louis Duchemin committed
263 264 265 266
          print_string ("\t" ^ string_of_float ratio)
        );
      Out_channel.newline stdout;
    )