mutsel_simulator_cpg.ml 7.09 KB
Newer Older
1 2 3
open Core_kernel
open Phylogenetics

4
module NSCodon = Codon.Universal_genetic_code.NS
5
module NSCodon_rate_matrix = Rate_matrix.Make(NSCodon)
6

7
module Evolution_model = struct
8 9 10 11
  type context_CpG = {
    lh: Nucleotide.repr option;
    rh: Nucleotide.repr option;
  }
12

13 14 15 16 17 18 19 20
  type param = {
    nucleotide_rates : Rate_matrix.Nucleotide.t ;
    nucleotide_stat_dist : Nucleotide.vector ;
    omega : float ; (* dN/dS *)
    scaled_fitness : Amino_acid.vector ;
    gBGC : float ;
    rate_CpG: float;
    context_CpG : context_CpG;
21 22
  }

Philippe Veber's avatar
Philippe Veber committed
23 24
  let inspect_codon codon =
    let x, y, z = NSCodon.nucleotides codon in
25
    Nucleotide.(inspect x, inspect y, inspect z)
26

27
  let context_CpG_of_codons ?left_codon ?right_codon () =
28 29
    let open Option.Monad_infix in
    {
30 31
      lh = left_codon  >>| inspect_codon >>| trd3 ;
      rh = right_codon >>| inspect_codon >>| fst3 ;
32
    }
33

34 35 36 37 38 39 40
  let fitness_of_profile ?(beta = 1.) =
    Amino_acid.Vector.map ~f:(fun x -> beta *. Float.log x)

  let flat_fitness () =
    Amino_acid.Vector.init (fun _ -> 1. /. float Amino_acid.card)
    |> fitness_of_profile

41
  let make_param ~gBGC ~scaled_fitness ~omega ~nucleotide_rates ~nucleotide_stat_dist ~rate_CpG state pos _branch =
42 43 44 45 46 47 48
    let n = Array.length state in
    let left_codon =
      if 0 <= pos - 1 && pos - 1 < n
      then Some state.(pos - 1)
      else None
    and right_codon =
      if pos + 1 < n then Some state.(pos + 1) else None
49
    in
50
    let context_CpG = context_CpG_of_codons ?left_codon ?right_codon () in
51 52
    {
      nucleotide_rates;
53 54 55 56 57
      nucleotide_stat_dist;
      omega;
      scaled_fitness;
      gBGC;
      rate_CpG;
58 59 60
      context_CpG;
    }

Louis Duchemin's avatar
Louis Duchemin committed
61
  let flat_param ~rate_CpG = 
62 63 64
    let pi = Nucleotide.flat_profile () in
    let rho = Linear_algebra.Lacaml.Vector.init 6 ~f:(fun _ -> 1. /. 6.) in
    let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in 
Louis Duchemin's avatar
Louis Duchemin committed
65
    make_param ~gBGC:0. ~scaled_fitness:(flat_fitness ()) ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG 
66

Louis Duchemin's avatar
Louis Duchemin committed
67
  let random_param ~alpha_nuc ~alpha_fitness ~rate_CpG = 
68 69 70 71
    let pi = Nucleotide.random_profile alpha_nuc in 
    let rho = Utils.random_profile 6 in 
    let nucleotide_rates = Rate_matrix.Nucleotide.gtr ~equilibrium_frequencies:pi ~transition_rates:rho in 
    let scaled_fitness = Amino_acid.random_profile alpha_fitness |> fitness_of_profile in
Louis Duchemin's avatar
Louis Duchemin committed
72
    make_param ~gBGC:0. ~scaled_fitness ~omega:1. ~nucleotide_rates ~nucleotide_stat_dist:pi ~rate_CpG
73

74 75 76 77 78 79 80 81 82
  let rate_increase_CpG (pos, x_a, x_b) context codon =
    let open Nucleotide in
    match (pos, inspect x_a, inspect x_b), context, codon with
    | (0, C, T), _, (_, G, _)
    | (1, C, T), _, (_, _, G)
    | (2, C, T), { rh = Some G ; _ }, _
    | (0, G, A), { lh = Some C ; _ }, _
    | (1, G, A), _, (C, _, _)
    | (2, G, A), _, (_,C,_) -> true
83 84 85 86 87 88 89 90 91 92 93 94 95 96
    | _ -> false

  let fixation_probability delta =
    let open Float in
    if abs delta < 1e-30 then 1. + delta / 2.
    else if delta > 50. then delta
    else if delta < - 50. then 0.
    else delta / (1. - exp (- delta))

  let rate_matrix { nucleotide_rates ; omega ; scaled_fitness = _F_ ; gBGC ; rate_CpG ; context_CpG; _ } =
    let nuc_rates = (nucleotide_rates :> Nucleotide.matrix) in
    Mutsel.NSCodon_rate_matrix.make (fun p q ->
        match NSCodon.neighbours p q with
        | Some (pos, x_a, x_b) ->
97
          let rate_CpG = if rate_increase_CpG (pos, x_a, x_b) context_CpG (inspect_codon p) then rate_CpG else 1. in
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
          let _B_ = match Nucleotide.(inspect x_a, inspect x_b) with
            | (A | T), (C | G) -> gBGC
            | (C | G), (A | T) -> -. gBGC
            | _ -> 0.
          in
          let selection_coefficient =
            _B_ +.
            if NSCodon.synonym p q then 0.
            else
              let aa_p = NSCodon.aa_of_codon p in
              let aa_q = NSCodon.aa_of_codon q in
              _F_.Amino_acid.%(aa_q) -. _F_.Amino_acid.%(aa_p)
          in
          let p_fix = fixation_probability selection_coefficient in
          let q_ab = nuc_rates.Nucleotide.%{x_a, x_b} in
          q_ab *. omega *. p_fix *. rate_CpG
        | None -> 0.
      )

117
  let stationary_distribution p = NSCodon_rate_matrix.stationary_distribution (rate_matrix p)
118

119 120
  let nucleotides_of_codons codons =
    Array.concat_map codons ~f:(fun codon ->
121 122
        let x, y, z = NSCodon.nucleotides codon in 
        [|x;y;z|]
123 124 125 126 127 128 129
      )

  let nucleotide_counts seq =
    Nucleotide.Table.init (fun nuc ->
        Array.count seq ~f:(Nucleotide.equal nuc)
      )

130 131 132 133 134 135 136 137 138 139 140 141 142
  let dinucleotide_counts seq = 
    let observed = Array.init Nucleotide.card ~f:(fun _ -> 
        Array.init Nucleotide.card ~f:(fun _ -> 0)
      ) in 
    Array.iteri seq ~f:(fun pos nuc -> 
        if (pos + 1) < Array.length seq then
          let next = seq.(pos+1) in
          Nucleotide.(
            observed.(to_int nuc).(to_int next) <- 
              observed.(to_int nuc).(to_int next) + 1
          ));
    observed

143 144 145
  let dinucleotide_ratios codon_sequence =
    let nucleotide_sequence = nucleotides_of_codons codon_sequence in
    let single_counts = nucleotide_counts nucleotide_sequence in
146 147 148 149 150 151 152 153 154
    let dinuc_counts = dinucleotide_counts nucleotide_sequence in
    Array.mapi dinuc_counts ~f:(fun fst_nuc counts -> 
        Array.mapi counts ~f:(fun snd_nuc observed -> 
            let expected = 
              float_of_int Nucleotide.Table.(
                  get single_counts (Nucleotide.of_int_exn fst_nuc) * 
                  get single_counts (Nucleotide.of_int_exn snd_nuc)
                ) /. 
              float_of_int (Array.length nucleotide_sequence)
155
            in
156
            float_of_int observed /. expected)
157
      )
158
end 
159

160
module Make(BI: Simulator.Branch_info) = Simulator.Make(NSCodon)(Evolution_model)(BI)
Louis Duchemin's avatar
Louis Duchemin committed
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198

let demo seq_length ~rate_CpG ~branch_length = 
  let module Branch_info = struct
    type t = Newick.branch_info
    let length branch = Option.value_exn Newick.(branch.length)
  end  
  in
  let module M = Make(Branch_info) in
  let root_sequence = Array.init seq_length ~f:(fun _ -> 
      let codon_seed = Owl.Stats.uniform_int_rvs ~a:0 ~b:(NSCodon.card - 1) in
      match NSCodon.of_int codon_seed with 
      | Some codon -> codon
      | None -> sprintf "Cannot create codon from integer : %d" codon_seed |> failwith
    ) in
  let param = Evolution_model.random_param ~alpha_nuc:0.5 ~alpha_fitness:0.5 ~rate_CpG in
  let tree = 
    sprintf "(leaf:%f);" branch_length
    |>Newick.from_string 
    |>Newick.with_inner_tree ~f:(fun tree ->
        M.sequence_gillespie_direct tree ~root:root_sequence ~param
      ) 
  in
  let simulated_sequence = Tree.leaves tree |> List.hd_exn in 
  let counts = Evolution_model.dinucleotide_ratios simulated_sequence in
  Array.init Nucleotide.card ~f:(fun i -> 
      let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in
      "\t" ^ nuc
    ) 
  |> String.concat_array ~sep:"\t" 
  |> print_endline ;
  Array.iteri counts ~f:(fun i nuc_ratios -> 
      let nuc = Nucleotide.of_int_exn i |> Nucleotide.to_char |> Char.to_string in
      print_string nuc;
      Array.iteri nuc_ratios ~f:(fun _ ratio -> 
          print_string ("\t" ^ string_of_float ratio)
        );
      Out_channel.newline stdout;
    )