convergence_tree.ml 6.85 KB
Newer Older
1 2 3
open Core_kernel
open Phylogenetics

4 5 6 7 8 9 10 11
type condition = [`Ancestral | `Convergent]

type branch_info = {
  condition : condition ;
  length : float ;
}

type t = (unit, string, branch_info) Tree.t
12 13 14

module Tags = struct
  let condition_label = "Condition"
15

16 17
  let transition_label = "Transition"

18
  let condition tags =
19 20
    List.Assoc.find tags condition_label ~equal:String.equal

21 22 23 24
  let string_of_condition = function
    | `Ancestral -> "0"
    | `Convergent -> "1"

25 26 27 28
  let set_condition tags c =
    List.Assoc.(
      add
        (remove tags condition_label ~equal:String.equal)
29
        condition_label c ~equal:String.equal)
30 31 32 33 34 35 36 37 38

  (* let other_tags tags =
   *   List.filter tags ~f:(fun (key, _) -> String.(key <> condition_label && key <> transition_label)) *)

  let unset_transition tags =
    List.Assoc.remove tags transition_label ~equal:String.equal

  let set_transition tags c =
    List.Assoc.(
39
      add (unset_transition tags) transition_label c ~equal:String.equal)
40 41 42 43 44
end

let condition_of_branch_info (bi : Newick.branch_info) =
  Tags.condition bi.tags

45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
let of_newick_tree t =
  let open Phylogenetics.Newick in
  try
    let node _ = () in
    let leaf (l : Newick.node_info) = match l.name with
      | Some n -> n
      | None -> failwith "missing leaf name"
    in
    let branch b =
      let length = match b.length with
        | None -> failwith "missing branch length"
        | Some bl -> bl
      in
      let condition =
        match
          List.Assoc.find ~equal:String.equal b.tags "Condition"
        with
        | Some s -> (
            match s with
            | "0" -> `Ancestral
            | "1" -> `Convergent
            | _ -> failwithf "Invalid condition: %s" s () )
        | None -> failwith "Missing Condition tag"
      in
      { length; condition }
    in
    Tree.map t ~node ~leaf ~branch
    |> Result.return
  with Failure msg -> Result.fail (`Msg msg)
74

75 76 77 78 79 80 81 82 83 84
let to_newick_tree t =
  let node () = { Newick.name = None } in
  let leaf l = { Newick.name = Some l } in
  let branch b = {
    Newick.length = Some b.length ;
    tags = []
  }
  in
  Tree.map t ~node ~leaf ~branch

85 86 87
let from_file fn =
  let module BI = Phylogenetics_convergence.Simulator.Branch_info in
  Newick.from_file fn
88
  |> Newick.with_inner_tree ~f:of_newick_tree
89 90 91 92 93

let leaves tree =
  let rec node condition t acc =
    match t with
    | Tree.Node n -> List1.fold_right n.branches ~init:acc ~f:branch
94
    | Leaf species -> (species, condition) :: acc
95
  and branch (Tree.Branch b) acc =
96
    node b.data.condition b.tip acc
97 98 99
  in
  node `Ancestral tree []

100 101 102 103 104 105 106 107
let%test "leaves computed in correct order" =
  match from_file "../../data/besnard2009/besnard2009.nhx" with
  | Ok t ->
    List.equal String.equal
      (List.take (leaves t) 4 |> List.map ~f:fst)
      [ "Chrysithr" ; "Ele.bald" ; "Ele.bal2" ; "Ele.bal4" ]
  | Error (`Msg msg) -> failwith msg

108
let rec transfer_condition_to_branches t =
109
  let category : _ Tree.t -> condition = function
110 111 112 113 114 115
    | Leaf (_, c) -> c
    | Node n -> snd n.data
  in
  match t with
  | Tree.Leaf (l, _) -> Tree.leaf l
  | Node n ->
116 117 118
      List1.map n.branches ~f:(fun (Branch b) ->
          let cat_child = category b.tip in
          let tags =
119
            Tags.set_condition b.data.Newick.tags (Tags.string_of_condition cat_child)
120 121 122 123 124
          in
          Tree.branch
            { b.data with Newick.tags }
            (transfer_condition_to_branches b.tip))
      |> Tree.node (fst n.data)
125

126
let reset_transitions (tree : Newick.tree) =
127
  let rec aux mother_condition tree =
128
    match (tree : Newick.tree) with
129 130
    | Leaf _ as l -> l
    | Node n ->
131 132 133 134 135 136 137 138 139 140 141 142
        let branches =
          List1.map n.branches ~f:(fun (Branch b) ->
              let tags, c_b =
                match Tags.condition b.data.tags with
                | None -> failwith "tree tagged with condition expected"
                | Some c_b ->
                    let tags =
                      if String.(c_b <> mother_condition) then
                        Tags.set_transition b.data.tags c_b
                      else Tags.unset_transition b.data.tags
                    in
                    (tags, c_b)
143
              in
144 145 146 147
              let data = { b.data with tags } in
              Tree.branch data (aux c_b b.tip))
        in
        Node { n with branches }
148 149 150 151
  in
  match tree with
  | Leaf _ as l -> l
  | Node n ->
152 153 154 155 156 157 158
      let branches =
        List1.map n.branches ~f:(fun (Branch b) ->
            match Tags.condition b.data.tags with
            | None -> failwith "tree tagged with condition expected"
            | Some c_b -> Tree.branch b.data (aux c_b b.tip))
      in
      Node { n with branches }
159 160 161 162 163

let length_on_each_condition branches =
  let module A = Biocaml_unix.Accu in
  let acc = A.create ~bin:Fn.id ~zero:0. ~add:( +. ) () in
  List.iter branches ~f:(fun bi ->
164
      match (condition_of_branch_info bi, bi.Newick.length) with
165
      | Some c, Some l -> A.add acc c l
166
      | _ -> ()) ;
167 168
  A.to_alist acc

169
let remove_nodes_with_single_child (tree : Newick.tree) =
170 171
  Tree.simplify_node_with_single_child tree
    ~merge_branch_data:(fun branches ->
172 173 174
      let condition_stats = length_on_each_condition branches in
      let major_condition =
        List.max_elt condition_stats ~compare:(fun (_, l) (_, l') ->
175
            Float.compare l l')
176
      in
177 178
      let tags =
        match major_condition with
179 180 181
        | None -> []
        | Some (c, _) -> Tags.set_condition [] c
      in
182 183 184 185 186
      let length =
        List.fold branches ~init:0. ~f:(fun acc bi ->
            acc +. Option.value_exn bi.length)
      in
      { Newick.tags; length = Some length })
187 188
  |> reset_transitions

189 190
let infer_binary_condition_on_branches ?(gain_relative_cost = 2.) t
    ~convergent_leaves =
191 192
  let category (ni : Newick.node_info) =
    Option.map ni.name ~f:(fun l ->
193
        if String.Set.mem convergent_leaves l then 1 else 0)
194 195
  in
  let cost x y =
196
    match (x, y) with
197 198
    | 0, 1 -> gain_relative_cost
    | 1, 0 -> 1.
199
    | 0, 0 | 1, 1 -> 0.
200 201
    | _ -> assert false
  in
202 203 204 205 206
  let convert_node = function
    | x, 0 -> x, `Ancestral
    | x, 1 -> x, `Convergent
    | _ -> assert false
  in
207
  Fitch.fitch ~cost ~n:2 ~category t
208
  |> Tree.map ~node:convert_node ~leaf:convert_node ~branch:Fn.id
209
  |> transfer_condition_to_branches |> reset_transitions
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234


let alignment_counts_map tree alignment f =
  let leaves =
    leaves tree
    |> List.map ~f:(fun (n, cond) ->
        match Alignment.find_sequence alignment n with
        | None -> failwithf "Could not find %s in alignment" n ()
        | Some seq -> seq, cond
      )
  in
  let seqs0, seqs1 = List.partition_map leaves ~f:Either.(function
      | (aa, `Ancestral) -> First aa
      | (aa, `Convergent) -> Second aa
    )
  in
  let counts seqs i =
    Amino_acid.Table.init (fun aa ->
        let aa = Amino_acid.to_char aa in
        List.count seqs ~f:(fun s -> Char.equal s.[i] aa)
      )
  in
  let site i = f (counts seqs0 i) (counts seqs1 i) in
  let n = Alignment.ncols alignment in
  List.init n ~f:site