orthomam.ml 10.7 KB
Newer Older
1 2 3 4 5 6
open Core_kernel
open Bistro
open File_formats
open Reviewphiltrans_toolbox

let ensembl_tree : nhx file =
7
  Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh"
8

9
type alignment = Orthomam_db.t * string
10

11
let alignments_of_db db =
12
  List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali)
13 14 15 16 17 18 19 20 21 22 23

let%pworkflow fasta_of_phylip ali =
  let ali =
    [%path ali]
    |> Phylogenetics.Phylip.read_exn ~strict:false
  in
  List.map ali.items ~f:(fun it ->
      Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
    )
  |> Biotk.Fasta.to_file [%dest]

24
type query = {
25 26
  db : Orthomam_db.t ;
  alignment : string ;
27 28 29
  convergent_species : string list workflow ;
}

30 31
let query (db, alignment) ~convergent_species =
  { db ; alignment ; convergent_species }
32

33 34 35 36 37 38 39 40 41 42 43 44
let%pworkflow remove_unobserved_sequences_from_alignment phylip : phylip file =
  let phylip = [%path phylip] in
  let module Phylip = Phylogenetics.Phylip in
  let data = Phylip.read_exn ~strict:false phylip in
  let threshold = data.sequence_length * 90 / 100 in
  let items = List.filter data.items ~f:(fun it ->
      String.count it.sequence ~f:(Char.equal '?') < threshold
    )
  in
  let filtered_data = Phylip.make_exn items in
  Phylip.write ~strict:false filtered_data [%dest]

45 46 47 48 49 50 51 52 53
let rec transfer_tag_to_branches t =
  let open Phylogenetics in
  let category : _ Tree.t -> int = function
    | Leaf (_, c) -> c
    | Node n -> snd n.data
  in
  match t with
  | Tree.Leaf (l, _) -> Tree.leaf l
  | Node n ->
54
    let cat_n = snd n.data in
55
    List1.map n.branches ~f:(fun (Branch b) ->
56 57 58 59 60 61 62 63 64 65 66
        let cat_child = category b.tip in
        let tags = List.concat [
            [ "Condition", Int.to_string cat_child ] ;
            (
              if cat_n <> cat_child then
                [ "Transition", Int.to_string cat_child ]
              else []
            ) ;
            b.data.Newick.tags
          ]
        in
67 68 69 70 71 72
        Tree.branch
          { b.data with Newick.tags }
          (transfer_tag_to_branches b.tip)
      )
    |> Tree.node (fst n.data)

73
let tag_tree t tagged_leaves =
74 75 76 77 78 79 80 81 82
  let open Phylogenetics in
  let category (ni : Newick.node_info) =
    Option.map ni.name ~f:(fun l ->
        if String.Set.mem tagged_leaves l then 1 else 0
      )
  in
  Fitch.fitch ~n:2 ~category t
  |> transfer_tag_to_branches

83 84 85 86 87 88
let%pworkflow clip_tree_on_alignment tree ali =
  let open Phylogenetics in
  let tree = Newick.from_file [%path tree] in
  let ali = Phylip.read_exn ~strict:false [%path ali] in
  let ali_species = List.map ali.items ~f:(fun it -> it.name) in
  let clipped_tree =
Philippe Veber's avatar
Philippe Veber committed
89
    Newick.map_inner_tree tree ~f:(fun tree ->
90 91 92 93 94 95 96 97 98 99
        match
          Tree.leafset_generated_subtree tree
            (fun bi -> bi.Newick.name) ali_species
        with
        | None -> failwith "Tree has no leaf in alignment"
        | Some filtered_tree -> filtered_tree
      )
  in
  Newick.to_file clipped_tree [%dest]

100 101 102
let omm_tree_of_db db =
  Workflow.input (Orthomam_db.tree db)

103
let annotate_convergent_species_in_tree tree species =
104
  Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" (
105 106
    let open Phylogenetics in
    let%pdeps species = species
107
    and omm_tree = path tree in
108

109
    let ensembl_tree = Newick.from_file omm_tree in
110
    let tagged_tree =
Philippe Veber's avatar
Philippe Veber committed
111
      Newick.map_inner_tree ensembl_tree ~f:(fun t ->
112 113 114
          tag_tree t (String.Set.of_list species)
        )
    in
115 116 117
    Newick.to_file tagged_tree [%dest]
  )

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
let%pworkflow compare_tree_branch_lengths t1 t2 =
  let open Phylogenetics in
  let t1 = Newick.from_file [%path t1] in
  let t2 = Newick.from_file [%path t2] in
  let rec reorder_branches (branches : _ Tree.branch List1.t) =
    List1.map branches ~f:(fun (Branch b) ->
        let tip, label = reorder_tree_aux b.tip in
        Tree.branch b.data tip, label
      )
  and reorder_tree_aux (t : Newick.tree) =
    match t with
    | Leaf l -> t, Option.value_exn l.name
    | Node n ->
      let branches_and_labels = reorder_branches n.branches in
      let branches, labels =
        branches_and_labels
        |> List1.sort ~compare:(fun (_, x) (_, y) ->
            String.compare x y
          )
        |> List1.unzip
      in
      let label = List1.hd labels in
      Tree.node n.data branches, label
  and reorder_tree t = fst (reorder_tree_aux t)
  in
  let rec collect_branch_lengths (t1 : Newick.tree) (t2 : Newick.tree) =
    match (t1, t2) with
    | Leaf _, Leaf _ -> []
    | Node n1, Node n2 ->
      List1.map2_exn n1.branches n2.branches ~f:(fun (Branch b1) (Branch b2) ->
          let acc = collect_branch_lengths b1.tip b2.tip in
          match b1.data.length, b2.data.length with
          | Some l1, Some l2 -> (l1, l2) :: acc
          | _ -> acc
        )
      |> List1.to_list
      |> List.concat
    | Leaf _, Node _
    | Node _, Leaf _ -> invalid_arg "trees with different topologies"
  in
  let t1 = Newick.with_inner_tree ~f:reorder_tree t1 in
  let t2 = Newick.with_inner_tree ~f:reorder_tree t2 in
  let l1, l2 =
    collect_branch_lengths t1 t2
    |> List.unzip
  in
  OCamlR_grDevices.pdf [%dest] ;
  OCamlR_graphics.plot
    ~main:"Comparison of branch lengths"
    ~xlab:"Branch lengths in tree 1"
    ~ylab:"Branch lengths in tree 2"
    ~x:(Array.of_list l1) ~y:(Array.of_list l2) () ;
  OCamlR_grDevices.dev_off ()
171 172 173 174

let%pworkflow number_of_missing_sequences_repartition db =
  Orthomam_db.missing_sequences_cdf [%param db] [%dest]

175
let%pworkflow concatenate ?seed ?site_sampling ~max_missing_sequences db =
176 177
  let db = [%param db] in
  let max_missing_sequences = [%param max_missing_sequences] in
178 179
  let seed = [%param seed] in
  let site_sampling = [%param site_sampling] in
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
  let open Phylogenetics in
  let all_alignments = Orthomam_db.list_alignments db in
  let missing_sequence seq =
    String.count seq ~f:Char.(fun c -> c = '?' || c = '-') = String.length seq
  in
  let compare_items (x : Phylip.item) (y : Phylip.item) =
    String.compare x.name y.name
  in
  let merge_items (xs : Phylip.item list) =
    match xs with
    | [] -> assert false
    | h :: t ->
      assert (List.for_all t ~f:(fun x -> String.equal x.name h.name)) ;
      { Phylip.name = h.name ;
        sequence =
          List.map xs ~f:(fun x -> x.sequence)
          |> String.concat }
  in
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
  let sample_sites (alignments : Phylip.t list) n =
    let seed = match seed with
      | None -> Gsl.Rng.default_seed ()
      | Some i -> Nativeint.of_int i
    in
    let alignments = Array.of_list alignments in
    let sizes =
      Array.map alignments ~f:(fun al -> float al.sequence_length)
      |> Gsl.Randist.discrete_preproc
    in
    let rng = Gsl.Rng.(make (default ())) in
    Gsl.Rng.set rng seed ;
    let nsequences = alignments.(0).number_of_sequences in
    let sequences = Array.init nsequences ~f:(fun _ ->
        Bytes.create n
      )
    in
    for i = 0 to n - 1 do
      let al = alignments.(Gsl.Randist.discrete rng sizes) in
      let p = Gsl.Rng.uniform_int rng al.sequence_length in
      List.iteri al.items ~f:(fun j it ->
          Bytes.set sequences.(j) i it.sequence.[p]
        )
    done ;
    List.mapi alignments.(0).items ~f:(fun i it ->
        { Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name }
      )
  in
  let filtered_alignments =
    List.filter_map all_alignments ~f:(fun ali_fn ->
        let ali = Phylip.read_exn ~strict:false ali_fn in
        let nmissing = List.count ali.items ~f:(fun it -> missing_sequence it.sequence) in
        if nmissing <= max_missing_sequences then
231 232 233
          if List.for_all ali.items ~f:(fun it -> Result.is_ok (Orthomam_db.check_orf it.sequence))
          then Some ali
          else None
234 235 236 237 238
        else None
      )
  in
  let merged_items (alignments : Phylip.t list) =
    List.map alignments ~f:(fun ali ->
239 240 241 242 243
        List.sort ali.items ~compare:compare_items
      )
    |> List.transpose_exn
    |> List.map ~f:merge_items
  in
244 245 246 247 248 249
  let items =
    match site_sampling with
    | None -> merged_items filtered_alignments
    | Some n -> sample_sites filtered_alignments n
  in
  Phylip.write ~strict:false (Phylip.make_exn items) [%dest]
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286

let%pworkflow concatenate_calibration_figure ~nsites ~trees =
  let open Phylogenetics in
  let nsites = [%param nsites] in
  let tree_paths = [%eval
    List.map trees ~f:Workflow.path_list
    |> Workflow.list
  ]
  in
  let tree_lengths =
    List.map tree_paths ~f:(
      List.map ~f:(fun tree_path ->
          Newick.from_file tree_path
          |> Newick.with_inner_tree ~f:(
            Tree.pre ~init:0.
              ~node:(fun acc _ -> acc) ~leaf:(fun acc _ -> acc)
              ~branch:(fun acc bi ->
                  match bi.Newick.length with
                  | None -> failwith "expected tree with branch length"
                  | Some l -> acc +. l
                )
          )
        )
    )
  in
  let x =
    List.map2_exn nsites trees ~f:(fun n trees ->
        List.init (List.length trees) ~f:(fun _ -> float n)
      )
    |> List.concat
    |> Array.of_list
  in
  let y =
    List.concat tree_lengths
    |> Array.of_list
  in
  OCamlR_grDevices.pdf [%dest] ;
287
  OCamlR_graphics.plot ~ylab:"Total branch length" ~xlab:"Concatenate size" ~log:`XY ~x ~y () ;
288 289 290
  OCamlR_grDevices.dev_off ()

let concatenate_calibration ~seed db =
291 292
  let nrep = 5 in
  let nsites = [ 50 ; 100 ; 200 ; 500 ; 1000 ; 2000 ; 5000 ; 10000 ; 20000 ; 50000 ] in
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
  let seeds =
    let rng = Random.State.make [| seed |] in
    List.map nsites ~f:(fun _ ->
        List.init nrep ~f:(fun _ ->
            Random.State.int rng (1 lsl 30 - 1)
          )
      )
  in
  let concatenates =
    List.map2_exn nsites seeds ~f:(fun n seeds ->
        List.map seeds ~f:(fun seed ->
            concatenate ~seed ~site_sampling:n ~max_missing_sequences:0 db
          )
      )
  in
  let omm_tree = omm_tree_of_db db in
  let trees =
    List.map concatenates ~f:(
      List.map ~f:(fun ali ->
          Biotope.Iqtree.iqtree ~o:"Ornithorhynchus_anatinus" ~te:omm_tree (`phylip ali)
        )
    )
  in
  concatenate_calibration_figure ~nsites ~trees
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351

let estimated_nucleotide_tree db =
  concatenate ~seed:1234 ~site_sampling:10_000 ~max_missing_sequences:0 db

module Q = struct
  type t = query

  let alignment q : phylip file =
    Workflow.input q.alignment
    |> remove_unobserved_sequences_from_alignment

  let tree ~branch_length_unit q =
    let omm_tree_with_branch_lengths = match branch_length_unit with
      | `Nucleotide -> estimated_nucleotide_tree q.db
      | `Amino_acid -> assert false
      | `Codon -> assert false
    in
    clip_tree_on_alignment
      (annotate_convergent_species_in_tree omm_tree_with_branch_lengths q.convergent_species)
      (alignment q)

  let nucleotide_alignment q =
    fasta_of_phylip (alignment q)
end

include Q

include Detection_pipeline.Make(Q)

let species_with_echolocation = [
  "Rhinolophus ferrumequinum" ;
  "Tursiops_truncatus" ;
  "Myotis_lucifugus" ;
  "Physeter_catodon" ;
]