orthomam.ml 16.1 KB
Newer Older
1 2 3 4 5 6
open Core_kernel
open Bistro
open File_formats
open Reviewphiltrans_toolbox

let ensembl_tree : nhx file =
7
  Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh"
8

9
type alignment = Orthomam_db.t * string
10

11
let alignments_of_db db =
12
  List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali)
13 14 15 16 17 18 19 20 21 22 23

let%pworkflow fasta_of_phylip ali =
  let ali =
    [%path ali]
    |> Phylogenetics.Phylip.read_exn ~strict:false
  in
  List.map ali.items ~f:(fun it ->
      Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
    )
  |> Biotk.Fasta.to_file [%dest]

24
type query = {
25 26
  db : Orthomam_db.t ;
  alignment : string ;
27 28 29
  convergent_species : string list workflow ;
}

30 31
let query (db, alignment) ~convergent_species =
  { db ; alignment ; convergent_species }
32

33 34 35 36 37 38 39 40 41 42 43 44
let%pworkflow remove_unobserved_sequences_from_alignment phylip : phylip file =
  let phylip = [%path phylip] in
  let module Phylip = Phylogenetics.Phylip in
  let data = Phylip.read_exn ~strict:false phylip in
  let threshold = data.sequence_length * 90 / 100 in
  let items = List.filter data.items ~f:(fun it ->
      String.count it.sequence ~f:(Char.equal '?') < threshold
    )
  in
  let filtered_data = Phylip.make_exn items in
  Phylip.write ~strict:false filtered_data [%dest]

45 46 47 48 49 50 51 52 53
let rec transfer_tag_to_branches t =
  let open Phylogenetics in
  let category : _ Tree.t -> int = function
    | Leaf (_, c) -> c
    | Node n -> snd n.data
  in
  match t with
  | Tree.Leaf (l, _) -> Tree.leaf l
  | Node n ->
54
    let cat_n = snd n.data in
55
    List1.map n.branches ~f:(fun (Branch b) ->
56 57 58 59 60 61 62 63 64 65 66
        let cat_child = category b.tip in
        let tags = List.concat [
            [ "Condition", Int.to_string cat_child ] ;
            (
              if cat_n <> cat_child then
                [ "Transition", Int.to_string cat_child ]
              else []
            ) ;
            b.data.Newick.tags
          ]
        in
67 68 69 70 71 72
        Tree.branch
          { b.data with Newick.tags }
          (transfer_tag_to_branches b.tip)
      )
    |> Tree.node (fst n.data)

73
let tag_tree t tagged_leaves =
74 75 76 77 78 79
  let open Phylogenetics in
  let category (ni : Newick.node_info) =
    Option.map ni.name ~f:(fun l ->
        if String.Set.mem tagged_leaves l then 1 else 0
      )
  in
80 81 82 83 84 85 86 87 88
  let cost x y =
    match x, y with
    | 0, 1 -> 2.
    | 1, 0 -> 1.
    | 0, 0
    | 1, 1 -> 0.
    | _ -> assert false
  in
  Fitch.fitch ~cost ~n:2 ~category t
89 90
  |> transfer_tag_to_branches

91 92 93 94 95 96
let%pworkflow clip_tree_on_alignment tree ali =
  let open Phylogenetics in
  let tree = Newick.from_file [%path tree] in
  let ali = Phylip.read_exn ~strict:false [%path ali] in
  let ali_species = List.map ali.items ~f:(fun it -> it.name) in
  let clipped_tree =
Philippe Veber's avatar
Philippe Veber committed
97
    Newick.map_inner_tree tree ~f:(fun tree ->
98 99 100 101 102 103 104 105 106 107
        match
          Tree.leafset_generated_subtree tree
            (fun bi -> bi.Newick.name) ali_species
        with
        | None -> failwith "Tree has no leaf in alignment"
        | Some filtered_tree -> filtered_tree
      )
  in
  Newick.to_file clipped_tree [%dest]

108 109 110
let omm_tree_of_db db =
  Workflow.input (Orthomam_db.tree db)

111
let annotate_convergent_species_in_tree (tree : newick file) species : newick file =
112
  Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" (
113 114
    let open Phylogenetics in
    let%pdeps species = species
115
    and omm_tree = path tree in
116

117
    let ensembl_tree = Newick.from_file omm_tree in
118
    let tagged_tree =
Philippe Veber's avatar
Philippe Veber committed
119
      Newick.map_inner_tree ensembl_tree ~f:(fun t ->
120 121 122
          tag_tree t (String.Set.of_list species)
        )
    in
123 124 125
    Newick.to_file tagged_tree [%dest]
  )

126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
let%pworkflow compare_tree_branch_lengths t1 t2 =
  let open Phylogenetics in
  let t1 = Newick.from_file [%path t1] in
  let t2 = Newick.from_file [%path t2] in
  let rec reorder_branches (branches : _ Tree.branch List1.t) =
    List1.map branches ~f:(fun (Branch b) ->
        let tip, label = reorder_tree_aux b.tip in
        Tree.branch b.data tip, label
      )
  and reorder_tree_aux (t : Newick.tree) =
    match t with
    | Leaf l -> t, Option.value_exn l.name
    | Node n ->
      let branches_and_labels = reorder_branches n.branches in
      let branches, labels =
        branches_and_labels
        |> List1.sort ~compare:(fun (_, x) (_, y) ->
            String.compare x y
          )
        |> List1.unzip
      in
      let label = List1.hd labels in
      Tree.node n.data branches, label
  and reorder_tree t = fst (reorder_tree_aux t)
  in
  let rec collect_branch_lengths (t1 : Newick.tree) (t2 : Newick.tree) =
    match (t1, t2) with
    | Leaf _, Leaf _ -> []
    | Node n1, Node n2 ->
      List1.map2_exn n1.branches n2.branches ~f:(fun (Branch b1) (Branch b2) ->
          let acc = collect_branch_lengths b1.tip b2.tip in
          match b1.data.length, b2.data.length with
          | Some l1, Some l2 -> (l1, l2) :: acc
          | _ -> acc
        )
      |> List1.to_list
      |> List.concat
    | Leaf _, Node _
    | Node _, Leaf _ -> invalid_arg "trees with different topologies"
  in
  let t1 = Newick.with_inner_tree ~f:reorder_tree t1 in
  let t2 = Newick.with_inner_tree ~f:reorder_tree t2 in
  let l1, l2 =
    collect_branch_lengths t1 t2
    |> List.unzip
  in
  OCamlR_grDevices.pdf [%dest] ;
  OCamlR_graphics.plot
    ~main:"Comparison of branch lengths"
    ~xlab:"Branch lengths in tree 1"
    ~ylab:"Branch lengths in tree 2"
    ~x:(Array.of_list l1) ~y:(Array.of_list l2) () ;
  OCamlR_grDevices.dev_off ()
179

180
let%pworkflow [@version 6] number_of_missing_sequences_repartition db =
181 182
  Orthomam_db.missing_sequences_cdf [%param db] [%dest]

183
let%pworkflow [@mem Workflow.int 4096] concatenate ?(nmissing = 0) ?seed db n : phylip file =
184
  let db = [%param db] in
185
  let seed = [%param seed] in
186
  let n = [%param n] in
187 188 189 190
  let open Phylogenetics in
  let compare_items (x : Phylip.item) (y : Phylip.item) =
    String.compare x.name y.name
  in
191 192 193 194 195 196 197 198
  let rec at_most_n_failures xs ~n ~f =
    if n <= 0 then List.for_all xs ~f
    else
      match xs with
      | [] -> true
      | h :: t ->
        let n = if f h then n else n - 1 in
        at_most_n_failures t ~n ~f
199
  in
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
  let aa_at_pos seq i =
    let seq = String.sub seq ~pos:(3 * i) ~len:3 in
    let open Phylogenetics in
    Codon.of_string seq
    |> Option.bind ~f:Codon.Universal_genetic_code.aa_of_codon
  in
  let full_sites_of_alignment (alignment : Phylip.t) =
    if alignment.sequence_length mod 3 = 0 then
      Reviewphiltrans_toolbox.Utils.int_fold 0 (alignment.sequence_length / 3) ~init:[] ~f:(fun acc j ->
          let column_is_quasi_full =
            at_most_n_failures alignment.items ~n:nmissing ~f:(fun it ->
                Option.is_some (aa_at_pos it.sequence j)
              )
          in
          if column_is_quasi_full then (alignment, j) :: acc else acc
        )
    else []
  in
  let all_full_sites alignments =
    Array.concat_map (Array.of_list alignments) ~f:(fun al ->
        Array.of_list (full_sites_of_alignment al)
      )
  in
  let build_alignment_from_sites (sites : (Phylip.t * int) array) =
    assert (Array.length sites > n) ;
    let al0, _ = sites.(0) in
    let nb_sequences = al0.number_of_sequences in
    let sequences = Array.init nb_sequences ~f:(fun _ ->
228
        Bytes.create (3 * n)
229 230 231
      )
    in
    for i = 0 to n - 1 do
232
      let al, p = sites.(i) in
233
      List.iteri al.items ~f:(fun j it ->
234 235
          match aa_at_pos it.sequence p with
          | Some _ ->
236 237 238
            Bytes.set sequences.(j) (3 * i) it.sequence.[3 * p] ;
            Bytes.set sequences.(j) (3 * i + 1) it.sequence.[3 * p + 1] ;
            Bytes.set sequences.(j) (3 * i + 2) it.sequence.[3 * p + 2]
239 240 241 242
          | None ->
            Bytes.set sequences.(j) (3 * i) 'N' ;
            Bytes.set sequences.(j) (3 * i + 1) 'N' ;
            Bytes.set sequences.(j) (3 * i + 2) 'N' ;
243 244
        )
    done ;
245
    List.mapi al0.items ~f:(fun i it ->
246 247 248
        { Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name }
      )
  in
249 250 251
  let all_alignments =
    Orthomam_db.list_alignments db
    |> List.filter_map ~f:(fun ali_fn ->
252
        let ali = Phylip.read_exn ~strict:false ali_fn in
253 254 255 256
        if ali.sequence_length mod 3 = 0 then
          Phylip.make_exn (List.sort ali.items ~compare:compare_items)
          |> Option.some
        else None
257 258
      )
  in
259 260 261 262 263
  let all_full_sites = all_full_sites all_alignments in
  if Array.length all_full_sites < n then failwith "not enough sites" ;
  let seed = match seed with
    | None -> Gsl.Rng.default_seed ()
    | Some i -> Nativeint.of_int i
264
  in
265 266 267 268 269
  let rng = Gsl.Rng.(make (default ())) in
  Gsl.Rng.set rng seed ;
  Gsl.Randist.shuffle rng all_full_sites ;
  let items = build_alignment_from_sites all_full_sites in
  printf "nfull sites = %d\n%!" (Array.length all_full_sites) ;
270
  Phylip.write ~strict:false (Phylip.make_exn items) [%dest]
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307

let%pworkflow concatenate_calibration_figure ~nsites ~trees =
  let open Phylogenetics in
  let nsites = [%param nsites] in
  let tree_paths = [%eval
    List.map trees ~f:Workflow.path_list
    |> Workflow.list
  ]
  in
  let tree_lengths =
    List.map tree_paths ~f:(
      List.map ~f:(fun tree_path ->
          Newick.from_file tree_path
          |> Newick.with_inner_tree ~f:(
            Tree.pre ~init:0.
              ~node:(fun acc _ -> acc) ~leaf:(fun acc _ -> acc)
              ~branch:(fun acc bi ->
                  match bi.Newick.length with
                  | None -> failwith "expected tree with branch length"
                  | Some l -> acc +. l
                )
          )
        )
    )
  in
  let x =
    List.map2_exn nsites trees ~f:(fun n trees ->
        List.init (List.length trees) ~f:(fun _ -> float n)
      )
    |> List.concat
    |> Array.of_list
  in
  let y =
    List.concat tree_lengths
    |> Array.of_list
  in
  OCamlR_grDevices.pdf [%dest] ;
308
  OCamlR_graphics.plot ~ylab:"Total branch length" ~xlab:"Concatenate size" ~log:`XY ~x ~y () ;
309 310
  OCamlR_grDevices.dev_off ()

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
let%pworkflow phylip_aa_of_nuc (ali : phylip file) : phylip file =
  let open Phylogenetics in
  let has_some_gaps s = String.exists s ~f:(function '-' | '?' -> true | _ -> false) in
  let convert_seq seq =
    let fail subseq =
      failwithf "Sequence %s cannot be converted to aminoacid sequence because it contains %s" seq subseq ()
    in
    let n = String.length seq in
    String.init (n / 3) ~f:(fun i ->
        let subseq = String.sub seq ~pos:(i * 3) ~len:3 in
        match Codon.of_string subseq with
        | Some codon -> (
            match Codon.Universal_genetic_code.aa_of_codon codon with
            | Some aa -> Amino_acid.to_char aa
            | None -> 'N'
326
          )
327 328
        | None ->
          if has_some_gaps subseq then '-'
329
          else if String.exists subseq ~f:(Char.( = ) 'N') then 'N'
330
          else fail subseq
331 332
      )
  in
333 334 335
  let input_ali = Phylip.read_exn ~strict:false [%path ali] in
  let items = List.map input_ali.items ~f:(fun it ->
      { it with Phylip.sequence = convert_seq it.sequence }
336 337
    )
  in
338
  Phylip.write ~strict:false (Phylip.make_exn items) [%dest]
339

340 341 342 343 344 345 346 347 348 349 350 351 352 353
let%pworkflow iqtree_nexus_partition_file_of_alignment ali : Biotope.Formats.nexus file =
  let maybe_nb_columns =
    let open Option in
    In_channel.with_file [%path ali] ~f:In_channel.input_line >>= fun ali_first_line ->
    String.lsplit2 ali_first_line ~on:'\t' >>= fun (_, n) ->
    return (Int.of_string n)
  in
  let nb_columns = Option.value_exn ~message:"Could not parse phylip alignment first line" maybe_nb_columns in
  Out_channel.with_file [%dest] ~f:(fun oc ->
      fprintf oc
        {|#nexus begin sets; charset part1 = 1-%d\3; charset part2 = 2-%d\3; charset part3 = 3-%d\3; charpartition mine = GTR+R:part1, GTR+R:part2, GTR+R:part3; end;|}
        nb_columns nb_columns nb_columns
    )

354
let branch_length_estimation_concatenate db =
355
  concatenate ~nmissing:2 ~seed:42 db 20_000
356 357 358

let nucleotide_tree_estimatimation db ali =
  let open Biotope in
359
  let spp = iqtree_nexus_partition_file_of_alignment ali in
360 361 362 363 364 365
  Iqtree.iqtree ~st:`DNA ~te:(omm_tree_of_db db) ~spp (`phylip ali)
  |> Iqtree.treefile

let estimated_nucleotide_tree db =
  let ali = branch_length_estimation_concatenate db in
  nucleotide_tree_estimatimation db ali
366 367

let estimated_amino_acid_tree db =
368 369 370 371 372 373 374 375
  let open Biotope in
  let ali =
    branch_length_estimation_concatenate db
    |> phylip_aa_of_nuc
  in
  let model = Iqtree.model_spec ~freq_type:`F ~rate_type:(`R None) `LG in
  Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model (`phylip ali)
  |> Iqtree.treefile
376 377

let estimated_codon_tree db =
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
  let open Biotope in
  let ali = branch_length_estimation_concatenate db in
  let model = Iqtree.model_spec ~freq_type:`F `ECMK07 in
  Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model ~st:(`CODON 1) (`phylip ali)
  |> Iqtree.treefile

let concatenate_calibration ~seed db =
  let nrep = 5 in
  let nsites = [ 50 ; 100 ; 200 ; 500 ; 1000 ; 2000 ; 5000 ; 10000 ; 20000 ; 50000 ] in
  let seeds =
    let rng = Random.State.make [| seed |] in
    List.map nsites ~f:(fun _ ->
        List.init nrep ~f:(fun _ ->
            Random.State.int rng (1 lsl 30 - 1)
          )
      )
  in
  let concatenates =
    List.map2_exn nsites seeds ~f:(fun n seeds ->
        List.map seeds ~f:(fun seed ->
398
            concatenate ~nmissing:2 ~seed db n
399 400 401 402 403 404 405 406 407
          )
      )
  in
  let trees =
    List.map concatenates ~f:(
      List.map ~f:(nucleotide_tree_estimatimation db)
    )
  in
  concatenate_calibration_figure ~nsites ~trees
408 409 410 411 412 413 414 415 416 417 418

module Q = struct
  type t = query

  let alignment q : phylip file =
    Workflow.input q.alignment
    |> remove_unobserved_sequences_from_alignment

  let tree ~branch_length_unit q =
    let omm_tree_with_branch_lengths = match branch_length_unit with
      | `Nucleotide -> estimated_nucleotide_tree q.db
419 420
      | `Amino_acid -> estimated_amino_acid_tree q.db
      | `Codon -> estimated_codon_tree q.db
421 422 423 424 425 426 427 428 429 430 431 432 433 434
    in
    clip_tree_on_alignment
      (annotate_convergent_species_in_tree omm_tree_with_branch_lengths q.convergent_species)
      (alignment q)

  let nucleotide_alignment q =
    fasta_of_phylip (alignment q)
end

include Q

include Detection_pipeline.Make(Q)

let species_with_echolocation = [
435
  "Orcinus_orca" ;
436
  "Myotis_lucifugus" ;
437 438 439 440 441
  "Eptesicus_fuscus" ;
  "Tursiops_truncatus" ;
  "Miniopterus_natalensis" ;
  (* "Pteropus_vampyrus" ; unsure, i'd say no *)
  (* "Pteropus alecto" ; unsure, i'd say no *)
442
  "Rhinolophus_sinicus" ;
443 444
  "Lipotes_vexillifer" ;
  "Hipposideros_armiger" ;
445
  "Physeter_catodon" ;
446 447 448 449 450 451
  "Rousettus_aegyptiacus" ; (* unsure, i'd say yes *)
  "Myotis_brandtii" ;
  "Balaenoptera_acutorostrata_scammoni" ;
  "Delphinapterus_leucas" ;
  "Myotis_davidii" ;
  (* "Rhinolophus ferrumequinum" ; in ensembl but not in orthomam *)
452
]
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497

let%pworkflow convergence_species_tree_pdf ~convergent_species db =
  let convergent_species = [%param convergent_species] in
  let tree_path = [%path estimated_amino_acid_tree db] in
  let open Phylogenetics in
  let convergent_species = String.Set.of_list convergent_species in
  let render_tree (tree_or_branch : Newick.t) =
    let module P = Biotk.Phylo_tree_draw in
    let rec of_tree : Newick.tree -> P.tree = function
      | Leaf ni ->
        let col, text =
          match ni.name with
          | Some name ->
            let col =
              if String.Set.mem convergent_species name
              then Gg.Color.red else Gg.Color.black
            in
            col, String.tr ~target:'_' ~replacement:' ' name
          | None -> Gg.Color.black, ""
        in
        P.leaf ~col ~style:`oblique text
      | Node n ->
        Phylogenetics.List1.map n.branches ~f:of_branch
        |> Phylogenetics.List1.to_list
        |> P.node
    and of_branch : Newick.branch -> P.branch = fun (Branch b) ->
      let len = Option.value_exn b.data.length in
      let col =
        match List.Assoc.find b.data.tags "Condition" ~equal:String.equal with
        | Some "1" -> Gg.Color.red
        | _ -> Gg.Color.black
      in
      P.branch ~col len (of_tree b.tip)
    in
    match tree_or_branch with
    | Tree t -> P.draw_tree (of_tree t)
    | Branch b -> P.draw_branch (of_branch b)
  in
  let tree_or_branch =
    Newick.from_file tree_path
    |> Newick.map_inner_tree ~f:(fun t -> tag_tree t convergent_species)
  in
  render_tree tree_or_branch
  |> Biotk_croquis.Croquis.Layout.simple
  |> Fn.flip Biotk_croquis.Croquis.Layout.render_pdf [%dest]