orthomam.ml 3.39 KB
Newer Older
1 2 3 4 5 6
open Core_kernel
open Bistro
open File_formats
open Reviewphiltrans_toolbox

let ensembl_tree : nhx file =
7
  Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh"
8

9
type alignment = Orthomam_db.t * string
10

11
let alignments_of_db db =
12
  List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali)
13 14 15 16 17 18 19 20 21 22 23

let%pworkflow fasta_of_phylip ali =
  let ali =
    [%path ali]
    |> Phylogenetics.Phylip.read_exn ~strict:false
  in
  List.map ali.items ~f:(fun it ->
      Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
    )
  |> Biotk.Fasta.to_file [%dest]

24
type query = {
25 26
  db : Orthomam_db.t ;
  alignment : string ;
27 28 29
  convergent_species : string list workflow ;
}

30 31
let query (db, alignment) ~convergent_species =
  { db ; alignment ; convergent_species }
32 33 34 35 36 37 38 39 40 41

let rec transfer_tag_to_branches t =
  let open Phylogenetics in
  let category : _ Tree.t -> int = function
    | Leaf (_, c) -> c
    | Node n -> snd n.data
  in
  match t with
  | Tree.Leaf (l, _) -> Tree.leaf l
  | Node n ->
42
    let cat_n = snd n.data in
43
    List1.map n.branches ~f:(fun (Branch b) ->
44 45 46 47 48 49 50 51 52 53 54
        let cat_child = category b.tip in
        let tags = List.concat [
            [ "Condition", Int.to_string cat_child ] ;
            (
              if cat_n <> cat_child then
                [ "Transition", Int.to_string cat_child ]
              else []
            ) ;
            b.data.Newick.tags
          ]
        in
55 56 57 58 59 60
        Tree.branch
          { b.data with Newick.tags }
          (transfer_tag_to_branches b.tip)
      )
    |> Tree.node (fst n.data)

61
let tag_tree t tagged_leaves =
62 63 64 65 66 67 68 69 70
  let open Phylogenetics in
  let category (ni : Newick.node_info) =
    Option.map ni.name ~f:(fun l ->
        if String.Set.mem tagged_leaves l then 1 else 0
      )
  in
  Fitch.fitch ~n:2 ~category t
  |> transfer_tag_to_branches

71 72 73 74 75 76
let%pworkflow clip_tree_on_alignment tree ali =
  let open Phylogenetics in
  let tree = Newick.from_file [%path tree] in
  let ali = Phylip.read_exn ~strict:false [%path ali] in
  let ali_species = List.map ali.items ~f:(fun it -> it.name) in
  let clipped_tree =
Philippe Veber's avatar
Philippe Veber committed
77
    Newick.map_inner_tree tree ~f:(fun tree ->
78 79 80 81 82 83 84 85 86 87
        match
          Tree.leafset_generated_subtree tree
            (fun bi -> bi.Newick.name) ali_species
        with
        | None -> failwith "Tree has no leaf in alignment"
        | Some filtered_tree -> filtered_tree
      )
  in
  Newick.to_file clipped_tree [%dest]

88 89 90 91
let omm_tree q =
  Workflow.input (Orthomam_db.tree q.db)

let tree_of_convergent_species tree species =
92
  Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" (
93 94
    let open Phylogenetics in
    let%pdeps species = species
95
    and omm_tree = path tree in
96

97
    let ensembl_tree = Newick.from_file omm_tree in
98
    let tagged_tree =
Philippe Veber's avatar
Philippe Veber committed
99
      Newick.map_inner_tree ensembl_tree ~f:(fun t ->
100 101 102
          tag_tree t (String.Set.of_list species)
        )
    in
103 104 105
    Newick.to_file tagged_tree [%dest]
  )

106
module Q = struct
107
  type t = query
108 109 110 111 112 113

  let alignment q : phylip file =
    Workflow.input q.alignment

  let tree q =
    clip_tree_on_alignment
114
      (tree_of_convergent_species (omm_tree q) q.convergent_species)
115 116
      (alignment q)

117
  let nucleotide_alignment q =
118
    fasta_of_phylip (alignment q)
119 120
end

121 122
include Q

123
include Detection_pipeline.Make(Q)
124 125 126 127 128 129 130

let species_with_echolocation = [
  "Rhinolophus ferrumequinum" ;
  "Tursiops_truncatus" ;
  "Myotis_lucifugus" ;
  "Physeter_catodon" ;
]