orthomam.ml 8.27 KB
Newer Older
1 2 3 4 5 6
open Core_kernel
open Bistro
open File_formats
open Reviewphiltrans_toolbox

let ensembl_tree : nhx file =
7
  Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh"
8

9
type alignment = Orthomam_db.t * string
10

11
let alignments_of_db db =
12
  List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali)
13 14 15 16 17 18 19 20 21 22 23

let%pworkflow fasta_of_phylip ali =
  let ali =
    [%path ali]
    |> Phylogenetics.Phylip.read_exn ~strict:false
  in
  List.map ali.items ~f:(fun it ->
      Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
    )
  |> Biotk.Fasta.to_file [%dest]

24
type query = {
25 26
  db : Orthomam_db.t ;
  alignment : string ;
27 28 29
  convergent_species : string list workflow ;
}

30 31
let query (db, alignment) ~convergent_species =
  { db ; alignment ; convergent_species }
32

33 34 35 36 37 38 39 40 41 42 43 44
let%pworkflow remove_unobserved_sequences_from_alignment phylip : phylip file =
  let phylip = [%path phylip] in
  let module Phylip = Phylogenetics.Phylip in
  let data = Phylip.read_exn ~strict:false phylip in
  let threshold = data.sequence_length * 90 / 100 in
  let items = List.filter data.items ~f:(fun it ->
      String.count it.sequence ~f:(Char.equal '?') < threshold
    )
  in
  let filtered_data = Phylip.make_exn items in
  Phylip.write ~strict:false filtered_data [%dest]

45 46 47 48 49 50 51 52 53
let rec transfer_tag_to_branches t =
  let open Phylogenetics in
  let category : _ Tree.t -> int = function
    | Leaf (_, c) -> c
    | Node n -> snd n.data
  in
  match t with
  | Tree.Leaf (l, _) -> Tree.leaf l
  | Node n ->
54
    let cat_n = snd n.data in
55
    List1.map n.branches ~f:(fun (Branch b) ->
56 57 58 59 60 61 62 63 64 65 66
        let cat_child = category b.tip in
        let tags = List.concat [
            [ "Condition", Int.to_string cat_child ] ;
            (
              if cat_n <> cat_child then
                [ "Transition", Int.to_string cat_child ]
              else []
            ) ;
            b.data.Newick.tags
          ]
        in
67 68 69 70 71 72
        Tree.branch
          { b.data with Newick.tags }
          (transfer_tag_to_branches b.tip)
      )
    |> Tree.node (fst n.data)

73
let tag_tree t tagged_leaves =
74 75 76 77 78 79 80 81 82
  let open Phylogenetics in
  let category (ni : Newick.node_info) =
    Option.map ni.name ~f:(fun l ->
        if String.Set.mem tagged_leaves l then 1 else 0
      )
  in
  Fitch.fitch ~n:2 ~category t
  |> transfer_tag_to_branches

83 84 85 86 87 88
let%pworkflow clip_tree_on_alignment tree ali =
  let open Phylogenetics in
  let tree = Newick.from_file [%path tree] in
  let ali = Phylip.read_exn ~strict:false [%path ali] in
  let ali_species = List.map ali.items ~f:(fun it -> it.name) in
  let clipped_tree =
Philippe Veber's avatar
Philippe Veber committed
89
    Newick.map_inner_tree tree ~f:(fun tree ->
90 91 92 93 94 95 96 97 98 99
        match
          Tree.leafset_generated_subtree tree
            (fun bi -> bi.Newick.name) ali_species
        with
        | None -> failwith "Tree has no leaf in alignment"
        | Some filtered_tree -> filtered_tree
      )
  in
  Newick.to_file clipped_tree [%dest]

100 101 102 103
let omm_tree q =
  Workflow.input (Orthomam_db.tree q.db)

let tree_of_convergent_species tree species =
104
  Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" (
105 106
    let open Phylogenetics in
    let%pdeps species = species
107
    and omm_tree = path tree in
108

109
    let ensembl_tree = Newick.from_file omm_tree in
110
    let tagged_tree =
Philippe Veber's avatar
Philippe Veber committed
111
      Newick.map_inner_tree ensembl_tree ~f:(fun t ->
112 113 114
          tag_tree t (String.Set.of_list species)
        )
    in
115 116 117
    Newick.to_file tagged_tree [%dest]
  )

118
module Q = struct
119
  type t = query
120 121 122

  let alignment q : phylip file =
    Workflow.input q.alignment
123
    |> remove_unobserved_sequences_from_alignment
124 125 126

  let tree q =
    clip_tree_on_alignment
127
      (tree_of_convergent_species (omm_tree q) q.convergent_species)
128 129
      (alignment q)

130
  let nucleotide_alignment q =
131
    fasta_of_phylip (alignment q)
132 133
end

134 135
include Q

136
include Detection_pipeline.Make(Q)
137 138 139 140 141 142 143

let species_with_echolocation = [
  "Rhinolophus ferrumequinum" ;
  "Tursiops_truncatus" ;
  "Myotis_lucifugus" ;
  "Physeter_catodon" ;
]
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

let%pworkflow compare_tree_branch_lengths t1 t2 =
  let open Phylogenetics in
  let t1 = Newick.from_file [%path t1] in
  let t2 = Newick.from_file [%path t2] in
  let rec reorder_branches (branches : _ Tree.branch List1.t) =
    List1.map branches ~f:(fun (Branch b) ->
        let tip, label = reorder_tree_aux b.tip in
        Tree.branch b.data tip, label
      )
  and reorder_tree_aux (t : Newick.tree) =
    match t with
    | Leaf l -> t, Option.value_exn l.name
    | Node n ->
      let branches_and_labels = reorder_branches n.branches in
      let branches, labels =
        branches_and_labels
        |> List1.sort ~compare:(fun (_, x) (_, y) ->
            String.compare x y
          )
        |> List1.unzip
      in
      let label = List1.hd labels in
      Tree.node n.data branches, label
  and reorder_tree t = fst (reorder_tree_aux t)
  in
  let rec collect_branch_lengths (t1 : Newick.tree) (t2 : Newick.tree) =
    match (t1, t2) with
    | Leaf _, Leaf _ -> []
    | Node n1, Node n2 ->
      List1.map2_exn n1.branches n2.branches ~f:(fun (Branch b1) (Branch b2) ->
          let acc = collect_branch_lengths b1.tip b2.tip in
          match b1.data.length, b2.data.length with
          | Some l1, Some l2 -> (l1, l2) :: acc
          | _ -> acc
        )
      |> List1.to_list
      |> List.concat
    | Leaf _, Node _
    | Node _, Leaf _ -> invalid_arg "trees with different topologies"
  in
  let t1 = Newick.with_inner_tree ~f:reorder_tree t1 in
  let t2 = Newick.with_inner_tree ~f:reorder_tree t2 in
  let l1, l2 =
    collect_branch_lengths t1 t2
    |> List.unzip
  in
  OCamlR_grDevices.pdf [%dest] ;
  OCamlR_graphics.plot
    ~main:"Comparison of branch lengths"
    ~xlab:"Branch lengths in tree 1"
    ~ylab:"Branch lengths in tree 2"
    ~x:(Array.of_list l1) ~y:(Array.of_list l2) () ;
  OCamlR_grDevices.dev_off ()
198 199 200 201

let%pworkflow number_of_missing_sequences_repartition db =
  Orthomam_db.missing_sequences_cdf [%param db] [%dest]

202
let%pworkflow concatenate ?seed ?site_sampling ~max_missing_sequences db =
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
  let db = [%param db] in
  let max_missing_sequences = [%param max_missing_sequences] in
  let open Phylogenetics in
  let all_alignments = Orthomam_db.list_alignments db in
  let missing_sequence seq =
    String.count seq ~f:Char.(fun c -> c = '?' || c = '-') = String.length seq
  in
  let compare_items (x : Phylip.item) (y : Phylip.item) =
    String.compare x.name y.name
  in
  let merge_items (xs : Phylip.item list) =
    match xs with
    | [] -> assert false
    | h :: t ->
      assert (List.for_all t ~f:(fun x -> String.equal x.name h.name)) ;
      { Phylip.name = h.name ;
        sequence =
          List.map xs ~f:(fun x -> x.sequence)
          |> String.concat }
  in
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
  let sample_sites (alignments : Phylip.t list) n =
    let seed = match seed with
      | None -> Gsl.Rng.default_seed ()
      | Some i -> Nativeint.of_int i
    in
    let alignments = Array.of_list alignments in
    let sizes =
      Array.map alignments ~f:(fun al -> float al.sequence_length)
      |> Gsl.Randist.discrete_preproc
    in
    let rng = Gsl.Rng.(make (default ())) in
    Gsl.Rng.set rng seed ;
    let nsequences = alignments.(0).number_of_sequences in
    let sequences = Array.init nsequences ~f:(fun _ ->
        Bytes.create n
      )
    in
    for i = 0 to n - 1 do
      let al = alignments.(Gsl.Randist.discrete rng sizes) in
      let p = Gsl.Rng.uniform_int rng al.sequence_length in
      List.iteri al.items ~f:(fun j it ->
          Bytes.set sequences.(j) i it.sequence.[p]
        )
    done ;
    List.mapi alignments.(0).items ~f:(fun i it ->
        { Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name }
      )
  in
  let filtered_alignments =
    List.filter_map all_alignments ~f:(fun ali_fn ->
        let ali = Phylip.read_exn ~strict:false ali_fn in
        let nmissing = List.count ali.items ~f:(fun it -> missing_sequence it.sequence) in
        if nmissing <= max_missing_sequences then
          Some ali
        else None
      )
  in
  let merged_items (alignments : Phylip.t list) =
    List.map alignments ~f:(fun ali ->
262 263 264 265 266
        List.sort ali.items ~compare:compare_items
      )
    |> List.transpose_exn
    |> List.map ~f:merge_items
  in
267 268 269 270 271 272
  let items =
    match site_sampling with
    | None -> merged_items filtered_alignments
    | Some n -> sample_sites filtered_alignments n
  in
  Phylip.write ~strict:false (Phylip.make_exn items) [%dest]