orthomam.ml 10.3 KB
Newer Older
1 2 3 4 5 6
open Core_kernel
open Bistro
open File_formats
open Reviewphiltrans_toolbox

let ensembl_tree : nhx file =
7
  Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh"
8

9
type alignment = Orthomam_db.t * string
10

11
let alignments_of_db db =
12
  List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali)
13 14 15 16 17 18 19 20 21 22 23

let%pworkflow fasta_of_phylip ali =
  let ali =
    [%path ali]
    |> Phylogenetics.Phylip.read_exn ~strict:false
  in
  List.map ali.items ~f:(fun it ->
      Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
    )
  |> Biotk.Fasta.to_file [%dest]

24
type query = {
25 26
  db : Orthomam_db.t ;
  alignment : string ;
27 28 29
  convergent_species : string list workflow ;
}

30 31
let query (db, alignment) ~convergent_species =
  { db ; alignment ; convergent_species }
32

33 34 35 36 37 38 39 40 41 42 43 44
let%pworkflow remove_unobserved_sequences_from_alignment phylip : phylip file =
  let phylip = [%path phylip] in
  let module Phylip = Phylogenetics.Phylip in
  let data = Phylip.read_exn ~strict:false phylip in
  let threshold = data.sequence_length * 90 / 100 in
  let items = List.filter data.items ~f:(fun it ->
      String.count it.sequence ~f:(Char.equal '?') < threshold
    )
  in
  let filtered_data = Phylip.make_exn items in
  Phylip.write ~strict:false filtered_data [%dest]

45 46 47 48 49 50 51 52 53
let rec transfer_tag_to_branches t =
  let open Phylogenetics in
  let category : _ Tree.t -> int = function
    | Leaf (_, c) -> c
    | Node n -> snd n.data
  in
  match t with
  | Tree.Leaf (l, _) -> Tree.leaf l
  | Node n ->
54
    let cat_n = snd n.data in
55
    List1.map n.branches ~f:(fun (Branch b) ->
56 57 58 59 60 61 62 63 64 65 66
        let cat_child = category b.tip in
        let tags = List.concat [
            [ "Condition", Int.to_string cat_child ] ;
            (
              if cat_n <> cat_child then
                [ "Transition", Int.to_string cat_child ]
              else []
            ) ;
            b.data.Newick.tags
          ]
        in
67 68 69 70 71 72
        Tree.branch
          { b.data with Newick.tags }
          (transfer_tag_to_branches b.tip)
      )
    |> Tree.node (fst n.data)

73
let tag_tree t tagged_leaves =
74 75 76 77 78 79 80 81 82
  let open Phylogenetics in
  let category (ni : Newick.node_info) =
    Option.map ni.name ~f:(fun l ->
        if String.Set.mem tagged_leaves l then 1 else 0
      )
  in
  Fitch.fitch ~n:2 ~category t
  |> transfer_tag_to_branches

83 84 85 86 87 88
let%pworkflow clip_tree_on_alignment tree ali =
  let open Phylogenetics in
  let tree = Newick.from_file [%path tree] in
  let ali = Phylip.read_exn ~strict:false [%path ali] in
  let ali_species = List.map ali.items ~f:(fun it -> it.name) in
  let clipped_tree =
Philippe Veber's avatar
Philippe Veber committed
89
    Newick.map_inner_tree tree ~f:(fun tree ->
90 91 92 93 94 95 96 97 98 99
        match
          Tree.leafset_generated_subtree tree
            (fun bi -> bi.Newick.name) ali_species
        with
        | None -> failwith "Tree has no leaf in alignment"
        | Some filtered_tree -> filtered_tree
      )
  in
  Newick.to_file clipped_tree [%dest]

100 101 102 103
let omm_tree_of_db db =
  Workflow.input (Orthomam_db.tree db)

let omm_tree q = omm_tree_of_db q.db
104 105

let tree_of_convergent_species tree species =
106
  Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" (
107 108
    let open Phylogenetics in
    let%pdeps species = species
109
    and omm_tree = path tree in
110

111
    let ensembl_tree = Newick.from_file omm_tree in
112
    let tagged_tree =
Philippe Veber's avatar
Philippe Veber committed
113
      Newick.map_inner_tree ensembl_tree ~f:(fun t ->
114 115 116
          tag_tree t (String.Set.of_list species)
        )
    in
117 118 119
    Newick.to_file tagged_tree [%dest]
  )

120
module Q = struct
121
  type t = query
122 123 124

  let alignment q : phylip file =
    Workflow.input q.alignment
125
    |> remove_unobserved_sequences_from_alignment
126 127 128

  let tree q =
    clip_tree_on_alignment
129
      (tree_of_convergent_species (omm_tree q) q.convergent_species)
130 131
      (alignment q)

132
  let nucleotide_alignment q =
133
    fasta_of_phylip (alignment q)
134 135
end

136 137
include Q

138
include Detection_pipeline.Make(Q)
139 140 141 142 143 144 145

let species_with_echolocation = [
  "Rhinolophus ferrumequinum" ;
  "Tursiops_truncatus" ;
  "Myotis_lucifugus" ;
  "Physeter_catodon" ;
]
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199

let%pworkflow compare_tree_branch_lengths t1 t2 =
  let open Phylogenetics in
  let t1 = Newick.from_file [%path t1] in
  let t2 = Newick.from_file [%path t2] in
  let rec reorder_branches (branches : _ Tree.branch List1.t) =
    List1.map branches ~f:(fun (Branch b) ->
        let tip, label = reorder_tree_aux b.tip in
        Tree.branch b.data tip, label
      )
  and reorder_tree_aux (t : Newick.tree) =
    match t with
    | Leaf l -> t, Option.value_exn l.name
    | Node n ->
      let branches_and_labels = reorder_branches n.branches in
      let branches, labels =
        branches_and_labels
        |> List1.sort ~compare:(fun (_, x) (_, y) ->
            String.compare x y
          )
        |> List1.unzip
      in
      let label = List1.hd labels in
      Tree.node n.data branches, label
  and reorder_tree t = fst (reorder_tree_aux t)
  in
  let rec collect_branch_lengths (t1 : Newick.tree) (t2 : Newick.tree) =
    match (t1, t2) with
    | Leaf _, Leaf _ -> []
    | Node n1, Node n2 ->
      List1.map2_exn n1.branches n2.branches ~f:(fun (Branch b1) (Branch b2) ->
          let acc = collect_branch_lengths b1.tip b2.tip in
          match b1.data.length, b2.data.length with
          | Some l1, Some l2 -> (l1, l2) :: acc
          | _ -> acc
        )
      |> List1.to_list
      |> List.concat
    | Leaf _, Node _
    | Node _, Leaf _ -> invalid_arg "trees with different topologies"
  in
  let t1 = Newick.with_inner_tree ~f:reorder_tree t1 in
  let t2 = Newick.with_inner_tree ~f:reorder_tree t2 in
  let l1, l2 =
    collect_branch_lengths t1 t2
    |> List.unzip
  in
  OCamlR_grDevices.pdf [%dest] ;
  OCamlR_graphics.plot
    ~main:"Comparison of branch lengths"
    ~xlab:"Branch lengths in tree 1"
    ~ylab:"Branch lengths in tree 2"
    ~x:(Array.of_list l1) ~y:(Array.of_list l2) () ;
  OCamlR_grDevices.dev_off ()
200 201 202 203

let%pworkflow number_of_missing_sequences_repartition db =
  Orthomam_db.missing_sequences_cdf [%param db] [%dest]

204
let%pworkflow concatenate ?seed ?site_sampling ~max_missing_sequences db =
205 206
  let db = [%param db] in
  let max_missing_sequences = [%param max_missing_sequences] in
207 208
  let seed = [%param seed] in
  let site_sampling = [%param site_sampling] in
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
  let open Phylogenetics in
  let all_alignments = Orthomam_db.list_alignments db in
  let missing_sequence seq =
    String.count seq ~f:Char.(fun c -> c = '?' || c = '-') = String.length seq
  in
  let compare_items (x : Phylip.item) (y : Phylip.item) =
    String.compare x.name y.name
  in
  let merge_items (xs : Phylip.item list) =
    match xs with
    | [] -> assert false
    | h :: t ->
      assert (List.for_all t ~f:(fun x -> String.equal x.name h.name)) ;
      { Phylip.name = h.name ;
        sequence =
          List.map xs ~f:(fun x -> x.sequence)
          |> String.concat }
  in
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
  let sample_sites (alignments : Phylip.t list) n =
    let seed = match seed with
      | None -> Gsl.Rng.default_seed ()
      | Some i -> Nativeint.of_int i
    in
    let alignments = Array.of_list alignments in
    let sizes =
      Array.map alignments ~f:(fun al -> float al.sequence_length)
      |> Gsl.Randist.discrete_preproc
    in
    let rng = Gsl.Rng.(make (default ())) in
    Gsl.Rng.set rng seed ;
    let nsequences = alignments.(0).number_of_sequences in
    let sequences = Array.init nsequences ~f:(fun _ ->
        Bytes.create n
      )
    in
    for i = 0 to n - 1 do
      let al = alignments.(Gsl.Randist.discrete rng sizes) in
      let p = Gsl.Rng.uniform_int rng al.sequence_length in
      List.iteri al.items ~f:(fun j it ->
          Bytes.set sequences.(j) i it.sequence.[p]
        )
    done ;
    List.mapi alignments.(0).items ~f:(fun i it ->
        { Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name }
      )
  in
  let filtered_alignments =
    List.filter_map all_alignments ~f:(fun ali_fn ->
        let ali = Phylip.read_exn ~strict:false ali_fn in
        let nmissing = List.count ali.items ~f:(fun it -> missing_sequence it.sequence) in
        if nmissing <= max_missing_sequences then
          Some ali
        else None
      )
  in
  let merged_items (alignments : Phylip.t list) =
    List.map alignments ~f:(fun ali ->
266 267 268 269 270
        List.sort ali.items ~compare:compare_items
      )
    |> List.transpose_exn
    |> List.map ~f:merge_items
  in
271 272 273 274 275 276
  let items =
    match site_sampling with
    | None -> merged_items filtered_alignments
    | Some n -> sample_sites filtered_alignments n
  in
  Phylip.write ~strict:false (Phylip.make_exn items) [%dest]
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313

let%pworkflow concatenate_calibration_figure ~nsites ~trees =
  let open Phylogenetics in
  let nsites = [%param nsites] in
  let tree_paths = [%eval
    List.map trees ~f:Workflow.path_list
    |> Workflow.list
  ]
  in
  let tree_lengths =
    List.map tree_paths ~f:(
      List.map ~f:(fun tree_path ->
          Newick.from_file tree_path
          |> Newick.with_inner_tree ~f:(
            Tree.pre ~init:0.
              ~node:(fun acc _ -> acc) ~leaf:(fun acc _ -> acc)
              ~branch:(fun acc bi ->
                  match bi.Newick.length with
                  | None -> failwith "expected tree with branch length"
                  | Some l -> acc +. l
                )
          )
        )
    )
  in
  let x =
    List.map2_exn nsites trees ~f:(fun n trees ->
        List.init (List.length trees) ~f:(fun _ -> float n)
      )
    |> List.concat
    |> Array.of_list
  in
  let y =
    List.concat tree_lengths
    |> Array.of_list
  in
  OCamlR_grDevices.pdf [%dest] ;
314
  OCamlR_graphics.plot ~ylab:"Total branch length" ~xlab:"Concatenate size" ~log:`XY ~x ~y () ;
315 316 317 318
  OCamlR_grDevices.dev_off ()

let concatenate_calibration ~seed db =
  let nrep = 4 in
319
  let nsites = [ 30 ; 50 ; 100 ; 300 ; 500 ; 1000 ; 3000 ; 5000 ; 10000 ; 50000 ] in
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
  let seeds =
    let rng = Random.State.make [| seed |] in
    List.map nsites ~f:(fun _ ->
        List.init nrep ~f:(fun _ ->
            Random.State.int rng (1 lsl 30 - 1)
          )
      )
  in
  let concatenates =
    List.map2_exn nsites seeds ~f:(fun n seeds ->
        List.map seeds ~f:(fun seed ->
            concatenate ~seed ~site_sampling:n ~max_missing_sequences:0 db
          )
      )
  in
  let omm_tree = omm_tree_of_db db in
  let trees =
    List.map concatenates ~f:(
      List.map ~f:(fun ali ->
          Biotope.Iqtree.iqtree ~o:"Ornithorhynchus_anatinus" ~te:omm_tree (`phylip ali)
        )
    )
  in
  concatenate_calibration_figure ~nsites ~trees