open Core_kernel open Bistro open File_formats open Reviewphiltrans_toolbox let ensembl_tree : nhx file = Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh" type alignment = Orthomam_db.t * string let alignments_of_db db = List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali) let%pworkflow fasta_of_phylip ali = let ali = [%path ali] |> Phylogenetics.Phylip.read_exn ~strict:false in List.map ali.items ~f:(fun it -> Biotk.Fasta.{ description = it.name ; sequence = it.sequence } ) |> Biotk.Fasta.to_file [%dest] type query = { db : Orthomam_db.t ; alignment : string ; convergent_species : string list workflow ; } let query (db, alignment) ~convergent_species = { db ; alignment ; convergent_species } let%pworkflow remove_unobserved_sequences_from_alignment phylip : phylip file = let phylip = [%path phylip] in let module Phylip = Phylogenetics.Phylip in let data = Phylip.read_exn ~strict:false phylip in let threshold = data.sequence_length * 90 / 100 in let items = List.filter data.items ~f:(fun it -> String.count it.sequence ~f:(Char.equal '?') < threshold ) in let filtered_data = Phylip.make_exn items in Phylip.write ~strict:false filtered_data [%dest] let rec transfer_tag_to_branches t = let open Phylogenetics in let category : _ Tree.t -> int = function | Leaf (_, c) -> c | Node n -> snd n.data in match t with | Tree.Leaf (l, _) -> Tree.leaf l | Node n -> let cat_n = snd n.data in List1.map n.branches ~f:(fun (Branch b) -> let cat_child = category b.tip in let tags = List.concat [ [ "Condition", Int.to_string cat_child ] ; ( if cat_n <> cat_child then [ "Transition", Int.to_string cat_child ] else [] ) ; b.data.Newick.tags ] in Tree.branch { b.data with Newick.tags } (transfer_tag_to_branches b.tip) ) |> Tree.node (fst n.data) let tag_tree t tagged_leaves = let open Phylogenetics in let category (ni : Newick.node_info) = Option.map ni.name ~f:(fun l -> if String.Set.mem tagged_leaves l then 1 else 0 ) in Fitch.fitch ~n:2 ~category t |> transfer_tag_to_branches let%pworkflow clip_tree_on_alignment tree ali = let open Phylogenetics in let tree = Newick.from_file [%path tree] in let ali = Phylip.read_exn ~strict:false [%path ali] in let ali_species = List.map ali.items ~f:(fun it -> it.name) in let clipped_tree = Newick.map_inner_tree tree ~f:(fun tree -> match Tree.leafset_generated_subtree tree (fun bi -> bi.Newick.name) ali_species with | None -> failwith "Tree has no leaf in alignment" | Some filtered_tree -> filtered_tree ) in Newick.to_file clipped_tree [%dest] let omm_tree_of_db db = Workflow.input (Orthomam_db.tree db) let annotate_convergent_species_in_tree tree species = Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" ( let open Phylogenetics in let%pdeps species = species and omm_tree = path tree in let ensembl_tree = Newick.from_file omm_tree in let tagged_tree = Newick.map_inner_tree ensembl_tree ~f:(fun t -> tag_tree t (String.Set.of_list species) ) in Newick.to_file tagged_tree [%dest] ) let%pworkflow compare_tree_branch_lengths t1 t2 = let open Phylogenetics in let t1 = Newick.from_file [%path t1] in let t2 = Newick.from_file [%path t2] in let rec reorder_branches (branches : _ Tree.branch List1.t) = List1.map branches ~f:(fun (Branch b) -> let tip, label = reorder_tree_aux b.tip in Tree.branch b.data tip, label ) and reorder_tree_aux (t : Newick.tree) = match t with | Leaf l -> t, Option.value_exn l.name | Node n -> let branches_and_labels = reorder_branches n.branches in let branches, labels = branches_and_labels |> List1.sort ~compare:(fun (_, x) (_, y) -> String.compare x y ) |> List1.unzip in let label = List1.hd labels in Tree.node n.data branches, label and reorder_tree t = fst (reorder_tree_aux t) in let rec collect_branch_lengths (t1 : Newick.tree) (t2 : Newick.tree) = match (t1, t2) with | Leaf _, Leaf _ -> [] | Node n1, Node n2 -> List1.map2_exn n1.branches n2.branches ~f:(fun (Branch b1) (Branch b2) -> let acc = collect_branch_lengths b1.tip b2.tip in match b1.data.length, b2.data.length with | Some l1, Some l2 -> (l1, l2) :: acc | _ -> acc ) |> List1.to_list |> List.concat | Leaf _, Node _ | Node _, Leaf _ -> invalid_arg "trees with different topologies" in let t1 = Newick.with_inner_tree ~f:reorder_tree t1 in let t2 = Newick.with_inner_tree ~f:reorder_tree t2 in let l1, l2 = collect_branch_lengths t1 t2 |> List.unzip in OCamlR_grDevices.pdf [%dest] ; OCamlR_graphics.plot ~main:"Comparison of branch lengths" ~xlab:"Branch lengths in tree 1" ~ylab:"Branch lengths in tree 2" ~x:(Array.of_list l1) ~y:(Array.of_list l2) () ; OCamlR_grDevices.dev_off () let%pworkflow number_of_missing_sequences_repartition db = Orthomam_db.missing_sequences_cdf [%param db] [%dest] let%pworkflow concatenate ?seed ?site_sampling ~max_missing_sequences db = let db = [%param db] in let max_missing_sequences = [%param max_missing_sequences] in let seed = [%param seed] in let site_sampling = [%param site_sampling] in let open Phylogenetics in let all_alignments = Orthomam_db.list_alignments db in let missing_sequence seq = String.count seq ~f:Char.(fun c -> c = '?' || c = '-') = String.length seq in let compare_items (x : Phylip.item) (y : Phylip.item) = String.compare x.name y.name in let merge_items (xs : Phylip.item list) = match xs with | [] -> assert false | h :: t -> assert (List.for_all t ~f:(fun x -> String.equal x.name h.name)) ; { Phylip.name = h.name ; sequence = List.map xs ~f:(fun x -> x.sequence) |> String.concat } in let sample_sites (alignments : Phylip.t list) n = let seed = match seed with | None -> Gsl.Rng.default_seed () | Some i -> Nativeint.of_int i in let alignments = Array.of_list alignments in let sizes = Array.map alignments ~f:(fun al -> float al.sequence_length) |> Gsl.Randist.discrete_preproc in let rng = Gsl.Rng.(make (default ())) in Gsl.Rng.set rng seed ; let nsequences = alignments.(0).number_of_sequences in let sequences = Array.init nsequences ~f:(fun _ -> Bytes.create n ) in for i = 0 to n - 1 do let al = alignments.(Gsl.Randist.discrete rng sizes) in let p = Gsl.Rng.uniform_int rng al.sequence_length in List.iteri al.items ~f:(fun j it -> Bytes.set sequences.(j) i it.sequence.[p] ) done ; List.mapi alignments.(0).items ~f:(fun i it -> { Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name } ) in let filtered_alignments = List.filter_map all_alignments ~f:(fun ali_fn -> let ali = Phylip.read_exn ~strict:false ali_fn in let nmissing = List.count ali.items ~f:(fun it -> missing_sequence it.sequence) in if nmissing <= max_missing_sequences then if List.for_all ali.items ~f:(fun it -> Result.is_ok (Orthomam_db.check_orf it.sequence)) then Some ali else None else None ) in let merged_items (alignments : Phylip.t list) = List.map alignments ~f:(fun ali -> List.sort ali.items ~compare:compare_items ) |> List.transpose_exn |> List.map ~f:merge_items in let items = match site_sampling with | None -> merged_items filtered_alignments | Some n -> sample_sites filtered_alignments n in Phylip.write ~strict:false (Phylip.make_exn items) [%dest] let%pworkflow concatenate_calibration_figure ~nsites ~trees = let open Phylogenetics in let nsites = [%param nsites] in let tree_paths = [%eval List.map trees ~f:Workflow.path_list |> Workflow.list ] in let tree_lengths = List.map tree_paths ~f:( List.map ~f:(fun tree_path -> Newick.from_file tree_path |> Newick.with_inner_tree ~f:( Tree.pre ~init:0. ~node:(fun acc _ -> acc) ~leaf:(fun acc _ -> acc) ~branch:(fun acc bi -> match bi.Newick.length with | None -> failwith "expected tree with branch length" | Some l -> acc +. l ) ) ) ) in let x = List.map2_exn nsites trees ~f:(fun n trees -> List.init (List.length trees) ~f:(fun _ -> float n) ) |> List.concat |> Array.of_list in let y = List.concat tree_lengths |> Array.of_list in OCamlR_grDevices.pdf [%dest] ; OCamlR_graphics.plot ~ylab:"Total branch length" ~xlab:"Concatenate size" ~log:`XY ~x ~y () ; OCamlR_grDevices.dev_off () let concatenate_calibration ~seed db = let nrep = 5 in let nsites = [ 50 ; 100 ; 200 ; 500 ; 1000 ; 2000 ; 5000 ; 10000 ; 20000 ; 50000 ] in let seeds = let rng = Random.State.make [| seed |] in List.map nsites ~f:(fun _ -> List.init nrep ~f:(fun _ -> Random.State.int rng (1 lsl 30 - 1) ) ) in let concatenates = List.map2_exn nsites seeds ~f:(fun n seeds -> List.map seeds ~f:(fun seed -> concatenate ~seed ~site_sampling:n ~max_missing_sequences:0 db ) ) in let omm_tree = omm_tree_of_db db in let trees = List.map concatenates ~f:( List.map ~f:(fun ali -> Biotope.Iqtree.iqtree ~o:"Ornithorhynchus_anatinus" ~te:omm_tree (`phylip ali) ) ) in concatenate_calibration_figure ~nsites ~trees let estimated_nucleotide_tree db = concatenate ~seed:1234 ~site_sampling:10_000 ~max_missing_sequences:0 db module Q = struct type t = query let alignment q : phylip file = Workflow.input q.alignment |> remove_unobserved_sequences_from_alignment let tree ~branch_length_unit q = let omm_tree_with_branch_lengths = match branch_length_unit with | `Nucleotide -> estimated_nucleotide_tree q.db | `Amino_acid -> assert false | `Codon -> assert false in clip_tree_on_alignment (annotate_convergent_species_in_tree omm_tree_with_branch_lengths q.convergent_species) (alignment q) let nucleotide_alignment q = fasta_of_phylip (alignment q) end include Q include Detection_pipeline.Make(Q) let species_with_echolocation = [ "Rhinolophus ferrumequinum" ; "Tursiops_truncatus" ; "Myotis_lucifugus" ; "Physeter_catodon" ; ]