From f30d0759e1d9e65564a9a99303d41389a297d48e Mon Sep 17 00:00:00 2001 From: Philippe Veber Date: Thu, 17 Dec 2020 00:47:36 +0100 Subject: [PATCH] Dataset: clip tree with respect to each alignment --- lib/dataset.ml | 48 +++++++++++++++++++++++++++++++++++++++++++++++- lib/orthomam.ml | 22 ++-------------------- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/lib/dataset.ml b/lib/dataset.ml index 9603760..ba171e3 100644 --- a/lib/dataset.ml +++ b/lib/dataset.ml @@ -60,13 +60,59 @@ module New_API = struct convergent_species : string list workflow ; } + let clip_tree_on_alignment (tree : nhx file) (ali : nucleotide_fasta file) = + let f = fun%workflow dest -> + let open Phylogenetics in + let tree = Newick.from_file [%path tree] in + let _, ali = + Biotk.Fasta.from_file [%path ali] + |> Result.ok_or_failwith + in + let ali_species = List.map ali ~f:(fun it -> it.description) in + let clipped_tree = + Newick.map_inner_tree tree ~f:(fun tree -> + match + Tree.leafset_generated_subtree tree + (fun bi -> bi.Newick.name) ali_species + with + | None -> failwith "Tree has no leaf in alignment" + | Some filtered_tree -> filtered_tree + ) + in + Newick.to_file clipped_tree dest + in + Workflow.path_plugin ~descr:"orthomam.clip_tree_on_alignment" f + + let annotate_convergent_species_in_tree (tree : newick file) species : nhx file = + let f = fun%workflow dest -> + let open Phylogenetics in + let species = [%eval species] + and omm_tree = [%path tree] in + + let ensembl_tree = Newick.from_file omm_tree in + let tagged_tree = + Newick.map_inner_tree ensembl_tree ~f:(fun t -> + Codepitk.Convergence_tree.infer_binary_condition_on_branches + ~convergent_leaves:(String.Set.of_list species) + t + ) + in + Newick.to_file tagged_tree dest + in + Workflow.path_plugin ~version:3 ~descr:"dataset.annotate_convergent_species_in_tree" f + + let make ~tree ~nucleotide_alignments ~convergent_species = + let tree = annotate_convergent_species_in_tree tree convergent_species in + { tree ; nucleotide_alignments ; convergent_species } + module Query = struct type dataset = t type t = dataset * nucleotide_fasta file let nucleotide_alignment = snd - let tree ~branch_length_unit:_ (d, _) = d.tree + let tree ~branch_length_unit:_ (d, fa) = + clip_tree_on_alignment d.tree fa end let queries d = diff --git a/lib/orthomam.ml b/lib/orthomam.ml index b43dc4d..c1b982a 100644 --- a/lib/orthomam.ml +++ b/lib/orthomam.ml @@ -78,24 +78,6 @@ let clip_tree_on_alignment tree ali = let omm_tree_of_db db = Workflow.input (Orthomam_db.tree db) -let annotate_convergent_species_in_tree (tree : newick file) species : nhx file = - let f = fun%workflow dest -> - let open Phylogenetics in - let species = [%eval species] - and omm_tree = [%path tree] in - - let ensembl_tree = Newick.from_file omm_tree in - let tagged_tree = - Newick.map_inner_tree ensembl_tree ~f:(fun t -> - Codepitk.Convergence_tree.infer_binary_condition_on_branches - ~convergent_leaves:(String.Set.of_list species) - t - ) - in - Newick.to_file tagged_tree dest - in - Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" f - let compare_tree_branch_lengths t1 t2 = let f = fun%workflow dest -> let open Phylogenetics in @@ -393,7 +375,7 @@ module Q = struct let tree ~branch_length_unit q = clip_tree_on_alignment - (annotate_convergent_species_in_tree + (Dataset.New_API.annotate_convergent_species_in_tree (omm_tree_with_branch_lengths ~branch_length_unit q.db) q.convergent_species) (alignment q) @@ -405,7 +387,7 @@ end include Q let tree_of_db db ~branch_length_unit ~convergent_species = - annotate_convergent_species_in_tree + Dataset.New_API.annotate_convergent_species_in_tree (omm_tree_with_branch_lengths ~branch_length_unit db) (Workflow.data convergent_species) -- GitLab