dataset.ml 3.92 KB
Newer Older
1 2 3 4 5 6
open Core
open Bistro_utils

type t = {
  model_prefix: string ;
  tree_prefix : string ;
Carine Rey's avatar
Carine Rey committed
7
  is_real: bool ;
Carine Rey's avatar
Carine Rey committed
8
  calc_dnds: bool ;
9
  dataset : Ready_dataset.t ;
Carine Rey's avatar
Carine Rey committed
10
  seed : int ;
11 12
}

Carine Rey's avatar
Carine Rey committed
13 14 15



Philippe Veber's avatar
Philippe Veber committed
16
let repo dataset_l =
17 18 19
  List.map dataset_l ~f:(fun dataset ->
      let model_prefix = dataset.model_prefix in
      let tree_prefix = dataset.tree_prefix in
Carine Rey's avatar
Carine Rey committed
20
      if dataset.is_real then
Carine Rey's avatar
Carine Rey committed
21
          let repo_realdata = Ready_dataset.repo_realdata ~ali_prefix:model_prefix ~tree_prefix dataset.dataset ~calc_dnds:dataset.calc_dnds in
Carine Rey's avatar
Carine Rey committed
22 23 24 25 26 27 28 29 30 31 32 33 34
          repo_realdata
          |> Repo.shift "Dataset"
          |> Repo.shift tree_prefix
      else
          let repo_ready_data = Ready_dataset.repo dataset.dataset in
          let repo_raw_data = Raw_dataset.repo ~prefix:model_prefix (Ready_dataset.to_raw dataset.dataset) in
          List.concat [
            Repo.shift "minimal" (Repo.shift (tree_prefix ^ "_" ^ model_prefix) repo_raw_data);
            Repo.shift "debug" repo_ready_data;
          ]
          |> Repo.shift "dataset"
          |> Repo.shift model_prefix
          |> Repo.shift "Results_per_hypothesis"
35 36
    )
  |> List.concat
Carine Rey's avatar
Carine Rey committed
37

Carine Rey's avatar
Carine Rey committed
38 39 40 41




42
let add_indels_to_dataset d ~seed =
43 44 45 46 47 48 49
  let p =  0.33 in
  let model_prefix = sprintf "%s_0.33_i" d.model_prefix in
  let tree_prefix  = d.tree_prefix in
  let is_real      = d.is_real in
  let new_seed     = Hashtbl.hash seed in
  let indel_seed   = Hashtbl.hash new_seed in
  let dataset      = Ready_dataset.add_indels_to_ready_dataset ~p ~seed:indel_seed d.dataset in
Carine Rey's avatar
Carine Rey committed
50 51
  let calc_dnds    = d.calc_dnds in
  {model_prefix; tree_prefix; is_real; calc_dnds; dataset; seed = new_seed}
Philippe Veber's avatar
Philippe Veber committed
52 53 54 55 56 57 58

open Bistro
open File_formats

module New_API = struct
  type t = {
    tree : nhx file ;
Philippe Veber's avatar
Philippe Veber committed
59
    nucleotide_alignments : (string * nucleotide_fasta file) list ;
Philippe Veber's avatar
Philippe Veber committed
60 61 62
    convergent_species : string list workflow ;
  }

63 64 65
  let clip_tree_on_alignment (tree : nhx file) (ali : nucleotide_fasta file) =
    let f = fun%workflow dest ->
      let open Phylogenetics in
Philippe Veber's avatar
Philippe Veber committed
66
      let tree = Newick.from_file_exn [%path tree] in
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
      let _, ali =
        Biotk.Fasta.from_file [%path ali]
        |> Result.ok_or_failwith
      in
      let ali_species = List.map ali ~f:(fun it -> it.description) in
      let clipped_tree =
        Newick.map_inner_tree tree ~f:(fun tree ->
            match
              Tree.leafset_generated_subtree tree
                (fun bi -> bi.Newick.name) ali_species
            with
            | None -> failwith "Tree has no leaf in alignment"
            | Some filtered_tree -> filtered_tree
          )
      in
      Newick.to_file clipped_tree dest
    in
    Workflow.path_plugin ~descr:"orthomam.clip_tree_on_alignment" f

  let annotate_convergent_species_in_tree (tree : newick file) species : nhx file =
    let f = fun%workflow dest ->
      let open Phylogenetics in
      let species = [%eval species]
      and omm_tree = [%path tree] in

Philippe Veber's avatar
Philippe Veber committed
92
      let ensembl_tree = Newick.from_file_exn omm_tree in
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
      let tagged_tree =
        Newick.map_inner_tree ensembl_tree ~f:(fun t ->
            Codepitk.Convergence_tree.infer_binary_condition_on_branches
              ~convergent_leaves:(String.Set.of_list species)
              t
          )
      in
      Newick.to_file tagged_tree dest
    in
    Workflow.path_plugin ~version:3 ~descr:"dataset.annotate_convergent_species_in_tree" f

  let make ~tree ~nucleotide_alignments ~convergent_species =
    let tree = annotate_convergent_species_in_tree tree convergent_species in
    { tree ; nucleotide_alignments ; convergent_species }

Philippe Veber's avatar
Philippe Veber committed
108 109
  module Query = struct
    type dataset = t
Philippe Veber's avatar
Philippe Veber committed
110 111 112 113 114
    type t = {
      dataset : dataset ;
      alignment_descr : string ;
      alignment : nucleotide_fasta file ;
    }
Philippe Veber's avatar
Philippe Veber committed
115

Philippe Veber's avatar
Philippe Veber committed
116
    let nucleotide_alignment q = q.alignment
Philippe Veber's avatar
Philippe Veber committed
117

Philippe Veber's avatar
Philippe Veber committed
118 119
    let tree ~branch_length_unit:_ q =
      clip_tree_on_alignment q.dataset.tree q.alignment
Philippe Veber's avatar
Philippe Veber committed
120 121
  end

Philippe Veber's avatar
Philippe Veber committed
122 123 124 125
  let queries dataset =
    List.map dataset.nucleotide_alignments ~f:(fun (alignment_descr, alignment) ->
        { Query.dataset ; alignment_descr ; alignment }
      )
Philippe Veber's avatar
Philippe Veber committed
126
end