Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 1805222a authored by Philippe Veber's avatar Philippe Veber
Browse files

orthomam: site_ranking workflow (whole database run)

parent 0cbda477
(library
(name reviewphiltrans)
(libraries core biotk biotope bistro.utils gsl ocaml-r.graphics ocaml-r.grDevices phylogenetics.convergence reviewphiltrans_toolbox)
(libraries core biotk biotope bistro.utils containers gsl ocaml-r.graphics ocaml-r.grDevices phylogenetics.convergence reviewphiltrans_toolbox)
(preprocess
(pps ppx_jane ppx_csv_conv bistro.ppx ppx_here)))
......
......@@ -6,10 +6,10 @@ open Reviewphiltrans_toolbox
let ensembl_tree : nhx file =
Bistro_unix.wget "ftp://ftp.ensembl.org/pub/release-99/compara/species_trees/vertebrates_species-tree_Ensembl.nh"
type alignment = Orthomam_db.t * string
type alignment = Alignment of Orthomam_db.t * string
let alignments_of_db db =
List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> db, ali)
List.map (Orthomam_db.list_alignments db) ~f:(fun ali -> Alignment (db, ali))
let%pworkflow fasta_of_phylip ali =
let ali =
......@@ -27,9 +27,14 @@ type query = {
convergent_species : string list workflow ;
}
let query (db, alignment) ~convergent_species =
let query (Alignment (db, alignment)) ~convergent_species =
{ db ; alignment ; convergent_species }
let family_id q =
Filename.basename q.alignment
|> Filename.chop_suffix "_NT.fasta.ali"
|> String.chop_prefix_exn ~prefix:"all"
let%pworkflow remove_unobserved_sequences_from_alignment phylip : phylip file =
let phylip = [%path phylip] in
let module Phylip = Phylogenetics.Phylip in
......@@ -495,3 +500,103 @@ let%pworkflow convergence_species_tree_pdf ~convergent_species db =
render_tree tree_or_branch
|> Biotk_croquis.Croquis.Layout.simple
|> Fn.flip Biotk_croquis.Croquis.Layout.render_pdf [%dest]
module Candidate_site = struct
module T = struct
open Phylogenetics
type t = {
alignment_id : string ;
pos : int ;
contents : (unit, string * char * bool, float) Tree.t ;
score : float ;
}
let leq s1 s2 = Float.(s1.score <= s2.score)
end
include T
module Heap = CCHeap.Make(T)
end
let%workflow ranking_of_results ~alignment_ids ~convergent_species (alignments : aminoacid_fasta file list) (trees : newick file list) result_files ~column_label =
let ranking_size = 1_000 in
let column_label = [%param column_label] in
let alignment_ids = Array.of_list [%param alignment_ids] in
let trees = [%eval Workflow.path_list trees] in
let alignments = [%eval Workflow.path_list alignments] in
let result_files = [%eval Workflow.path_list result_files] in
let convergent_species = String.Set.of_list [%param convergent_species] in
let module Result_table = Reviewphiltrans_toolbox.Result_table in
let lazy_load xs ~f = List.map xs ~f:(fun x -> lazy (f x)) |> Array.of_list in
let results = lazy_load result_files ~f:Result_table.of_file in
let trees = lazy_load trees ~f:Phylogenetics.Newick.from_file in
let alignments = lazy_load alignments ~f:Biotk.Fasta.from_file in
let module S = struct
type t = {
alignment : int ;
pos : int ;
score : float ;
}
let leq s1 s2 = Float.(s1.score <= s2.score)
end in
let module H = CCHeap.Make(S) in
let add_result_line acc ~alignment ~pos ~score =
let candidate = { S.alignment ; pos ; score } in
let acc = H.add acc candidate in
if H.size acc > ranking_size then H.take_exn acc |> fst
else acc
in
let add_result_file alignment acc (rf : Result_table.t Lazy.t) =
let col = List.Assoc.find_exn (Lazy.force rf).scores_per_meth column_label ~equal:String.equal in
Array.foldi col ~init:acc ~f:(fun pos acc score ->
add_result_line acc ~alignment ~pos ~score
)
in
let top_sites =
Array.foldi results ~init:H.empty ~f:add_result_file
|> H.to_list_sorted
|> List.rev
in
let annotate_site (s : S.t) =
let id = alignment_ids.(s.alignment) in
let tree = Lazy.force trees.(s.alignment) in
let alignment =
match Lazy.force alignments.(s.alignment) with
| Ok (_, fa) -> fa
| Error _ -> assert false
in
let leaf_values = List.map alignment ~f:(fun it ->
it.description, it.sequence.[s.pos]
)
in
let contents =
let open Phylogenetics in
Newick.with_inner_tree tree ~f:(fun t ->
Tree.map t
~node:(fun _ -> ())
~branch:(fun bi -> Option.value_exn bi.length)
~leaf:(fun ni ->
let name = Option.value_exn ni.name in
let aa = List.Assoc.find_exn leaf_values name ~equal:String.equal in
name, aa, String.Set.mem convergent_species name
)
)
in
{ Candidate_site.alignment_id = id ; contents ; pos = s.pos ; score = s.score }
in
List.map top_sites ~f:annotate_site
let site_ranking ~meth ~convergent_species db =
let detection_method, column_label = match meth with
| `multinomial_asymptotic_lrt ->
multinomial_asymptotic_lrt, "1MinusLRT"
in
let queries =
alignments_of_db db
|> List.map ~f:(query ~convergent_species:(Workflow.data convergent_species))
in
let alignment_ids = List.map queries ~f:family_id in
let aa_alignments = List.map queries ~f:amino_acid_alignment in
let result_files = List.map queries ~f:detection_method in
let trees = List.map queries ~f:(tree ~branch_length_unit:`Amino_acid) in
ranking_of_results ~alignment_ids ~convergent_species aa_alignments trees result_files ~column_label
......@@ -13,6 +13,8 @@ val query :
convergent_species:string list workflow ->
query
val family_id : query -> string
val alignment : query -> phylip file
val tree :
......@@ -71,3 +73,20 @@ val convergence_species_tree_pdf :
convergent_species:string list ->
Orthomam_db.t ->
pdf file
module Candidate_site : sig
open Phylogenetics
type t = {
alignment_id : string ;
pos : int ;
contents : (unit, string * char * bool, float) Tree.t ;
score : float ;
}
end
val site_ranking :
meth:[`multinomial_asymptotic_lrt] ->
convergent_species:string list ->
Orthomam_db.t ->
Candidate_site.t list workflow
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment