Gitlab is now running v13.9.0 - More info -> here <-

Commit 12048c65 authored by Philippe Veber's avatar Philippe Veber

Merge branch 'new-cli'

parents 4d9215f4 6a7e6b9a
......@@ -2,7 +2,6 @@
_build
_bistro
outdir*
run*
reviewphiltrans.install
**/.merlin
example/outdir
......
open Core
open Codepi
open Bistro_utils
let first_nhx_in_dir dir =
Sys.readdir dir
|> Array.find_exn ~f:(String.is_suffix ~suffix:".nhx")
let sw b x = if b then Some x else None
let realdata_main ~use_diffsel
~use_pcoc
~use_pcoc_c60
~use_pcoc_gamma
~use_pcoc_v2
~use_pcoc_pcp
~use_tdg09
~use_topological
~use_identical
~no_use_multinomial
~calc_dnds
~calc_gene_trees
~indir ~outdir ~np ~mem () =
let loggers = [
Console_logger.create () ;
] in
let mem = Option.map mem ~f:(fun i -> `GB i) in
let rd =
Real_dataset.make
~alignment_dir_path:(Filename.concat indir "Alignments")
~tree_path:(Filename.concat indir (first_nhx_in_dir indir))
in
let use_multinomial = not no_use_multinomial in
let meths = List.filter_opt [
sw use_diffsel `Diffsel ;
sw use_pcoc `Pcoc ;
sw use_pcoc_c60 `Pcoc_C60 ;
sw use_pcoc_gamma `Pcoc_gamma ;
sw use_pcoc_v2 `PCOC_v2 ;
sw use_pcoc_pcp `PCOC_pcp ;
sw use_tdg09 `Tdg09 ;
sw use_topological `Topological ;
sw use_identical `Identical ;
sw use_multinomial `Multinomial ;
]
in
let pal = List.filter_opt [
sw calc_dnds `DnDs;
sw calc_gene_trees `GeneTree;
]
in
List.concat [
Repo.shift "Merged_results" (Real_dataset.repo meths rd) ;
Repo.shift "PreParsed_Dataset" (Real_dataset.repo_parsed_rd pal rd);
]
|> Bistro_utils.Repo.build_main ~outdir ~loggers ?np ?mem
let realdata_command =
let open Command.Let_syntax in
Command.basic
~summary:"Run pipeline on real data"
[%map_open
let outdir =
flag "--outdir" (required string) ~doc:"PATH Output directory"
and indir =
flag "--indir" (required string) ~doc:"PATH Input directory"
and use_diffsel =
flag "--diffsel" no_arg ~doc:" use the diffsel method (very slow)."
and use_pcoc =
flag "--pcoc" no_arg ~doc:" use the pcoc method (slow)."
and use_pcoc_c60 =
flag "--pcoc-c60" no_arg ~doc:" use the pcoc method with c60 profils (very_slow)."
and use_pcoc_gamma =
flag "--pcoc-gamma" no_arg ~doc:" use the pcoc method with the gamma option (very_slow)."
and use_pcoc_v2 =
flag "--pcoc-v2" no_arg ~doc:" use the pcoc v2 method with the C10 profiles (slow)."
and use_pcoc_pcp =
flag "--pcoc-pcp" no_arg ~doc:" use the pcoc v2 method with the physico-chemical profiles (slow)."
and use_tdg09 =
flag "--tdg09" no_arg ~doc:" use the tdg09 method (slow)."
and use_topological =
flag "--topological" no_arg ~doc:" use the topological method (fast)."
and use_identical =
flag "--identical" no_arg ~doc:" use the identical method (fast)."
and no_use_multinomial =
flag "--no-multinomial" no_arg ~doc:" not use the multinomial method (very fast so by default)."
and calc_dnds =
flag "--dnds" no_arg ~doc:" calculate dn ds dnds trees (slow)."
and calc_gene_trees =
flag "--gt" no_arg ~doc:" calculate gene trees (slow)."
and np =
flag "--np" (optional int) ~doc:"INT Number of available processors"
and mem =
flag "--mem" (optional int) ~doc:"INT Available memory (in GB)"
in
realdata_main ~use_diffsel
~use_pcoc
~use_pcoc_c60
~use_pcoc_gamma
~use_pcoc_v2
~use_pcoc_pcp
~use_tdg09
~use_topological
~use_identical
~no_use_multinomial
~calc_dnds
~calc_gene_trees
~indir ~outdir ~np ~mem
]
let () =
Command.group ~summary:"Reviewphiltrans" [
Command.group ~summary:"codepi" [
"validation", Pipeline.validation_command ;
"realdata", realdata_command ;
"run", Run.command ;
"alistats", Alistats.command ;
]
|> Command.run
......@@ -49,3 +49,78 @@ let add_indels_to_dataset d ~seed =
let dataset = Ready_dataset.add_indels_to_ready_dataset ~p ~seed:indel_seed d.dataset in
let calc_dnds = d.calc_dnds in
{model_prefix; tree_prefix; is_real; calc_dnds; dataset; seed = new_seed}
open Bistro
open File_formats
module New_API = struct
type t = {
tree : nhx file ;
nucleotide_alignments : (string * nucleotide_fasta file) list ;
convergent_species : string list workflow ;
}
let clip_tree_on_alignment (tree : nhx file) (ali : nucleotide_fasta file) =
let f = fun%workflow dest ->
let open Phylogenetics in
let tree = Newick.from_file [%path tree] in
let _, ali =
Biotk.Fasta.from_file [%path ali]
|> Result.ok_or_failwith
in
let ali_species = List.map ali ~f:(fun it -> it.description) in
let clipped_tree =
Newick.map_inner_tree tree ~f:(fun tree ->
match
Tree.leafset_generated_subtree tree
(fun bi -> bi.Newick.name) ali_species
with
| None -> failwith "Tree has no leaf in alignment"
| Some filtered_tree -> filtered_tree
)
in
Newick.to_file clipped_tree dest
in
Workflow.path_plugin ~descr:"orthomam.clip_tree_on_alignment" f
let annotate_convergent_species_in_tree (tree : newick file) species : nhx file =
let f = fun%workflow dest ->
let open Phylogenetics in
let species = [%eval species]
and omm_tree = [%path tree] in
let ensembl_tree = Newick.from_file omm_tree in
let tagged_tree =
Newick.map_inner_tree ensembl_tree ~f:(fun t ->
Codepitk.Convergence_tree.infer_binary_condition_on_branches
~convergent_leaves:(String.Set.of_list species)
t
)
in
Newick.to_file tagged_tree dest
in
Workflow.path_plugin ~version:3 ~descr:"dataset.annotate_convergent_species_in_tree" f
let make ~tree ~nucleotide_alignments ~convergent_species =
let tree = annotate_convergent_species_in_tree tree convergent_species in
{ tree ; nucleotide_alignments ; convergent_species }
module Query = struct
type dataset = t
type t = {
dataset : dataset ;
alignment_descr : string ;
alignment : nucleotide_fasta file ;
}
let nucleotide_alignment q = q.alignment
let tree ~branch_length_unit:_ q =
clip_tree_on_alignment q.dataset.tree q.alignment
end
let queries dataset =
List.map dataset.nucleotide_alignments ~f:(fun (alignment_descr, alignment) ->
{ Query.dataset ; alignment_descr ; alignment }
)
end
......@@ -69,6 +69,17 @@ module type S = sig
val view_site :
query -> convergent_species:string list -> site_pos:int -> pdf file
val ranking :
?length:int ->
query_descr:(query -> string option) ->
meth:(query -> cpt file) ->
column_label:string ->
convergent_species:string list workflow ->
query list ->
Codepitk.Candidate_site.t list workflow
val alignment_plot : query -> svg file
end
module Make (Q : Query) = struct
......@@ -240,9 +251,103 @@ module Make (Q : Query) = struct
Biotk.Fasta.from_file alignment_path
|> Base.Result.ok_or_failwith |> snd
in
CS.make ~condition tree alignment site_pos
|> CS.draw |> Biotk_croquis.Croquis.Layout.simple
|> Base.Fn.flip Biotk_croquis.Croquis.Layout.render_pdf dest
let layout =
CS.make ~condition tree alignment site_pos
|> CS.draw
|> Biotk_croquis.Croquis.Layout.simple
in
Biotk_croquis.Croquis.Layout.render `pdf layout (`File dest)
in
Workflow.path_plugin ~descr:"detection_pipeline.view_site" f
let ranking ?(length = 100) ~query_descr ~meth ~column_label ~convergent_species queries =
let open Core_kernel in
let open Codepitk in
let query_ids = Array.of_list (List.map queries ~f:query_descr) in
let trees = List.map queries ~f:(tree ~branch_length_unit:`Amino_acid) in
let alignments = List.map queries ~f:amino_acid_alignment in
let result_files = List.map queries ~f:meth in
let f = fun%workflow () ->
let ranking_length = [%param length] in
let column_label = [%param column_label] in
let query_ids = [%param query_ids] in
let trees = [%eval Workflow.path_list trees] in
let alignments = [%eval Workflow.path_list alignments] in
let result_files = [%eval Workflow.path_list result_files] in
let convergent_species = String.Set.of_list [%eval convergent_species] in
let module Result_table = Codepitk.Result_table in
let lazy_load xs ~f = List.map xs ~f:(fun x -> lazy (f x)) |> Array.of_list in
let results = lazy_load result_files ~f:Result_table.of_file in
let trees = lazy_load trees ~f:Phylogenetics.Newick.from_file in
let alignments = lazy_load alignments ~f:Biotk.Fasta.from_file in
let module S = struct
type t = {
alignment : int ;
pos : int ;
score : float ;
}
let leq s1 s2 = Float.(s1.score <= s2.score)
end in
let module H = CCHeap.Make(S) in
let add_result_line acc ~alignment ~pos ~score =
match score with
| None -> acc
| Some score ->
let candidate = { S.alignment ; pos ; score } in
let acc = H.add acc candidate in
if H.size acc > ranking_length then H.take_exn acc |> fst
else acc
in
let add_result_file alignment acc (rf : Result_table.t Lazy.t) =
let col = List.Assoc.find_exn (Lazy.force rf).scores_per_meth column_label ~equal:String.equal in
Array.foldi col ~init:acc ~f:(fun pos acc score ->
add_result_line acc ~alignment ~pos ~score
)
in
let top_sites =
Array.foldi results ~init:H.empty ~f:add_result_file
|> H.to_list_sorted
|> List.rev
in
let annotate_site (s : S.t) =
let id = query_ids.(s.alignment) in
let tree = Lazy.force trees.(s.alignment) in
let alignment =
match Lazy.force alignments.(s.alignment) with
| Ok (_, fa) -> fa
| Error _ -> assert false
in
let leaf_values = List.map alignment ~f:(fun it ->
it.description, it.sequence.[s.pos]
)
in
let contents =
let open Phylogenetics in
Newick.with_inner_tree tree ~f:(fun t ->
Tree.map t
~node:(fun _ -> ())
~branch:(fun bi -> Option.value_exn bi.length)
~leaf:(fun ni ->
let species = Option.value_exn ni.name in
let state = List.Assoc.find_exn leaf_values species ~equal:String.equal in
let condition =
if String.Set.mem convergent_species species then `Convergent
else `Ancestral
in
{ Codepitk.Candidate_site.species ; state ; condition }
)
)
in
{ Candidate_site.alignment_id = id ; contents ; pos = Some s.pos ; score = Some s.score }
in
List.map top_sites ~f:annotate_site
in
Workflow.plugin ~descr:"detection_pipeline.ranking" f
let alignment_plot d =
Convergence_detection.plot_convergent_sites
~tree:(Q.tree ~branch_length_unit:`Amino_acid d)
~alignment:(amino_acid_alignment d)
~detection_results:(multinomial_asymptotic_lrt d)
()
end
......@@ -69,6 +69,17 @@ module type S = sig
val view_site :
query -> convergent_species:string list -> site_pos:int -> pdf file
val ranking :
?length:int ->
query_descr:(query -> string option) ->
meth:(query -> cpt file) ->
column_label:string ->
convergent_species:string list workflow ->
query list ->
Codepitk.Candidate_site.t list workflow
val alignment_plot : query -> svg file
end
module Make (Q : Query) : S with type query := Q.t
......@@ -78,24 +78,6 @@ let clip_tree_on_alignment tree ali =
let omm_tree_of_db db =
Workflow.input (Orthomam_db.tree db)
let annotate_convergent_species_in_tree (tree : newick file) species : nhx file =
let f = fun%workflow dest ->
let open Phylogenetics in
let species = [%eval species]
and omm_tree = [%path tree] in
let ensembl_tree = Newick.from_file omm_tree in
let tagged_tree =
Newick.map_inner_tree ensembl_tree ~f:(fun t ->
Codepitk.Convergence_tree.infer_binary_condition_on_branches
~convergent_leaves:(String.Set.of_list species)
t
)
in
Newick.to_file tagged_tree dest
in
Workflow.path_plugin ~version:3 ~descr:"tree_of_convergent_species" f
let compare_tree_branch_lengths t1 t2 =
let f = fun%workflow dest ->
let open Phylogenetics in
......@@ -393,7 +375,7 @@ module Q = struct
let tree ~branch_length_unit q =
clip_tree_on_alignment
(annotate_convergent_species_in_tree
(Dataset.New_API.annotate_convergent_species_in_tree
(omm_tree_with_branch_lengths ~branch_length_unit q.db)
q.convergent_species)
(alignment q)
......@@ -405,7 +387,7 @@ end
include Q
let tree_of_db db ~branch_length_unit ~convergent_species =
annotate_convergent_species_in_tree
Dataset.New_API.annotate_convergent_species_in_tree
(omm_tree_with_branch_lengths ~branch_length_unit db)
(Workflow.data convergent_species)
......@@ -484,9 +466,11 @@ let convergence_species_tree_pdf ~convergent_species db =
Codepitk.Convergence_tree.infer_binary_condition_on_branches
t ~convergent_leaves:convergent_species)
in
render_tree tree_or_branch
|> Biotk_croquis.Croquis.Layout.simple
|> Fn.flip Biotk_croquis.Croquis.Layout.render_pdf dest
let layout =
render_tree tree_or_branch
|> Biotk_croquis.Croquis.Layout.simple
in
Biotk_croquis.Croquis.Layout.render `pdf layout (`File dest)
in
Workflow.path_plugin ~descr:"orthomam.convergence_species_tree_pdf" f
......@@ -607,7 +591,7 @@ let draw_site q pos =
|> Candidate_site.draw
|> Croquis.Layout.simple
in
Croquis.Layout.render_pdf picture dest
Croquis.Layout.render `pdf picture (`File dest)
| Error msg -> failwith msg
in
Workflow.path_plugin ~descr:"orthomam." f
open Core
open Bistro
open Bistro_utils
open File_formats
module Dataset = Dataset.New_API
module Pipeline = Detection_pipeline.Make(Dataset.Query)
type detection_method = Detection_method of {
id : string ;
name : string ;
cpt_column_label : string ;
f : Dataset.Query.t -> cpt file ;
}
let multinomial_method = Detection_method {
id = "multinomial" ;
name = "Multinomial" ;
cpt_column_label = "Multinomial_1mp" ;
f = Pipeline.multinomial_asymptotic_lrt ;
}
let tdg09_method = Detection_method {
id = "tdg09" ;
name = "TDG09" ;
cpt_column_label = "Tdg09_1MinusFDR" ;
f = Pipeline.tdg09 ;
}
let pcoc_method = Detection_method {
id = "pcoc" ;
name = "PCOC" ;
cpt_column_label = "fdsf" ;
f = Pipeline.pcoc ;
}
let known_methods = [
"multinomial", multinomial_method ;
"tdg09", tdg09_method ;
"pcoc", pcoc_method ;
]
let parse_methods methods =
List.map methods ~f:(fun m ->
match List.Assoc.find known_methods ~equal:String.equal m with
| Some dm -> Ok dm
| None ->
Rresult.R.error_msgf "Unknown method %s, known methods are %s"
m (List.map known_methods ~f:fst |> String.concat ~sep:",")
)
|> Result.all
let candidate_site_report dataset (Detection_method meth) =
let all_queries = Dataset.queries dataset in
let sites =
Pipeline.ranking
~query_descr:(fun q -> Some q.Dataset.Query.alignment_descr)
~meth:meth.f
~column_label:meth.cpt_column_label
~convergent_species:dataset.Dataset.convergent_species
all_queries
in
let f = fun%workflow dest ->
let module N = Codepitk.Note in
let module DF = Codepitk.Dataframe in
let module CS = Codepitk.Candidate_site in
let sites = Array.of_list [%eval sites] in
let note_to_html note path =
N.to_html note path
|> Rresult.R.failwith_error_msg
in
let site_page_path i = sprintf "%s_%04d.html" meth.id i in
let site_page i s =
let path = Filename.concat dest (site_page_path i) in
let title = sprintf "%s candidate site #%d" meth.name i in
let alignment_id = Option.value ~default:"NA" s.CS.alignment_id in
let position = Option.value_map ~f:Int.to_string ~default:"NA" s.CS.pos in
let score = Option.value_map ~f:Float.to_string ~default:"NA" s.CS.score in
let summary = {%string|
- **Alignment**: %{alignment_id}
- **Position**: %{position}
- **Score**: %{score}
|}
in
let contents = N.make ~title N.[
text summary ;
croquis ~width:20. (CS.draw s) ;
]
in
note_to_html contents path ;
if i = 0 then Out_channel.with_file "delme.bin" ~f:(fun oc -> Marshal.to_channel oc s [])
in
let module H = struct
open Tyxml.Html
let table ~labels rows =
let open Tyxml.Html in
let thead = thead [tr (List.map labels ~f:(fun x -> td [txt x]))] in
let rows =
List.map rows ~f:(fun cells ->
tr (List.map cells ~f:(fun x -> td [x]))
)
in
table ~thead rows
let link ~href contents = a ~a:[a_href href] [txt contents]
let opt_cell x ~f = match x with
| None -> txt "NA"
| Some y -> f y
let opt_txt x ~f = opt_cell x ~f:(fun x -> txt (f x))
end
in
let index =
let df =
let labels = ["Alignment ID" ; "Position" ; "Score" ; "Infos"] in
let rows = List.init (Array.length sites) ~f:(fun i -> [
H.opt_cell sites.(i).CS.alignment_id ~f:(fun id ->
let href = sprintf "../alignments/%s.svg" id in
H.link ~href id
) ;
H.opt_txt sites.(i).CS.pos ~f:Int.to_string ;
H.opt_txt sites.(i).CS.score ~f:Float.to_string ;
H.link ~href:(site_page_path i) "Details"
])
in
H.table ~labels rows
in
let title = sprintf "Candidate sites for %s method" meth.name in
N.make ~title N.[
table df ;
]
in
let path fn = Filename.concat dest fn in
Unix.mkdir_p dest ;
note_to_html index (path "index.html") ;
Array.iteri sites ~f:site_page
in
Workflow.path_plugin ~descr:"codepi.run.candidate_site_report" ~version:5 f
type t = {
tree_file : string ;
alignment_dir : string ;
convergent_species_file : string ;
detection_methods : detection_method list ;
}
let convergent_species_workflow run =
[%workflow
In_channel.read_lines
[%path Workflow.input run.convergent_species_file]]
let dataset run =
let tree = Workflow.input run.tree_file in
let nucleotide_alignments =
Sys.readdir run.alignment_dir
|> Array.to_list
|> List.map ~f:(fun ali ->
ali,
Workflow.input (Filename.concat run.alignment_dir ali)
)
in
let convergent_species = convergent_species_workflow run in
Dataset.make ~tree ~nucleotide_alignments ~convergent_species
let repo run =
let d = dataset run in
let foreach_detection_method (Detection_method meth as dm) =
let report = candidate_site_report d dm in
Repo.item [meth.id ^ "_report"] report
in
let foreach_query (q : Dataset.Query.t) =
Repo.item
["alignments" ; sprintf "%s.svg" q.alignment_descr ]
(Pipeline.alignment_plot q)
in
List.concat [
List.map run.detection_methods ~f:foreach_detection_method ;
List.map (Dataset.queries d) ~f:foreach_query ;
]
let main
~tree_file ~alignment_dir ~convergent_species_file
~outdir ~np ~mem
~methods () =
try
let loggers = [
Console_logger.create () ;
] in