Commit 69e06469 authored by Philippe Veber's avatar Philippe Veber
Browse files

update wrt bistro

parent 47ecba26
......@@ -38,7 +38,7 @@ eval "$(opam config env)"
```
opam pin add -y bistro --dev-repo
opam pin add -y biotk https://github.com/pveber/biotk.git
opam pin add -y biotope https://github.com/pveber/biotope.git
opam pin add -y bistro-bio https://github.com/pveber/bistro.git
opam pin add -y phylogenetics https://github.com/biocaml/phylogenetics.git
opam install -y ppx_csv_conv ocamlify utop
```
......
......@@ -15,7 +15,8 @@ depends: [
"dune" {>= "1.11"}
"biocaml"
"biotk"
"biotope"
"bistro"
"bistro-bio"
"ocaml-r"
"ocamlify"
"phylogenetics"
......
......@@ -24,7 +24,8 @@ or to benchmark various tools.
(depends
biocaml
biotk
biotope
bistro
bistro-bio
ocaml-r
ocamlify
phylogenetics))
open Core
open Phylogenetics
open Bistro
open File_formats
let ok_exn err = function
| Ok x -> x
......@@ -83,21 +84,24 @@ let command =
main ~alignment
]
let%pworkflow histogram (fa : #fasta file) =
let al = ok_exn Alignment.show_parsing_error @@ Alignment.from_fasta [%path fa] in
let float_array_of_int_list x =
Array.of_list x
|> Array.map ~f:float
let histogram (fa : #fasta file) =
let f = fun%workflow dest ->
let al = ok_exn Alignment.show_parsing_error @@ Alignment.from_fasta [%path fa] in
let float_array_of_int_list x =
Array.of_list x
|> Array.map ~f:float
in
let values, counts =
Alignment.number_of_residues_per_column_stats al
|> List.unzip
in
OCamlR_grDevices.pdf dest ;
OCamlR_graphics.plot
~plot_type:`Histogram
~xlab:"Number of different residues in column"
~ylab:"Counts"
~x:(float_array_of_int_list values)
~y:(float_array_of_int_list counts) () ;
OCamlR_grDevices.dev_off ()
in
let values, counts =
Alignment.number_of_residues_per_column_stats al
|> List.unzip
in
OCamlR_grDevices.pdf [%dest] ;
OCamlR_graphics.plot
~plot_type:`Histogram
~xlab:"Number of different residues in column"
~ylab:"Counts"
~x:(float_array_of_int_list values)
~y:(float_array_of_int_list counts) () ;
OCamlR_grDevices.dev_off ()
Workflow.path_plugin ~descr:"alistats.histogram" f
......@@ -188,20 +188,26 @@ let recall_precision_curve table =
] ;
]
let%workflow recall_precision_auc_table table =
let module RT = Codepitk.Result_table in
let { RT.oracle ; scores_per_meth } = RT.of_file [%path table] in
let labels = Option.value_exn oracle in
List.map scores_per_meth ~f:(fun (meth, scores) ->
let scores = Array.filter_opt scores in
let _, auc = Biocaml_unix.Bin_pred.recall_precision_curve ~labels ~scores in
meth, auc
)
let recall_precision_auc_table table =
let f = fun%workflow () ->
let module RT = Codepitk.Result_table in
let { RT.oracle ; scores_per_meth } = RT.of_file [%path table] in
let labels = Option.value_exn oracle in
List.map scores_per_meth ~f:(fun (meth, scores) ->
let scores = Array.filter_opt scores in
let _, auc = Biocaml_unix.Bin_pred.recall_precision_curve ~labels ~scores in
meth, auc
)
in
Workflow.plugin ~descr:"convergence_detection.recall_precision_curve" f
let%pworkflow oracle ~n_h0 ~n_ha =
let n_h0 = [%param n_h0] in
let n_ha = [%param n_ha] in
"Sites\tOracle"
:: (List.init n_h0 ~f:(fun i -> sprintf "%d\t0" (i + 1)))
@ (List.init n_ha ~f:(fun i -> sprintf "%d\t1" (n_h0 + i + 1)))
|> Out_channel.write_lines [%dest]
let oracle ~n_h0 ~n_ha =
let f = fun%workflow dest ->
let n_h0 = [%param n_h0] in
let n_ha = [%param n_ha] in
"Sites\tOracle"
:: (List.init n_h0 ~f:(fun i -> sprintf "%d\t0" (i + 1)))
@ (List.init n_ha ~f:(fun i -> sprintf "%d\t1" (n_h0 + i + 1)))
|> Out_channel.write_lines dest
in
Workflow.path_plugin ~descr:"convergence_detection.oracle" f
......@@ -81,18 +81,20 @@ module Make (Q : Query) = struct
let gene_tree d =
Tree_dataset.raxmlng_fna ~fna:(nucleotide_alignment d) ()
let%pworkflow tree_with_no_single_child ~branch_length_unit d :
newick file =
let tree_file = [%path tree ~branch_length_unit d] in
let open Phylogenetics in
let tree = Newick.from_file tree_file in
let tree =
Newick.map_inner_tree tree
~f:
Codepitk.Convergence_tree
.remove_nodes_with_single_child
let tree_with_no_single_child ~branch_length_unit d : newick file =
let f = fun%workflow dest ->
let tree_file = [%path tree ~branch_length_unit d] in
let open Phylogenetics in
let tree = Newick.from_file tree_file in
let tree =
Newick.map_inner_tree tree
~f:
Codepitk.Convergence_tree
.remove_nodes_with_single_child
in
Newick.to_file tree dest
in
Newick.to_file tree [%dest]
Workflow.path_plugin ~descr:"detection_pipeline.tree_with_no_single_child" f
let identical d =
let tree_sc =
......@@ -145,15 +147,18 @@ module Make (Q : Query) = struct
~faa:(amino_acid_alignment d) ()
|> Tamuri.results
let%pworkflow mock_tdg09 d =
match Biotk.Fasta.from_file [%path amino_acid_alignment d] with
| Ok (_, item :: _) ->
let mock_tdg09 d =
let f = fun%workflow dest ->
match Biotk.Fasta.from_file [%path amino_acid_alignment d] with
| Ok (_, item :: _) ->
let open Core_kernel in
let n = String.length item.sequence in
"Sites\tTdg09_1MinusFDR\tTdg09_1MinusLRT\tTdg09_prob_post"
:: List.init n ~f:(fun i -> sprintf "%d\t0.0\t0.0\tNA" (i + 1))
|> Out_channel.write_lines [%dest]
| _ -> failwith "couldn't read an item in fasta"
|> Out_channel.write_lines dest
| _ -> failwith "couldn't read an item in fasta"
in
Workflow.path_plugin ~descr:"detection_pipeline.mock_tdg09" f
let failsafe_tdg09 d = Workflow.trywith (tdg09 d) (mock_tdg09 d)
......@@ -213,21 +218,24 @@ module Make (Q : Query) = struct
let dnds_tree d = (dn_ds_dnds_trees d).dnds_tsv
let%pworkflow view_site query ~convergent_species ~site_pos =
let tree_path = [%path tree ~branch_length_unit:`Amino_acid query] in
let alignment_path = [%path amino_acid_alignment query] in
let module CS = Codepitk.Candidate_site in
let convergent_species = [%param convergent_species] in
let site_pos = [%param site_pos] in
let condition s =
if List.mem s convergent_species then `Convergent else `Ancestral
in
let tree = Phylogenetics.Newick.from_file tree_path in
let alignment =
Biotk.Fasta.from_file alignment_path
|> Base.Result.ok_or_failwith |> snd
let view_site query ~convergent_species ~site_pos =
let f = fun%workflow dest ->
let tree_path = [%path tree ~branch_length_unit:`Amino_acid query] in
let alignment_path = [%path amino_acid_alignment query] in
let module CS = Codepitk.Candidate_site in
let convergent_species = [%param convergent_species] in
let site_pos = [%param site_pos] in
let condition s =
if List.mem s convergent_species then `Convergent else `Ancestral
in
let tree = Phylogenetics.Newick.from_file tree_path in
let alignment =
Biotk.Fasta.from_file alignment_path
|> Base.Result.ok_or_failwith |> snd
in
CS.make ~condition tree alignment site_pos
|> CS.draw |> Biotk_croquis.Croquis.Layout.simple
|> Base.Fn.flip Biotk_croquis.Croquis.Layout.render_pdf dest
in
CS.make ~condition tree alignment site_pos
|> CS.draw |> Biotk_croquis.Croquis.Layout.simple
|> Base.Fn.flip Biotk_croquis.Croquis.Layout.render_pdf [%dest]
Workflow.path_plugin ~descr:"detection_pipeline.view_site" f
end
......@@ -141,16 +141,19 @@ let posterior_probabilities run_diffseldsparse : cpt file =
]
]
let%pworkflow results dir =
Filename.concat [%path dir] "myrun_1.sitepp"
|> In_channel.read_lines
|> Fn.flip List.drop 1
|> List.map ~f:(String.split ~on:'\t')
|> List.map ~f:(function
| site :: pp :: _ ->
Int.of_string site + 1, Float.of_string pp /. 100.
| _ -> failwith "parsing readdiffseldsparse results"
)
|> List.map ~f:(fun (site, pp) -> sprintf "%d\t%f" site pp)
|> (fun xs -> "Sites\tDiffseldsparse" :: xs)
|> Out_channel.write_lines [%dest]
let results dir =
let f = fun%workflow dest ->
Filename.concat [%path dir] "myrun_1.sitepp"
|> In_channel.read_lines
|> Fn.flip List.drop 1
|> List.map ~f:(String.split ~on:'\t')
|> List.map ~f:(function
| site :: pp :: _ ->
Int.of_string site + 1, Float.of_string pp /. 100.
| _ -> failwith "parsing readdiffseldsparse results"
)
|> List.map ~f:(fun (site, pp) -> sprintf "%d\t%f" site pp)
|> (fun xs -> "Sites\tDiffseldsparse" :: xs)
|> Out_channel.write_lines dest
in
Workflow.path_plugin ~descr:"diffseldsparse.results" f
(library
(name codepi)
(libraries core biotk biotope bistro.utils codepitk containers gsl
(libraries core biotk bistro-bio bistro.utils codepitk containers gsl
ocaml-r.graphics ocaml-r.grDevices phylogenetics.convergence
prc)
(preprocess
......
open Bistro
include Bistro_bio.Formats
class type bimbam = object
inherit text
method format : [`bimbam]
......
......@@ -9,32 +9,38 @@ class type relatedness_matrix =
method format : [ `gemma_relatedness_matrix ]
end
let%pworkflow genotype_of_fasta fasta =
let module G = Codepitk.Gemma in
G.write_genotypes ~alignment:[%path fasta] ~output:[%dest]
let genotype_of_fasta fasta =
let f = fun%workflow dest ->
let module G = Codepitk.Gemma in
G.write_genotypes ~alignment:[%path fasta] ~output:dest
in
Workflow.path_plugin ~descr:"gemma.genotype_of_fasta" f
let%pworkflow phenotype_of_tree nhx =
let open Phylogenetics in
let module U = Codepitk.Utils in
let collect_leaves t =
let rec node condition t acc =
match t with
| Tree.Node n -> List1.fold_right n.branches ~init:acc ~f:branch
| Leaf _ -> condition :: acc
and branch (Tree.Branch b) acc =
node
(Phylogenetics_convergence.Simulator.Branch_info.condition b.data)
b.tip acc
let phenotype_of_tree nhx =
let f = fun%workflow dest ->
let open Phylogenetics in
let module U = Codepitk.Utils in
let collect_leaves t =
let rec node condition t acc =
match t with
| Tree.Node n -> List1.fold_right n.branches ~init:acc ~f:branch
| Leaf _ -> condition :: acc
and branch (Tree.Branch b) acc =
node
(Phylogenetics_convergence.Simulator.Branch_info.condition b.data)
b.tip acc
in
node `Ancestral t []
in
node `Ancestral t []
in
let leaves = U.tree_from_file [%path nhx] |> collect_leaves in
let write_phenotypes leaves oc =
List.iter leaves ~f:(fun cond ->
fprintf oc "%d\n"
(match cond with `Ancestral -> 0 | `Convergent -> 1))
let leaves = U.tree_from_file [%path nhx] |> collect_leaves in
let write_phenotypes leaves oc =
List.iter leaves ~f:(fun cond ->
fprintf oc "%d\n"
(match cond with `Ancestral -> 0 | `Convergent -> 1))
in
Out_channel.with_file dest ~f:(write_phenotypes leaves)
in
Out_channel.with_file [%dest] ~f:(write_phenotypes leaves)
Workflow.path_plugin ~descr:"gemma.phenotype_of_tree" f
let template_of_lmm lmm =
int (match lmm with `Wald -> 1 | `LRT -> 2 | `Score -> 3 | `All -> 4)
......@@ -80,13 +86,16 @@ let calculate_relatedness_matrix ~mode ~genotype ~phenotype =
cmd "cp" [ tmp // relatedness_filename mode; dest ];
]
let%pworkflow[@version 5] result_table_of_output alignment gemma_output =
let module R = Codepitk.Gemma.Result_file in
match R.of_file [%path gemma_output] with
| Error msg -> failwith msg
| Ok result_file ->
let result_table_of_output alignment gemma_output =
let f = fun%workflow dest ->
let module R = Codepitk.Gemma.Result_file in
match R.of_file [%path gemma_output] with
| Error msg -> failwith msg
| Ok result_file ->
R.to_result_table result_file ~alignment:[%path alignment]
~site_aggregator:R.min_pvalue_aggregator
|> Result.map_error ~f:Phylogenetics.Alignment.show_parsing_error
|> Result.ok_or_failwith
|> Codepitk.Result_table.to_file ~output:[%dest]
|> Codepitk.Result_table.to_file ~output:dest
in
Workflow.path_plugin ~descr:"gemma.result_table_of_output" f
open Bistro
open Phylogenetics
open Codepitk
let%pworkflow[@version 2] test alignment tree =
let alignment =
Alignment.from_fasta [%path alignment]
|> Rresult.R.get_ok
let test alignment tree =
let f = fun%workflow dest ->
let alignment =
Alignment.from_fasta [%path alignment]
|> Rresult.R.get_ok
in
let tree =
Convergence_tree.from_file [%path tree]
|> Result.get_ok
in
Inhouse_lmm.test ~alignment ~tree
|> Inhouse_lmm.result_table_of_test
|> Result_table.to_file ~output:dest
in
let tree =
Convergence_tree.from_file [%path tree]
|> Result.get_ok
in
Inhouse_lmm.test ~alignment ~tree
|> Inhouse_lmm.result_table_of_test
|> Result_table.to_file ~output:[%dest]
Workflow.path_plugin ~descr:"inhouse_lmm.test" f
......@@ -2,40 +2,43 @@ open Core
open Bistro
open File_formats
let%pworkflow multinomial_ocaml_implementation ~meth ~(tree_sc:_ file) ~(faa:aminoacid_fasta file) (* : cpt file *) =
let open Phylogenetics in
let open Codepitk in
let module MT = Codepitk.Multinomial_test in
let meth = [%param meth] in
let test = match meth with
| `Asymptotic_LRT -> MT.LRT.asymptotic_test
| `Simulation_LRT -> MT.LRT.simulation_test ~sample_size:10_000
| `Asymptotic_sparse -> MT.Sparse.asymptotic_test
| `Simulation_sparse -> MT.Sparse.simulation_test ~sample_size:10_000
let multinomial_ocaml_implementation ~meth ~(tree_sc:_ file) ~(faa:aminoacid_fasta file) : cpt file =
let f = fun%workflow dest ->
let open Phylogenetics in
let open Codepitk in
let module MT = Codepitk.Multinomial_test in
let meth = [%param meth] in
let test = match meth with
| `Asymptotic_LRT -> MT.LRT.asymptotic_test
| `Simulation_LRT -> MT.LRT.simulation_test ~sample_size:10_000
| `Asymptotic_sparse -> MT.Sparse.asymptotic_test
| `Simulation_sparse -> MT.Sparse.simulation_test ~sample_size:10_000
in
let alignment =
Alignment.from_fasta [%path faa]
|> Rresult.R.get_ok
in
let tree =
Convergence_tree.from_file [%path tree_sc]
|> Rresult.R.failwith_error_msg
in
let site c0 c1 =
let c0 = (c0 : int Amino_acid.table :> int array) in
let c1 = (c1 : int Amino_acid.table :> int array) in
let d = MT.data ~x1:c0 ~x2:c1 in
let r = test d in
r._T_, r.pvalue
in
let res = Convergence_tree.alignment_counts_map tree alignment site in
let res_lines =
List.mapi res ~f:(fun i (lr, pval) ->
sprintf "%d\t%f\t%f" (i + 1) (1. -. pval) lr
)
in
let header = "Sites\tMultinomial_1mp\tMultinomial_LikelihoodRatio" in
Out_channel.write_lines dest (header :: res_lines)
in
let alignment =
Alignment.from_fasta [%path faa]
|> Rresult.R.get_ok
in
let tree =
Convergence_tree.from_file [%path tree_sc]
|> Rresult.R.failwith_error_msg
in
let site c0 c1 =
let c0 = (c0 : int Amino_acid.table :> int array) in
let c1 = (c1 : int Amino_acid.table :> int array) in
let d = MT.data ~x1:c0 ~x2:c1 in
let r = test d in
r._T_, r.pvalue
in
let res = Convergence_tree.alignment_counts_map tree alignment site in
let res_lines =
List.mapi res ~f:(fun i (lr, pval) ->
sprintf "%d\t%f\t%f" (i + 1) (1. -. pval) lr
)
in
let header = "Sites\tMultinomial_1mp\tMultinomial_LikelihoodRatio" in
Out_channel.write_lines [%dest] (header :: res_lines)
Workflow.path_plugin ~descr:"multinomial.multinomial_ocaml_implementation" f
let multinomial_asymptotic_lrt ~tree_sc ~faa = multinomial_ocaml_implementation ~meth:`Asymptotic_LRT ~tree_sc ~faa
let multinomial_simulation_lrt ~tree_sc ~faa = multinomial_ocaml_implementation ~meth:`Simulation_LRT ~tree_sc ~faa
......
This diff is collapsed.
......@@ -41,19 +41,22 @@ module Query = struct
let nucleotide_alignment q =
Workflow.input (alignment_path q)
let%pworkflow tree ~branch_length_unit:_ q =
let open Phylogenetics in
let q = [%param q] in
let convergent_leaves =
convergent_species q
|> String.Set.of_list
let tree ~branch_length_unit:_ q =
let f = fun%workflow dest ->
let open Phylogenetics in
let q = [%param q] in
let convergent_leaves =
convergent_species q
|> String.Set.of_list
in
Newick.from_file (nhx_path q)
|> Newick.map_inner_tree ~f:(fun tree ->
Tk.Convergence_tree.infer_binary_condition_on_branches tree ~convergent_leaves
|> Tk.Convergence_tree.reset_transitions
)
|> Fn.flip Newick.to_file dest
in
Newick.from_file (nhx_path q)
|> Newick.map_inner_tree ~f:(fun tree ->
Tk.Convergence_tree.infer_binary_condition_on_branches tree ~convergent_leaves
|> Tk.Convergence_tree.reset_transitions
)
|> Fn.flip Newick.to_file [%dest]
Workflow.path_plugin ~descr:"rubisco_dataset.tree" f
end
include Detection_pipeline.Make(Query)
......
......@@ -99,140 +99,145 @@ module Mutsel = struct
convergent_counts : int Amino_acid.table array ;
}
let%workflow benchmark q methods =
let open Phylogenetics in
let open Codepitk in
let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in
let sim : simulation = [%eval simulation q] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
let results =
List.map result_paths ~f:Cpt.of_file
|> Result.all
|> Rresult.R.failwith_error_msg
|> List.concat_map ~f:Cpt.columns
in
let method_labels, method_outputs = List.unzip results in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let amino_acid_vector_of_codon_vector xs =
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then
acc +. xs.Codon.%(c)
else acc
)
)
in
let compute_profile p =
Mutsel.stationary_distribution p
|> amino_acid_vector_of_codon_vector
|> Amino_acid.Vector.to_array
in
let profiles =
Array.append sim.h0_params sim.ha_params
|> Array.map ~f:(fun (p1, p2) ->
compute_profile p1, compute_profile p2
)
in
let counts seqs i =
Amino_acid.Table.init (fun aa ->
let aa = Amino_acid.to_char aa in
List.count seqs ~f:(fun s ->
let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
let codon = match Codon.of_string codon_str with
| Some c -> c
| None -> assert false
in
Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
)
in
let collect_counts cond =
let species = Convergence_tree.leaves sim.tree in
let seqs =
List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
if Poly.equal cond cond_s then Some s else None
let benchmark q methods =
let f = fun%workflow () ->
let open Phylogenetics in
let open Codepitk in
let open Codepitk.Simulator.Site_independent_mutsel in
let module Codon = Codon.Universal_genetic_code.NS in
let sim : simulation = [%eval simulation q] in
let result_paths = [%eval Bistro.Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
let results =
List.map result_paths ~f:Cpt.of_file
|> Result.all
|> Rresult.R.failwith_error_msg
|> List.concat_map ~f:Cpt.columns
in
let method_labels, method_outputs = List.unzip results in
let n_h0 = Array.length sim.h0_params in
let n_ha = Array.length sim.ha_params in
let nsites = n_h0 + n_ha in
let amino_acid_vector_of_codon_vector xs =
Amino_acid.Vector.init (fun aa ->
List.fold Codon.all ~init:0. ~f:(fun acc c ->
if Amino_acid.equal aa (Codon.aa_of_codon c) then
acc +. xs.Codon.%(c)
else acc
)
)
|> List.filter_opt
in
Array.init nsites ~f:(