Commit 479c860c authored by Philippe Veber's avatar Philippe Veber
Browse files

Merge branch 'orthomam-as-fasta'

parents e7746c43 95c414ff
......@@ -12,7 +12,7 @@
(modules orthomam_app)
(libraries codepi)
(preprocess
(pps ppx_jane)))
(pps ppx_jane ppx_deriving.show)))
(executable
(name lmm_benchmark)
......@@ -33,4 +33,4 @@
(name matrix_exponential_bench)
(public_name matrix_exp)
(modules matrix_exponential_bench)
(libraries codepi phylogenetics core_bench))
\ No newline at end of file
(libraries codepi phylogenetics core_bench))
......@@ -4,14 +4,21 @@ module Top = Bistro_utils.Toplevel_eval.Make(struct let np = 3 let mem = 10 end)
let () =
try
Codepitk.Orthomam_db.make "/disk/data/omm"
Codepitk.Orthomam_db.make "_runs/omm"
|> Orthomam.(
site_ranking
~convergent_species:species_with_echolocation
~meth:`tdg09
~meth:`multinomial_asymptotic_lrt
)
|> Top.eval
|> Core.(Fn.flip List.take 10)
|> List.iter Codepitk.Candidate_site.(fun x -> Option.iter print_endline x.alignment_id)
|> List.iter Codepitk.Candidate_site.(fun x ->
Option.iter print_string x.alignment_id ;
Option.iter (Printf.printf " %f\n") x.score ;
let p1, p2 = Codepitk.Candidate_site.profiles x in
print_endline @@ [%show: float array] (p1 :> float array) ;
print_endline @@ [%show: float array] (p2 :> float array) ;
print_endline "====="
)
with
| Failure _ -> ()
| Failure msg -> prerr_endline msg
open Core_kernel
open Bistro
open File_formats
module U = Utils
open Codepitk
let ensembl_tree : newick file =
......@@ -20,21 +21,7 @@ let search_alignments ~pat db =
let family_id_of_alignment (alignment : Orthomam_db.alignment) =
Filename.basename (alignment :> string)
|> Fn.flip Filename.chop_suffix "_NT.fasta.ali"
|> String.chop_prefix_exn ~prefix:"all"
let fasta_of_phylip ali =
let f = fun%workflow dest ->
let ali =
[%path ali]
|> Phylogenetics.Phylip.read_exn ~strict:false
in
List.map ali.items ~f:(fun it ->
Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
)
|> Biotk.Fasta.to_file dest
in
Workflow.path_plugin ~descr:"orthomam.fasta_of_phylip" f
|> Fn.flip Filename.chop_suffix "_NT.fasta"
type query = {
db : Orthomam_db.t ;
......@@ -47,28 +34,13 @@ let query (Alignment (db, alignment)) ~convergent_species =
let family_id q = family_id_of_alignment q.alignment
let remove_unobserved_sequences_from_alignment phylip : phylip file =
let f = fun%workflow dest ->
let phylip = [%path phylip] in
let module Phylip = Phylogenetics.Phylip in
let data = Phylip.read_exn ~strict:false phylip in
let threshold = data.sequence_length * 90 / 100 in
let items = List.filter data.items ~f:(fun it ->
String.count it.sequence ~f:(Char.equal '?') < threshold
)
in
let filtered_data = Phylip.make_exn items in
Phylip.write ~strict:false filtered_data dest
in
Workflow.path_plugin ~descr:"orthomam.remove_unobserved_sequences_from_alignment" f
let clip_tree_on_alignment ?(handle_tags = true) (tree:(#newick as 'a) file) (ali: phylip file) : 'a file =
let clip_tree_on_alignment ?(handle_tags = true) (tree:(#newick as 'a) file) (ali: #fasta file) : 'a file =
let f = fun%workflow dest ->
let open Phylogenetics in
let handle_tags = [%param handle_tags] in
let tree = Newick.from_file_exn [%path tree] in
let ali = Phylip.read_exn ~strict:false [%path ali] in
let ali_species = List.map ali.items ~f:(fun it -> it.name) in
let _, items = Biotk.Fasta.from_file_exn [%path ali] in
let ali_species = List.map items ~f:(fun it -> it.description) in
let remove_nodes_with_single_child =
if handle_tags then
Convergence_tree.remove_nodes_with_single_child
......@@ -163,14 +135,53 @@ let number_of_missing_sequences_repartition db =
Orthomam_db.missing_sequences_cdf [%param db] dest
)
let concatenate ?(nmissing = 0) ?seed db n : phylip file =
module Fasta_alignment = struct
open Biotk
type t = {
length : int ;
nb_sequences : int ;
items : Fasta.item list ;
}
let project ~species ali =
let table =
List.map ali.items ~f:(fun it -> it.description, it)
|> String.Table.of_alist_exn
in
let mock = String.make ali.length '?' in
let items = List.map species ~f:(fun sp ->
match String.Table.find table sp with
| Some it -> it
| None -> { Fasta.description = sp ; sequence = mock }
)
in
let nb_sequences = List.length species in
{ ali with items ; nb_sequences }
let from_file_exn fn =
match Fasta.from_file fn with
| Ok (_, (first :: others as items)) ->
let length = String.length first.sequence in
if List.for_all others ~f:(fun s -> String.length s.sequence = length) then
let nb_sequences = List.length items in
{ length ; nb_sequences ; items }
else failwith "not all sequences have the same length is this alignment"
| Ok (_, []) -> failwith "empty fasta file"
| Error msg -> failwith msg
end
let concatenate ?(nmissing = 0) ?seed db n : nucleotide_fasta file =
let f = fun%workflow dest ->
let db = [%param db] in
let seed = [%param seed] in
let n = [%param n] in
let open Phylogenetics in
let compare_items (x : Phylip.item) (y : Phylip.item) =
String.compare x.name y.name
let tree = [%path omm_tree_of_db db] in
let all_species =
let open Phylogenetics in
Newick.from_file_exn tree
|> Newick.with_inner_tree ~f:Tree.leaves
|> List.map ~f:(fun ni -> Option.value_exn ni.name)
in
let rec at_most_n_failures xs ~n ~f =
if n <= 0 then List.for_all xs ~f
......@@ -187,9 +198,9 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
Codon.of_string seq
|> Option.bind ~f:Codon.Universal_genetic_code.aa_of_codon
in
let full_sites_of_alignment (alignment : Phylip.t) =
if alignment.sequence_length mod 3 = 0 then
Codepitk.Range.fold 0 (alignment.sequence_length / 3) ~init:[] ~f:(fun acc j ->
let full_sites_of_alignment (alignment : Fasta_alignment.t) =
if alignment.length mod 3 = 0 then
Codepitk.Range.fold 0 (alignment.length / 3) ~init:[] ~f:(fun acc j ->
let column_is_quasi_full =
at_most_n_failures alignment.items ~n:nmissing ~f:(fun it ->
Option.is_some (aa_at_pos it.sequence j)
......@@ -204,11 +215,10 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
Array.of_list (full_sites_of_alignment al)
)
in
let build_alignment_from_sites (sites : (Phylip.t * int) array) =
let build_alignment_from_sites (sites : (Fasta_alignment.t * int) array) =
assert (Array.length sites > n) ;
let al0, _ = sites.(0) in
let nb_sequences = al0.number_of_sequences in
let sequences = Array.init nb_sequences ~f:(fun _ ->
let sequences = Array.init al0.nb_sequences ~f:(fun _ ->
Bytes.create (3 * n)
)
in
......@@ -217,7 +227,10 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
List.iteri al.items ~f:(fun j it ->
match aa_at_pos it.sequence p with
| Some _ ->
Bytes.set sequences.(j) (3 * i) it.sequence.[3 * p] ;
Bytes.set
sequences.(j)
(3 * i)
it.sequence.[3 * p] ;
Bytes.set sequences.(j) (3 * i + 1) it.sequence.[3 * p + 1] ;
Bytes.set sequences.(j) (3 * i + 2) it.sequence.[3 * p + 2]
| None ->
......@@ -227,18 +240,20 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
)
done ;
List.mapi al0.items ~f:(fun i it ->
{ Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name }
{ it with sequence = Bytes.to_string sequences.(i) }
)
in
let load_alignment (ali_fn : Orthomam_db.alignment) =
let ali = Fasta_alignment.from_file_exn (ali_fn :> string) in
if ali.length mod 3 = 0 then
Some (
Fasta_alignment.project ~species:all_species ali
)
else None
in
let all_alignments =
Orthomam_db.list_alignments db
|> List.filter_map ~f:(fun ali_fn ->
let ali = Phylip.read_exn ~strict:false (ali_fn :> string) in
if ali.sequence_length mod 3 = 0 then
Phylip.make_exn (List.sort ali.items ~compare:compare_items)
|> Option.some
else None
)
|> List.filter_map ~f:load_alignment
in
let all_full_sites = all_full_sites all_alignments in
if Array.length all_full_sites < n then failwith "not enough sites" ;
......@@ -250,7 +265,7 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
Gsl.Rng.set rng seed ;
Gsl.Randist.shuffle rng all_full_sites ;
let items = build_alignment_from_sites all_full_sites in
Phylip.write ~strict:false (Phylip.make_exn items) dest
Biotk.Fasta.to_file dest items
in
Workflow.path_plugin ~descr:"orthomam.concatenate" ~mem:(Workflow.int 4096) f
......@@ -296,31 +311,9 @@ let concatenate_calibration_figure ~nsites ~trees =
in
Workflow.path_plugin ~descr:"orthomam.concatenate_calibration_figure" f
let phylip_aa_of_nuc (ali : phylip file) : phylip file =
let f = fun%workflow dest ->
let open Phylogenetics in
let input_ali = Phylip.read_exn ~strict:false [%path ali] in
let items = List.map input_ali.items ~f:(fun it ->
let sequence =
it.sequence
|> Codepitk.Utils.translate_nucleotide_sequence_whatever_it_takes
in
{ it with Phylip.sequence }
)
in
Phylip.write ~strict:false (Phylip.make_exn items) dest
in
Workflow.path_plugin ~descr:"orthomam.phylip_aa_of_nuc" f
let iqtree_nexus_partition_file_of_alignment ali : nexus file =
let f = fun%workflow dest ->
let maybe_nb_columns =
let open Option in
In_channel.with_file [%path ali] ~f:In_channel.input_line >>= fun ali_first_line ->
String.lsplit2 ali_first_line ~on:'\t' >>= fun (_, n) ->
return (Int.of_string n)
in
let nb_columns = Option.value_exn ~message:"Could not parse phylip alignment first line" maybe_nb_columns in
let nb_columns = (Fasta_alignment.from_file_exn [%path ali]).length in
Out_channel.with_file dest ~f:(fun oc ->
fprintf oc
{|#nexus begin sets; charset part1 = 1-%d\3; charset part2 = 2-%d\3; charset part3 = 3-%d\3; charpartition mine = GTR+R:part1, GTR+R:part2, GTR+R:part3; end;|}
......@@ -335,7 +328,7 @@ let branch_length_estimation_concatenate db =
let nucleotide_tree_estimation db ali =
let open Bistro_bio in
let spp = iqtree_nexus_partition_file_of_alignment ali in
Iqtree.iqtree ~st:`DNA ~te:(omm_tree_of_db db) ~spp (`phylip ali)
Iqtree.iqtree ~st:`DNA ~te:(omm_tree_of_db db) ~spp (`fasta ali)
|> Iqtree.treefile
let estimated_nucleotide_tree db =
......@@ -346,17 +339,17 @@ let estimated_amino_acid_tree db =
let open Bistro_bio in
let ali =
branch_length_estimation_concatenate db
|> phylip_aa_of_nuc
|> U.amino_acid_fasta_of_nucleotide_fasta
in
let model = Iqtree.model_spec ~freq_type:`F ~rate_type:(`R None) `LG in
Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model (`phylip ali)
Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model (`fasta ali)
|> Iqtree.treefile
let estimated_codon_tree db =
let open Bistro_bio in
let ali = branch_length_estimation_concatenate db in
let model = Iqtree.model_spec ~freq_type:`F `ECMK07 in
Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model ~st:(`CODON 1) (`phylip ali)
Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model ~st:(`CODON 1) (`fasta ali)
|> Iqtree.treefile
let concatenate_calibration ~seed db =
......@@ -387,9 +380,8 @@ let concatenate_calibration ~seed db =
module Q = struct
type t = query
let alignment q : phylip file =
let alignment q : nucleotide_fasta file =
Workflow.input (q.alignment :> string)
|> remove_unobserved_sequences_from_alignment
let omm_tree_with_branch_lengths ~branch_length_unit = match branch_length_unit with
| `Nucleotide -> estimated_nucleotide_tree
......@@ -403,8 +395,7 @@ module Q = struct
q.convergent_species)
(alignment q)
let nucleotide_alignment q =
fasta_of_phylip (alignment q)
let nucleotide_alignment q = alignment q
end
include Q
......@@ -578,7 +569,7 @@ let ranking_of_results ~alignment_ids ~convergent_species (alignments : aminoaci
let site_ranking ?subset ~meth ~convergent_species db =
let detection_method, column_label = match meth with
| `multinomial_asymptotic_lrt ->
multinomial_asymptotic_lrt, "1MinusLRT"
multinomial_asymptotic_lrt, "Multinomial_1mp"
| `tdg09 ->
failsafe_tdg09, "Tdg09_1MinusFDR"
in
......@@ -638,13 +629,13 @@ module Binary_phenotype = struct
let parse (line:string list) =
match line with
| [name; genus; species; is_convergent] ->
| [name; genus; species; is_convergent] ->
(match bool_of_word is_convergent with
| Ok is_convergent -> Ok {name;genus;species;is_convergent}
| Error msg -> Error msg)
| Error msg -> Error msg)
| _ -> Error "Invalid line format encountered"
let list_of_file filename =
let list_of_file filename =
let open Result.Monad_infix in
let lines = In_channel.read_lines filename
|> List.map ~f:(String.split ~on:'\t') in
......@@ -652,11 +643,11 @@ module Binary_phenotype = struct
Result.all (List.map ~f:parse split_lines)
let convergent_species_list (phenotypes:t list) =
List.filter phenotypes ~f:(fun p -> p.is_convergent)
List.filter phenotypes ~f:(fun p -> p.is_convergent)
|> List.map ~f:(fun p -> p.name)
let load_convergent_species phenotype_tsv =
let f = fun%workflow () ->
let load_convergent_species phenotype_tsv =
let f = fun%workflow () ->
match list_of_file [%path phenotype_tsv] with
| Ok phenotypes -> convergent_species_list phenotypes
| Error msg -> failwith msg
......@@ -692,11 +683,12 @@ let rer_converge ?transform ?weighted ?scale ?continuous ?(summary_n_genes = 10)
let clipped_tree =
clip_tree_on_alignment ~handle_tags:false species_tree q
in
let gene_tree = Iqtree.iqtree
let gene_tree =
Iqtree.iqtree
~keep_ident:true ~te:clipped_tree
(*~m:model*) ~st:`DNA ~spp
(`phylip q)
|> Iqtree.treefile
(`fasta q)
|> Iqtree.treefile
in
RER.match_species_tree_position ~gene_tree ~clipped_species_tree:clipped_tree
)
......@@ -708,17 +700,15 @@ let rer_converge ?transform ?weighted ?scale ?continuous ?(summary_n_genes = 10)
let rds = RER.rer_converge ?transform ?weighted ?scale ?continuous
~min_specs:10 ~master_tree:species_tree ~gene_tree_set ~phenotypes () in
List.map RER.[All; Ancestral; Terminal] ~f:(fun clade ->
List.map RER.[All; Ancestral; Terminal] ~f:(fun clade ->
let result_table = RER.result_table_of_rds rds clade in
let best_candidate_summary = RER.best_candidate_summary
~n_genes:summary_n_genes rds clade
let best_candidate_summary = RER.best_candidate_summary
~n_genes:summary_n_genes rds clade
in
let rer_plot ~pat =
let rer_plot ~pat =
search_alignments ~pat db
|> List.map ~f:(fun (Alignment (_, ali)) -> family_id_of_alignment ali)
|> RER.plot_relative_rates rds clade
in
clade, { result_table; best_candidate_summary; rer_plot; }
)
......@@ -20,8 +20,6 @@ val query :
val family_id : query -> string
val alignment : query -> phylip file
val tree_of_db :
Orthomam_db.t ->
branch_length_unit:[`Nucleotide | `Amino_acid | `Codon] ->
......@@ -50,7 +48,7 @@ val concatenate :
?seed:int ->
Orthomam_db.t ->
int ->
phylip file
nucleotide_fasta file
(** Produces a concatenate of gene alignments in a
database. Alignments where strictly more than
[max_missing_sequences] leaves are missing are discarded, as well
......
......@@ -10,21 +10,21 @@ let make path =
let list_alignments path =
Sys.readdir path
|> Array.to_list
|> List.filter ~f:(String.is_suffix ~suffix:".ali")
|> List.filter ~f:(String.is_suffix ~suffix:".fasta")
|> List.map ~f:(Filename.concat path)
let tree path = Filename.concat path "all.tree"
let tree path = Filename.concat path "orthomam.nw"
let id_of_alignment ali =
String.chop_suffix_exn (Filename.basename ali) ~suffix:".ali"
String.chop_suffix_exn (Filename.basename ali) ~suffix:".fasta"
let is_gap c = Char.(c = '?' || c = '-')
let alignment_number_of_missing_sequences (ali : Phylogenetics.Phylip.t) =
let alignment_number_of_missing_sequences (ali : Biotk.Fasta.item list) =
let missing_sequence seq =
String.count seq ~f:is_gap = String.length seq
in
List.count ali.items ~f:(fun it -> missing_sequence it.sequence)
List.count ali ~f:(fun it -> missing_sequence it.sequence)
let remove_gaps s =
String.filter s ~f:(Fn.non is_gap)
......@@ -86,6 +86,7 @@ let check_orf seq : (unit, orf_defect) result =
|> Option.value_map ~f:Result.fail ~default:(Ok ())
type alignment_defect = [
| `Sequences_with_different_lengths
| `Length_not_multiple_of_3
| `Invalid_sequences of (string * aligned_sequence_defect) list
]
......@@ -113,33 +114,38 @@ let check_alignment_sequence seq =
if n mod 3 = 0 then loop 0
else invalid_arg "sequence length is not a multiple of 3"
let check_sequences_of_alignment ali =
List.filter_map ali.Phylogenetics.Phylip.items ~f:(fun it ->
let check_sequences_of_alignment (fasta : Biotk.Fasta.item list) =
List.filter_map fasta ~f:(fun it ->
match check_alignment_sequence it.sequence with
| Ok () -> None
| Error e -> Some (it.name, e)
| Error e -> Some (it.description, e)
)
let check_alignment ali : alignment_defect option =
let open Phylogenetics in
if ali.Phylip.sequence_length mod 3 = 0 then
match check_sequences_of_alignment ali with
| [] -> None
| _ :: _ as xs -> Some (`Invalid_sequences xs)
else
Some `Length_not_multiple_of_3
let check_alignment (fasta : Biotk.Fasta.item list) : alignment_defect option =
match fasta with
| [] -> None
| h :: t ->
let n = String.length h.sequence in
if n mod 3 = 0 then
Some `Length_not_multiple_of_3
else if List.exists t ~f:(fun s -> String.length s.sequence <> n) then
Some `Sequences_with_different_lengths
else
match check_sequences_of_alignment fasta with
| [] -> None
| _ :: _ as xs -> Some (`Invalid_sequences xs)
let missing_sequences_cdf db dest =
let open Phylogenetics in
let all_alignments =
list_alignments db
|> List.map ~f:(Phylip.read_exn ~strict:false)
|> List.map ~f:Biotk.Fasta.from_file_exn
|> List.map ~f:snd
in
let filtered_alignment =
List.filter all_alignments ~f:(fun ali ->
match check_alignment ali with
| None -> true
| Some `Length_not_multiple_of_3 -> false
| Some (`Length_not_multiple_of_3 | `Sequences_with_different_lengths) -> false
| Some (`Invalid_sequences _) -> true
)
in
......
......@@ -21,6 +21,7 @@ val check_orf : string -> (unit, orf_defect) result
val remove_gaps : string -> string
type alignment_defect = [
| `Sequences_with_different_lengths
| `Length_not_multiple_of_3
| `Invalid_sequences of (string * aligned_sequence_defect) list
]
......@@ -30,4 +31,4 @@ and aligned_sequence_defect = [
| `Stop_codon_at_last_position
]
val check_alignment : Phylogenetics.Phylip.t -> alignment_defect option
val check_alignment : Biotk.Fasta.item list -> alignment_defect option
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment