Commit 95c414ff authored by Philippe Veber's avatar Philippe Veber
Browse files

Use official, fasta based, distribution of orthomam

parent e7746c43
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
(modules orthomam_app) (modules orthomam_app)
(libraries codepi) (libraries codepi)
(preprocess (preprocess
(pps ppx_jane))) (pps ppx_jane ppx_deriving.show)))
(executable (executable
(name lmm_benchmark) (name lmm_benchmark)
...@@ -33,4 +33,4 @@ ...@@ -33,4 +33,4 @@
(name matrix_exponential_bench) (name matrix_exponential_bench)
(public_name matrix_exp) (public_name matrix_exp)
(modules matrix_exponential_bench) (modules matrix_exponential_bench)
(libraries codepi phylogenetics core_bench)) (libraries codepi phylogenetics core_bench))
\ No newline at end of file
...@@ -4,14 +4,21 @@ module Top = Bistro_utils.Toplevel_eval.Make(struct let np = 3 let mem = 10 end) ...@@ -4,14 +4,21 @@ module Top = Bistro_utils.Toplevel_eval.Make(struct let np = 3 let mem = 10 end)
let () = let () =
try try
Codepitk.Orthomam_db.make "/disk/data/omm" Codepitk.Orthomam_db.make "_runs/omm"
|> Orthomam.( |> Orthomam.(
site_ranking site_ranking
~convergent_species:species_with_echolocation ~convergent_species:species_with_echolocation
~meth:`tdg09 ~meth:`multinomial_asymptotic_lrt
) )
|> Top.eval |> Top.eval
|> Core.(Fn.flip List.take 10) |> Core.(Fn.flip List.take 10)
|> List.iter Codepitk.Candidate_site.(fun x -> Option.iter print_endline x.alignment_id) |> List.iter Codepitk.Candidate_site.(fun x ->
Option.iter print_string x.alignment_id ;
Option.iter (Printf.printf " %f\n") x.score ;
let p1, p2 = Codepitk.Candidate_site.profiles x in
print_endline @@ [%show: float array] (p1 :> float array) ;
print_endline @@ [%show: float array] (p2 :> float array) ;
print_endline "====="
)
with with
| Failure _ -> () | Failure msg -> prerr_endline msg
open Core_kernel open Core_kernel
open Bistro open Bistro
open File_formats open File_formats
module U = Utils
open Codepitk open Codepitk
let ensembl_tree : newick file = let ensembl_tree : newick file =
...@@ -20,21 +21,7 @@ let search_alignments ~pat db = ...@@ -20,21 +21,7 @@ let search_alignments ~pat db =
let family_id_of_alignment (alignment : Orthomam_db.alignment) = let family_id_of_alignment (alignment : Orthomam_db.alignment) =
Filename.basename (alignment :> string) Filename.basename (alignment :> string)
|> Fn.flip Filename.chop_suffix "_NT.fasta.ali" |> Fn.flip Filename.chop_suffix "_NT.fasta"
|> String.chop_prefix_exn ~prefix:"all"
let fasta_of_phylip ali =
let f = fun%workflow dest ->
let ali =
[%path ali]
|> Phylogenetics.Phylip.read_exn ~strict:false
in
List.map ali.items ~f:(fun it ->
Biotk.Fasta.{ description = it.name ; sequence = it.sequence }
)
|> Biotk.Fasta.to_file dest
in
Workflow.path_plugin ~descr:"orthomam.fasta_of_phylip" f
type query = { type query = {
db : Orthomam_db.t ; db : Orthomam_db.t ;
...@@ -47,28 +34,13 @@ let query (Alignment (db, alignment)) ~convergent_species = ...@@ -47,28 +34,13 @@ let query (Alignment (db, alignment)) ~convergent_species =
let family_id q = family_id_of_alignment q.alignment let family_id q = family_id_of_alignment q.alignment
let remove_unobserved_sequences_from_alignment phylip : phylip file = let clip_tree_on_alignment ?(handle_tags = true) (tree:(#newick as 'a) file) (ali: #fasta file) : 'a file =
let f = fun%workflow dest ->
let phylip = [%path phylip] in
let module Phylip = Phylogenetics.Phylip in
let data = Phylip.read_exn ~strict:false phylip in
let threshold = data.sequence_length * 90 / 100 in
let items = List.filter data.items ~f:(fun it ->
String.count it.sequence ~f:(Char.equal '?') < threshold
)
in
let filtered_data = Phylip.make_exn items in
Phylip.write ~strict:false filtered_data dest
in
Workflow.path_plugin ~descr:"orthomam.remove_unobserved_sequences_from_alignment" f
let clip_tree_on_alignment ?(handle_tags = true) (tree:(#newick as 'a) file) (ali: phylip file) : 'a file =
let f = fun%workflow dest -> let f = fun%workflow dest ->
let open Phylogenetics in let open Phylogenetics in
let handle_tags = [%param handle_tags] in let handle_tags = [%param handle_tags] in
let tree = Newick.from_file_exn [%path tree] in let tree = Newick.from_file_exn [%path tree] in
let ali = Phylip.read_exn ~strict:false [%path ali] in let _, items = Biotk.Fasta.from_file_exn [%path ali] in
let ali_species = List.map ali.items ~f:(fun it -> it.name) in let ali_species = List.map items ~f:(fun it -> it.description) in
let remove_nodes_with_single_child = let remove_nodes_with_single_child =
if handle_tags then if handle_tags then
Convergence_tree.remove_nodes_with_single_child Convergence_tree.remove_nodes_with_single_child
...@@ -163,14 +135,53 @@ let number_of_missing_sequences_repartition db = ...@@ -163,14 +135,53 @@ let number_of_missing_sequences_repartition db =
Orthomam_db.missing_sequences_cdf [%param db] dest Orthomam_db.missing_sequences_cdf [%param db] dest
) )
let concatenate ?(nmissing = 0) ?seed db n : phylip file = module Fasta_alignment = struct
open Biotk
type t = {
length : int ;
nb_sequences : int ;
items : Fasta.item list ;
}
let project ~species ali =
let table =
List.map ali.items ~f:(fun it -> it.description, it)
|> String.Table.of_alist_exn
in
let mock = String.make ali.length '?' in
let items = List.map species ~f:(fun sp ->
match String.Table.find table sp with
| Some it -> it
| None -> { Fasta.description = sp ; sequence = mock }
)
in
let nb_sequences = List.length species in
{ ali with items ; nb_sequences }
let from_file_exn fn =
match Fasta.from_file fn with
| Ok (_, (first :: others as items)) ->
let length = String.length first.sequence in
if List.for_all others ~f:(fun s -> String.length s.sequence = length) then
let nb_sequences = List.length items in
{ length ; nb_sequences ; items }
else failwith "not all sequences have the same length is this alignment"
| Ok (_, []) -> failwith "empty fasta file"
| Error msg -> failwith msg
end
let concatenate ?(nmissing = 0) ?seed db n : nucleotide_fasta file =
let f = fun%workflow dest -> let f = fun%workflow dest ->
let db = [%param db] in let db = [%param db] in
let seed = [%param seed] in let seed = [%param seed] in
let n = [%param n] in let n = [%param n] in
let open Phylogenetics in let tree = [%path omm_tree_of_db db] in
let compare_items (x : Phylip.item) (y : Phylip.item) = let all_species =
String.compare x.name y.name let open Phylogenetics in
Newick.from_file_exn tree
|> Newick.with_inner_tree ~f:Tree.leaves
|> List.map ~f:(fun ni -> Option.value_exn ni.name)
in in
let rec at_most_n_failures xs ~n ~f = let rec at_most_n_failures xs ~n ~f =
if n <= 0 then List.for_all xs ~f if n <= 0 then List.for_all xs ~f
...@@ -187,9 +198,9 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file = ...@@ -187,9 +198,9 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
Codon.of_string seq Codon.of_string seq
|> Option.bind ~f:Codon.Universal_genetic_code.aa_of_codon |> Option.bind ~f:Codon.Universal_genetic_code.aa_of_codon
in in
let full_sites_of_alignment (alignment : Phylip.t) = let full_sites_of_alignment (alignment : Fasta_alignment.t) =
if alignment.sequence_length mod 3 = 0 then if alignment.length mod 3 = 0 then
Codepitk.Range.fold 0 (alignment.sequence_length / 3) ~init:[] ~f:(fun acc j -> Codepitk.Range.fold 0 (alignment.length / 3) ~init:[] ~f:(fun acc j ->
let column_is_quasi_full = let column_is_quasi_full =
at_most_n_failures alignment.items ~n:nmissing ~f:(fun it -> at_most_n_failures alignment.items ~n:nmissing ~f:(fun it ->
Option.is_some (aa_at_pos it.sequence j) Option.is_some (aa_at_pos it.sequence j)
...@@ -204,11 +215,10 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file = ...@@ -204,11 +215,10 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
Array.of_list (full_sites_of_alignment al) Array.of_list (full_sites_of_alignment al)
) )
in in
let build_alignment_from_sites (sites : (Phylip.t * int) array) = let build_alignment_from_sites (sites : (Fasta_alignment.t * int) array) =
assert (Array.length sites > n) ; assert (Array.length sites > n) ;
let al0, _ = sites.(0) in let al0, _ = sites.(0) in
let nb_sequences = al0.number_of_sequences in let sequences = Array.init al0.nb_sequences ~f:(fun _ ->
let sequences = Array.init nb_sequences ~f:(fun _ ->
Bytes.create (3 * n) Bytes.create (3 * n)
) )
in in
...@@ -217,7 +227,10 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file = ...@@ -217,7 +227,10 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
List.iteri al.items ~f:(fun j it -> List.iteri al.items ~f:(fun j it ->
match aa_at_pos it.sequence p with match aa_at_pos it.sequence p with
| Some _ -> | Some _ ->
Bytes.set sequences.(j) (3 * i) it.sequence.[3 * p] ; Bytes.set
sequences.(j)
(3 * i)
it.sequence.[3 * p] ;
Bytes.set sequences.(j) (3 * i + 1) it.sequence.[3 * p + 1] ; Bytes.set sequences.(j) (3 * i + 1) it.sequence.[3 * p + 1] ;
Bytes.set sequences.(j) (3 * i + 2) it.sequence.[3 * p + 2] Bytes.set sequences.(j) (3 * i + 2) it.sequence.[3 * p + 2]
| None -> | None ->
...@@ -227,18 +240,20 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file = ...@@ -227,18 +240,20 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
) )
done ; done ;
List.mapi al0.items ~f:(fun i it -> List.mapi al0.items ~f:(fun i it ->
{ Phylip.sequence = Bytes.to_string sequences.(i) ; name = it.name } { it with sequence = Bytes.to_string sequences.(i) }
) )
in in
let load_alignment (ali_fn : Orthomam_db.alignment) =
let ali = Fasta_alignment.from_file_exn (ali_fn :> string) in
if ali.length mod 3 = 0 then
Some (
Fasta_alignment.project ~species:all_species ali
)
else None
in
let all_alignments = let all_alignments =
Orthomam_db.list_alignments db Orthomam_db.list_alignments db
|> List.filter_map ~f:(fun ali_fn -> |> List.filter_map ~f:load_alignment
let ali = Phylip.read_exn ~strict:false (ali_fn :> string) in
if ali.sequence_length mod 3 = 0 then
Phylip.make_exn (List.sort ali.items ~compare:compare_items)
|> Option.some
else None
)
in in
let all_full_sites = all_full_sites all_alignments in let all_full_sites = all_full_sites all_alignments in
if Array.length all_full_sites < n then failwith "not enough sites" ; if Array.length all_full_sites < n then failwith "not enough sites" ;
...@@ -250,7 +265,7 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file = ...@@ -250,7 +265,7 @@ let concatenate ?(nmissing = 0) ?seed db n : phylip file =
Gsl.Rng.set rng seed ; Gsl.Rng.set rng seed ;
Gsl.Randist.shuffle rng all_full_sites ; Gsl.Randist.shuffle rng all_full_sites ;
let items = build_alignment_from_sites all_full_sites in let items = build_alignment_from_sites all_full_sites in
Phylip.write ~strict:false (Phylip.make_exn items) dest Biotk.Fasta.to_file dest items
in in
Workflow.path_plugin ~descr:"orthomam.concatenate" ~mem:(Workflow.int 4096) f Workflow.path_plugin ~descr:"orthomam.concatenate" ~mem:(Workflow.int 4096) f
...@@ -296,31 +311,9 @@ let concatenate_calibration_figure ~nsites ~trees = ...@@ -296,31 +311,9 @@ let concatenate_calibration_figure ~nsites ~trees =
in in
Workflow.path_plugin ~descr:"orthomam.concatenate_calibration_figure" f Workflow.path_plugin ~descr:"orthomam.concatenate_calibration_figure" f
let phylip_aa_of_nuc (ali : phylip file) : phylip file =
let f = fun%workflow dest ->
let open Phylogenetics in
let input_ali = Phylip.read_exn ~strict:false [%path ali] in
let items = List.map input_ali.items ~f:(fun it ->
let sequence =
it.sequence
|> Codepitk.Utils.translate_nucleotide_sequence_whatever_it_takes
in
{ it with Phylip.sequence }
)
in
Phylip.write ~strict:false (Phylip.make_exn items) dest
in
Workflow.path_plugin ~descr:"orthomam.phylip_aa_of_nuc" f
let iqtree_nexus_partition_file_of_alignment ali : nexus file = let iqtree_nexus_partition_file_of_alignment ali : nexus file =
let f = fun%workflow dest -> let f = fun%workflow dest ->
let maybe_nb_columns = let nb_columns = (Fasta_alignment.from_file_exn [%path ali]).length in
let open Option in
In_channel.with_file [%path ali] ~f:In_channel.input_line >>= fun ali_first_line ->
String.lsplit2 ali_first_line ~on:'\t' >>= fun (_, n) ->
return (Int.of_string n)
in
let nb_columns = Option.value_exn ~message:"Could not parse phylip alignment first line" maybe_nb_columns in
Out_channel.with_file dest ~f:(fun oc -> Out_channel.with_file dest ~f:(fun oc ->
fprintf oc fprintf oc
{|#nexus begin sets; charset part1 = 1-%d\3; charset part2 = 2-%d\3; charset part3 = 3-%d\3; charpartition mine = GTR+R:part1, GTR+R:part2, GTR+R:part3; end;|} {|#nexus begin sets; charset part1 = 1-%d\3; charset part2 = 2-%d\3; charset part3 = 3-%d\3; charpartition mine = GTR+R:part1, GTR+R:part2, GTR+R:part3; end;|}
...@@ -335,7 +328,7 @@ let branch_length_estimation_concatenate db = ...@@ -335,7 +328,7 @@ let branch_length_estimation_concatenate db =
let nucleotide_tree_estimation db ali = let nucleotide_tree_estimation db ali =
let open Bistro_bio in let open Bistro_bio in
let spp = iqtree_nexus_partition_file_of_alignment ali in let spp = iqtree_nexus_partition_file_of_alignment ali in
Iqtree.iqtree ~st:`DNA ~te:(omm_tree_of_db db) ~spp (`phylip ali) Iqtree.iqtree ~st:`DNA ~te:(omm_tree_of_db db) ~spp (`fasta ali)
|> Iqtree.treefile |> Iqtree.treefile
let estimated_nucleotide_tree db = let estimated_nucleotide_tree db =
...@@ -346,17 +339,17 @@ let estimated_amino_acid_tree db = ...@@ -346,17 +339,17 @@ let estimated_amino_acid_tree db =
let open Bistro_bio in let open Bistro_bio in
let ali = let ali =
branch_length_estimation_concatenate db branch_length_estimation_concatenate db
|> phylip_aa_of_nuc |> U.amino_acid_fasta_of_nucleotide_fasta
in in
let model = Iqtree.model_spec ~freq_type:`F ~rate_type:(`R None) `LG in let model = Iqtree.model_spec ~freq_type:`F ~rate_type:(`R None) `LG in
Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model (`phylip ali) Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model (`fasta ali)
|> Iqtree.treefile |> Iqtree.treefile
let estimated_codon_tree db = let estimated_codon_tree db =
let open Bistro_bio in let open Bistro_bio in
let ali = branch_length_estimation_concatenate db in let ali = branch_length_estimation_concatenate db in
let model = Iqtree.model_spec ~freq_type:`F `ECMK07 in let model = Iqtree.model_spec ~freq_type:`F `ECMK07 in
Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model ~st:(`CODON 1) (`phylip ali) Iqtree.iqtree ~te:(omm_tree_of_db db) ~m:model ~st:(`CODON 1) (`fasta ali)
|> Iqtree.treefile |> Iqtree.treefile
let concatenate_calibration ~seed db = let concatenate_calibration ~seed db =
...@@ -387,9 +380,8 @@ let concatenate_calibration ~seed db = ...@@ -387,9 +380,8 @@ let concatenate_calibration ~seed db =
module Q = struct module Q = struct
type t = query type t = query
let alignment q : phylip file = let alignment q : nucleotide_fasta file =
Workflow.input (q.alignment :> string) Workflow.input (q.alignment :> string)
|> remove_unobserved_sequences_from_alignment
let omm_tree_with_branch_lengths ~branch_length_unit = match branch_length_unit with let omm_tree_with_branch_lengths ~branch_length_unit = match branch_length_unit with
| `Nucleotide -> estimated_nucleotide_tree | `Nucleotide -> estimated_nucleotide_tree
...@@ -403,8 +395,7 @@ module Q = struct ...@@ -403,8 +395,7 @@ module Q = struct
q.convergent_species) q.convergent_species)
(alignment q) (alignment q)
let nucleotide_alignment q = let nucleotide_alignment q = alignment q
fasta_of_phylip (alignment q)
end end
include Q include Q
...@@ -578,7 +569,7 @@ let ranking_of_results ~alignment_ids ~convergent_species (alignments : aminoaci ...@@ -578,7 +569,7 @@ let ranking_of_results ~alignment_ids ~convergent_species (alignments : aminoaci
let site_ranking ?subset ~meth ~convergent_species db = let site_ranking ?subset ~meth ~convergent_species db =
let detection_method, column_label = match meth with let detection_method, column_label = match meth with
| `multinomial_asymptotic_lrt -> | `multinomial_asymptotic_lrt ->
multinomial_asymptotic_lrt, "1MinusLRT" multinomial_asymptotic_lrt, "Multinomial_1mp"
| `tdg09 -> | `tdg09 ->
failsafe_tdg09, "Tdg09_1MinusFDR" failsafe_tdg09, "Tdg09_1MinusFDR"
in in
...@@ -638,13 +629,13 @@ module Binary_phenotype = struct ...@@ -638,13 +629,13 @@ module Binary_phenotype = struct
let parse (line:string list) = let parse (line:string list) =
match line with match line with
| [name; genus; species; is_convergent] -> | [name; genus; species; is_convergent] ->
(match bool_of_word is_convergent with (match bool_of_word is_convergent with
| Ok is_convergent -> Ok {name;genus;species;is_convergent} | Ok is_convergent -> Ok {name;genus;species;is_convergent}
| Error msg -> Error msg) | Error msg -> Error msg)
| _ -> Error "Invalid line format encountered" | _ -> Error "Invalid line format encountered"
let list_of_file filename = let list_of_file filename =
let open Result.Monad_infix in let open Result.Monad_infix in
let lines = In_channel.read_lines filename let lines = In_channel.read_lines filename
|> List.map ~f:(String.split ~on:'\t') in |> List.map ~f:(String.split ~on:'\t') in
...@@ -652,11 +643,11 @@ module Binary_phenotype = struct ...@@ -652,11 +643,11 @@ module Binary_phenotype = struct
Result.all (List.map ~f:parse split_lines) Result.all (List.map ~f:parse split_lines)
let convergent_species_list (phenotypes:t list) = let convergent_species_list (phenotypes:t list) =
List.filter phenotypes ~f:(fun p -> p.is_convergent) List.filter phenotypes ~f:(fun p -> p.is_convergent)
|> List.map ~f:(fun p -> p.name) |> List.map ~f:(fun p -> p.name)
let load_convergent_species phenotype_tsv = let load_convergent_species phenotype_tsv =
let f = fun%workflow () -> let f = fun%workflow () ->
match list_of_file [%path phenotype_tsv] with match list_of_file [%path phenotype_tsv] with
| Ok phenotypes -> convergent_species_list phenotypes | Ok phenotypes -> convergent_species_list phenotypes
| Error msg -> failwith msg | Error msg -> failwith msg
...@@ -692,11 +683,12 @@ let rer_converge ?transform ?weighted ?scale ?continuous ?(summary_n_genes = 10) ...@@ -692,11 +683,12 @@ let rer_converge ?transform ?weighted ?scale ?continuous ?(summary_n_genes = 10)
let clipped_tree = let clipped_tree =
clip_tree_on_alignment ~handle_tags:false species_tree q clip_tree_on_alignment ~handle_tags:false species_tree q
in in
let gene_tree = Iqtree.iqtree let gene_tree =
Iqtree.iqtree
~keep_ident:true ~te:clipped_tree ~keep_ident:true ~te:clipped_tree
(*~m:model*) ~st:`DNA ~spp (*~m:model*) ~st:`DNA ~spp
(`phylip q) (`fasta q)
|> Iqtree.treefile |> Iqtree.treefile
in in
RER.match_species_tree_position ~gene_tree ~clipped_species_tree:clipped_tree RER.match_species_tree_position ~gene_tree ~clipped_species_tree:clipped_tree
) )
...@@ -708,17 +700,15 @@ let rer_converge ?transform ?weighted ?scale ?continuous ?(summary_n_genes = 10) ...@@ -708,17 +700,15 @@ let rer_converge ?transform ?weighted ?scale ?continuous ?(summary_n_genes = 10)
let rds = RER.rer_converge ?transform ?weighted ?scale ?continuous let rds = RER.rer_converge ?transform ?weighted ?scale ?continuous
~min_specs:10 ~master_tree:species_tree ~gene_tree_set ~phenotypes () in ~min_specs:10 ~master_tree:species_tree ~gene_tree_set ~phenotypes () in
List.map RER.[All; Ancestral; Terminal] ~f:(fun clade -> List.map RER.[All; Ancestral; Terminal] ~f:(fun clade ->
let result_table = RER.result_table_of_rds rds clade in let result_table = RER.result_table_of_rds rds clade in
let best_candidate_summary = RER.best_candidate_summary let best_candidate_summary = RER.best_candidate_summary
~n_genes:summary_n_genes rds clade ~n_genes:summary_n_genes rds clade
in in
let rer_plot ~pat = let rer_plot ~pat =
search_alignments ~pat db search_alignments ~pat db
|> List.map ~f:(fun (Alignment (_, ali)) -> family_id_of_alignment ali) |> List.map ~f:(fun (Alignment (_, ali)) -> family_id_of_alignment ali)
|> RER.plot_relative_rates rds clade |> RER.plot_relative_rates rds clade
in in
clade, { result_table; best_candidate_summary; rer_plot; } clade, { result_table; best_candidate_summary; rer_plot; }
) )
...@@ -20,8 +20,6 @@ val query : ...@@ -20,8 +20,6 @@ val query :
val family_id : query -> string val family_id : query -> string
val alignment : query -> phylip file
val tree_of_db : val tree_of_db :
Orthomam_db.t -> Orthomam_db.t ->
branch_length_unit:[`Nucleotide | `Amino_acid | `Codon] -> branch_length_unit:[`Nucleotide | `Amino_acid | `Codon] ->
...@@ -50,7 +48,7 @@ val concatenate : ...@@ -50,7 +48,7 @@ val concatenate :
?seed:int -> ?seed:int ->
Orthomam_db.t -> Orthomam_db.t ->
int -> int ->
phylip file nucleotide_fasta file
(** Produces a concatenate of gene alignments in a (** Produces a concatenate of gene alignments in a
database. Alignments where strictly more than database. Alignments where strictly more than
[max_missing_sequences] leaves are missing are discarded, as well [max_missing_sequences] leaves are missing are discarded, as well
......
...@@ -10,21 +10,21 @@ let make path = ...@@ -10,21 +10,21 @@ let make path =
let list_alignments path = let list_alignments path =
Sys.readdir path Sys.readdir path
|> Array.to_list |> Array.to_list
|> List.filter ~f:(String.is_suffix ~suffix:".ali") |> List.filter ~f:(String.is_suffix ~suffix:".fasta")
|> List.map ~f:(Filename.concat path) |> List.map ~f:(Filename.concat path)
let tree path = Filename.concat path "all.tree" let tree path = Filename.concat path "orthomam.nw"
let id_of_alignment ali = let id_of_alignment ali =
String.chop_suffix_exn (Filename.basename ali) ~suffix:".ali" String.chop_suffix_exn (Filename.basename ali) ~suffix:".fasta"
let is_gap c = Char.(c = '?' || c = '-') let is_gap c = Char.(c = '?' || c = '-')
let alignment_number_of_missing_sequences (ali : Phylogenetics.Phylip.t) = let alignment_number_of_missing_sequences (ali : Biotk.Fasta.item list) =
let missing_sequence seq = let missing_sequence seq =
String.count seq ~f:is_gap = String.length seq String.count seq ~f:is_gap = String.length seq
in in
List.count ali.items ~f:(fun it -> missing_sequence it.sequence) List.count ali ~f:(fun it -> missing_sequence it.sequence)
let remove_gaps s = let remove_gaps s =
String.filter s ~f:(Fn.non is_gap) String.filter s ~f:(Fn.non is_gap)
...@@ -86,6 +86,7 @@ let check_orf seq : (unit, orf_defect) result = ...@@ -86,6 +86,7 @@ let check_orf seq : (unit, orf_defect) result =
|> Option.value_map ~f:Result.fail ~default:(Ok ()) |> Option.value_map ~f:Result.fail ~default:(Ok ())
type alignment_defect = [ type alignment_defect = [
| `Sequences_with_different_lengths
| `Length_not_multiple_of_3 | `Length_not_multiple_of_3
| `Invalid_sequences of (string * aligned_sequence_defect) list | `Invalid_sequences of (string * aligned_sequence_defect) list
] ]
...@@ -113,33 +114,38 @@ let check_alignment_sequence seq = ...@@ -113,33 +114,38 @@ let check_alignment_sequence seq =
if n mod 3