Commit 8bfb1fb9 authored by Philippe Veber's avatar Philippe Veber
Browse files

Alistat: refactoring of nucleotide_fasta_gc

- avoid n^2 string copy
- allow sequences of various sizes
- don't expect sequences to be of length 0 modulo 3
parent 9fb805cc
......@@ -10,17 +10,22 @@ let is_gc ?thirdpos:(tp=false) i c =
| true when not ((i mod 3) = 2) -> false
| _ -> c = 'g' || c = 'G' || c = 'c' || c = 'C'
let string_from_fasta fa =
let strings_from_fasta fa =
match Alignment.from_fasta fa with
| Ok al -> al.sequences |> Array.fold ~init:"" ~f:(fun acc s -> String.concat [acc ; s])
| Ok al -> al.sequences
| Error `Fasta_parser_error _ -> failwith "parsing error"
| Error `Msg msg -> failwith msg
let nucleotide_fasta_gc ?thirdpos:(tp=false) fa =
let als = string_from_fasta fa in
let len = if tp then String.length als / 3 else String.length als in
let sum = String.foldi ~init:0 ~f:(fun i a c -> a + if is_gc ~thirdpos:tp i c then 1 else 0) als in
(float_of_int sum) /. (float_of_int len)
let seqs = strings_from_fasta fa in
let gc_counts = Array.map seqs ~f:(fun seq ->
let len = if tp then String.length seq / 3 else String.length seq in
let sum = String.foldi ~init:0 ~f:(fun i a c -> a + if is_gc ~thirdpos:tp i c then 1 else 0) seq in
float len, float sum
)
in
let total_len = Array.fold gc_counts ~init:0. ~f:(fun acc (n,_) -> acc +. n) in
Array.fold gc_counts ~init:0. ~f:(fun acc (len, k) -> acc +. k *. len /. total_len)
let main ~alignment () =
match Alignment.from_fasta alignment with
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment