Commit a897beaf authored by Philippe Veber's avatar Philippe Veber
Browse files

Simulator: save distance btw profiles at leaves

this commit introduces a measure of how difficult a particular
parameter setting is in average. it computes an expected profile at
the convergent leaves given branch lengths (actually the height of the
tree is taken, which is an upper bound).

Note that I suppressed a check that used to avoid simulations with two
profiles having the same preferred amino acid.
parent 8bfb1fb9
......@@ -127,7 +127,8 @@ let rec nucleotide_alignment = function
| Convdet_simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_factor ; seed ; _ } as sim ->
let tree = tree sim in
let fitness_profiles = Workflow.input profiles in
Simulator.simulator ~branch_factor ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
let fna, _ = Simulator.simulator ~branch_factor ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles () in
fna
let amino_acid_alignment d = Bppsuite.fna2faa (nucleotide_alignment d)
......
......@@ -2,6 +2,14 @@ open Core_kernel
let%pworkflow simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitness_profiles () =
let () = Option.iter ~f:Random.init [%param seed] in
let rec tree_height t = Phylogenetics.Tree.(
match t.branches with
| [] -> 0.
| xs -> List.fold xs ~init:Float.neg_infinity ~f:(fun acc b ->
Float.max acc (fst b.branch_data +. tree_height b.tip)
)
)
in
let n_h0 = [%param n_h0] in
let n_ha = [%param n_ha] in
let ne_s = [%param ne_s] in
......@@ -20,46 +28,71 @@ let%pworkflow simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitn
|> Array.get fitness_profiles
|> Convdet.Amino_acid.vector_of_array_exn
in
let random_h0_param () =
let p = random_profile () in
{ base_param with scaled_fitness = p }
let selected_profiles = Array.init n_h0 ~f:(fun _ -> random_profile ()) in
let selected_profile_pairs = Array.init n_ha ~f:(fun _ ->
random_profile (), random_profile ())
in
let most_probable_aa (pref : float Convdet.Amino_acid.vector) =
let arr = Array.mapi (pref :> float array) ~f:(fun i x -> x, i) in
match Array.max_elt ~compare:Poly.compare arr with
| None -> assert false
| Some (_, i) -> i
in
let rec random_ha_param () =
let p = random_profile () in
let q = random_profile () in
if p = q || most_probable_aa p = most_probable_aa q then random_ha_param ()
let param_of_fitness f = { base_param with scaled_fitness = f } in
let params i =
if i < n_h0 then Fn.const (param_of_fitness selected_profiles.(i))
else
let p = { base_param with scaled_fitness = p } in
let q = { base_param with scaled_fitness = q } in
let fp, fq = selected_profile_pairs.(i - n_h0) in
let p = param_of_fitness fp in
let q = param_of_fitness fq in
(function
| 0 -> p
| 1 -> q
| _ -> assert false)
in
let params i =
if i < n_h0 then Fn.const (random_h0_param ())
else random_ha_param () in
let root_condition = Convdet.Simulator.Mutsel.root_condition tree in
let root_dists = Array.init (n_h0 + n_ha) ~f:(fun i ->
Convdet.Mutsel.stationary_distribution (params i root_condition)
)
in
let dists =
let owl_row_vector pi = Convdet.Mutsel.NSCodon.(Owl.Mat.of_array (pi : float vector :> float array) 1 card) in
let tau = tree_height tree in
Array.init n_ha ~f:(fun i ->
let i = n_h0 + i in
let pi_p = Convdet.Mutsel.stationary_distribution (params i root_condition) in
let param_q = params i (1 - root_condition) in
let pmat_q =
Convdet.Rate_matrix.transition_probability_matrix
~tau
~rates:(Convdet.Mutsel.rate_matrix param_q :> float array array)
in
let owl_pi_p = owl_row_vector pi_p in
let owl_pmat_q = Owl.Mat.of_arrays pmat_q in
match Owl.Mat.(to_array @@ l2norm (dot owl_pi_p owl_pmat_q - owl_pi_p)) with
| [| x |] -> x
| _ -> assert false
)
in
let root = Convdet.Simulator.Mutsel.hmm0 ~len:(n_h0 + n_ha) ~dist:(Array.get root_dists) in
let ali = Convdet.Simulator.Mutsel.alignment tree ~root params in
let species_name =
Phylogenetics.Tree.leaves tree
|> List.map ~f:(fun { name } -> Option.value_exn name) in
Out_channel.with_file [%dest] ~f:(fun oc ->
List.iter2_exn species_name ali ~f:(fun description sequence ->
fprintf oc ">%s\n%s\n" description (sequence :> string)
)
)
let save_alignment fn =
Out_channel.with_file fn ~f:(fun oc ->
List.iter2_exn species_name ali ~f:(fun description sequence ->
fprintf oc ">%s\n%s\n" description (sequence :> string)
)
)
in
let save_distances fn =
Array.map dists ~f:Float.to_string
|> Array.to_list
|> Out_channel.write_lines fn
in
Core.Unix.mkdir_p [%dest] ;
save_alignment (Filename.concat [%dest] "simulation.fa") ;
save_distances (Filename.concat [%dest] "distances.tsv")
let simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitness_profiles () =
let dir = simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitness_profiles () in
Bistro.Workflow.select dir ["simulation.fa"],
Bistro.Workflow.select dir ["distances.tsv"]
let%pworkflow pair_tree ~branch_length1 ~branch_length2 ~npairs =
let open Phylogenetics in
......
......@@ -11,7 +11,7 @@ val simulator :
tree:nhx pworkflow ->
fitness_profiles:#text_file pworkflow ->
unit ->
nucleotide_fasta pworkflow
nucleotide_fasta pworkflow * text_file pworkflow
val pair_tree :
branch_length1:float ->
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment