Commit 702df100 authored by Philippe Veber's avatar Philippe Veber
Browse files

refactored simulation to use tk

parent 194e8dde
......@@ -32,7 +32,7 @@ type t =
}
| Convdet_simulation of {
tree : tree ;
branch_factor : float ;
branch_scale : float ;
profiles : string ;
n_h0 : int ;
n_ha : int ;
......@@ -60,7 +60,7 @@ let bppseqgen_simulation ~hyp ~tree ~profiles ~nb_sites ~seed =
seed ;
}
let convdet_simulation ?(branch_factor = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () =
let convdet_simulation ?(branch_scale = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?(seed = 0) ~tree ~profiles ~n_h0 ~n_ha () =
Convdet_simulation {
tree ;
profiles ;
......@@ -68,7 +68,7 @@ let convdet_simulation ?(branch_factor = 1.) ?(ne_s = 1., 1.) ?(gBGC = 0., 0.) ?
n_ha ;
ne_s ;
gBGC ;
branch_factor ;
branch_scale ;
seed : int ;
}
......@@ -127,11 +127,11 @@ let rec nucleotide_alignment = function
let h0 = nucleotide_alignment (Bppseqgen_simulation { hypothesis = H0 (Fixed ne_s) ; profiles ; seed ; nb_sites = n_h0 ; tree }) in
let ha = nucleotide_alignment (Bppseqgen_simulation { hypothesis = HaPC (Fixed ne_s) ; profiles ; seed ; nb_sites = n_ha ; tree }) in
Utils.fasta_cappend h0 ha
| Convdet_simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_factor ; seed ; _ } as sim ->
| Convdet_simulation { n_h0 ; n_ha ; profiles ; ne_s ; gBGC ; branch_scale ; seed ; _ } as sim ->
let tree = tree ~branch_length_unit:`Nucleotide sim in
let fitness_profiles = Workflow.input profiles in
Simulator.simulator ~branch_factor ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
|> fst
Simulator.simulation ~branch_scale ~n_ha ~n_h0 ~ne_s ~gBGC ~tree ~seed ~fitness_profiles ()
|> Simulator.alignment_of_simulation
include Detection_pipeline.Make(struct
type nonrec t = t
......@@ -207,8 +207,8 @@ let benchmark2 d =
]
(* param exploration for SMBE paper *)
(* type branch_factor_t = float *)
let branch_factor_range = [ 1.; 3.; 6.; 9. ]
(* type branch_scale_t = float *)
let branch_scale_range = [ 1.; 3.; 6.; 9. ]
type gBGC_t = Global of float | Convergent of float * float
let gBGC_range =
......@@ -221,7 +221,7 @@ let gBGC_range =
type param_t = float * gBGC_t
let explore_params ~(f: param_t -> _) =
List.map branch_factor_range ~f:(fun (bf:float) ->
List.map branch_scale_range ~f:(fun (bf:float) ->
List.map gBGC_range ~f:(fun (gbgc:gBGC_t) -> ((bf, gbgc), f (bf, gbgc)))
) |> List.concat
......@@ -230,7 +230,7 @@ let simu_of_param ?n_h0:(n_h0=50) (p: param_t) =
convdet_simulation
~tree:(NHX "example/trees_analyses/C4AmaranthaceaePolyroot.nhx")
~profiles:"example/aa_fitness/263SelectedProfiles.tsv"
~branch_factor:bf
~branch_scale:bf
~gBGC:(match gbgc with
| Convergent (a,c) -> (a, c)
| Global g -> (g, g))
......
......@@ -29,7 +29,7 @@ val bppseqgen_simulation :
t
val convdet_simulation :
?branch_factor:float ->
?branch_scale:float ->
?ne_s:float * float ->
?gBGC:float * float ->
?seed:int ->
......
open Core_kernel
let%pworkflow simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s:(ne_s0, ne_s1) ~gBGC:(gBGC0, gBGC1) ~tree ~fitness_profiles () =
let%workflow simulation ?branch_scale ?seed ~n_h0 ~n_ha ~ne_s:(ne_s0, ne_s1) ~gBGC:(gBGC0, gBGC1) ~tree ~fitness_profiles () =
let open Phylogenetics in
let module Convsim = Phylogenetics_convergence.Simulator in
let () = Option.iter ~f:Random.init [%param seed] in
let n_h0 = [%param n_h0] in
let n_ha = [%param n_ha] in
......@@ -10,97 +9,48 @@ let%pworkflow simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s:(ne_s0, ne_s1) ~g
let ne_s1 = [%param ne_s1] in
let gBGC0 = [%param gBGC0] in
let gBGC1 = [%param gBGC1] in
let branch_factor = [%param branch_factor] in
let tree = Codepitk.Utils.tree_from_file ?alpha:branch_factor [%path tree] in
let fitness_profiles = Phylogenetics_convergence.Profile_tsv.(read [%path fitness_profiles] |> to_fitness) in
let rescale_fitness beta = Amino_acid.Vector.map ~f:(( *. ) beta) in
let base_param =
let p = Mutsel.random_param ~alpha_nucleotide:10. ~alpha_fitness:0.1 in
{ p with omega = 1. }
in
let random_profile beta =
Random.int (Array.length fitness_profiles)
|> Array.get fitness_profiles
|> Amino_acid.Vector.of_array_exn
|> rescale_fitness beta
in
let h0_params = Array.init n_h0 ~f:(fun _ ->
let profile = random_profile 1. in
let p = { base_param with gBGC = gBGC0 ; scaled_fitness = rescale_fitness ne_s0 profile } in
let q = { base_param with gBGC = gBGC1 ; scaled_fitness = rescale_fitness ne_s1 profile } in
(p, q)
)
in
let most_probable_aa (pref : Amino_acid.vector) =
let pref = Amino_acid.Vector.to_array pref in
let arr = Array.mapi pref ~f:(fun i x -> x, i) in
match Array.max_elt ~compare:Poly.compare arr with
| None -> assert false
| Some (_, i) -> i
in
let ha_params = Array.init n_ha ~f:(fun _ ->
let rec loop () =
let p = random_profile ne_s0 in
let q = random_profile ne_s1 in
if Poly.equal p q || most_probable_aa p = most_probable_aa q then loop ()
else
let p = { base_param with scaled_fitness = p ; gBGC = gBGC0 } in
let q = { base_param with scaled_fitness = q ; gBGC = gBGC1 } in
(p, q)
in
loop ()
)
in
let params_of_cond i cond =
let p, q =
if i < n_h0 then h0_params.(i)
else ha_params.(i - n_h0)
in
match cond with
| `Ancestral -> p
| `Convergent -> q
let branch_scale = [%param branch_scale] in
let tree =
Codepitk.Convergence_tree.from_file [%path tree]
|> Rresult.R.get_ok
in
let params i bi =
params_of_cond i (Convsim.Branch_info.condition bi)
let fitness_profiles =
Phylogenetics_convergence.Profile_tsv.(
read [%path fitness_profiles]
|> to_fitness
|> Array.map ~f: Amino_acid.Vector.of_array_exn
) in
Codepitk.Simulator.Site_independent_mutsel.simulation
?branch_scale ?seed ~n_h0 ~n_ha ~gBGC:(gBGC0, gBGC1)
~ne_s:(ne_s0, ne_s1) ~tree ~fitness_profiles ()
let%pworkflow alignment_of_simulation sim =
let sim : Codepitk.Simulator.Site_independent_mutsel.simulation =
[%eval sim]
in
let root_condition = `Ancestral in
let root_dists = Array.init (n_h0 + n_ha) ~f:(fun i ->
Mutsel.stationary_distribution (params_of_cond i root_condition)
|> Mutsel.NSCodon.Vector.to_array
|> Mutsel.NSCodon.Table.of_array_exn
let species_name = Phylogenetics.Tree.leaves sim.tree in
Out_channel.with_file [%dest] ~f:(fun oc ->
List.iter2_exn species_name sim.sequences ~f:(fun description sequence ->
fprintf oc ">%s\n%s\n" description (sequence :> string)
)
)
let%pworkflow fitness_histogram sim =
let open Phylogenetics in
let sim : Codepitk.Simulator.Site_independent_mutsel.simulation =
[%eval sim]
in
let root = Convsim.Mutsel.hmm0 ~len:(n_h0 + n_ha) ~dist:(Array.get root_dists) in
let ali = Convsim.Mutsel.alignment tree ~root params in
let species_name =
Phylogenetics.Tree.leaves tree
|> List.map ~f:(fun { name } -> Option.value_exn name) in
let save_alignment fn =
Out_channel.with_file fn ~f:(fun oc ->
List.iter2_exn species_name ali ~f:(fun description sequence ->
fprintf oc ">%s\n%s\n" description (sequence :> string)
)
let params = Array.append sim.h0_params sim.ha_params in
let data =
Array.fold params ~init:[] ~f:(fun acc (p,q) ->
(Amino_acid.Vector.to_array p.scaled_fitness) ::
(Amino_acid.Vector.to_array q.scaled_fitness) :: acc
)
|> Array.concat
in
let save_fitness_histogram dest =
let data =
Array.fold (Array.append h0_params ha_params) ~init:[] ~f:(fun acc (p,q) ->
(Amino_acid.Vector.to_array p.scaled_fitness) :: (Amino_acid.Vector.to_array q.scaled_fitness) :: acc
)
|> Array.concat
in
OCamlR_grDevices.pdf dest ;
ignore (OCamlR_graphics.hist data : OCamlR_graphics.hist) ;
OCamlR_grDevices.dev_off ()
in
Core.Unix.mkdir_p [%dest] ;
save_alignment (Filename.concat [%dest] "simulation.fa") ;
save_fitness_histogram (Filename.concat [%dest] "fitness_histogram.pdf")
let simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitness_profiles () =
let dir = simulator ?branch_factor ?seed ~n_h0 ~n_ha ~ne_s ~gBGC ~tree ~fitness_profiles () in
Bistro.Workflow.select dir ["simulation.fa"],
Bistro.Workflow.select dir ["fitness_histogram.pdf"]
OCamlR_grDevices.pdf [%dest] ;
ignore (OCamlR_graphics.hist data : OCamlR_graphics.hist) ;
OCamlR_grDevices.dev_off ()
let%pworkflow pair_tree ~branch_length1 ~branch_length2 ~npairs =
let open Phylogenetics in
......
open Bistro
open File_formats
open Codepitk
val simulator :
?branch_factor:float ->
val simulation :
?branch_scale:float ->
?seed:int ->
n_h0:int ->
n_ha:int ->
......@@ -11,7 +12,15 @@ val simulator :
tree:nhx file ->
fitness_profiles:#text file ->
unit ->
nucleotide_fasta file * pdf file
Simulator.Site_independent_mutsel.simulation workflow
val alignment_of_simulation :
Simulator.Site_independent_mutsel.simulation workflow ->
nucleotide_fasta file
val fitness_histogram :
Simulator.Site_independent_mutsel.simulation workflow ->
pdf file
val pair_tree :
branch_length1:float ->
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment