Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
bbdd8e37
Commit
bbdd8e37
authored
Nov 17, 2020
by
Philippe Veber
Browse files
Simulation_dataset: separate in-house simulation and bppseqgen's
parent
265897c0
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
585 additions
and
454 deletions
+585
-454
lib/simulation_dataset.ml
lib/simulation_dataset.ml
+515
-374
lib/simulation_dataset.mli
lib/simulation_dataset.mli
+70
-80
No files found.
lib/simulation_dataset.ml
View file @
bbdd8e37
...
...
@@ -15,28 +15,13 @@ type tree =
branch_length2
:
float
;
}
type
'
s
benchmark
=
{
method_labels
:
string
list
;
method_outputs
:
float
option
array
list
;
average_precision
:
(
float
*
(
float
*
float
))
list
;
site_model
:
'
s
array
;
ancestral_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
convergent_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
}
module
type
S
=
sig
type
t
type
site_model
include
Detection_pipeline
.
Query
with
type
t
:=
t
include
Detection_pipeline
.
S
with
type
query
:=
t
val
alignment_plot
:
t
->
svg
file
type
query
val
benchmark
:
t
->
text
file
list
->
site_model
benchmark
workflow
include
Detection_pipeline
.
Query
with
type
t
:=
query
include
Detection_pipeline
.
S
with
type
query
:=
query
val
rds_of_benchmark
:
site_model
benchmark
workflow
->
rds
file
val
alignment_plot
:
query
->
svg
file
end
let
tree_workflow
=
function
...
...
@@ -46,7 +31,18 @@ let tree_workflow = function
npairs
}
->
Simulator
.
pair_tree
~
branch_length1
~
branch_length2
~
npairs
module
Mutsel_param
=
struct
module
Make
(
Q
:
Detection_pipeline
.
Query
)
=
struct
include
Detection_pipeline
.
Make
(
Q
)
let
alignment_plot
d
=
Convergence_detection
.
plot_convergent_sites
~
tree
:
(
Q
.
tree
~
branch_length_unit
:
`Amino_acid
d
)
~
alignment
:
(
amino_acid_alignment
d
)
~
detection_results
:
(
multinomial_asymptotic_lrt
d
)
()
end
module
Mutsel_query
=
struct
type
t
=
{
tree
:
tree
;
branch_scale
:
float
;
...
...
@@ -73,165 +69,34 @@ module Mutsel_param = struct
let
tree
=
tree_workflow
tree
in
let
fitness_profiles
=
Workflow
.
input
profiles
in
Simulator
.
simulation
~
branch_scale
~
n_ha
~
n_h0
~
ne_s
~
gBGC
~
tree
~
seed
~
fitness_profiles
()
end
type
t
=
|
Bppseqgen
of
{
hypothesis
:
Convergence_hypothesis
.
t
;
tree
:
tree
;
profiles
:
string
;
nb_sites
:
int
;
seed
:
int
;
}
|
Bppseqgen_mixed
of
{
tree
:
tree
;
profiles
:
string
;
seed
:
int
;
n_h0
:
int
;
n_ha
:
int
;
ne_s
:
float
;
}
|
Mutsel
of
Mutsel_param
.
t
let
bppseqgen_mixed
?
(
ne_s
=
1
.
)
?
(
seed
=
0
)
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Bppseqgen_mixed
{
tree
;
profiles
;
seed
;
n_ha
;
n_h0
;
ne_s
;
}
let
bppseqgen
~
hyp
~
tree
~
profiles
~
nb_sites
~
seed
=
Bppseqgen
{
hypothesis
=
hyp
;
tree
;
profiles
;
nb_sites
;
seed
;
}
let
mutsel
?
branch_scale
?
ne_s
?
gBGC
?
seed
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Mutsel
(
Mutsel_param
.
make
?
branch_scale
?
ne_s
?
gBGC
?
seed
~
tree
~
profiles
~
n_h0
~
n_ha
()
)
let
of_mutsel_param
p
=
Mutsel
p
let
tree
~
branch_length_unit
:_
=
function
|
Bppseqgen
{
tree
;
_
}
|
Bppseqgen_mixed
{
tree
;
_
}
|
Mutsel
{
tree
;
_
}
->
tree_workflow
tree
let
seed
=
function
|
Bppseqgen_mixed
s
->
s
.
seed
|
Bppseqgen
s
->
s
.
seed
|
Mutsel
s
->
s
.
seed
let
profile
~
nb_sites
~
profiles
~
seed
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
All
~
nb_sites
profiles
~
seed
:
(
calc_fixed_seed
~
str
:
profiles
seed
)
let
bppseqgen_simulation
sim
~
hypothesis
~
nb_sites
~
profiles
=
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
hypothesis
in
let
descr
=
sprintf
".%s"
model_prefix
in
let
profile
=
profile
~
nb_sites
~
profiles
~
seed
:
(
seed
sim
)
in
let
profile_f
=
profile
.
profile_f
in
let
profile_c
=
profile
.
profile_c
in
Bppsuite
.
Bppseqgen
.
multi_profiles
~
descr
~
input_tree
:
(
tree
~
branch_length_unit
:
`Nucleotide
sim
)
~
hypothesis
~
profile_f
~
profile_c
~
seed
:
(
seed
sim
)
let
rec
nucleotide_alignment
=
function
|
Bppseqgen
{
hypothesis
;
nb_sites
;
profiles
;
_
}
as
sim
->
bppseqgen_simulation
sim
~
hypothesis
~
nb_sites
~
profiles
|>
Bppsuite
.
Bppseqgen
.
alignment
|
Bppseqgen_mixed
{
profiles
;
seed
;
n_h0
;
n_ha
;
ne_s
;
tree
}
->
let
h0
=
nucleotide_alignment
(
Bppseqgen
{
hypothesis
=
H0
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_h0
;
tree
})
in
let
ha
=
nucleotide_alignment
(
Bppseqgen
{
hypothesis
=
HaPC
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_ha
;
tree
})
in
Utils
.
fasta_cappend
h0
ha
|
Mutsel
param
->
Mutsel_param
.
simulation
param
let
nucleotide_alignment
p
=
simulation
p
|>
Simulator
.
alignment_of_simulation
include
Detection_pipeline
.
Make
(
struct
type
nonrec
t
=
t
let
tree
=
tree
let
nucleotide_alignment
=
nucleotide_alignment
end
)
let
tree
~
branch_length_unit
:_
{
tree
;
_
}
=
tree_workflow
tree
end
let
alignment_plot
d
=
Convergence_detection
.
plot_convergent_sites
~
tree
:
(
tree
~
branch_length_unit
:
`Amino_acid
d
)
~
alignment
:
(
amino_acid_alignment
d
)
~
detection_results
:
(
multinomial_asymptotic_lrt
d
)
()
let
oracle
d
=
let
n_h0
,
n_ha
=
match
d
with
|
Bppseqgen
{
nb_sites
;
hypothesis
;
_
}
->
(
match
hypothesis
with
|
H0
_
->
nb_sites
,
0
|
HaPC
_
|
HaPCOC
_
->
0
,
nb_sites
)
|
Bppseqgen_mixed
{
n_h0
;
n_ha
;
_
}
|
Mutsel
{
n_h0
;
n_ha
;
_
}
->
n_h0
,
n_ha
in
Convergence_detection
.
oracle
~
n_h0
~
n_ha
module
Mutsel
=
struct
type
query
=
Mutsel_query
.
t
let
query
=
Mutsel_query
.
make
let
simulation
=
Mutsel_query
.
simulation
let
multinomial_benchmark
d
=
Utils
.
recall_precision_curve
~
oracle
:
(
oracle
d
)
~
labels
:
[
"LRT"
;
"LRTsim"
;
"sparse"
;
"sparse_sim"
]
~
results
:
[
multinomial_asymptotic_lrt
d
,
1
;
multinomial_simulation_lrt
d
,
1
;
multinomial_asymptotic_sparse
d
,
1
;
multinomial_simulation_sparse
d
,
1
;
]
include
Make
(
Mutsel_query
)
let
result_table
?
(
mode
=
`fast
)
d
=
Convergence_detection
.
merge_result_tables
~
multinomial
:
(
multinomial_asymptotic_lrt
d
)
~
tdg09
:
(
tdg09
d
)
~
identical
:
(
identical
d
)
~
topological
:
(
topological
d
)
~
pcoc
:
(
match
mode
with
|
`fast
->
pcoc
~
gamma
:
false
~
ncat
:
10
d
|
`full
->
pcoc
d
)
?
diffsel
:
(
match
mode
with
|
`fast
->
None
|
`full
->
Some
(
diffsel
d
)
)
~
oracle
:
(
oracle
d
)
()
type
benchmark
=
{
method_labels
:
string
list
;
method_outputs
:
float
option
array
list
;
average_precision
:
(
float
*
(
float
*
float
))
list
;
site_model
:
[
`Convergent
|
`Non_convergent
]
array
;
ancestral_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
convergent_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
}
let
benchmark
?
mode
d
=
result_table
?
mode
d
|>
Convergence_detection
.
recall_precision_curve
let
benchmark2
d
=
Utils
.
recall_precision_curve
~
oracle
:
(
oracle
d
)
~
labels
:
[
"identical"
;
"topological"
;
"multinomial"
;
"pcoc"
;
"tdg09"
]
~
results
:
[
identical
d
,
1
;
topological
d
,
1
;
multinomial_asymptotic_lrt
d
,
1
;
pcoc
d
,
3
;
tdg09
d
,
1
;
]
let
%
pworkflow
benchmark_statistics
simulation
~
labels
~
results
=
let
%
pworkflow
benchmark_statistics
simulation
~
labels
~
results
=
let
open
Phylogenetics
in
let
open
Codepitk
in
let
open
OCamlR_base
in
...
...
@@ -356,28 +221,28 @@ let%pworkflow benchmark_statistics simulation ~labels ~results =
in
saveRDS
~
file
:
[
%
dest
]
(
List_
.
to_sexp
data
)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let
branch_scale_range
=
[
1
.;
3
.;
6
.;
9
.
]
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let
branch_scale_range
=
[
1
.;
3
.;
6
.;
9
.
]
type
gBGC_t
=
Global
of
float
|
Convergent
of
float
*
float
let
gBGC_range
=
type
gBGC_t
=
Global
of
float
|
Convergent
of
float
*
float
let
gBGC_range
=
let
range
=
[
0
.;
2
.;
4
.;
8
.;
16
.;
32
.;
64
.;
]
in
List
.
concat
[
(* List.map ~f:(fun x -> Global x) range ; *)
List
.
map
~
f
:
(
fun
x
->
Convergent
(
0
.,
x
))
range
;
]
type
param_t
=
float
*
gBGC_t
type
param_t
=
float
*
gBGC_t
let
explore_params
~
(
f
:
param_t
->
_
)
=
let
explore_params
~
(
f
:
param_t
->
_
)
=
List
.
map
branch_scale_range
~
f
:
(
fun
(
bf
:
float
)
->
List
.
map
gBGC_range
~
f
:
(
fun
(
gbgc
:
gBGC_t
)
->
((
bf
,
gbgc
)
,
f
(
bf
,
gbgc
)))
)
|>
List
.
concat
let
simu_of_param
?
n_h0
:
(
n_h0
=
50
)
(
p
:
param_t
)
=
let
simu_of_param
?
n_h0
:
(
n_h0
=
50
)
(
p
:
param_t
)
=
let
bf
,
gbgc
=
p
in
m
utsel
M
utsel
_query
.
make
~
tree
:
(
NHX
(
Workflow
.
input
"example/trees_analyses/C4AmaranthaceaePolyroot.nhx"
))
~
profiles
:
"example/aa_fitness/263SelectedProfiles.tsv"
~
branch_scale
:
bf
...
...
@@ -389,15 +254,15 @@ let simu_of_param ?n_h0:(n_h0=50) (p: param_t) =
~
n_h0
()
let
filter_results
~
(
f
:
_
->
bool
)
(
results
:
(
param_t
*
_
)
list
)
=
let
filter_results
~
(
f
:
_
->
bool
)
(
results
:
(
param_t
*
_
)
list
)
=
List
.
filter
results
~
f
:
(
fun
(
_
,
x
)
->
f
x
)
type
record_t
=
{
type
record_t
=
{
gc_means_ancestral
:
([
`first
|
`second
|
`third
]
*
float
)
list
;
gc_means_convergent
:
([
`first
|
`second
|
`third
]
*
float
)
list
}
}
let
%
workflow
record_of_simu
s
=
let
%
workflow
record_of_simu
s
=
let
tree
=
[
%
path
tree
~
branch_length_unit
:
`Nucleotide
s
]
in
let
nucleotide_alignment
=
[
%
path
nucleotide_alignment
s
]
in
let
gc_mean_from_simu
~
pos
=
...
...
@@ -411,13 +276,13 @@ let%workflow record_of_simu s =
gc_means_convergent
=
[(
`first
,
m1_c
.
gc_mean
)
;
(
`second
,
m2_c
.
gc_mean
)
;
(
`third
,
m3_c
.
gc_mean
)]
}
let
expected_gc
=
[
let
expected_gc
=
[
(
`first
,
(
0
.
3326
,
0
.
5157
,
0
.
5589
,
0
.
6080
,
0
.
8621
))
;
(
`second
,
(
0
.
2102
,
0
.
3784
,
0
.
4160
,
0
.
4626
,
0
.
7499
))
;
(
`third
,
(
0
.
2242
,
0
.
4852
,
0
.
6274
,
0
.
7358
,
0
.
9575
))
]
]
let
quartile
(
min_
,
fq_
,
mean_
,
tq_
,
max_
)
x
=
let
quartile
(
min_
,
fq_
,
mean_
,
tq_
,
max_
)
x
=
match
Float
.(
x
<
min_
,
x
<
fq_
,
x
<
mean_
,
x
<
tq_
,
x
<
max_
)
with
|
true
,
_
,
_
,
_
,
_
->
`below_min
|
false
,
true
,
_
,
_
,
_
->
`first
...
...
@@ -426,7 +291,7 @@ let quartile (min_, fq_, mean_, tq_, max_) x =
|
_
,
_
,
_
,
false
,
true
->
`fourth
|
_
,
_
,
_
,
_
,
false
->
`over_max
let
adjacent
q1
q2
=
let
adjacent
q1
q2
=
match
q1
,
q2
with
|
`first
,
`first
|
`second
,
`second
|
`third
,
`third
|
`fourth
,
`fourth
...
...
@@ -435,13 +300,13 @@ let adjacent q1 q2 =
|
`third
,
`fourth
|
`fourth
,
`third
->
true
|
_
->
false
let
quartile_of_record
(
r
:
record_t
)
=
let
quartile_of_record
(
r
:
record_t
)
=
List
.
map
r
.
gc_means_convergent
~
f
:
(
fun
(
q
,
x
)
->
let
q_list
=
List
.
Assoc
.
find_exn
expected_gc
~
equal
:
(
fun
x
y
->
Caml
.(
x
=
y
))
q
in
quartile
q_list
x
)
let
realistic_result
(
r
:
record_t
)
=
let
realistic_result
(
r
:
record_t
)
=
match
quartile_of_record
r
with
|
[
q1
;
q2
;
q3
]
->
adjacent
q1
q2
&&
adjacent
q2
q3
&&
adjacent
q1
q3
|
_
->
failwith
"oh no"
...
...
@@ -449,3 +314,279 @@ let realistic_result (r: record_t) =
(* let v = g.gc_stat.gc_variance_among_sequences in
Float.(v >= 8.388e-05 && v <= 5.262e-02) *)
(* let%workflow benchmark_statistics q methods =
* let open Phylogenetics in
* let open Codepitk in
* let open OCamlR_base in
* let open Codepitk.Simulator.Site_independent_mutsel in
* let module Codon = Codon.Universal_genetic_code.NS in
* let results_paths = [%eval Workflow.path_list (List.map methods ~f:(fun f -> f q))] in
* let sim : simulation = [%eval simulation q] in
* let load_results fn =
* let open Codepitk in
* let df = Dataframe.from_file fn |> Rresult.R.failwith_error_msg in
* List.init (Dataframe.ncols df - 1) ~f:(fun i ->
* match Dataframe.get_col df (i + 1) with
* | Some (Floats xs) -> Array.map xs ~f:Option.some
* | Some (Float_opts xs) -> xs
* | Some (Ints xs) -> Array.map xs ~f:(fun i -> Some (Float.of_int i))
* | Some (Int_opts xs) -> Array.map xs ~f:(Option.map ~f:Float.of_int)
* | Some _ -> failwith "expected a numeric column at pos 1"
* | None -> assert false
* )
* in
* let results = List.map2_exn result_paths (List.map ~f:snd results) ~f:load_results in
* let labels = [%param labels] in
* let n_h0 = Array.length sim.h0_params in
* let n_ha = Array.length sim.ha_params in
* let nsites = n_h0 + n_ha in
* let columns = List.map2_exn labels results ~f:(fun l r ->
* l, `Numeric (Numeric.of_array_opt r)
* ) in
* let amino_acid_vector_of_codon_vector xs =
* Amino_acid.Vector.init (fun aa ->
* List.fold Codon.all ~init:0. ~f:(fun acc c ->
* if Amino_acid.equal aa (Codon.aa_of_codon c) then
* acc +. xs.Codon.%(c)
* else acc
* )
* )
* in
* let collect_profiles sel =
* Array.append sim.h0_params sim.ha_params
* |> Array.map ~f:(fun p ->
* sel p
* |> Mutsel.stationary_distribution
* |> amino_acid_vector_of_codon_vector
* |> Amino_acid.Vector.to_array
* )
* |> Numeric.Matrix.of_arrays
* in
* let ancestral_profiles = collect_profiles fst in
* let convergent_profiles = collect_profiles snd in
* let counts seqs i =
* let t =
* Amino_acid.Table.init (fun aa ->
* let aa = Amino_acid.to_char aa in
* List.count seqs ~f:(fun s ->
* let codon_str = String.sub (s : Dna.t :> string) ~pos:(i * 3) ~len:3 in
* let codon = match Codon.of_string codon_str with
* | Some c -> c
* | None -> assert false
* in
* Char.equal (Amino_acid.to_char (Codon.aa_of_codon codon)) aa)
* )
* in
* (t :> int array)
* in
* let collect_counts cond =
* let species = Convergence_tree.leaves sim.tree in
* let seqs =
* List.map2_exn sim.sequences species ~f:(fun s (_, cond_s) ->
* if Poly.equal cond cond_s then Some s else None
* )
* |> List.filter_opt
* in
* Array.init nsites ~f:(counts seqs)
* |> Integer.Matrix.of_arrays
* in
* let ancestral_counts = collect_counts `Ancestral in
* let convergent_counts = collect_counts `Convergent in
* let make_classification_data x y =
* Prc.Classification_data (
* List.init (Array.length x) ~f:(fun i ->
* match x.(i), y.(i) with
* | Some x_i, Some y_i -> Some (x_i, y_i)
* | None, _ | _, None -> None
* )
* |> List.filter_opt
* )
* in
* let estimates, lower_bounds, upper_bounds =
* let oracle = Array.init nsites ~f:(fun i -> if i < n_h0 then Some false else Some true) in
* List.map results ~f:(fun scores ->
* let Prc.Classification_data xs as data = make_classification_data scores oracle in
* let n = List.count xs ~f:snd in
* let theta_hat = Prc.auc_trapezoidal_lt data in
* let lb, ub = Prc.logit_confidence_interval ~alpha:0.05 ~theta_hat ~n in
* theta_hat, lb, ub
* )
* |> List.unzip3
* in
* let open OCamlR_base in
* let auc_estimates = Dataframe.create [
* "method", `Character (Character.of_list labels) ;
* "estimate", `Numeric (Numeric.of_list estimates) ;
* "lower_bound", `Numeric (Numeric.of_list lower_bounds) ;
* "upper_bound", `Numeric (Numeric.of_list upper_bounds) ;
* ]
* in
* let oracle =
* Array.(
* append
* (map sim.h0_profiles ~f:(Fn.const false))
* (map sim.ha_profiles ~f:(Fn.const true))
* )
* |> Logical.of_array
* in
* let results = Dataframe.create columns in
* let data = List_.create [
* Some "results", Dataframe.to_sexp results ;
* Some "oracle", Logical.to_sexp oracle ;
* Some "ancestral_profiles", Numeric.Matrix.to_sexp ancestral_profiles ;
* Some "convergent_profiles", Numeric.Matrix.to_sexp convergent_profiles ;
* Some "ancestral_counts", Integer.Matrix.to_sexp ancestral_counts ;
* Some "convergent_counts", Integer.Matrix.to_sexp convergent_counts ;
* Some "auc_estimates", Dataframe.to_sexp auc_estimates ;
* ]
* in
* saveRDS ~file:[%dest] (List_.to_sexp data) *)
end
module
Bppseqgen
=
struct
type
t
=
|
Bppseqgen
of
{
hypothesis
:
Convergence_hypothesis
.
t
;
tree
:
tree
;
profiles
:
string
;
nb_sites
:
int
;
seed
:
int
;
}
|
Bppseqgen_mixed
of
{
tree
:
tree
;
profiles
:
string
;
seed
:
int
;
n_h0
:
int
;
n_ha
:
int
;
ne_s
:
float
;
}
let
bppseqgen_mixed
?
(
ne_s
=
1
.
)
?
(
seed
=
0
)
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Bppseqgen_mixed
{
tree
;
profiles
;
seed
;
n_ha
;
n_h0
;
ne_s
;
}
let
bppseqgen
~
hyp
~
tree
~
profiles
~
nb_sites
~
seed
=
Bppseqgen
{
hypothesis
=
hyp
;
tree
;
profiles
;
nb_sites
;
seed
;
}
let
tree
~
branch_length_unit
:_
=
function
|
Bppseqgen
{
tree
;
_
}
|
Bppseqgen_mixed
{
tree
;
_
}
->
tree_workflow
tree
let
seed
=
function
|
Bppseqgen_mixed
s
->
s
.
seed
|
Bppseqgen
s
->
s
.
seed
let
profile
~
nb_sites
~
profiles
~
seed
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
All
~
nb_sites
profiles
~
seed
:
(
calc_fixed_seed
~
str
:
profiles
seed
)
let
bppseqgen_simulation
sim
~
hypothesis
~
nb_sites
~
profiles
=
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
hypothesis
in
let
descr
=
sprintf
".%s"
model_prefix
in
let
profile
=
profile
~
nb_sites
~
profiles
~
seed
:
(
seed
sim
)
in
let
profile_f
=
profile
.
profile_f
in
let
profile_c
=
profile
.
profile_c
in
Bppsuite
.
Bppseqgen
.
multi_profiles
~
descr
~
input_tree
:
(
tree
~
branch_length_unit
:
`Nucleotide
sim
)
~
hypothesis
~
profile_f
~
profile_c
~
seed
:
(
seed
sim
)
let
rec
nucleotide_alignment
=
function
|
Bppseqgen
{
hypothesis
;
nb_sites
;
profiles
;
_
}
as
sim
->
bppseqgen_simulation
sim
~
hypothesis
~
nb_sites
~
profiles
|>
Bppsuite
.
Bppseqgen
.
alignment
|
Bppseqgen_mixed
{
profiles
;
seed
;
n_h0
;
n_ha
;
ne_s
;
tree
}
->
let
h0
=
nucleotide_alignment
(
Bppseqgen
{
hypothesis
=
H0
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_h0
;
tree
})
in
let
ha
=
nucleotide_alignment
(
Bppseqgen
{
hypothesis
=
HaPC
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_ha
;
tree
})
in
Utils
.
fasta_cappend
h0
ha
include
Detection_pipeline
.
Make
(
struct
type
nonrec
t
=
t
let
tree
=
tree
let
nucleotide_alignment
=
nucleotide_alignment
end
)
let
alignment_plot
d
=
Convergence_detection
.
plot_convergent_sites
~
tree
:
(
tree
~
branch_length_unit
:
`Amino_acid
d
)
~
alignment
:
(
amino_acid_alignment
d
)
~
detection_results
:
(
multinomial_asymptotic_lrt
d
)
()
let
oracle
d
=
let
n_h0
,
n_ha
=
match
d
with
|
Bppseqgen
{
nb_sites
;
hypothesis
;
_
}
->
(
match
hypothesis
with
|
H0
_
->
nb_sites
,
0
|
HaPC
_
|
HaPCOC
_
->
0
,
nb_sites
)
|
Bppseqgen_mixed
{
n_h0
;
n_ha
;
_
}
->
n_h0
,
n_ha
in
Convergence_detection
.
oracle
~
n_h0
~
n_ha
let
multinomial_benchmark
d
=
Utils
.
recall_precision_curve
~
oracle
:
(
oracle
d
)
~
labels
:
[
"LRT"
;
"LRTsim"
;
"sparse"
;
"sparse_sim"
]
~
results
:
[
multinomial_asymptotic_lrt
d
,
1
;
multinomial_simulation_lrt
d
,
1
;
multinomial_asymptotic_sparse
d
,
1
;
multinomial_simulation_sparse
d
,
1
;
]
let
result_table
?
(
mode
=
`fast
)
d
=
Convergence_detection
.
merge_result_tables
~
multinomial
:
(
multinomial_asymptotic_lrt
d
)
~
tdg09
:
(
tdg09
d
)
~
identical
:
(
identical
d
)
~
topological
:
(
topological
d
)
~
pcoc
:
(
match
mode
with
|
`fast
->
pcoc
~
gamma
:
false
~
ncat
:
10
d
|
`full
->
pcoc
d
)
?
diffsel
:
(
match
mode
with
|
`fast
->
None
|
`full
->
Some
(
diffsel
d
)
)
~
oracle
:
(
oracle
d
)
()
let
benchmark
?
mode
d
=
result_table
?
mode
d
|>
Convergence_detection
.
recall_precision_curve
let
benchmark2
d
=
Utils
.
recall_precision_curve
~
oracle
:
(
oracle
d
)
~
labels
:
[
"identical"
;
"topological"
;
"multinomial"
;
"pcoc"
;
"tdg09"
]
~
results
:
[
identical
d
,
1
;
topological
d
,
1
;
multinomial_asymptotic_lrt
d
,
1
;
pcoc
d
,
3
;
tdg09
d
,
1
;
]
end
lib/simulation_dataset.mli
View file @
bbdd8e37