Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
bbdd8e37
Commit
bbdd8e37
authored
Nov 17, 2020
by
Philippe Veber
Browse files
Simulation_dataset: separate in-house simulation and bppseqgen's
parent
265897c0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
585 additions
and
454 deletions
+585
-454
lib/simulation_dataset.ml
lib/simulation_dataset.ml
+515
-374
lib/simulation_dataset.mli
lib/simulation_dataset.mli
+70
-80
No files found.
lib/simulation_dataset.ml
View file @
bbdd8e37
...
...
@@ -15,28 +15,13 @@ type tree =
branch_length2
:
float
;
}
type
'
s
benchmark
=
{
method_labels
:
string
list
;
method_outputs
:
float
option
array
list
;
average_precision
:
(
float
*
(
float
*
float
))
list
;
site_model
:
'
s
array
;
ancestral_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
convergent_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
}
module
type
S
=
sig
type
t
type
site_model
include
Detection_pipeline
.
Query
with
type
t
:=
t
include
Detection_pipeline
.
S
with
type
query
:=
t
val
alignment_plot
:
t
->
svg
file
type
query
val
benchmark
:
t
->
text
file
list
->
site_model
benchmark
workflow
include
Detection_pipeline
.
Query
with
type
t
:=
query
include
Detection_pipeline
.
S
with
type
query
:=
query
val
rds_of_benchmark
:
site_model
benchmark
workflow
->
rds
file
val
alignment_plot
:
query
->
svg
file
end
let
tree_workflow
=
function
...
...
@@ -46,7 +31,18 @@ let tree_workflow = function
npairs
}
->
Simulator
.
pair_tree
~
branch_length1
~
branch_length2
~
npairs
module
Mutsel_param
=
struct
module
Make
(
Q
:
Detection_pipeline
.
Query
)
=
struct
include
Detection_pipeline
.
Make
(
Q
)
let
alignment_plot
d
=
Convergence_detection
.
plot_convergent_sites
~
tree
:
(
Q
.
tree
~
branch_length_unit
:
`Amino_acid
d
)
~
alignment
:
(
amino_acid_alignment
d
)
~
detection_results
:
(
multinomial_asymptotic_lrt
d
)
()
end
module
Mutsel_query
=
struct
type
t
=
{
tree
:
tree
;
branch_scale
:
float
;
...
...
@@ -73,379 +69,524 @@ module Mutsel_param = struct
let
tree
=
tree_workflow
tree
in
let
fitness_profiles
=
Workflow
.
input
profiles
in
Simulator
.
simulation
~
branch_scale
~
n_ha
~
n_h0
~
ne_s
~
gBGC
~
tree
~
seed
~
fitness_profiles
()
end
type
t
=
|
Bppseqgen
of
{
hypothesis
:
Convergence_hypothesis
.
t
;
tree
:
tree
;
profiles
:
string
;
nb_sites
:
int
;
seed
:
int
;
}
|
Bppseqgen_mixed
of
{
tree
:
tree
;
profiles
:
string
;
seed
:
int
;
n_h0
:
int
;
n_ha
:
int
;
ne_s
:
float
;
}
|
Mutsel
of
Mutsel_param
.
t
let
nucleotide_alignment
p
=
simulation
p
|>
Simulator
.
alignment_of_simulation
let
bppseqgen_mixed
?
(
ne_s
=
1
.
)
?
(
seed
=
0
)
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Bppseqgen_mixed
{
tree
;
profiles
;
seed
;
n_ha
;
n_h0
;
ne_s
;
}
let
tree
~
branch_length_unit
:_
{
tree
;
_
}
=
tree_workflow
tree
end
let
bppseqgen
~
hyp
~
tree
~
profiles
~
nb_sites
~
seed
=
Bppseqgen
{
hypothesis
=
hyp
;
tree
;
profiles
;
nb_sites
;
seed
;
}
let
mutsel
?
branch_scale
?
ne_s
?
gBGC
?
seed
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Mutsel
(
Mutsel_param
.
make
?
branch_scale
?
ne_s
?
gBGC
?
seed
~
tree
~
profiles
~
n_h0
~
n_ha
()
)
let
of_mutsel_param
p
=
Mutsel
p
let
tree
~
branch_length_unit
:_
=
function
|
Bppseqgen
{
tree
;
_
}
|
Bppseqgen_mixed
{
tree
;
_
}
|
Mutsel
{
tree
;
_
}
->
tree_workflow
tree
let
seed
=
function
|
Bppseqgen_mixed
s
->
s
.
seed
|
Bppseqgen
s
->
s
.
seed
|
Mutsel
s
->
s
.
seed
let
profile
~
nb_sites
~
profiles
~
seed
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
All
~
nb_sites
profiles
~
seed
:
(
calc_fixed_seed
~
str
:
profiles
seed
)
let
bppseqgen_simulation
sim
~
hypothesis
~
nb_sites
~
profiles
=
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
hypothesis
in
let
descr
=
sprintf
".%s"
model_prefix
in
let
profile
=
profile
~
nb_sites
~
profiles
~
seed
:
(
seed
sim
)
in
let
profile_f
=
profile
.
profile_f
in
let
profile_c
=
profile
.
profile_c
in
Bppsuite
.
Bppseqgen
.
multi_profiles
~
descr
~
input_tree
:
(
tree
~
branch_length_unit
:
`Nucleotide
sim
)
~
hypothesis
~
profile_f
~
profile_c
~
seed
:
(
seed
sim
)
let
rec
nucleotide_alignment
=
function
|
Bppseqgen
{
hypothesis
;
nb_sites
;
profiles
;
_
}
as
sim
->
bppseqgen_simulation
sim
~
hypothesis
~
nb_sites
~
profiles
|>
Bppsuite
.
Bppseqgen
.
alignment
|
Bppseqgen_mixed
{
profiles
;
seed
;
n_h0
;
n_ha
;
ne_s
;
tree
}
->
let
h0
=
nucleotide_alignment
(
Bppseqgen
{
hypothesis
=
H0
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_h0
;
tree
})
in
let
ha
=
nucleotide_alignment
(
Bppseqgen
{
hypothesis
=
HaPC
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_ha
;
tree
})
in
Utils
.
fasta_cappend
h0
ha
|
Mutsel
param
->
Mutsel_param
.
simulation
param
|>
Simulator
.
alignment_of_simulation
module
Mutsel
=
struct
type
query
=
Mutsel_query
.
t
let
query
=
Mutsel_query
.
make
let
simulation
=
Mutsel_query
.
simulation
include
Detection_pipeline
.
Make
(
struct
type
nonrec
t
=
t
let
tree
=
tree
let
nucleotide_alignment
=
nucleotide_alignment
end
)
let
alignment_plot
d
=
Convergence_detection
.
plot_convergent_sites
~
tree
:
(
tree
~
branch_length_unit
:
`Amino_acid
d
)
~
alignment
:
(
amino_acid_alignment
d
)
~
detection_results
:
(
multinomial_asymptotic_lrt
d
)
()
let
oracle
d
=
let
n_h0
,
n_ha
=
match
d
with
|
Bppseqgen
{
nb_sites
;
hypothesis
;
_
}
->
(
match
hypothesis
with
|
H0
_
->
nb_sites
,
0
|
HaPC
_
|
HaPCOC
_
->
0
,
nb_sites
)
|
Bppseqgen_mixed
{
n_h0
;
n_ha
;
_
}
|
Mutsel
{
n_h0
;
n_ha
;
_
}
->
n_h0
,
n_ha
in
Convergence_detection
.
oracle
~
n_h0
~
n_ha
let
multinomial_benchmark
d
=
Utils
.
recall_precision_curve
~
oracle
:
(
oracle
d
)
~
labels
:
[
"LRT"
;
"LRTsim"
;
"sparse"
;
"sparse_sim"
]
~
results
:
[
multinomial_asymptotic_lrt
d
,
1
;
multinomial_simulation_lrt
d
,
1
;
multinomial_asymptotic_sparse
d
,
1
;
multinomial_simulation_sparse
d
,
1
;
]
include
Make
(
Mutsel_query
)
let
result_table
?
(
mode
=
`fast
)
d
=
Convergence_detection
.
merge_result_tables
~
multinomial
:
(
multinomial_asymptotic_lrt
d
)
~
tdg09
:
(
tdg09
d
)
~
identical
:
(
identical
d
)
~
topological
:
(
topological
d
)
~
pcoc
:
(
match
mode
with
|
`fast
->
pcoc
~
gamma
:
false
~
ncat
:
10
d
|
`full
->
pcoc
d
)
?
diffsel
:
(
match
mode
with
|
`fast
->
None
|
`full
->
Some
(
diffsel
d
)
)
~
oracle
:
(
oracle
d
)
()
let
benchmark
?
mode
d
=
result_table
?
mode
d
|>
Convergence_detection
.
recall_precision_curve
let
benchmark2
d
=
Utils
.
recall_precision_curve
~
oracle
:
(
oracle
d
)
~
labels
:
[
"identical"
;
"topological"
;
"multinomial"
;
"pcoc"
;
"tdg09"
]
~
results
:
[
identical
d
,
1
;
topological
d
,
1
;
multinomial_asymptotic_lrt
d
,
1
;
pcoc
d
,
3
;
tdg09
d
,
1
;
]
type
benchmark
=
{
method_labels
:
string
list
;
method_outputs
:
float
option
array
list
;
average_precision
:
(
float
*
(
float
*
float
))
list
;
site_model
:
[
`Convergent
|
`Non_convergent
]
array
;
ancestral_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
convergent_counts
:
int
Phylogenetics
.
Amino_acid
.
table
array
;
}
let
%
pworkflow
benchmark_statistics
simulation
~
labels
~
results
=
let
open
Phylogenetics
in
let
open
Codepitk
in
let
open
OCamlR_base
in
let
open
Codepitk
.
Simulator
.
Site_independent_mutsel
in
let
module
Codon
=
Codon
.
Universal_genetic_code
.
NS
in
let
load_results
fn
col
=
let
%
pworkflow
benchmark_statistics
simulation
~
labels
~
results
=
let
open
Phylogenetics
in
let
open
Codepitk
in
let
df
=
Dataframe
.
from_file
fn
|>
Rresult
.
R
.
failwith_error_msg
in
match
Dataframe
.
get_col
df
col
with
|
Some
(
Floats
xs
)
->
Array
.
map
xs
~
f
:
Option
.
some
|
Some
(
Float_opts
xs
)
->
xs
|
Some
(
Ints
xs
)
->
Array
.
map
xs
~
f
:
(
fun
i
->
Some
(
Float
.
of_int
i
))
|
Some
(
Int_opts
xs
)
->
Array
.
map
xs
~
f
:
(
Option
.
map
~
f
:
Float
.
of_int
)
|
_
->
failwith
"expected a numeric column at pos 1"
in
let
sim
:
simulation
=
[
%
eval
simulation
]
in
let
result_paths
=
[
%
eval
Bistro
.
Workflow
.
path_list
(
List
.
map
~
f
:
fst
results
)]
in
let
results
=
List
.
map2_exn
result_paths
(
List
.
map
~
f
:
snd
results
)
~
f
:
load_results
in
let
labels
=
[
%
param
labels
]
in
let
n_h0
=
Array
.
length
sim
.
h0_params
in
let
n_ha
=
Array
.
length
sim
.
ha_params
in
let
nsites
=
n_h0
+
n_ha
in
let
columns
=
List
.
map2_exn
labels
results
~
f
:
(
fun
l
r
->
l
,
`Numeric
(
Numeric
.
of_array_opt
r
)
)
in
let
amino_acid_vector_of_codon_vector
xs
=
Amino_acid
.
Vector
.
init
(
fun
aa
->
List
.
fold
Codon
.
all
~
init
:
0
.
~
f
:
(
fun
acc
c
->
if
Amino_acid
.
equal
aa
(
Codon
.
aa_of_codon
c
)
then
acc
+.
xs
.
Codon
.
%
(
c
)
else
acc
)
)
in
let
collect_profiles
sel
=
Array
.
append
sim
.
h0_params
sim
.
ha_params
|>
Array
.
map
~
f
:
(
fun
p
->
sel
p
|>
Mutsel
.
stationary_distribution
|>
amino_acid_vector_of_codon_vector
|>
Amino_acid
.
Vector
.
to_array
)
|>
Numeric
.
Matrix
.
of_arrays
in
let
ancestral_profiles
=
collect_profiles
fst
in
let
convergent_profiles
=
collect_profiles
snd
in
let
counts
seqs
i
=
let
t
=
Amino_acid
.
Table
.
init
(
fun
aa
->
let
aa
=
Amino_acid
.
to_char
aa
in
List
.
count
seqs
~
f
:
(
fun
s
->
let
codon_str
=
String
.
sub
(
s
:
Dna
.
t
:>
string
)
~
pos
:
(
i
*
3
)
~
len
:
3
in
let
codon
=
match
Codon
.
of_string
codon_str
with
|
Some
c
->
c
|
None
->
assert
false
in
Char
.
equal
(
Amino_acid
.
to_char
(
Codon
.
aa_of_codon
codon
))
aa
)
let
open
OCamlR_base
in
let
open
Codepitk
.
Simulator
.
Site_independent_mutsel
in
let
module
Codon
=
Codon
.
Universal_genetic_code
.
NS
in
let
load_results
fn
col
=
let
open
Codepitk
in
let
df
=
Dataframe
.
from_file
fn
|>
Rresult
.
R
.
failwith_error_msg
in
match
Dataframe
.
get_col
df
col
with
|
Some
(
Floats
xs
)
->
Array
.
map
xs
~
f
:
Option
.
some
|
Some
(
Float_opts
xs
)
->
xs
|
Some
(
Ints
xs
)
->
Array
.
map
xs
~
f
:
(
fun
i
->
Some
(
Float
.
of_int
i
))
|
Some
(
Int_opts
xs
)
->
Array
.
map
xs
~
f
:
(
Option
.
map
~
f
:
Float
.
of_int
)
|
_
->
failwith
"expected a numeric column at pos 1"
in
let
sim
:
simulation
=
[
%
eval
simulation
]
in
let
result_paths
=
[
%
eval
Bistro
.
Workflow
.
path_list
(
List
.
map
~
f
:
fst
results
)]
in
let
results
=
List
.
map2_exn
result_paths
(
List
.
map
~
f
:
snd
results
)
~
f
:
load_results
in
let
labels
=
[
%
param
labels
]
in
let
n_h0
=
Array
.
length
sim
.
h0_params
in
let
n_ha
=
Array
.
length
sim
.
ha_params
in
let
nsites
=
n_h0
+
n_ha
in
let
columns
=
List
.
map2_exn
labels
results
~
f
:
(
fun
l
r
->
l
,
`Numeric
(
Numeric
.
of_array_opt
r
)
)
in
let
amino_acid_vector_of_codon_vector
xs
=
Amino_acid
.
Vector
.
init
(
fun
aa
->
List
.
fold
Codon
.
all
~
init
:
0
.
~
f
:
(
fun
acc
c
->
if
Amino_acid
.
equal
aa
(
Codon
.
aa_of_codon
c
)
then
acc
+.
xs
.
Codon
.
%
(
c
)
else
acc
)
)
in
(
t
:>
int
array
)
in
let
collect_counts
cond
=
let
species
=
Convergence_tree
.
leaves
sim
.
tree
in
let
seqs
=
List
.
map2_exn
sim
.
sequences
species
~
f
:
(
fun
s
(
_
,
cond_s
)
->
if
Poly
.
equal
cond
cond_s
then
Some
s
else
None
let
collect_profiles
sel
=
Array
.
append
sim
.
h0_params
sim
.
ha_params
|>
Array
.
map
~
f
:
(
fun
p
->
sel
p
|>
Mutsel
.
stationary_distribution
|>
amino_acid_vector_of_codon_vector
|>
Amino_acid
.
Vector
.
to_array
)
|>
List
.
filter_opt
|>
Numeric
.
Matrix
.
of_arrays
in
Array
.
init
nsites
~
f
:
(
counts
seqs
)
|>
Integer
.
Matrix
.
of_arrays
in
let
ancestral_counts
=
collect_counts
`Ancestral
in
let
convergent_counts
=
collect_counts
`Convergent
in
let
make_classification_data
x
y
=
Prc
.
Classification_data
(
List
.
init
(
Array
.
length
x
)
~
f
:
(
fun
i
->
match
x
.
(
i
)
,
y
.
(
i
)
with
|
Some
x_i
,
Some
y_i
->
Some
(
x_i
,
y_i
)
|
None
,
_
|
_
,
None
->
None
let
ancestral_profiles
=
collect_profiles
fst
in
let
convergent_profiles
=
collect_profiles
snd
in
let
counts
seqs
i
=
let
t
=
Amino_acid
.
Table
.
init
(
fun
aa
->
let
aa
=
Amino_acid
.
to_char
aa
in
List
.
count
seqs
~
f
:
(
fun
s
->
let
codon_str
=
String
.
sub
(
s
:
Dna
.
t
:>
string
)
~
pos
:
(
i
*
3
)
~
len
:
3
in
let
codon
=
match
Codon
.
of_string
codon_str
with
|
Some
c
->
c
|
None
->
assert
false
in
Char
.
equal
(
Amino_acid
.
to_char
(
Codon
.
aa_of_codon
codon
))
aa
)
)
in
(
t
:>
int
array
)
in
let
collect_counts
cond
=
let
species
=
Convergence_tree
.
leaves
sim
.
tree
in
let
seqs
=
List
.
map2_exn
sim
.
sequences
species
~
f
:
(
fun
s
(
_
,
cond_s
)
->
if
Poly
.
equal
cond
cond_s
then
Some
s
else
None
)
|>
List
.
filter_opt
in
Array
.
init
nsites
~
f
:
(
counts
seqs
)
|>
Integer
.
Matrix
.
of_arrays
in
let
ancestral_counts
=
collect_counts
`Ancestral
in
let
convergent_counts
=
collect_counts
`Convergent
in
let
make_classification_data
x
y
=
Prc
.
Classification_data
(
List
.
init
(
Array
.
length
x
)
~
f
:
(
fun
i
->
match
x
.
(
i
)
,
y
.
(
i
)
with
|
Some
x_i
,
Some
y_i
->
Some
(
x_i
,
y_i
)
|
None
,
_
|
_
,
None
->
None
)
|>
List
.
filter_opt
)
in
let
estimates
,
lower_bounds
,
upper_bounds
=
let
oracle
=
Array
.
init
nsites
~
f
:
(
fun
i
->
if
i
<
n_h0
then
Some
false
else
Some
true
)
in
List
.
map
results
~
f
:
(
fun
scores
->
let
Prc
.
Classification_data
xs
as
data
=
make_classification_data
scores
oracle
in
let
n
=
List
.
count
xs
~
f
:
snd
in
let
theta_hat
=
Prc
.
auc_trapezoidal_lt
data
in
let
lb
,
ub
=
Prc
.
logit_confidence_interval
~
alpha
:
0
.
05
~
theta_hat
~
n
in
theta_hat
,
lb
,
ub
)
|>
List
.
filter_opt
)
in
let
estimates
,
lower_bounds
,
upper_bounds
=
let
oracle
=
Array
.
init
nsites
~
f
:
(
fun
i
->
if
i
<
n_h0
then
Some
false
else
Some
true
)
in
List
.
map
results
~
f
:
(
fun
scores
->
let
Prc
.
Classification_data
xs
as
data
=
make_classification_data
scores
oracle
in
let
n
=
List
.
count
xs
~
f
:
snd
in
let
theta_hat
=
Prc
.
auc_trapezoidal_lt
data
in
let
lb
,
ub
=
Prc
.
logit_confidence_interval
~
alpha
:
0
.
05
~
theta_hat
~
n
in
theta_hat
,
lb
,
ub
|>
List
.
unzip3
in
let
open
OCamlR_base
in
let
auc_estimates
=
Dataframe
.
create
[
"method"
,
`Character
(
Character
.
of_list
labels
)
;
"estimate"
,
`Numeric
(
Numeric
.
of_list
estimates
)
;
"lower_bound"
,
`Numeric
(
Numeric
.
of_list
lower_bounds
)
;
"upper_bound"
,
`Numeric
(
Numeric
.
of_list
upper_bounds
)
;
]
in
let
oracle
=
Array
.(
append
(
map
sim
.
h0_profiles
~
f
:
(
Fn
.
const
false
))
(
map
sim
.
ha_profiles
~
f
:
(
Fn
.
const
true
))
)
|>
List
.
unzip3
in
let
open
OCamlR_base
in
let
auc_estimates
=
Dataframe
.
create
[
"method"
,
`Character
(
Character
.
of_list
labels
)
;
"estimate"
,
`Numeric
(
Numeric
.
of_list
estimates
)
;
"lower_bound"
,
`Numeric
(
Numeric
.
of_list
lower_bounds
)
;
"upper_bound"
,
`Numeric
(
Numeric
.
of_list
upper_bounds
)
;
]
in
let
oracle
=
Array
.(
append
(
map
sim
.
h0_profiles
~
f
:
(
Fn
.
const
false
))
(
map
sim
.
ha_profiles
~
f
:
(
Fn
.
const
true
))
)
|>
Logical
.
of_array
in
let
results
=
Dataframe
.
create
columns
in
let
data
=
List_
.
create
[
Some
"results"
,
Dataframe
.
to_sexp
results
;
Some
"oracle"
,
Logical
.
to_sexp
oracle
;
Some
"ancestral_profiles"
,
Numeric
.
Matrix
.
to_sexp
ancestral_profiles
;
Some
"convergent_profiles"
,
Numeric
.
Matrix
.
to_sexp
convergent_profiles
;
Some
"ancestral_counts"
,
Integer
.
Matrix
.
to_sexp
ancestral_counts
;
Some
"convergent_counts"
,
Integer
.
Matrix
.
to_sexp
convergent_counts
;
Some
"auc_estimates"
,
Dataframe
.
to_sexp
auc_estimates
;
|>
Logical
.
of_array
in
let
results
=
Dataframe
.
create
columns
in
let
data
=
List_
.
create
[
Some
"results"
,
Dataframe
.
to_sexp
results
;
Some
"oracle"
,
Logical
.
to_sexp
oracle
;
Some
"ancestral_profiles"
,
Numeric
.
Matrix
.
to_sexp
ancestral_profiles
;
Some
"convergent_profiles"
,
Numeric
.
Matrix
.
to_sexp
convergent_profiles
;
Some
"ancestral_counts"
,
Integer
.
Matrix
.
to_sexp
ancestral_counts
;
Some
"convergent_counts"
,
Integer
.
Matrix
.
to_sexp
convergent_counts
;
Some
"auc_estimates"
,
Dataframe
.
to_sexp
auc_estimates
;
]
in
saveRDS
~
file
:
[
%
dest
]
(
List_
.
to_sexp
data
)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let
branch_scale_range
=
[
1
.;
3
.;
6
.;
9
.
]
type
gBGC_t
=
Global
of
float
|
Convergent
of
float
*
float
let
gBGC_range
=
let
range
=
[
0
.;
2
.;
4
.;
8
.;
16
.;
32
.;
64
.;
]
in
List
.
concat
[
(* List.map ~f:(fun x -> Global x) range ; *)
List
.
map
~
f
:
(
fun
x
->
Convergent
(
0
.,
x
))
range
;
]
in
saveRDS
~
file
:
[
%
dest
]
(
List_
.
to_sexp
data
)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let
branch_scale_range
=
[
1
.;
3
.;
6
.;
9
.
]
type
gBGC_t
=
Global
of
float
|
Convergent
of
float
*
float
let
gBGC_range
=
let
range
=
[
0
.;
2
.;
4
.;
8
.;
16
.;
32
.;
64
.;
]
in
List
.
concat
[
(* List.map ~f:(fun x -> Global x) range ; *)
List
.
map
~
f
:
(
fun
x
->
Convergent
(
0
.,
x
))
range
;
]
type
param_t
=
float
*
gBGC_t
let
explore_params
~
(
f
:
param_t
->
_
)
=
List
.
map
branch_scale_range
~
f
:
(
fun
(
bf
:
float
)
->
List
.
map
gBGC_range
~
f
:
(
fun
(
gbgc
:
gBGC_t
)
->
((
bf
,
gbgc
)
,
f
(
bf
,
gbgc
)))
)
|>
List
.
concat
let
simu_of_param
?
n_h0
:
(
n_h0
=
50
)
(
p
:
param_t
)
=
let
bf
,
gbgc
=
p
in
mutsel
~
tree
:
(
NHX
(
Workflow
.
input
"example/trees_analyses/C4AmaranthaceaePolyroot.nhx"
))
~
profiles
:
"example/aa_fitness/263SelectedProfiles.tsv"
~
branch_scale
:
bf
~
gBGC
:
(
match
gbgc
with
|
Convergent
(
a
,
c
)
->
(
a
,
c
)
|
Global
g
->
(
g
,
g
))
~
ne_s
:
(
4
.,
4
.
)
~
n_ha
:
0
~
n_h0
()
let
filter_results
~
(
f
:
_
->
bool
)
(
results
:
(
param_t
*
_
)
list
)
=
List
.
filter
results
~
f
:
(
fun
(
_
,
x
)
->
f
x
)
type
record_t
=
{
gc_means_ancestral
:
([
`first
|
`second
|
`third
]
*
float
)
list
;
gc_means_convergent
:
([
`first
|
`second
|
`third
]
*
float
)
list
}
let
%
workflow
record_of_simu
s
=
let
tree
=
[
%
path
tree
~
branch_length_unit
:
`Nucleotide
s
]
in
let
nucleotide_alignment
=
[
%
path
nucleotide_alignment
s
]
in
let
gc_mean_from_simu
~
pos
=
Alistats
.
nucleotide_fasta_gc_ac
~
pos
tree
nucleotide_alignment
in
let
(
m1_a
,
m1_c
)
,
(
m2_a
,
m2_c
)
,
(
m3_a
,
m3_c
)
=
gc_mean_from_simu
~
pos
:
`first
,
gc_mean_from_simu
~
pos
:
`second
,
gc_mean_from_simu
~
pos
:
`third
in
{
gc_means_ancestral
=
[(
`first
,
m1_a
.
gc_mean
)
;
(
`second
,
m2_a
.
gc_mean
)
;
(
`third
,
m3_a
.
gc_mean
)]
;
gc_means_convergent
=
[(
`first
,
m1_c
.
gc_mean
)
;
(
`second
,
m2_c
.
gc_mean
)
;
(
`third
,
m3_c
.
gc_mean
)]
type
param_t
=
float
*
gBGC_t
let
explore_params
~
(
f
:
param_t
->
_
)
=
List
.
map
branch_scale_range
~
f
:
(
fun
(
bf
:
float
)
->
List
.
map
gBGC_range
~
f
:
(
fun
(
gbgc
:
gBGC_t
)
->
((
bf
,
gbgc
)
,
f
(
bf
,
gbgc
)))
)
|>
List
.
concat
let
simu_of_param
?
n_h0
:
(
n_h0
=
50
)
(
p
:
param_t
)
=
let
bf
,
gbgc
=
p
in
Mutsel_query
.
make
~
tree
:
(
NHX
(
Workflow
.
input
"example/trees_analyses/C4AmaranthaceaePolyroot.nhx"
))
~
profiles
:
"example/aa_fitness/263SelectedProfiles.tsv"
~
branch_scale
:
bf
~
gBGC
:
(
match
gbgc
with
|
Convergent
(
a
,
c
)
->
(
a
,
c
)
|
Global
g
->
(
g
,
g
))
~
ne_s
:
(
4
.,
4
.
)
~
n_ha
:
0
~
n_h0
()
let
filter_results
~
(
f
:
_
->
bool
)
(
results
:
(
param_t
*
_
)
list
)
=
List
.
filter
results
~
f
:
(
fun
(
_
,
x
)
->
f
x
)
type
record_t
=
{
gc_means_ancestral
:
([
`first
|
`second
|
`third
]
*
float
)
list
;
gc_means_convergent
:
([
`first
|
`second
|
`third
]
*
float
)
list
}
let
expected_gc
=
[
(
`first
,
(
0
.
3326
,
0
.
5157
,
0
.
5589
,
0
.
6080
,
0
.
8621
))
;
(
`second
,
(
0
.
2102
,
0
.
3784
,
0
.
4160
,
0
.
4626
,
0
.
7499
))
;
(
`third
,
(
0
.
2242
,
0
.
4852
,
0
.
6274
,
0
.
7358
,
0
.
9575
))
]
let
quartile
(
min_
,
fq_
,
mean_
,
tq_
,
max_
)
x
=
match
Float
.(
x
<
min_
,
x
<
fq_
,
x
<
mean_
,
x
<
tq_
,
x
<
max_
)
with
|
true
,
_
,
_
,
_
,
_
->
`below_min
|
false
,
true
,
_
,
_
,
_
->
`first
|
_
,
false
,
true
,
_
,
_
->
`second
|
_
,
_
,
false
,
true
,
_
->
`third
|
_
,
_
,
_
,
false
,
true
->
`fourth
|
_
,
_
,
_
,
_
,
false
->
`over_max
let
adjacent
q1
q2
=
match
q1
,
q2
with
|
`first
,
`first
|
`second
,
`second
|
`third
,
`third
|
`fourth
,
`fourth
|
`first
,
`second
|
`second
,
`first
|
`second
,
`third
|
`third
,
`second
|
`third
,
`fourth
|
`fourth
,
`third
->
true