Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Docker-in-Docker (DinD) capabilities of public runners deactivated.
More info
Open sidebar
VEBER Philippe
codepi
Commits
74b927db
Commit
74b927db
authored
Nov 11, 2020
by
Philippe Veber
Browse files
generate R-readable summary out of benchmark
parent
4434509f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
228 additions
and
40 deletions
+228
-40
app/lmm_benchmark.ml
app/lmm_benchmark.ml
+39
-9
lib/simulation_dataset.ml
lib/simulation_dataset.ml
+163
-31
lib/simulation_dataset.mli
lib/simulation_dataset.mli
+26
-0
No files found.
app/lmm_benchmark.ml
View file @
74b927db
...
...
@@ -4,6 +4,7 @@ open Codepi
open
Codepi
.
File_formats
type
dataset
=
{
label
:
string
;
tree
:
nhx
file
;
rooted
:
bool
;
branch_scale
:
float
;
...
...
@@ -11,27 +12,31 @@ type dataset = {
}
let
besnard2009
=
{
label
=
"besnard2009"
;
tree
=
Bistro
.
Workflow
.
input
"data/besnard2009/besnard2009.nhx"
;
rooted
=
true
;
branch_scale
=
3
.
;
branch_scale
=
10
.
;
ne_s
=
4
.,
4
.
;
}
let
oneline_rodent
=
{
label
=
"online_rodent"
;
tree
=
Bistro
.
Workflow
.
input
"data/online_rodent/online_rodent.nhx"
;
rooted
=
true
;
branch_scale
=
3
.
;
branch_scale
=
10
.
;
ne_s
=
4
.,
4
.
;
}
let
rubisco
=
{
label
=
"rubisco"
;
tree
=
Rubisco_dataset
.(
Path
"data/rubisco"
|>
Query
.
tree
~
branch_length_unit
:
`Amino_acid
)
;
rooted
=
false
;
branch_scale
=
3
.
;
branch_scale
=
10
.
;
ne_s
=
4
.,
4
.
;
}
let
orthomam_echolocation
=
{
label
=
"orthomam_echolocation"
;
tree
=
(
Orthomam
.
tree_of_db
~
branch_length_unit
:
`Amino_acid
...
...
@@ -39,7 +44,7 @@ let orthomam_echolocation = {
(
Codepitk
.
Orthomam_db
.
make
"omm"
)
)
;
rooted
=
false
;
branch_scale
=
3
.
;
branch_scale
=
10
.
;
ne_s
=
4
.,
4
.
;
}
...
...
@@ -63,10 +68,10 @@ let methods = Simulation_dataset.[
meth
topological
"topological"
~
requires_rooted_tree
:
true
;
]
let
benchmark
{
tree
=
t
;
rooted
;
ne_s
;
branch_scale
}
=
let
benchmark
{
tree
=
t
;
rooted
;
ne_s
;
branch_scale
;
_
}
=
let
open
Simulation_dataset
in
let
sim
=
convdet_simulation
~
seed
:
3
~
tree
:
(
NHX
t
)
~
branch_scale
~
ne_s
convdet_simulation
~
seed
:
42
~
tree
:
(
NHX
t
)
~
branch_scale
~
ne_s
~
profiles
:
"example/aa_fitness/263SelectedProfiles.tsv"
~
n_h0
:
900
~
n_ha
:
100
()
in
...
...
@@ -80,6 +85,25 @@ let benchmark { tree = t ; rooted ; ne_s ; branch_scale } =
in
Utils
.
average_precision_plot
~
oracle
:
(
oracle
sim
)
~
labels
~
results
let
benchmark_rds
?
(
seed
=
42
)
{
tree
=
t
;
rooted
;
ne_s
;
branch_scale
;
_
}
=
let
open
Simulation_dataset
in
let
param
=
Convdet_simulation_param
.
make
~
seed
~
tree
:
(
NHX
t
)
~
branch_scale
~
ne_s
~
profiles
:
"example/aa_fitness/263SelectedProfiles.tsv"
~
n_h0
:
900
~
n_ha
:
100
()
in
let
sim
=
convdet_simulation_of_param
param
in
let
simulation
=
Convdet_simulation_param
.
simulation
param
in
let
results
,
labels
=
List
.
filter_map
methods
~
f
:
(
fun
m
->
if
not
m
.
requires_rooted_tree
||
rooted
then
Some
((
m
.
result
sim
,
m
.
col
)
,
m
.
label
)
else
None
)
|>
List
.
unzip
in
benchmark_statistics
simulation
~
labels
~
results
let
report
=
let
module
H
=
Bistro_utils
.
Html_report
in
H
.
make
~
title
:
"Codepi benchmark"
[
...
...
@@ -98,7 +122,13 @@ let report =
|>
H
.
render
let
()
=
let
w
=
report
in
let
open
Bistro_utils
.
Repo
in
let
datasets
=
[
besnard2009
;
rubisco
;
oneline_rodent
;
orthomam_echolocation
]
in
let
repo
=
item
[
"report.html"
]
report
::
List
.
map
datasets
~
f
:
(
fun
d
->
item
[
d
.
label
^
".rds"
]
(
benchmark_rds
d
)
)
in
let
loggers
=
[
Bistro_utils
.
Console_logger
.
create
()
]
in
Bistro_engine
.
Scheduler
.
simple_eval_exn
~
loggers
~
np
:
4
~
mem
:
(
`GB
4
)
(
Bistro
.
Workflow
.
path
w
)
|>
print_endline
build_main
~
loggers
~
np
:
4
~
mem
:
(
`GB
4
)
repo
~
outdir
:
"res"
lib/simulation_dataset.ml
View file @
74b927db
...
...
@@ -15,6 +15,42 @@ type tree =
branch_length2
:
float
;
}
let
tree_workflow
=
function
|
NHX
w
->
w
|
Pair_tree
{
branch_length1
;
branch_length2
;
npairs
}
->
Simulator
.
pair_tree
~
branch_length1
~
branch_length2
~
npairs
module
Convdet_simulation_param
=
struct
type
t
=
{
tree
:
tree
;
branch_scale
:
float
;
profiles
:
string
;
n_h0
:
int
;
n_ha
:
int
;
ne_s
:
float
*
float
;
gBGC
:
float
*
float
;
seed
:
int
;
}
let
make
?
(
branch_scale
=
1
.
)
?
(
ne_s
=
1
.,
1
.
)
?
(
gBGC
=
0
.,
0
.
)
?
(
seed
=
0
)
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
{
tree
;
profiles
;
n_h0
;
n_ha
;
ne_s
;
gBGC
;
branch_scale
;
seed
:
int
;
}
let
simulation
{
n_h0
;
n_ha
;
profiles
;
ne_s
;
gBGC
;
branch_scale
;
seed
;
tree
;
_
}
=
let
tree
=
tree_workflow
tree
in
let
fitness_profiles
=
Workflow
.
input
profiles
in
Simulator
.
simulation
~
branch_scale
~
n_ha
~
n_h0
~
ne_s
~
gBGC
~
tree
~
seed
~
fitness_profiles
()
end
type
t
=
|
Bppseqgen_simulation
of
{
hypothesis
:
Convergence_hypothesis
.
t
;
...
...
@@ -31,16 +67,7 @@ type t =
n_ha
:
int
;
ne_s
:
float
;
}
|
Convdet_simulation
of
{
tree
:
tree
;
branch_scale
:
float
;
profiles
:
string
;
n_h0
:
int
;
n_ha
:
int
;
ne_s
:
float
*
float
;
gBGC
:
float
*
float
;
seed
:
int
;
}
|
Convdet_simulation
of
Convdet_simulation_param
.
t
let
bppseqgen_mixed_simulation
?
(
ne_s
=
1
.
)
?
(
seed
=
0
)
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Bppseqgen_mixed
{
...
...
@@ -61,28 +88,16 @@ let bppseqgen_simulation ~hyp ~tree ~profiles ~nb_sites ~seed =
seed
;
}
let
convdet_simulation
?
(
branch_scale
=
1
.
)
?
(
ne_s
=
1
.,
1
.
)
?
(
gBGC
=
0
.,
0
.
)
?
(
seed
=
0
)
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Convdet_simulation
{
tree
;
profiles
;
n_h0
;
n_ha
;
ne_s
;
gBGC
;
branch_scale
;
seed
:
int
;
}
let
convdet_simulation
?
branch_scale
?
ne_s
?
gBGC
?
seed
~
tree
~
profiles
~
n_h0
~
n_ha
()
=
Convdet_simulation
(
Convdet_simulation_param
.
make
?
branch_scale
?
ne_s
?
gBGC
?
seed
~
tree
~
profiles
~
n_h0
~
n_ha
()
)
let
convdet_simulation_of_param
p
=
Convdet_simulation
p
let
tree
~
branch_length_unit
:_
=
function
|
Bppseqgen_simulation
{
tree
;
_
}
|
Bppseqgen_mixed
{
tree
;
_
}
|
Convdet_simulation
{
tree
;
_
}
->
match
tree
with
|
NHX
w
->
w
|
Pair_tree
{
branch_length1
;
branch_length2
;
npairs
}
->
Simulator
.
pair_tree
~
branch_length1
~
branch_length2
~
npairs
tree_workflow
tree
let
seed
=
function
|
Bppseqgen_mixed
s
->
s
.
seed
...
...
@@ -115,10 +130,8 @@ let rec nucleotide_alignment = function
let
h0
=
nucleotide_alignment
(
Bppseqgen_simulation
{
hypothesis
=
H0
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_h0
;
tree
})
in
let
ha
=
nucleotide_alignment
(
Bppseqgen_simulation
{
hypothesis
=
HaPC
(
Fixed
ne_s
)
;
profiles
;
seed
;
nb_sites
=
n_ha
;
tree
})
in
Utils
.
fasta_cappend
h0
ha
|
Convdet_simulation
{
n_h0
;
n_ha
;
profiles
;
ne_s
;
gBGC
;
branch_scale
;
seed
;
_
}
as
sim
->
let
tree
=
tree
~
branch_length_unit
:
`Nucleotide
sim
in
let
fitness_profiles
=
Workflow
.
input
profiles
in
Simulator
.
simulation
~
branch_scale
~
n_ha
~
n_h0
~
ne_s
~
gBGC
~
tree
~
seed
~
fitness_profiles
()
|
Convdet_simulation
param
->
Convdet_simulation_param
.
simulation
param
|>
Simulator
.
alignment_of_simulation
include
Detection_pipeline
.
Make
(
struct
...
...
@@ -194,6 +207,125 @@ let benchmark2 d =
tdg09
d
,
1
;
]
let
%
pworkflow
benchmark_statistics
simulation
~
labels
~
results
=
let
open
Phylogenetics
in
let
open
Codepitk
in
let
open
OCamlR_base
in
let
open
Codepitk
.
Simulator
.
Site_independent_mutsel
in
let
load_results
fn
col
=
let
open
Codepitk
in
let
df
=
Dataframe
.
from_file
fn
|>
Rresult
.
R
.
failwith_error_msg
in
match
Dataframe
.
get_col
df
col
with
|
Some
(
Floats
xs
)
->
Array
.
map
xs
~
f
:
Option
.
some
|
Some
(
Float_opts
xs
)
->
xs
|
Some
(
Ints
xs
)
->
Array
.
map
xs
~
f
:
(
fun
i
->
Some
(
Float
.
of_int
i
))
|
Some
(
Int_opts
xs
)
->
Array
.
map
xs
~
f
:
(
Option
.
map
~
f
:
Float
.
of_int
)
|
_
->
failwith
"expected a numeric column at pos 1"
in
let
sim
:
simulation
=
[
%
eval
simulation
]
in
let
result_paths
=
[
%
eval
Bistro
.
Workflow
.
path_list
(
List
.
map
~
f
:
fst
results
)]
in
let
results
=
List
.
map2_exn
result_paths
(
List
.
map
~
f
:
snd
results
)
~
f
:
load_results
in
let
labels
=
[
%
param
labels
]
in
let
n_h0
=
Array
.
length
sim
.
h0_params
in
let
n_ha
=
Array
.
length
sim
.
ha_params
in
let
nsites
=
n_h0
+
n_ha
in
let
columns
=
List
.
map2_exn
labels
results
~
f
:
(
fun
l
r
->
l
,
`Numeric
(
Numeric
.
of_array_opt
r
)
)
in
let
amino_acid_vector_of_codon_vector
xs
=
let
module
Codon
=
Codon
.
Universal_genetic_code
.
NS
in
Amino_acid
.
Vector
.
init
(
fun
aa
->
List
.
fold
Codon
.
all
~
init
:
0
.
~
f
:
(
fun
acc
c
->
if
Amino_acid
.
equal
aa
(
Codon
.
aa_of_codon
c
)
then
acc
+.
xs
.
Codon
.
%
(
c
)
else
acc
)
)
in
let
collect_profiles
sel
=
Array
.
append
sim
.
h0_params
sim
.
ha_params
|>
Array
.
map
~
f
:
(
fun
p
->
sel
p
|>
Mutsel
.
stationary_distribution
|>
amino_acid_vector_of_codon_vector
|>
Amino_acid
.
Vector
.
to_array
)
|>
Numeric
.
Matrix
.
of_arrays
in
let
ancestral_profiles
=
collect_profiles
fst
in
let
convergent_profiles
=
collect_profiles
snd
in
let
counts
seqs
i
=
let
t
=
Amino_acid
.
Table
.
init
(
fun
aa
->
let
aa
=
Amino_acid
.
to_char
aa
in
List
.
count
seqs
~
f
:
(
fun
s
->
Char
.
equal
(
s
:
Dna
.
t
:>
string
)
.
[
i
]
aa
)
)
in
(
t
:>
int
array
)
in
let
collect_counts
cond
=
let
species
=
Convergence_tree
.
leaves
sim
.
tree
in
let
seqs
=
List
.
map2_exn
sim
.
sequences
species
~
f
:
(
fun
s
(
_
,
cond_s
)
->
if
Poly
.
equal
cond
cond_s
then
Some
s
else
None
)
|>
List
.
filter_opt
in
Array
.
init
nsites
~
f
:
(
counts
seqs
)
|>
Integer
.
Matrix
.
of_arrays
in
let
ancestral_counts
=
collect_counts
`Ancestral
in
let
convergent_counts
=
collect_counts
`Convergent
in
let
make_classification_data
x
y
=
Prc
.
Classification_data
(
List
.
init
(
Array
.
length
x
)
~
f
:
(
fun
i
->
match
x
.
(
i
)
,
y
.
(
i
)
with
|
Some
x_i
,
Some
y_i
->
Some
(
x_i
,
y_i
)
|
None
,
_
|
_
,
None
->
None
)
|>
List
.
filter_opt
)
in
let
estimates
,
lower_bounds
,
upper_bounds
=
let
oracle
=
Array
.
init
nsites
~
f
:
(
fun
i
->
if
i
<
n_h0
then
Some
false
else
Some
true
)
in
List
.
map
results
~
f
:
(
fun
scores
->
let
Prc
.
Classification_data
xs
as
data
=
make_classification_data
scores
oracle
in
let
n
=
List
.
count
xs
~
f
:
snd
in
let
theta_hat
=
Prc
.
auc_trapezoidal_lt
data
in
let
lb
,
ub
=
Prc
.
logit_confidence_interval
~
alpha
:
0
.
05
~
theta_hat
~
n
in
theta_hat
,
lb
,
ub
)
|>
List
.
unzip3
in
let
open
OCamlR_base
in
let
auc_estimates
=
Dataframe
.
create
[
"method"
,
`Character
(
Character
.
of_list
labels
)
;
"estimate"
,
`Numeric
(
Numeric
.
of_list
estimates
)
;
"lower_bound"
,
`Numeric
(
Numeric
.
of_list
lower_bounds
)
;
"upper_bound"
,
`Numeric
(
Numeric
.
of_list
upper_bounds
)
;
]
in
let
oracle
=
Array
.(
append
(
map
sim
.
h0_profiles
~
f
:
(
Fn
.
const
false
))
(
map
sim
.
ha_profiles
~
f
:
(
Fn
.
const
true
))
)
|>
Logical
.
of_array
in
let
results
=
Dataframe
.
create
columns
in
let
data
=
List_
.
create
[
Some
"results"
,
Dataframe
.
to_sexp
results
;
Some
"oracle"
,
Logical
.
to_sexp
oracle
;
Some
"ancestral_profiles"
,
Numeric
.
Matrix
.
to_sexp
ancestral_profiles
;
Some
"convergent_profiles"
,
Numeric
.
Matrix
.
to_sexp
convergent_profiles
;
Some
"ancestral_counts"
,
Integer
.
Matrix
.
to_sexp
ancestral_counts
;
Some
"convergent_counts"
,
Integer
.
Matrix
.
to_sexp
convergent_counts
;
Some
"auc_estimates"
,
Dataframe
.
to_sexp
auc_estimates
;
]
in
saveRDS
~
file
:
[
%
dest
]
(
List_
.
to_sexp
data
)
(* param exploration for SMBE paper *)
(* type branch_scale_t = float *)
let
branch_scale_range
=
[
1
.;
3
.;
6
.;
9
.
]
...
...
lib/simulation_dataset.mli
View file @
74b927db
...
...
@@ -29,6 +29,24 @@ val bppseqgen_simulation :
seed
:
int
->
t
module
Convdet_simulation_param
:
sig
type
t
val
make
:
?
branch_scale
:
float
->
?
ne_s
:
float
*
float
->
?
gBGC
:
float
*
float
->
?
seed
:
int
->
tree
:
tree
->
profiles
:
string
->
n_h0
:
int
->
n_ha
:
int
->
unit
->
t
val
simulation
:
t
->
Codepitk
.
Simulator
.
Site_independent_mutsel
.
simulation
workflow
end
val
convdet_simulation
:
?
branch_scale
:
float
->
?
ne_s
:
float
*
float
->
...
...
@@ -41,6 +59,8 @@ val convdet_simulation :
unit
->
t
val
convdet_simulation_of_param
:
Convdet_simulation_param
.
t
->
t
include
Detection_pipeline
.
Query
with
type
t
:=
t
include
Detection_pipeline
.
S
with
type
query
:=
t
...
...
@@ -55,6 +75,12 @@ val benchmark : ?mode:[ `fast | `full ] -> t -> svg file
val
benchmark2
:
t
->
pdf
file
val
benchmark_statistics
:
Codepitk
.
Simulator
.
Site_independent_mutsel
.
simulation
workflow
->
labels
:
string
list
->
results
:
(
text
file
*
int
)
list
->
binary_file
file
(** stuff for gbgc SBME paper *)
type
gBGC_t
=
|
Global
of
float
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment