Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
2bcb3e92
Commit
2bcb3e92
authored
Aug 31, 2018
by
Carine Rey
Browse files
add a new couple with indels
parent
aed9c6ab
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
59 additions
and
18 deletions
+59
-18
lib/dataset.ml
lib/dataset.ml
+8
-0
lib/pipeline.ml
lib/pipeline.ml
+27
-15
lib/ready_dataset.ml
lib/ready_dataset.ml
+21
-0
lib/scripts/calc_simu_infos.py
lib/scripts/calc_simu_infos.py
+3
-3
No files found.
lib/dataset.ml
View file @
2bcb3e92
...
...
@@ -23,3 +23,11 @@ let repo ~preview dataset_l =
|>
Repo
.
shift
"Results_per_hypothesis"
)
|>
List
.
concat
let
add_indels_to_dataset
d
=
let
p
=
0
.
33
in
let
model_prefix
=
sprintf
"%s_0.33_i"
d
.
model_prefix
in
let
tree_prefix
=
d
.
tree_prefix
in
let
is_real
=
d
.
is_real
in
let
dataset
=
Ready_dataset
.
add_indels_to_ready_dataset
~
p
:
0
.
33
d
.
dataset
in
{
model_prefix
;
tree_prefix
;
is_real
;
dataset
}
lib/pipeline.ml
View file @
2bcb3e92
...
...
@@ -120,7 +120,7 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let
ready_dataset
=
{
Ready_dataset
.
input_tree
=
input_tree
;
tree_dataset
;
fna
;
faa
;
fna_infos
}
in
{
Dataset
.
model_prefix
;
is_real
=
false
;
tree_prefix
;
dataset
=
ready_dataset
}
let
derive_from_tree
~
tree_dir
~
tree
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
=
let
derive_from_tree
~
tree_dir
~
tree
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
~
add_indels
=
let
tree_prefix
=
Filename
.
chop_extension
tree
in
let
input_tree
=
input
(
Filename
.
concat
tree_dir
tree
)
in
let
tree_dataset
=
Tree_dataset
.
prepare
~
descr
:
(
"simulated_data."
^
tree_prefix
)
input_tree
in
...
...
@@ -160,17 +160,25 @@ let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~n
let
dataset_per_hypo
=
List
.
map
models
~
f
:
(
fun
model
->
derive_from_model
~
model
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
in
let
ready_dataset_H0
=
(
derive_from_model
~
model
:
H0_NeG5
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
.
dataset
in
let
dataset_H0
=
derive_from_model
~
model
:
H0_NeG5
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
in
let
ready_dataset_H0
=
dataset_H0
.
dataset
in
let
indel_H0
=
Dataset
.
add_indels_to_dataset
dataset_H0
in
let
dataset_HaPC
=
derive_from_model
~
model
:
HaPC_NeG5
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
in
let
ready_dataset_HaPC
=
dataset_HaPC
.
dataset
in
let
indel_HaPC
=
Dataset
.
add_indels_to_dataset
dataset_HaPC
in
let
ready_dataset_HaPCOC
=
(
derive_from_model
~
model
:
HaPCOC
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
.
dataset
in
let
ready_dataset_HaPC
=
(
derive_from_model
~
model
:
HaPC_NeG5
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
.
dataset
in
let
_concat_H0HaPCOC
=
{
Dataset
.
model_prefix
=
"H0_NeG5+HaPCOC"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
ready_dataset_H0
ready_dataset_HaPCOC
}
in
let
concat_H0HaPC
=
{
Dataset
.
model_prefix
=
"H0_NeG5+HaPC_NeG5"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
ready_dataset_H0
ready_dataset_HaPC
}
in
let
dataset_concat_hypos
=
if
use_concat
then
[
concat_H0HaPC
;]
else
[]
in
List
.
concat
[
dataset_per_hypo
;
dataset_concat_hypos
]
let
dataset_with_indels
=
if
add_indels
then
[
indel_H0
;
indel_HaPC
;]
else
[]
in
List
.
concat
[
dataset_per_hypo
;
dataset_concat_hypos
;
dataset_with_indels
]
let
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
=
let
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
~
add_indels
=
List
.
map
trees
~
f
:
(
fun
tree
->
derive_from_tree
~
tree_dir
~
tree
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
)
derive_from_tree
~
tree_dir
~
tree
~
profile
~
preview
~
use_concat
~
add_indels
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
)
|>
List
.
concat
...
...
@@ -295,16 +303,16 @@ let derive_det ~dataset_l ~preview ~fast_mode =
List
.
map
dataset_l
~
f
:
(
fun
dataset
->
derive_from_dataset
~
preview
~
dataset
~
fast_mode
)
let
derive_profile
?
(
indir
=
""
)
?
(
ns
=
0
)
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
only_simu
~
seed
()
=
let
derive_profile
?
(
indir
=
""
)
?
(
ns
=
0
)
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
add_indels
~
only_simu
~
seed
()
=
let
trees
=
Array
.
to_list
@@
Sys
.
readdir
tree_dir
in
let
simu_dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
in
let
simu_dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
add_indels
~
ne_test
in
let
post_analyses_simu
=
Post_analyses
.
post_analyses_simu_of_simu_dataset_l
~
simu_dataset_l
in
let
repo_of_post_analyses_simu
=
Post_analyses
.
repo_of_post_analyses_simu
~
post_analyses_simu
in
let
repo_and_post_analyses_per_tree_simu
=
List
.
map
trees
~
f
:
(
fun
tree
->
(*to keep together all models per tree*)
let
trees
=
[
tree
]
in
let
tree_prefix
=
Filename
.
chop_extension
tree
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
in
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
~
add_indels
in
let
dataset_results_l
=
if
only_simu
then
[]
...
...
@@ -359,21 +367,21 @@ let detection_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~fast_mode () =
let
repo
=
repo_of_dataset_results_l
~
dataset_results_l
in
Repo
.
build
~
outdir
~
np
~
mem
:
(
`GB
mem
)
~
logger
repo
let
simulation_main
~
outdir
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
tree_dir
~
profile_fn
~
preview
~
use_concat
~
no_Ne
~
no_HaPC
~
seed
()
=
let
simulation_main
~
outdir
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
tree_dir
~
profile_fn
~
preview
~
use_concat
~
add_indels
~
no_Ne
~
no_HaPC
~
seed
()
=
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
1
~
nb_sites
profile_fn
in
let
trees
=
Array
.
to_list
@@
Sys
.
readdir
tree_dir
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
:
false
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
add_indels
~
ne_test
:
false
in
let
repo
=
Dataset
.
repo
dataset_l
~
preview
in
Repo
.
build
~
outdir
~
np
~
mem
:
(
`GB
mem
)
~
logger
repo
let
validation_main
~
outdir
?
(
indir
=
""
)
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile_fn
~
use_concat
~
only_simu
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
let
validation_main
~
outdir
?
(
indir
=
""
)
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile_fn
~
use_concat
~
add_indels
~
only_simu
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
printf
"Global seed: %i
\n
"
seed
;
(* simulated trees *)
Random
.
init
seed
;
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
3
~
nb_sites
profile_fn
in
let
sim_repo_l
=
derive_profile
~
indir
~
ns
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
only_simu
~
seed
()
in
let
sim_repo_l
=
derive_profile
~
indir
~
ns
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
add_indels
~
only_simu
~
seed
()
in
(* real trees *)
let
indir_dataset_l
=
if
indir
=
""
then
[]
else
parse_input_data
indir
in
let
dataset_l
=
indir_dataset_l
in
...
...
@@ -413,6 +421,8 @@ let simulation_command =
flag
"--mem"
(
optional
int
)
~
doc
:
"INT Available memory (in GB)"
and
use_concat
=
flag
"--use-concat"
no_arg
~
doc
:
" Use concatenation H0+Ha_pcoc"
and
add_indels
=
flag
"--add-indels"
no_arg
~
doc
:
" add indels in H*NeG5"
and
tree_dir
=
flag
"--tree-dir"
(
required
string
)
~
doc
:
"PATH Path to tree directory"
and
profile_fn
=
...
...
@@ -420,7 +430,7 @@ let simulation_command =
and
seed
=
flag
"--seed"
(
optional
int
)
~
doc
:
"INT Global seed"
in
simulation_main
~
outdir
?
ns
?
np
?
mem
~
no_Ne
~
no_HaPC
~
tree_dir
~
profile_fn
~
preview
~
use_concat
~
seed
simulation_main
~
outdir
?
ns
?
np
?
mem
~
no_Ne
~
no_HaPC
~
tree_dir
~
profile_fn
~
preview
~
use_concat
~
add_indels
~
seed
]
let
detection_command
=
...
...
@@ -468,6 +478,8 @@ let validation_command =
flag
"--only-simu"
no_arg
~
doc
:
" mode only simulation"
and
use_concat
=
flag
"--use-concat"
no_arg
~
doc
:
" Use concatenation H0+Ha_pcoc"
and
add_indels
=
flag
"--add-indels"
no_arg
~
doc
:
" add indels in H*NeG5"
and
ns
=
flag
"--ns"
(
optional
int
)
~
doc
:
"INT Number of sites to simulate (WARNING: will be multiplicated per 3)"
and
np
=
...
...
@@ -481,5 +493,5 @@ let validation_command =
and
seed
=
flag
"--seed"
(
optional
int
)
~
doc
:
"INT Global seed"
in
validation_main
~
outdir
?
indir
?
ns
?
np
?
mem
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile_fn
~
use_concat
~
only_simu
?
seed
validation_main
~
outdir
?
indir
?
ns
?
np
?
mem
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile_fn
~
use_concat
~
only_simu
~
add_indels
?
seed
]
lib/ready_dataset.ml
View file @
2bcb3e92
...
...
@@ -68,3 +68,24 @@ let paste d1 d2 =
let
fna_infos
=
Some
(
paste_fna_infos
~
fna_infos_l
)
in
let
ready_dataset
=
of_raw
{
Raw_dataset
.
input_tree
=
r_d1
.
input_tree
;
fna
;
fna_infos
}
in
ready_dataset
let
add_indels_to_fna
~
(
p
:
float
)
(
fna
:
nucleotide_fasta
workflow
)
:
nucleotide_fasta
workflow
=
let
env
=
docker_image
~
account
:
"carinerey"
~
name
:
"python_basics"
~
tag
:
"07252018"
()
in
workflow
~
descr
:
(
"add_indels"
)
[
cmd
"python"
~
env
[
file_dump
(
string
Scripts
.
add_indels
)
;
opt
"-p"
float
p
;
opt
"-a"
dep
fna
;
opt
"-o"
ident
dest
;
string
"-c"
;
]
]
let
add_indels_to_ready_dataset
~
p
d
=
let
r_d
=
to_raw
d
in
let
fna
=
add_indels_to_fna
~
p
r_d
.
fna
in
let
fna_infos
=
r_d
.
fna_infos
in
let
ready_dataset
=
of_raw
{
Raw_dataset
.
input_tree
=
r_d
.
input_tree
;
fna
;
fna_infos
}
in
ready_dataset
lib/scripts/calc_simu_infos.py
View file @
2bcb3e92
...
...
@@ -153,7 +153,7 @@ df_leaves_l = []
n_sites
=
ali
.
get_alignment_length
()
Sites
=
[
i
+
1
for
i
in
range
(
n_sites
)]
AA
=
[
"A"
,
"R"
,
"N"
,
"D"
,
"C"
,
"E"
,
"Q"
,
"G"
,
"H"
,
"I"
,
"L"
,
"K"
,
"M"
,
"F"
,
"P"
,
"S"
,
"T"
,
"W"
,
"Y"
,
"V"
]
AA
=
[
"A"
,
"R"
,
"N"
,
"D"
,
"C"
,
"E"
,
"Q"
,
"G"
,
"H"
,
"I"
,
"L"
,
"K"
,
"M"
,
"F"
,
"P"
,
"S"
,
"T"
,
"W"
,
"Y"
,
"V"
,
"-"
]
#init_AA_dict = {aa:0 for aa in AA}
Conv_AA_per_site_l
=
[{
aa
:
0
for
aa
in
AA
}
for
i
in
range
(
n_sites
)]
NonConv_AA_per_site_l
=
[{
aa
:
0
for
aa
in
AA
}
for
i
in
range
(
n_sites
)]
...
...
@@ -178,10 +178,10 @@ OnlyNonConv_ConvAA = [0] * n_sites #B = nb AA only in (NonConvLeaves AND ConvLea
for
i
in
range
(
n_sites
):
logger
.
info
(
i
)
ConvAA
=
set
([
aa
for
aa
,
v
in
Conv_AA_per_site_l
[
i
].
items
()
if
v
>
0
])
ConvAA
=
set
([
aa
for
aa
,
v
in
Conv_AA_per_site_l
[
i
].
items
()
if
v
>
0
and
aa
!=
"-"
])
logger
.
debug
(
Conv_AA_per_site_l
[
i
])
logger
.
debug
(
ConvAA
)
NonConvAA
=
set
([
aa
for
aa
,
v
in
NonConv_AA_per_site_l
[
i
].
items
()
if
v
>
0
])
NonConvAA
=
set
([
aa
for
aa
,
v
in
NonConv_AA_per_site_l
[
i
].
items
()
if
v
>
0
and
aa
!=
"-"
])
logger
.
debug
(
NonConv_AA_per_site_l
[
i
])
logger
.
debug
(
NonConvAA
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment