Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
5fc3531f
Commit
5fc3531f
authored
Jul 31, 2018
by
Carine Rey
Browse files
use predefined pairs of profiles in function of their distance
parent
f22a7cf1
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
86 additions
and
59 deletions
+86
-59
etc/docker/python_basics/Dockerfile
etc/docker/python_basics/Dockerfile
+2
-1
lib/bppsuite.ml
lib/bppsuite.ml
+20
-24
lib/bppsuite.mli
lib/bppsuite.mli
+2
-1
lib/diffsel.ml
lib/diffsel.ml
+2
-2
lib/pipeline.ml
lib/pipeline.ml
+23
-18
lib/profile.ml
lib/profile.ml
+35
-11
lib/scripts/calc_simu_infos.py
lib/scripts/calc_simu_infos.py
+2
-2
No files found.
etc/docker/python_basics/Dockerfile
View file @
5fc3531f
...
...
@@ -28,4 +28,5 @@ RUN pip install docopt
RUN
pip
install
matplotlib
RUN
apt-get update
&&
\
apt-get
install
--no-install-recommends
-qy
\
python-tk
python-tk
\
xvfb
lib/bppsuite.ml
View file @
5fc3531f
...
...
@@ -46,11 +46,11 @@ let bppseqgen ?(descr="") ~nb_sites ~tree ~config : nucleotide_fasta workflow =
)
]
/
selector
[
"seq.fa"
]
let
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_
w
~
ne_c
~
ne_a
~
config
~
nb_sites_per_profile
=
let
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_
f
~
ne_c
~
ne_a
~
config
~
nb_sites_per_profile
=
seq
~
sep
:
"
\n
"
(
[
assign
"input.tree.file"
(
dep
tree
)
;
assign
"PROFILE_F"
(
dep
profile_
w
)
;
assign
"PROFILE_F"
(
dep
profile_
f
)
;
assign
"number_of_sites"
(
int
nb_sites_per_profile
)
;
assign
"NE_1"
(
float
ne_a
)
;
assign
"NE_C"
(
float
ne_c
)
;
...
...
@@ -59,31 +59,27 @@ let conf_file_bppseqgen_multi_profiles ~tree ~profile_w ~ne_c ~ne_a ~config ~nb_
@
config
)
let
bppseqgen_multi_profiles_script
~
config
~
nb_combis
~
out
~
profile_
w
=
let
bppseqgen_multi_profiles_script
~
config
~
out
~
profile_
c
=
let
vars
=
[
"FINAL_OUT"
,
ident
out
;
"PARAM"
,
config
;
"PROFILE_F"
,
dep
profile_w
;
"NB_COMBI_PROFILES"
,
int
nb_combis
;
"PROFILE_C"
,
dep
profile_c
;
]
in
bash_script
vars
{
|
start_i
=
1
end_i
=$
NB_COMBI_PROFILES
NB_COLS
=
`head
-
n
1
$
PROFILE_F
|
awk
'
{
print
NF
}
'
`
rm
-
f
$
FINAL_OUT
.
info
for
((
i
=
start_i
;
i
<=
end_i
;
i
++
))
i
=
0
while
read
-
r
line
do
echo
"i: $i"
shuf
-
i1
-$
NB_COLS
-
n2
-
o
random_num
COL_M1
=
`head
-
n
1
random_num
`
COL_M2
=
`tail
-
n
1
random_num
`
echo
-
e
$
COL_M1'
\
t'
$
COL_M2
>>
$
FINAL_OUT
.
info
bppseqgen
param
=$
PARAM
i
=$
i
COL_M1
=$
COL_M1
COL_M2
=$
COL_M2
output
.
sequence
.
file
=
out_int_
"$i"
.
fa
done
echo
"i: $i"
((
i
++
))
name
=
"$line"
COL_M1
=
`echo
$
line
|
cut
-
f
1
-
d
" "
`
COL_M2
=
`echo
$
line
|
cut
-
f
2
-
d
" "
`
bppseqgen
param
=$
PARAM
i
=$
i
COL_M1
=$
COL_M1
COL_M2
=$
COL_M2
output
.
sequence
.
file
=
out_int_
"$i"
.
fa
done
<
"$PROFILE_C"
cp
$
PROFILE_C
$
FINAL_OUT
.
info
#
horizontal
concatenation
of
fasta
catfasta2phyml
.
pl
-
f
out_int_
*
>
$
FINAL_OUT
...
...
@@ -91,7 +87,7 @@ let bppseqgen_multi_profiles_script ~config ~nb_combis ~out ~profile_w =
|
}
let
bppseqgen_multi_profiles
?
(
descr
=
""
)
~
profile_
w
~
nb_sites
~
tree
~
config
~
ne_c
~
ne_a
:
bppseqgen_multi_profiles
directory
workflow
=
let
bppseqgen_multi_profiles
?
(
descr
=
""
)
~
profile_
f
~
profile_c
~
nb_sites
~
tree
~
config
~
ne_c
~
ne_a
:
bppseqgen_multi_profiles
directory
workflow
=
let
nb_sites_per_profile
=
1
in
let
nb_combis
=
Pervasives
.(
nb_sites
/
nb_sites_per_profile
)
in
let
config_f
=
dest
//
"config.bpp"
in
...
...
@@ -102,8 +98,8 @@ let bppseqgen_multi_profiles ?(descr="") ~profile_w ~nb_sites ~tree ~config ~ne_
mkdir_p
dest
;
mkdir_p
tmp
;
cd
tmp
;
cmd
"cat"
~
stdout
:
config_f
[(
file_dump
(
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_
w
~
config
~
ne_c
~
ne_a
~
nb_sites_per_profile
))];
cmd
"bash"
[(
file_dump
(
bppseqgen_multi_profiles_script
~
config
:
config_f
~
nb_combis
~
out
~
profile_
w
))];
cmd
"cat"
~
stdout
:
config_f
[(
file_dump
(
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_
f
~
config
~
ne_c
~
ne_a
~
nb_sites_per_profile
))];
cmd
"bash"
[(
file_dump
(
bppseqgen_multi_profiles_script
~
config
:
config_f
~
out
~
profile_
c
))];
]
)
]
...
...
@@ -138,7 +134,7 @@ let conf_file_bppseqman_fa2phy ~fna =
seq
~
sep
:
"
\n
"
[
assign
"input.sequence.file"
(
dep
fna
)
;
assign
"output.sequence.file"
dest
;
assign
"output.sequence.format"
(
string
"Phylip"
)
;
assign
"output.sequence.format"
(
string
"Phylip
(order=interleaved, type=extended)
"
)
;
string
{
|
input
.
alignment
=
true
input
.
sequence
.
remove_stop_codons
=
no
input
.
sequence
.
sites_to_use
=
all
...
...
@@ -147,7 +143,7 @@ let conf_file_bppseqman_fa2phy ~fna =
]
let
fa2phy
~
(
fna
:
nucleotide_fasta
workflow
)
:
nucleotide_phylip
workflow
=
workflow
~
descr
:
"bppsuite.fa2phy"
[
workflow
~
descr
:
"bppsuite.fa2phy
_interleaved
"
[
cmd
"bppseqman"
~
env
[
assign
"param"
(
file_dump
(
conf_file_bppseqman_fa2phy
~
fna
))
;
]
...
...
lib/bppsuite.mli
View file @
5fc3531f
...
...
@@ -13,7 +13,8 @@ val bppseqgen :
val
bppseqgen_multi_profiles
:
?
descr
:
string
->
profile_w
:
_
workflow
->
profile_f
:
text_file
workflow
->
profile_c
:
text_file
workflow
->
nb_sites
:
int
->
tree
:
nhx
workflow
->
config
:
Bistro
.
Template
.
t
list
->
...
...
lib/diffsel.ml
View file @
5fc3531f
...
...
@@ -85,7 +85,7 @@ let diffsel ~(phy_n:nucleotide_phylip workflow) ~(tree: _ workflow) ~(w_every:in
opt
"-x"
seq
[
int
w_every
;
string
" "
;
int
n_cycles
];
ident
chainname
;
];
cmd
"bash"
[(
file_dump
(
diffsel_add_iterations_script
~
chainname
))];
cmd
"bash"
[(
file_dump
(
diffsel_add_iterations_script
~
chainname
))];
]
)
]
...
...
@@ -112,7 +112,7 @@ let check_conv run_diffsel : text_file directory workflow =
]
;
cmd
"cp"
[
string
"DiffselMCMCConvergenceAnalysis.html"
;
ident
out
]
;
cmd
"cp"
[
string
"new_iterations.txt"
;
ident
nb_new_iterations
]
]
]
)
]
...
...
lib/pipeline.ml
View file @
5fc3531f
...
...
@@ -47,7 +47,7 @@ let parse_input_data indir =
let
derive_from_model
~
model
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
=
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
model
in
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
nb_sites
=
ns
in
let
nodes
=
Tree_dataset
.
nodes
tree_dataset
model
in
let
tree
=
Tree_dataset
.
tree
tree_dataset
`Simulation
in
let
descr
=
"."
^
model_prefix
^
"."
^
tree_prefix
in
...
...
@@ -74,8 +74,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
|
H0_SmallNeInBigNe
->
10
.
|
_
->
1
.
in
let
profile_w
=
profile
.
profile_w
in
let
run_fna
=
Bppsuite
.
bppseqgen_multi_profiles
~
descr
~
nb_sites
~
tree
~
config
:
config_p
~
profile_w
~
ne_c
~
ne_a
in
let
profile_f
=
profile
.
profile_f
in
let
profile_c
=
profile
.
profile_c
in
let
run_fna
=
Bppsuite
.
bppseqgen_multi_profiles
~
descr
~
nb_sites
~
tree
~
config
:
config_p
~
profile_f
~
profile_c
~
ne_c
~
ne_a
in
let
fna
=
Bppsuite
.
bppseqgen_multi_profiles_get_fa
run_fna
in
let
fna_infos
=
Some
(
Bppsuite
.
bppseqgen_multi_profiles_get_info
run_fna
)
in
...
...
@@ -128,8 +129,10 @@ let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~ns ~no_Ne ~n
)
in
let
ready_dataset_H0
=
(
derive_from_model
~
model
:
H0
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
.
dataset
in
let
ready_dataset_HaPCOC
=
(
derive_from_model
~
model
:
HaPCOC
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
.
dataset
in
let
concat_H0Ha
=
{
Dataset
.
model_prefix
=
"H0+HaPCOC"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
ready_dataset_H0
ready_dataset_HaPCOC
}
in
let
dataset_concat_hypos
=
if
use_concat
then
[
concat_H0Ha
;]
else
[]
in
let
ready_dataset_HaPC
=
(
derive_from_model
~
model
:
HaPC
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
)
.
dataset
in
let
concat_H0HaPCOC
=
{
Dataset
.
model_prefix
=
"H0+HaPCOC"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
ready_dataset_H0
ready_dataset_HaPCOC
}
in
let
concat_H0HaPC
=
{
Dataset
.
model_prefix
=
"H0+HaPC"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
ready_dataset_H0
ready_dataset_HaPC
}
in
let
dataset_concat_hypos
=
if
use_concat
then
[
concat_H0HaPC
;]
else
[]
in
List
.
concat
[
dataset_per_hypo
;
dataset_concat_hypos
]
let
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
=
...
...
@@ -269,7 +272,7 @@ let derive_profile ?(indir = "") ?(ns = 0) ~preview ~fast_mode ~no_Ne ~ne_test ~
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
in
let
dataset_results_l
=
if
only_simu
then
if
only_simu
then
[]
else
derive_det
~
dataset_l
~
preview
~
fast_mode
...
...
@@ -288,8 +291,13 @@ let derive_profile ?(indir = "") ?(ns = 0) ~preview ~fast_mode ~no_Ne ~ne_test ~
let
all_post_analyses_per_tree
=
List
.
map
repo_and_post_analyses_per_tree_simu
~
f
:
(
fun
(
r
,
p
)
->
p
)
in
let
profile_prefix
=
profile
.
profile_n
in
let
repo_post_analyses_all_trees
=
Post_analyses
.
repo_post_analyses_all_trees_of_all_post_analyses_per_tree
~
all_post_analyses_per_tree
~
profile_prefix
in
let
repo
=
repo_of_post_analyses_simu
@
all_repo_per_tree_simu
@
repo_post_analyses_all_trees
in
Repo
.
shift
profile_prefix
repo
let
repo_post_analyses_all_trees
=
if
only_simu
then
[]
else
repo_post_analyses_all_trees
in
let
repo
=
repo_of_post_analyses_simu
@
all_repo_per_tree_simu
@
repo_post_analyses_all_trees
in
Repo
.
shift
profile_prefix
repo
let
logger
=
...
...
@@ -306,9 +314,8 @@ let detection_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~fast_mode () =
Repo
.
build
~
outdir
~
np
~
mem
:
(
`GB
mem
)
~
logger
repo
let
simulation_main
~
outdir
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
tree_dir
~
profile_fn
~
preview
~
use_concat
~
no_Ne
~
no_HaPC
()
=
let
profile_w
=
input
profile_fn
in
let
profile_n
=
Filename
.
chop_extension
profile_fn
in
let
profile
=
{
profile_w
;
profile_n
}
in
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
1
~
nb_sites
profile_fn
in
let
trees
=
Array
.
to_list
@@
Sys
.
readdir
tree_dir
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
:
false
in
let
repo
=
Dataset
.
repo
dataset_l
~
preview
in
...
...
@@ -316,16 +323,14 @@ let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn
let
validation_main
~
outdir
?
(
indir
=
""
)
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile_fn
~
use_concat
~
only_simu
()
=
(* simulated trees *)
let
profile_l
=
Profile
.
profile_l_of_splitted_profile
(
Profile
.
split_profile
(
input
profile_fn
))
in
let
sim_repo_l
=
List
.
map
profile_l
~
f
:
(
fun
profile
->
derive_profile
~
indir
~
ns
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
only_simu
()
)
|>
List
.
concat
in
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
3
~
nb_sites
profile_fn
in
let
sim_repo_l
=
derive_profile
~
indir
~
ns
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
only_simu
()
in
(* real trees *)
let
indir_dataset_l
=
if
indir
=
""
then
[]
else
parse_input_data
indir
in
let
dataset_l
=
indir_dataset_l
in
let
dataset_results_l
=
if
only_simu
then
if
only_simu
then
[]
else
derive_det
~
dataset_l
~
preview
~
fast_mode
...
...
@@ -412,7 +417,7 @@ let validation_command =
and
use_concat
=
flag
"--use-concat"
no_arg
~
doc
:
" Use concatenation H0+Ha_pcoc"
and
ns
=
flag
"--ns"
(
optional
int
)
~
doc
:
"INT Number of sites to simulate"
flag
"--ns"
(
optional
int
)
~
doc
:
"INT Number of sites to simulate
(WARNING: will be multiplicated per 3)
"
and
np
=
flag
"--np"
(
optional
int
)
~
doc
:
"INT Number of available processors"
and
mem
=
...
...
lib/profile.ml
View file @
5fc3531f
...
...
@@ -4,12 +4,13 @@ open Bistro.EDSL
open
File_formats
type
profile
=
{
profile_w
:
text_file
workflow
;
profile_n
:
string
;
profile_f
:
text_file
workflow
;
profile_c
:
text_file
workflow
;
profile_n
:
string
;
}
let
split_profile
profile_f
n
:
text_file
directory
workflow
=
let
split_profile
~
nb_sites
~
dist_bins
profile_f
:
text_file
directory
workflow
=
let
env
=
docker_image
~
account
:
"carinerey"
~
name
:
"python_basics"
~
tag
:
"07252018"
()
in
let
package
=
tmp
//
"diffsel_script_utils.py"
in
let
script
=
tmp
//
"generate_pairs.py"
in
...
...
@@ -20,6 +21,8 @@ let split_profile profile_fn : text_file directory workflow =
mkdir_p
tmp
;
mkdir_p
dest
;
cd
tmp
;
cmd
"("
[
string
"Xvfb :1 -screen 0 1024x768x16 & )"
];
cmd
"export"
[
string
"DISPLAY=:1"
];
cmd
"cp"
[
file_dump
(
string
Scripts
.
diffsel_script_utils
)
;
package
]
;
cmd
"cp"
[
file_dump
(
string
Scripts
.
generate_pairs
)
;
script
]
;
...
...
@@ -28,18 +31,39 @@ let split_profile profile_fn : text_file directory workflow =
cmd
"python"
[
string
"generate_pairs.py"
;
opt
"-o"
ident
prefix
;
dep
profile_fn
;
opt
"-s"
int
nb_sites
;
opt
"-b"
string
dist_bins
;
dep
profile_f
;
]
]
)
]
let
profile_l_of_splitted_profile
splitted_profile
=
let
p0
=
splitted_profile
/
selector
[
"profile_0.tsv"
]
in
let
p1
=
splitted_profile
/
selector
[
"profile_1.tsv"
]
in
let
p2
=
splitted_profile
/
selector
[
"profile_2.tsv"
]
in
[{
profile_w
=
p0
;
profile_n
=
"p0"
};
{
profile_w
=
p1
;
profile_n
=
"p1"
};
{
profile_w
=
p2
;
profile_n
=
"p2"
};
let
cat_file
~
(
f_l
:
text_file
workflow
list
)
:
text_file
workflow
=
workflow
~
descr
:
"cat.text_files"
[
cmd
"cat"
~
stdout
:
dest
(
List
.
concat
[
List
.
map
f_l
~
f
:
(
fun
f
->
dep
f
)
;
])
]
let
profile_l_of_splitted_profile
~
nb_cat
~
nb_sites
profile_fn
=
let
profile_f
=
input
profile_fn
in
let
prefix
=
Filename
.
chop_extension
profile_fn
in
let
dist_bins
=
match
nb_cat
with
|
3
->
"[0.01,0.4],[0.4,0.6],[0.6,2]"
|
1
->
"[0.01,2]"
|
_
->
failwith
{
|
nbcat
must
be
1
or
3
|
}
in
let
splitted_profile
=
split_profile
~
nb_sites
~
dist_bins
profile_f
in
match
nb_cat
with
|
3
->
(
let
p0
=
splitted_profile
/
selector
[
"profile_0.tsv"
]
in
let
p1
=
splitted_profile
/
selector
[
"profile_1.tsv"
]
in
let
p2
=
splitted_profile
/
selector
[
"profile_2.tsv"
]
in
{
profile_c
=
cat_file
[
p0
;
p1
;
p2
]
;
profile_n
=
prefix
^
"_3categories"
;
profile_f
};
)
|
1
->
(
let
p0
=
splitted_profile
/
selector
[
"profile_0.tsv"
]
in
{
profile_c
=
p0
;
profile_n
=
prefix
;
profile_f
})
|
_
->
failwith
{
|
nbcat
must
be
1
or
3
|
}
lib/scripts/calc_simu_infos.py
View file @
5fc3531f
...
...
@@ -205,7 +205,7 @@ df_final = df_final[["Sites","NbConvAA","NbNonConvAA","NbOnlyConvAA","NbOnlyNonC
#===================================================================================================
if
AliInfoFile
:
try
:
profil_df
=
pd
.
read_csv
(
AliInfoFile
.
name
,
sep
=
"
\t
"
,
names
=
[
"Pf_anc"
,
"Pf_conv"
])
profil_df
=
pd
.
read_csv
(
AliInfoFile
.
name
,
sep
=
"
\t
"
,
names
=
[
"Pf_anc"
,
"Pf_conv"
,
"dist"
])
except
Exception
as
exc
:
logger
.
error
(
str
(
exc
))
sys
.
exit
(
1
)
...
...
@@ -219,7 +219,7 @@ if AliInfoFile:
# reorder col names
profil_df
=
profil_df
[[
"Sites"
,
"Pf_anc"
,
"Pf_conv"
]]
profil_df
=
profil_df
[[
"Sites"
,
"Pf_anc"
,
"Pf_conv"
,
"dist"
]]
logger
.
info
(
"AliInfoFile (%s) ok after checking"
,
AliInfoFile
.
name
)
df_final
=
pd
.
merge
(
df_final
,
profil_df
,
on
=
"Sites"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment