Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Docker-in-Docker (DinD) capabilities of public runners deactivated.
More info
Open sidebar
VEBER Philippe
codepi
Commits
961383a1
Commit
961383a1
authored
Jun 22, 2018
by
Carine Rey
Browse files
add empirical aa freqencies in simulations
parent
2d3aa01b
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
123 additions
and
31 deletions
+123
-31
lib/bppsuite.ml
lib/bppsuite.ml
+19
-10
lib/bppsuite.mli
lib/bppsuite.mli
+2
-0
lib/convergence_detection.ml
lib/convergence_detection.ml
+2
-1
lib/convergence_detection.mli
lib/convergence_detection.mli
+1
-0
lib/convergence_hypothesis.ml
lib/convergence_hypothesis.ml
+30
-0
lib/pipeline.ml
lib/pipeline.ml
+20
-16
lib/scripts/parse_input_tree.py
lib/scripts/parse_input_tree.py
+49
-4
No files found.
lib/bppsuite.ml
View file @
961383a1
open
Bistro
.
EDSL
open
Bistro
.
Std
open
Core
open
Bistro
.
Std
open
Bistro
.
EDSL
open
Bistro_bioinfo
.
Std
open
File_formats
...
...
@@ -10,22 +11,30 @@ let assign k v =
seq
~
sep
:
"="
[
string
k
;
v
]
let
conf_file_bppseqgen
~
tree
~
nb_sites
~
config
=
let
conf_file_bppseqgen
~
tree
~
out
~
nb_sites
~
config
=
seq
~
sep
:
"
\n
"
(
[
assign
"input.tree.file"
(
dep
tree
)
;
assign
"output.sequence.file"
des
t
;
assign
"output.sequence.file"
ou
t
;
assign
"number_of_sites"
(
int
nb_sites
)
;
]
@
config
)
let
bppseqgen
~
nb_sites
~
tree
~
config
:
nucleotide_fasta
workflow
=
workflow
~
descr
:
"bppsuite.bppseqgen"
[
cmd
"bppseqgen"
~
env
[
assign
"param"
(
file_dump
(
conf_file_bppseqgen
~
tree
~
nb_sites
~
config
))
;
]
]
let
bppseqgen
?
(
descr
=
""
)
?
profile_f
~
nb_sites
~
tree
~
config
:
nucleotide_fasta
workflow
=
let
config_f
=
dest
//
"config.bpp"
in
let
out
=
dest
//
"seq.fa"
in
workflow
~
descr
:
(
"bppsuite.bppseqgen"
^
descr
)
[
docker
env
(
and_list
[
mkdir_p
dest
;
cmd
"cat"
~
stdout
:
config_f
[(
file_dump
(
conf_file_bppseqgen
~
tree
~
out
~
nb_sites
~
config
))];
cmd
"bppseqgen"
~
env
[
assign
"param"
config_f
;
]
]
)
]
/
selector
[
"seq.fa"
]
let
conf_file_bppseqman_fna2faa
~
fna
=
...
...
lib/bppsuite.mli
View file @
961383a1
...
...
@@ -3,6 +3,8 @@ open Bistro_bioinfo.Std
open
File_formats
val
bppseqgen
:
?
descr
:
string
->
?
profile_f
:
_
workflow
->
nb_sites
:
int
->
tree
:
nhx
workflow
->
config
:
Bistro
.
Template
.
t
list
->
...
...
lib/convergence_detection.ml
View file @
961383a1
...
...
@@ -11,7 +11,7 @@ type det_out =
|
Pcoc_out
|
Diffsel_out
let
pcoc
?
gamma
~
(
faa
:
aminoacid_fasta
workflow
)
~
(
tree
:_
workflow
)
:
(*`pcoc*)
det_out
directory
workflow
=
let
pcoc
?
plot_complete
?
gamma
~
(
faa
:
aminoacid_fasta
workflow
)
~
(
tree
:_
workflow
)
:
(*`pcoc*)
det_out
directory
workflow
=
let
env
=
docker_image
~
account
:
"carinerey"
~
name
:
"pcoc"
~
tag
:
"06212018"
()
in
workflow
~
descr
:
"convergence_detection.pcoc"
[
cmd
"pcoc_det.py"
~
env
[
...
...
@@ -20,6 +20,7 @@ let pcoc ?gamma ~(faa:aminoacid_fasta workflow) ~(tree:_ workflow) : (*`pcoc*)
opt
"-aa"
dep
faa
;
opt
"-o"
ident
dest
;
option
(
flag
string
"--gamma"
)
gamma
;
option
(
flag
string
"--plot --plot_complete_ali"
)
plot_complete
;
]
]
...
...
lib/convergence_detection.mli
View file @
961383a1
...
...
@@ -12,6 +12,7 @@ type det_out =
|
Diffsel_out
val
pcoc
:
?
plot_complete
:
bool
->
?
gamma
:
bool
->
faa
:
aminoacid_fasta
workflow
->
tree
:
_
workflow
->
...
...
lib/convergence_hypothesis.ml
View file @
961383a1
...
...
@@ -10,6 +10,9 @@ let string_of_model m = match m with
|
H0
->
"H0"
|
Ha
->
"Ha"
let
assign
k
v
=
seq
~
sep
:
"="
[
string
k
;
v
]
let
bpp_config_base
=
{
|
alphabet
=
Codon
(
letter
=
DNA
)
genetic_code
=
Standard
...
...
@@ -38,3 +41,30 @@ let bpp_config nodes hyp = [
|
Ha
->
bpp_config_Ha
)
;
]
let
bpp_config_H0_F
~
profile_f
=
seq
~
sep
:
"
\n
"
[
assign
"COL_M1"
(
int
2
)
;
(* string {|nonhomogeneous.number_of_models = 1 |} ; Mv in insert nodes*)
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file="
;
dep
profile_f
;
string
", col=$(COL_M1)))))"
]
;
]
let
bpp_config_Ha_F
~
profile_f
=
seq
~
sep
:
"
\n
"
[
(* string {|nonhomogeneous.number_of_models = 3 or 2 |} ; Mv in insert nodes*)
assign
"COL_M1"
(
int
2
)
;
assign
"COL_M2"
(
int
85
)
;
assign
"modelT"
(
string
"model2"
)
;
assign
"modelC"
(
string
"model3"
)
;
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file="
;
dep
profile_f
;
string
", col=$(COL_M1)))))"
]
;
seq
[
string
"$(modelT)=OneChange(model=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file="
;
dep
profile_f
;
string
", col=$(COL_M2))))))"
]
;
seq
[
string
"$(modelC)=Codon_AAFit(model=K80, fitness=FromModel(model=JTT92+F(frequencies=Empirical(file="
;
dep
profile_f
;
string
", col=$(COL_M2)))))"
]
;
]
let
bpp_config_F
nodes
hyp
profile_f
=
[
string
bpp_config_base
;
insert
nodes
;
match
hyp
with
|
H0
->
bpp_config_H0_F
~
profile_f
|
Ha
->
bpp_config_Ha_F
~
profile_f
;
]
lib/pipeline.ml
View file @
961383a1
...
...
@@ -53,29 +53,31 @@ let repo_of_dataset_l ~preview dataset_l =
)
|>
List
.
concat
let
derive_from_model
~
model
~
tree
~
tree_dataset
~
tree_prefix
~
preview
=
let
derive_from_model
~
model
~
tree
~
tree_dataset
~
tree_prefix
~
profile_f
~
preview
=
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
model
in
let
nb_sites
=
if
preview
then
10
else
100
in
let
nodes
=
Tree_dataset
.
nodes
tree_dataset
model
in
let
config
=
Convergence_hypothesis
.
bpp_config
nodes
model
in
let
config
=
Convergence_hypothesis
.
bpp_config
_F
nodes
model
profile_f
in
let
tree
=
Tree_dataset
.
tree
tree_dataset
`Simulation
in
let
fna
=
Bppsuite
.
bppseqgen
~
nb_sites
~
tree
~
config
in
let
descr
=
"."
^
model_prefix
^
"."
^
tree_prefix
in
let
fna
=
Bppsuite
.
bppseqgen
~
descr
~
nb_sites
~
tree
~
config
~
profile_f
in
let
faa
=
Bppsuite
.
fna2faa
~
fna
in
let
ready_dataset
=
{
input_tree
=
tree
;
tree_dataset
;
fna
;
faa
}
in
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
model
in
{
model_prefix
;
tree_prefix
;
ready_dataset
}
let
derive_from_tree
~
tree_dir
~
tree
~
preview
=
let
derive_from_tree
~
tree_dir
~
tree
~
profile_f
~
preview
=
let
tree_prefix
=
Filename
.
chop_extension
tree
in
let
tree
=
input
(
Filename
.
concat
tree_dir
tree
)
in
let
tree_dataset
=
Tree_dataset
.
prepare
tree
in
let
models
=
Convergence_hypothesis
.[
H0
;
Ha
]
in
List
.
map
models
~
f
:
(
fun
model
->
derive_from_model
~
model
~
tree
~
tree_dataset
~
tree_prefix
~
preview
derive_from_model
~
model
~
tree
~
tree_dataset
~
tree_prefix
~
profile_f
~
preview
)
let
derive_sim
~
tree_dir
~
trees
~
preview
=
let
derive_sim
~
tree_dir
~
trees
~
profile_fn
~
preview
=
let
profile_f
=
input
profile_fn
in
List
.
map
trees
~
f
:
(
fun
tree
->
derive_from_tree
~
tree_dir
~
tree
~
preview
)
derive_from_tree
~
tree_dir
~
tree
~
profile_f
~
preview
)
|>
List
.
concat
...
...
@@ -104,8 +106,8 @@ let derive_from_det_meth ~det_meth ~dataset ~preview =
let
w_every
=
if
preview
then
1
else
10
in
let
n_cycles
=
if
preview
then
100
else
1000
in
let
det_result
=
match
det_meth
with
|
Pcoc
->
Convergence_detection
.
pcoc
~
gamma
:
false
~
faa
~
tree
:
pcoc_tree
|
Pcoc_gamma
->
Convergence_detection
.
pcoc
~
gamma
:
true
~
faa
~
tree
:
pcoc_tree
|
Pcoc
->
Convergence_detection
.
pcoc
~
plot_complete
:
true
~
gamma
:
false
~
faa
~
tree
:
pcoc_tree
|
Pcoc_gamma
->
Convergence_detection
.
pcoc
~
plot_complete
:
true
~
gamma
:
true
~
faa
~
tree
:
pcoc_tree
|
Diffsel
->
Convergence_detection
.
diffsel
~
phy_n
~
tree
:
diffsel_tree
~
w_every
~
n_cycles
in
{
det_meth
;
det_result
;
dataset
}
...
...
@@ -119,22 +121,22 @@ let derive_from_dataset ~dataset ~preview=
derive_from_det_meth
~
det_meth
~
dataset
~
preview
)
let
derive_det
~
dataset_l
~
preview
=
let
derive_det
~
dataset_l
~
profile_fn
~
preview
=
List
.
map
dataset_l
~
f
:
(
fun
dataset
->
derive_from_dataset
~
preview
~
dataset
)
|>
List
.
concat
let
main
~
outdir
?
(
np
=
2
)
?
(
mem
=
2
)
~
tree_dir
~
preview
()
=
let
main
~
outdir
?
(
np
=
2
)
?
(
mem
=
2
)
~
tree_dir
~
profile_fn
~
preview
()
=
let
logger
=
Logger
.
tee
[
Console_logger
.
create
()
;
Dot_output
.
create
"dag.dot"
(*dot -Tpdf example/dag.dot -o dag.pdf*)
]
in
let
trees
=
Array
.
to_list
@@
Sys
.
readdir
tree_dir
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
preview
in
let
det_results_l
=
derive_det
~
dataset_l
~
preview
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile_fn
~
preview
in
let
det_results_l
=
derive_det
~
dataset_l
~
profile_fn
~
preview
in
let
repo
=
[
repo_of_dataset_l
dataset_l
~
preview
;
repo_of_dataset_l
dataset_l
~
preview
;
repo_of_det_results_l
det_results_l
;
]
|>
List
.
concat
...
...
@@ -156,6 +158,8 @@ let command =
flag
"--mem"
(
optional
int
)
~
doc
:
"INT Available memory (in GB)"
and
tree_dir
=
flag
"--tree-dir"
(
required
string
)
~
doc
:
"PATH Path to tree directory"
and
profile_fn
=
flag
"--profile-fn"
(
required
string
)
~
doc
:
"PATH Path to profile file"
in
main
~
outdir
?
np
?
mem
~
tree_dir
~
preview
main
~
outdir
?
np
?
mem
~
tree_dir
~
profile_fn
~
preview
]
lib/scripts/parse_input_tree.py
View file @
961383a1
...
...
@@ -107,9 +107,15 @@ if t:
features
.
remove
(
"ND"
)
if
not
"Condition"
in
features
:
logger
.
error
(
"
\"
Condition
\"
must be in the detected tags.
\"
Condition:1
\"
must identify
branch
es under the convergent model under Ha"
)
logger
.
error
(
"
\"
Condition
\"
must be in the detected tags.
\"
Condition:1
\"
must identify
nod
es under the convergent model under Ha"
)
sys
.
exit
(
1
)
if
not
"Transition"
in
features
:
logger
.
error
(
"
\"
Transition
\"
must be in the detected tags.
\"
Transition:1
\"
must identify nodes where there are transitions"
)
sys
.
exit
(
1
)
if
"Transition"
in
features
:
logger
.
info
(
"
\"
Transition
\"
is in the detected tags.
\"
Transition:1
\"
will be use to indicate OneChange model on this node"
)
# Check existing bl
branch_lengths
=
[]
nodeId
=
0
...
...
@@ -140,18 +146,57 @@ t.write(format=1, features=["ND"], outfile = "%s/tree.only_node_ids.nhx" %(OutDi
#### -> 1 line: all nodes ids
all_node_ids
=
range
(
nodeId
-
1
)
with
open
(
"%s/tree.H0.node_ids"
%
(
OutDirName
),
"w"
)
as
output_H0_node_ids
:
output_H0_node_ids
.
write
(
"nonhomogeneous.number_of_models = 1"
)
output_H0_node_ids
.
write
(
"
\n
"
)
output_H0_node_ids
.
write
(
"model1.nodes_id="
+
","
.
join
(
map
(
str
,
all_node_ids
)))
### tree.Ha.node_ids: alternative hypothesis
#### -> 2 lines: 1) node ids under the ancestral model
#### 2) node ids under the convergent model
conv_node_ids
=
[
n
.
ND
for
n
in
t
.
search_nodes
(
Condition
=
"1"
)]
trans_node_ids
=
[
n
.
ND
for
n
in
t
.
search_nodes
(
Transition
=
"1"
)]
conv_node_ids
=
[
i
for
i
in
conv_node_ids
if
i
not
in
trans_node_ids
]
not_conv_node_ids
=
[
i
for
i
in
all_node_ids
if
i
not
in
conv_node_ids
+
trans_node_ids
]
logger
.
info
(
conv_node_ids
)
logger
.
info
(
trans_node_ids
)
logger
.
info
(
not_conv_node_ids
)
if
trans_node_ids
and
conv_node_ids
:
nonhomogeneous_number_of_models
=
"nonhomogeneous.number_of_models = 3"
models
=
"
\n
"
.
join
([
"modelT = model2"
,
"modelC = model3"
])
node_ids
=
"
\n
"
.
join
([
"model1.nodes_id="
+
","
.
join
(
map
(
str
,
not_conv_node_ids
)),
"model2.nodes_id="
+
","
.
join
(
map
(
str
,
trans_node_ids
)),
"model3.nodes_id="
+
","
.
join
(
map
(
str
,
conv_node_ids
))
])
elif
trans_node_ids
and
not
conv_node_ids
:
nonhomogeneous_number_of_models
=
"nonhomogeneous.number_of_models = 2"
models
=
"
\n
"
.
join
([
"modelT = model2"
,
"modelC = model3"
])
node_ids
=
"
\n
"
.
join
([
"model1.nodes_id="
+
","
.
join
(
map
(
str
,
not_conv_node_ids
)),
"model2.nodes_id="
+
","
.
join
(
map
(
str
,
trans_node_ids
))
])
elif
not
trans_node_ids
and
conv_node_ids
:
nonhomogeneous_number_of_models
=
"nonhomogeneous.number_of_models = 2"
models
=
"
\n
"
.
join
([
"modelT = model3"
,
"modelC = model2"
])
node_ids
=
"
\n
"
.
join
([
"model1.nodes_id="
+
","
.
join
(
map
(
str
,
not_conv_node_ids
)),
"model2.nodes_id="
+
","
.
join
(
map
(
str
,
conv_node_ids
))
])
else
:
logger
.
error
(
"No convergent nodes"
)
sys
.
exit
(
2
)
with
open
(
"%s/tree.Ha.node_ids"
%
(
OutDirName
),
"w"
)
as
output_Ha_node_ids
:
conv_node_ids
=
[
n
.
ND
for
n
in
t
.
search_nodes
(
Condition
=
"1"
)]
not_conv_node_ids
=
[
i
for
i
in
all_node_ids
if
i
not
in
conv_node_ids
]
output_Ha_node_ids
.
write
(
"model2.nodes_id="
+
","
.
join
(
map
(
str
,
conv_node_ids
)))
output_Ha_node_ids
.
write
(
nonhomogeneous_number_of_models
)
output_Ha_node_ids
.
write
(
"
\n
"
)
output_Ha_node_ids
.
write
(
models
)
output_Ha_node_ids
.
write
(
"
\n
"
)
output_Ha_node_ids
.
write
(
"model1.nodes_id="
+
","
.
join
(
map
(
str
,
not_conv_
node_ids
)
))
output_Ha_node_ids
.
write
(
node_ids
)
######### output trees for detection
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment