Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
8f3a909f
Commit
8f3a909f
authored
Jul 23, 2018
by
Carine Rey
Browse files
add msd_meth (not working well yet due to output msd format but compile)
parent
6ae4542d
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
342 additions
and
10 deletions
+342
-10
etc/docker/msd/Dockerfile
etc/docker/msd/Dockerfile
+5
-5
lib/convergence_detection.ml
lib/convergence_detection.ml
+6
-0
lib/convergence_detection.mli
lib/convergence_detection.mli
+1
-0
lib/msd.ml
lib/msd.ml
+39
-0
lib/pipeline.ml
lib/pipeline.ml
+8
-4
lib/scripts/merge_det_results.py
lib/scripts/merge_det_results.py
+9
-1
lib/scripts/parse_input_msd.py
lib/scripts/parse_input_msd.py
+148
-0
lib/scripts/parse_output_msd.py
lib/scripts/parse_output_msd.py
+126
-0
No files found.
etc/docker/msd/Dockerfile
View file @
8f3a909f
FROM
debian:stretch
# base
FROM
carinerey/python_basics:07182018
MAINTAINER
Carine Rey carine.rey@ens-lyon.org
# add msd to python_basics
RUN
apt-get update
&&
\
apt-get
install
--no-install-recommends
-qy
\
git
\
make
\
gcc
\
libgsl-dev
\
...
...
@@ -11,9 +13,7 @@ RUN apt-get update && \
build-essential
\
libblas-dev libatlas-base-dev
COPY
Convergence-modif/ /msd
WORKDIR
/msd/src
RUN
make all
RUN
cp
msd /usr/local/bin
lib/convergence_detection.ml
View file @
8f3a909f
...
...
@@ -16,6 +16,7 @@ type result = [
|
`Topological_WAG
of
[
`topological
]
directory
workflow
|
`Tdg09
of
[
`tdg09
]
directory
workflow
|
`Multinomial
of
[
`multinomial
]
directory
workflow
|
`Msd
of
[
`msd
]
directory
workflow
]
let
meth_string_of_result
=
function
...
...
@@ -29,6 +30,7 @@ let meth_string_of_result = function
|
`Topological_WAG
_
->
"topological_WAG"
|
`Tdg09
_
->
"tdg09"
|
`Multinomial
_
->
"multinomial"
|
`Msd
_
->
"msd"
type
dataset_res
=
{
model_prefix
:
string
;
...
...
@@ -53,6 +55,7 @@ let merge_results ~res_by_tools : text_file workflow =
|
`Topological_WAG
d
->
Topological
.
results
d
|
`Tdg09
d
->
Tamuri
.
results
d
|
`Multinomial
d
->
Multinomial
.
results
d
|
`Msd
d
->
Msd
.
results
d
in
let
opt
=
match
res
with
|
`Pcoc
_
->
string
"--pcoc"
...
...
@@ -65,6 +68,7 @@ let merge_results ~res_by_tools : text_file workflow =
|
`Topological_WAG
_
->
string
"--topological_WAG"
|
`Tdg09
_
->
string
"--tdg09"
|
`Multinomial
_
->
string
"--multinomial"
|
`Msd
_
->
string
"--msd"
in
seq
~
sep
:
" "
[
opt
;
dep
w
]
)
...
...
@@ -92,6 +96,7 @@ let plot_merge_results ?t_choices ~plot_all_sites ~(res_by_tools:result list) ~t
|
`Topological_WAG
_
->
"Topological_WAG01"
|
`Tdg09
_
->
"Tdg09_1MinusFDR,Tdg09_prob_post"
|
`Multinomial
_
->
"Mutinomial_1MinusLRT"
|
`Msd
_
->
"Msd_1MinusP"
in
string
opt
)
|>
seq
~
sep
:
","
...
...
@@ -108,6 +113,7 @@ let plot_merge_results ?t_choices ~plot_all_sites ~(res_by_tools:result list) ~t
|
`Topological_WAG
_
->
"Topological_WAG01:0.9"
|
`Tdg09
_
->
"Tdg09_1MinusFDR:0.99,Tdg09_prob_post:0.99"
|
`Multinomial
_
->
"Mutinomial_1MinusLRT:0.95"
|
`Msd
_
->
"Msd_1MinusP:0.95"
in
string
opt
)
|>
seq
~
sep
:
","
...
...
lib/convergence_detection.mli
View file @
8f3a909f
...
...
@@ -16,6 +16,7 @@ type result = [
|
`Topological_WAG
of
[
`topological
]
directory
workflow
|
`Tdg09
of
[
`tdg09
]
directory
workflow
|
`Multinomial
of
[
`multinomial
]
directory
workflow
|
`Msd
of
[
`msd
]
directory
workflow
]
val
meth_string_of_result
:
result
->
string
...
...
lib/msd.ml
0 → 100644
View file @
8f3a909f
open
Core
open
Bistro
.
Std
open
Bistro
.
EDSL
open
Bistro_bioinfo
.
Std
open
File_formats
let
env
=
docker_image
~
account
:
"carinerey"
~
name
:
"msd"
~
tag
:
"07232018"
()
let
msd
~
(
faa
:
aminoacid_fasta
workflow
)
~
(
tree_sc
:_
workflow
)
:
[
`msd
]
directory
workflow
=
let
map_table
=
tmp
//
"map.tsv"
in
let
tree_nw
=
tmp
//
"tree.nw"
in
let
out
=
dest
//
"out.tsv"
in
workflow
~
descr
:
"convergence_detection.run_msd"
[
mkdir_p
dest
;
(*./msd -t 1 -o <nom fichier de sortie> -e 0.05 <phylogénie Newick> <table caractère convergent> <fichier de simulation fasta> *)
cmd
"python"
~
env
[
file_dump
(
string
Scripts
.
parse_input_msd
)
;
opt
"-i"
dep
tree_sc
;
opt
"-o"
ident
tree_nw
;
opt
"-m"
ident
map_table
;
];
cmd
"msd"
~
env
[
opt
"-t"
int
1
;
opt
"-o"
ident
out
;
opt
"-e"
float
1
.
;
dep
tree_sc
;
ident
map_table
;
dep
faa
;
];
]
let
results
run_msd
:
text_file
workflow
=
workflow
~
descr
:
"convergence_detection.parse_msd"
[
cmd
"python"
~
env
[
file_dump
(
string
Scripts
.
parse_output_msd
)
;
opt
"-i"
dep
(
run_msd
/
selector
[
"out.tsv"
]);
opt
"-o"
ident
dest
;
];
]
lib/pipeline.ml
View file @
8f3a909f
...
...
@@ -151,6 +151,7 @@ let repo_of_detection_result res =
|
`Topological_WAG
w
->
item
[
"Topological_WAG.results.tsv"
]
(
Topological
.
results
w
)
|
`Tdg09
w
->
item
[
"Tdg09.results.tsv"
]
(
Tamuri
.
results
w
)
|
`Multinomial
w
->
item
[
"Multinomial.results.tsv"
]
(
Multinomial
.
results
w
)
|
`Msd
w
->
item
[
"Msd.results.tsv"
]
(
Msd
.
results
w
)
]
;
[
match
res
with
...
...
@@ -164,10 +165,11 @@ let repo_of_detection_result res =
|
`Topological_WAG
w
->
item
[
"raw_results"
]
w
|
`Tdg09
w
->
item
[
"raw_results"
]
w
|
`Multinomial
w
->
item
[
"raw_results"
]
w
|
`Msd
w
->
item
[
"raw_results"
]
w
]
;
match
res
with
|
`Diffsel
w
->
[
item
[
"chain_convergence_checking.html"
]
((
Diffsel
.
check_conv
w
)
/
selector
[
"out.html"
])]
|
_
->
[]
match
res
with
|
`Diffsel
w
->
[
item
[
"chain_convergence_checking.html"
]
((
Diffsel
.
check_conv
w
)
/
selector
[
"out.html"
])]
|
_
->
[]
]
|>
List
.
concat
|>
Repo
.
shift
det_meth_prefix
|>
Repo
.
shift
"Detection_tools"
...
...
@@ -213,6 +215,7 @@ let derive_from_det_meth ~det_meth ~(dataset : Dataset.t) ~preview =
|
`Topological_LG
->
`Topological_LG
(
Topological
.
topological
~
faa
~
tree
:
tree_id
~
tree_conv
~
prot_model
:
"LG08"
)
|
`Topological_WAG
->
`Topological_WAG
(
Topological
.
topological
~
faa
~
tree
:
tree_id
~
tree_conv
~
prot_model
:
"WAG01"
)
|
`Multinomial
->
`Multinomial
(
Multinomial
.
multinomial
~
faa
~
tree_id
~
tree_sc
)
|
`Msd
->
`Msd
(
Msd
.
msd
~
faa
~
tree_sc
)
let
derive_from_dataset
~
dataset
~
preview
~
fast_mode
=
...
...
@@ -221,7 +224,8 @@ let derive_from_dataset ~dataset ~preview ~fast_mode=
`Tdg09
;
`Identical_LG
;
`Topological_LG
;
`Multinomial
]
;
`Multinomial
;
`Msd
]
;
if
preview
then
[]
else
...
...
lib/scripts/merge_det_results.py
View file @
8f3a909f
...
...
@@ -72,6 +72,8 @@ availableOptions.add_argument('--tdg09', type=str,
help
=
"tdg09 output name"
,
default
=
None
)
availableOptions
.
add_argument
(
'--multinomial'
,
type
=
str
,
help
=
"multinomial output name"
,
default
=
None
)
availableOptions
.
add_argument
(
'--msd'
,
type
=
str
,
help
=
"msd output name"
,
default
=
None
)
##############
...
...
@@ -90,6 +92,7 @@ df_topological_LG = pd.DataFrame()
df_topological_WAG
=
pd
.
DataFrame
()
df_tdg09
=
pd
.
DataFrame
()
df_multinomial
=
pd
.
DataFrame
()
df_msd
=
pd
.
DataFrame
()
OutName
=
args
.
output
if
args
.
pcoc
:
...
...
@@ -137,6 +140,10 @@ if args.topological_WAG :
if
args
.
tdg09
:
df_tdg09
=
pd
.
read_csv
(
args
.
tdg09
,
sep
=
"
\t
"
)
if
args
.
msd
:
df_msd
=
pd
.
read_csv
(
args
.
msd
,
sep
=
"
\t
"
)
df_msd
.
rename
(
columns
=
{
'1MinusP'
:
'Msd_1MinusP'
},
inplace
=
True
)
if
args
.
multinomial
:
df_multinomial
=
pd
.
read_csv
(
args
.
multinomial
,
sep
=
"
\t
"
)
df_multinomial
=
df_multinomial
[[
'Sites'
,
'1MinusLRT'
]]
...
...
@@ -148,7 +155,8 @@ df_list = [df for df in [df_pcoc, df_pcoc_gamma, df_pcoc_C60,
df_identical_LG
,
df_identical_WAG
,
df_topological_LG
,
df_topological_WAG
,
df_tdg09
,
df_multinomial
df_multinomial
,
df_msd
]
if
not
df
.
empty
]
df_list_len
=
[
df
.
shape
[
0
]
for
df
in
df_list
]
...
...
lib/scripts/parse_input_msd.py
0 → 100644
View file @
8f3a909f
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import
argparse
import
sys
import
os
import
logging
from
ete3
import
Tree
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser
=
argparse
.
ArgumentParser
(
prog
=
"parse_input_msd.py"
,
description
=
''
)
parser
.
add_argument
(
'--version'
,
action
=
'version'
,
version
=
'%(prog)s 1.0'
)
parser
.
add_argument
(
'--debug'
,
action
=
"store_true"
,
help
=
"debug mode"
,
default
=
False
)
##############
requiredOptions
=
parser
.
add_argument_group
(
'REQUIRED OPTIONS'
)
requiredOptions
.
add_argument
(
'-i'
,
"--input_tree"
,
type
=
argparse
.
FileType
(
'r'
),
help
=
'Input tree filename'
,
required
=
True
)
requiredOptions
.
add_argument
(
'-m'
,
'--output_map'
,
type
=
str
,
help
=
"Output map filename"
,
required
=
True
)
requiredOptions
.
add_argument
(
'-o'
,
'--output_tree'
,
type
=
str
,
help
=
"Output tree name"
,
required
=
True
)
##############
### Option parsing
args
=
parser
.
parse_args
()
InputTreeFile
=
args
.
input_tree
OutputTreeFile
=
args
.
output_tree
OutputMapFile
=
args
.
output_map
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger
=
logging
.
getLogger
(
"parse_input_tree"
)
logger
.
setLevel
(
logging
.
DEBUG
)
ch
=
logging
.
StreamHandler
()
if
args
.
debug
:
ch
.
setLevel
(
logging
.
DEBUG
)
else
:
ch
.
setLevel
(
logging
.
INFO
)
# create formatter and add it to the handlers
formatter_ch
=
logging
.
Formatter
(
'%(asctime)s - %(levelname)s - %(message)s'
)
ch
.
setFormatter
(
formatter_ch
)
logger
.
addHandler
(
ch
)
logger
.
debug
(
sys
.
argv
)
#===================================================================================================
# Read input tree
#===================================================================================================
try
:
t
=
Tree
(
InputTreeFile
.
name
)
except
Exception
as
exc
:
logger
.
error
(
str
(
exc
))
sys
.
exit
(
1
)
if
t
:
features
=
[]
for
n
in
t
.
traverse
(
"postorder"
):
# get all features:
features
.
extend
(
list
(
set
(
dir
(
n
))
-
set
(
dir
(
Tree
()))))
features
=
list
(
set
(
features
))
# list(set(*)) = remove duplicates
logger
.
info
(
"No detected tag"
if
not
features
else
"Detected tags: "
+
", "
.
join
([
f
for
f
in
features
]))
if
"ND"
in
features
:
logger
.
warning
(
"
\"
ND
\"
is in the detected tags but it will be removed by the programm"
)
features
.
remove
(
"ND"
)
if
not
"Condition"
in
features
:
logger
.
error
(
"
\"
Condition
\"
must be in the detected tags.
\"
Condition:1
\"
must identify nodes under the convergent model under Ha"
)
sys
.
exit
(
1
)
if
not
"Transition"
in
features
:
logger
.
error
(
"
\"
Transition
\"
must be in the detected tags.
\"
Transition:1
\"
must identify nodes where there are transitions"
)
sys
.
exit
(
1
)
if
"Transition"
in
features
:
logger
.
info
(
"
\"
Transition
\"
is in the detected tags.
\"
Transition:1
\"
will be use to indicate OneChange model on this node"
)
# Check existing bl
branch_lengths
=
[]
nodeId
=
0
for
node
in
t
.
traverse
(
"postorder"
):
node
.
add_features
(
ND
=
nodeId
)
branch_lengths
.
append
(
node
.
dist
)
nodeId
=
nodeId
+
1
logger
.
debug
(
"Branch length %s"
,
branch_lengths
)
if
set
(
branch_lengths
)
==
set
([
1.0
]):
logger
.
warning
(
"No branch length in %s, all branch length will be put to 1"
,
tree_filename
)
if
not
(
t
and
branch_lengths
):
sys
.
exit
(
1
)
logger
.
info
(
"Tree (%s) ok after checking"
,
InputTreeFile
.
name
)
#===================================================================================================
# Create output files
#===================================================================================================
######### output trees for msd
### tree.nw
t
.
write
(
format
=
1
,
outfile
=
OutputTreeFile
)
### map table
conv_leaves
=
[
n
.
name
for
n
in
t
.
search_nodes
(
Condition
=
"1"
)
if
n
.
is_leaf
()]
nonconv_leaves
=
[
n
.
name
for
n
in
t
.
search_nodes
(
Condition
=
"0"
)
if
n
.
is_leaf
()]
with
open
(
OutputMapFile
,
"w"
)
as
MAP
:
MAP
.
write
(
"
\n
"
.
join
([
"%s
\t
%s"
%
(
n
,
1
)
for
n
in
conv_leaves
]))
MAP
.
write
(
"
\n
"
)
MAP
.
write
(
"
\n
"
.
join
([
"%s
\t
%s"
%
(
n
,
0
)
for
n
in
nonconv_leaves
]))
MAP
.
write
(
"
\n
"
)
lib/scripts/parse_output_msd.py
0 → 100644
View file @
8f3a909f
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import
argparse
import
sys
import
os
import
logging
import
pandas
as
pd
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser
=
argparse
.
ArgumentParser
(
prog
=
"parse_output_msd.py"
,
description
=
''
)
parser
.
add_argument
(
'--version'
,
action
=
'version'
,
version
=
'%(prog)s 1.0'
)
parser
.
add_argument
(
'--debug'
,
action
=
"store_true"
,
help
=
"debug mode"
,
default
=
False
)
##############
requiredOptions
=
parser
.
add_argument_group
(
'REQUIRED OPTIONS'
)
requiredOptions
.
add_argument
(
'-i'
,
"--input"
,
type
=
argparse
.
FileType
(
'r'
),
help
=
'Input filename'
,
required
=
True
)
requiredOptions
.
add_argument
(
'-o'
,
'--output'
,
type
=
str
,
help
=
"Output filename"
,
required
=
True
)
##############
### Option parsing
args
=
parser
.
parse_args
()
InputFile
=
args
.
input
OutputFileName
=
args
.
output
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger
=
logging
.
getLogger
(
"parse_input_tree"
)
logger
.
setLevel
(
logging
.
DEBUG
)
ch
=
logging
.
StreamHandler
()
if
args
.
debug
:
ch
.
setLevel
(
logging
.
DEBUG
)
else
:
ch
.
setLevel
(
logging
.
INFO
)
# create formatter and add it to the handlers
formatter_ch
=
logging
.
Formatter
(
'%(asctime)s - %(levelname)s - %(message)s'
)
ch
.
setFormatter
(
formatter_ch
)
logger
.
addHandler
(
ch
)
logger
.
debug
(
sys
.
argv
)
#===================================================================================================
# Read input
#===================================================================================================
lines
=
InputFile
.
readlines
()
InputFile
.
close
()
res_line
=
lines
[
1
].
strip
()
# colnames :
#name status length unknown percent pvalue brute pvalue corrected number of convergent sites convergent sites
res_line_split
=
res_line
.
split
(
"
\t
"
)
n_sites
=
int
(
res_line_split
[
2
])
prob_post
=
[
"NA"
]
*
(
n_sites
)
try
:
p_value
=
1
-
float
(
res_line_split
[
5
])
except
:
p_value
=
1
try
:
convergent_sites
=
res_line_split
[
7
].
split
(
","
)
except
:
# if no convergent sites, convergent sites field don't exist
convergent_sites
=
[]
for
cs
in
convergent_sites
:
prob_post
[
int
(
cs
)
-
1
]
=
p_value
Sites
=
[
i
+
1
for
i
in
range
(
n_sites
)]
df_final
=
pd
.
DataFrame
({
'Sites'
:
Sites
,
'1MinusP'
:
prob_post
})
df_final
=
df_final
[[
"Sites"
,
"1MinusP"
]]
#===================================================================================================
# Create output files
#===================================================================================================
if
df_final
.
shape
[
0
]
!=
n_sites
:
print
(
"Format error!"
)
sys
.
exit
(
42
)
df_final
.
to_csv
(
OutputFileName
,
sep
=
"
\t
"
,
index
=
False
)
print
(
"Write summary output in %s"
%
OutputFileName
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment