Commit 8f3a909f authored by Carine Rey's avatar Carine Rey
Browse files

add msd_meth (not working well yet due to output msd format but compile)

parent 6ae4542d
FROM debian:stretch
# base
FROM carinerey/python_basics:07182018
MAINTAINER Carine Rey carine.rey@ens-lyon.org
# add msd to python_basics
RUN apt-get update && \
apt-get install --no-install-recommends -qy \
git \
make \
gcc \
libgsl-dev \
......@@ -11,9 +13,7 @@ RUN apt-get update && \
build-essential \
libblas-dev libatlas-base-dev
COPY Convergence-modif/ /msd
WORKDIR /msd/src
RUN make all
RUN cp msd /usr/local/bin
......@@ -16,6 +16,7 @@ type result = [
| `Topological_WAG of [`topological] directory workflow
| `Tdg09 of [`tdg09] directory workflow
| `Multinomial of [`multinomial] directory workflow
| `Msd of [`msd] directory workflow
]
let meth_string_of_result = function
......@@ -29,6 +30,7 @@ let meth_string_of_result = function
| `Topological_WAG _ -> "topological_WAG"
| `Tdg09 _ -> "tdg09"
| `Multinomial _ -> "multinomial"
| `Msd _ -> "msd"
type dataset_res = {
model_prefix : string ;
......@@ -53,6 +55,7 @@ let merge_results ~res_by_tools : text_file workflow =
| `Topological_WAG d -> Topological.results d
| `Tdg09 d -> Tamuri.results d
| `Multinomial d -> Multinomial.results d
| `Msd d -> Msd.results d
in
let opt = match res with
| `Pcoc _ -> string "--pcoc"
......@@ -65,6 +68,7 @@ let merge_results ~res_by_tools : text_file workflow =
| `Topological_WAG _ -> string "--topological_WAG"
| `Tdg09 _ -> string "--tdg09"
| `Multinomial _ -> string "--multinomial"
| `Msd _ -> string "--msd"
in
seq ~sep:" " [opt; dep w]
)
......@@ -92,6 +96,7 @@ let plot_merge_results ?t_choices ~plot_all_sites ~(res_by_tools:result list) ~t
| `Topological_WAG _ -> "Topological_WAG01"
| `Tdg09 _ -> "Tdg09_1MinusFDR,Tdg09_prob_post"
| `Multinomial _ -> "Mutinomial_1MinusLRT"
| `Msd _ -> "Msd_1MinusP"
in
string opt
) |> seq ~sep:","
......@@ -108,6 +113,7 @@ let plot_merge_results ?t_choices ~plot_all_sites ~(res_by_tools:result list) ~t
| `Topological_WAG _ -> "Topological_WAG01:0.9"
| `Tdg09 _ -> "Tdg09_1MinusFDR:0.99,Tdg09_prob_post:0.99"
| `Multinomial _ -> "Mutinomial_1MinusLRT:0.95"
| `Msd _ -> "Msd_1MinusP:0.95"
in
string opt
) |> seq ~sep:","
......
......@@ -16,6 +16,7 @@ type result = [
| `Topological_WAG of [`topological] directory workflow
| `Tdg09 of [`tdg09] directory workflow
| `Multinomial of [`multinomial] directory workflow
| `Msd of [`msd] directory workflow
]
val meth_string_of_result : result -> string
......
open Core
open Bistro.Std
open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
let env = docker_image ~account:"carinerey" ~name:"msd" ~tag:"07232018" ()
let msd ~(faa:aminoacid_fasta workflow) ~(tree_sc:_ workflow) : [`msd] directory workflow =
let map_table = tmp // "map.tsv" in
let tree_nw = tmp // "tree.nw" in
let out = dest // "out.tsv" in
workflow ~descr:"convergence_detection.run_msd" [
mkdir_p dest;
(*./msd -t 1 -o <nom fichier de sortie> -e 0.05 <phylogénie Newick> <table caractère convergent> <fichier de simulation fasta> *)
cmd "python" ~env [
file_dump (string Scripts.parse_input_msd) ;
opt "-i" dep tree_sc;
opt "-o" ident tree_nw;
opt "-m" ident map_table;
];
cmd "msd" ~env [
opt "-t" int 1;
opt "-o" ident out ;
opt "-e" float 1. ;
dep tree_sc;
ident map_table;
dep faa;
];
]
let results run_msd : text_file workflow =
workflow ~descr:"convergence_detection.parse_msd" [
cmd "python" ~env [
file_dump (string Scripts.parse_output_msd) ;
opt "-i" dep (run_msd / selector ["out.tsv"]);
opt "-o" ident dest;
];
]
......@@ -151,6 +151,7 @@ let repo_of_detection_result res =
| `Topological_WAG w -> item ["Topological_WAG.results.tsv"] (Topological.results w)
| `Tdg09 w -> item ["Tdg09.results.tsv"] (Tamuri.results w)
| `Multinomial w -> item ["Multinomial.results.tsv"] (Multinomial.results w)
| `Msd w -> item ["Msd.results.tsv"] (Msd.results w)
] ;
[
match res with
......@@ -164,10 +165,11 @@ let repo_of_detection_result res =
| `Topological_WAG w -> item ["raw_results"] w
| `Tdg09 w -> item ["raw_results"] w
| `Multinomial w -> item ["raw_results"] w
| `Msd w -> item ["raw_results"] w
] ;
match res with
| `Diffsel w -> [item ["chain_convergence_checking.html"] ((Diffsel.check_conv w) / selector ["out.html"])]
| _ -> []
match res with
| `Diffsel w -> [item ["chain_convergence_checking.html"] ((Diffsel.check_conv w) / selector ["out.html"])]
| _ -> []
] |> List.concat
|> Repo.shift det_meth_prefix
|> Repo.shift "Detection_tools"
......@@ -213,6 +215,7 @@ let derive_from_det_meth ~det_meth ~(dataset : Dataset.t) ~preview =
| `Topological_LG -> `Topological_LG (Topological.topological ~faa ~tree:tree_id ~tree_conv ~prot_model:"LG08")
| `Topological_WAG -> `Topological_WAG (Topological.topological ~faa ~tree:tree_id ~tree_conv ~prot_model:"WAG01")
| `Multinomial -> `Multinomial (Multinomial.multinomial ~faa ~tree_id ~tree_sc)
| `Msd -> `Msd (Msd.msd ~faa ~tree_sc)
let derive_from_dataset ~dataset ~preview ~fast_mode=
......@@ -221,7 +224,8 @@ let derive_from_dataset ~dataset ~preview ~fast_mode=
`Tdg09;
`Identical_LG;
`Topological_LG;
`Multinomial] ;
`Multinomial;
`Msd] ;
if preview then
[]
else
......
......@@ -72,6 +72,8 @@ availableOptions.add_argument('--tdg09', type=str,
help="tdg09 output name", default = None)
availableOptions.add_argument('--multinomial', type=str,
help="multinomial output name", default = None)
availableOptions.add_argument('--msd', type=str,
help="msd output name", default = None)
##############
......@@ -90,6 +92,7 @@ df_topological_LG = pd.DataFrame()
df_topological_WAG = pd.DataFrame()
df_tdg09 = pd.DataFrame()
df_multinomial = pd.DataFrame()
df_msd = pd.DataFrame()
OutName = args.output
if args.pcoc :
......@@ -137,6 +140,10 @@ if args.topological_WAG :
if args.tdg09 :
df_tdg09 = pd.read_csv(args.tdg09, sep="\t")
if args.msd :
df_msd = pd.read_csv(args.msd, sep="\t")
df_msd.rename(columns={'1MinusP': 'Msd_1MinusP'}, inplace=True)
if args.multinomial :
df_multinomial = pd.read_csv(args.multinomial, sep="\t")
df_multinomial = df_multinomial[['Sites','1MinusLRT']]
......@@ -148,7 +155,8 @@ df_list = [df for df in [df_pcoc, df_pcoc_gamma, df_pcoc_C60,
df_identical_LG, df_identical_WAG,
df_topological_LG, df_topological_WAG,
df_tdg09,
df_multinomial
df_multinomial,
df_msd
] if not df.empty ]
df_list_len = [df.shape[0] for df in df_list]
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os
import logging
from ete3 import Tree
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="parse_input_msd.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-i', "--input_tree", type=argparse.FileType('r'),
help='Input tree filename', required=True)
requiredOptions.add_argument('-m', '--output_map', type=str,
help="Output map filename", required=True)
requiredOptions.add_argument('-o', '--output_tree', type=str,
help="Output tree name", required=True)
##############
### Option parsing
args = parser.parse_args()
InputTreeFile = args.input_tree
OutputTreeFile = args.output_tree
OutputMapFile = args.output_map
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("parse_input_tree")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input tree
#===================================================================================================
try:
t=Tree(InputTreeFile.name)
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
if t:
features = []
for n in t.traverse("postorder"): # get all features:
features.extend(list(set(dir(n)) - set(dir(Tree()))))
features = list(set(features)) # list(set(*)) = remove duplicates
logger.info("No detected tag" if not features else "Detected tags: "+", ".join([f for f in features]))
if "ND" in features:
logger.warning("\"ND\" is in the detected tags but it will be removed by the programm")
features.remove("ND")
if not "Condition" in features:
logger.error("\"Condition\" must be in the detected tags. \"Condition:1\" must identify nodes under the convergent model under Ha")
sys.exit(1)
if not "Transition" in features:
logger.error("\"Transition\" must be in the detected tags. \"Transition:1\" must identify nodes where there are transitions")
sys.exit(1)
if "Transition" in features:
logger.info("\"Transition\" is in the detected tags. \"Transition:1\" will be use to indicate OneChange model on this node")
# Check existing bl
branch_lengths = []
nodeId = 0
for node in t.traverse("postorder"):
node.add_features(ND=nodeId)
branch_lengths.append(node.dist)
nodeId = nodeId + 1
logger.debug("Branch length %s", branch_lengths)
if set(branch_lengths) == set([1.0]):
logger.warning("No branch length in %s, all branch length will be put to 1",tree_filename)
if not (t and branch_lengths):
sys.exit(1)
logger.info("Tree (%s) ok after checking", InputTreeFile.name)
#===================================================================================================
# Create output files
#===================================================================================================
######### output trees for msd
### tree.nw
t.write(format=1, outfile = OutputTreeFile)
### map table
conv_leaves = [n.name for n in t.search_nodes(Condition = "1") if n.is_leaf()]
nonconv_leaves = [n.name for n in t.search_nodes(Condition = "0") if n.is_leaf()]
with open(OutputMapFile, "w") as MAP:
MAP.write("\n".join(["%s\t%s" %(n, 1) for n in conv_leaves]))
MAP.write("\n")
MAP.write("\n".join(["%s\t%s" %(n, 0) for n in nonconv_leaves]))
MAP.write("\n")
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os
import logging
import pandas as pd
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="parse_output_msd.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-i', "--input", type=argparse.FileType('r'),
help='Input filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output filename", required=True)
##############
### Option parsing
args = parser.parse_args()
InputFile = args.input
OutputFileName = args.output
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("parse_input_tree")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input
#===================================================================================================
lines = InputFile.readlines()
InputFile.close()
res_line = lines[1].strip()
# colnames :
#name status length unknown percent pvalue brute pvalue corrected number of convergent sites convergent sites
res_line_split = res_line.split("\t")
n_sites=int(res_line_split[2])
prob_post=["NA"] * (n_sites)
try:
p_value = 1-float(res_line_split[5])
except:
p_value = 1
try:
convergent_sites = res_line_split[7].split(",")
except: # if no convergent sites, convergent sites field don't exist
convergent_sites = []
for cs in convergent_sites:
prob_post[int(cs) -1] = p_value
Sites = [i +1 for i in range(n_sites)]
df_final = pd.DataFrame({'Sites': Sites, '1MinusP' : prob_post})
df_final = df_final[["Sites","1MinusP"]]
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
print("Format error!")
sys.exit(42)
df_final.to_csv(OutputFileName, sep = "\t", index = False)
print("Write summary output in %s" %OutputFileName)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment