Commit 3f225416 authored by Carine Rey's avatar Carine Rey
Browse files

add Topological and Tdg09 detection meth (not working yet)

parent 8ee06d5a
FROM debian:stretch
RUN apt-get update && \
apt-get install --no-install-recommends -qy \
python-pip \
python-dev \
build-essential \
python2.7-minimal \
python-numpy \
python-pandas \
python-setuptools \
python-dev \
pyqt4-dev-tools \
xauth \
libcurl4-openssl-dev \
libxml2-dev \
libssl-dev \
libcairo2-dev \
openjdk-8-jre \
wget
RUN pip install --upgrade pip
RUN pip install ete3==3.0.0b35
RUN pip install biopython==1.72
# getting tdg09
WORKDIR /tdg09
RUN wget https://github.com/tamuri/tdg09/archive/v1.1.2.tar.gz &&\
tar xzf v1.1.2.tar.gz &&\
rm v1.1.2.tar.gz && \
printf "#!/bin/bash \
\njava -cp /tdg09/tdg09-1.1.2/dist/tdg09.jar tdg09.Analyse \$@ \
\n" > /usr/bin/tdg09.sh && chmod +x /usr/bin/tdg09.sh
#! /bin/bash
set -e
IMAGE_NAME=tdg09
DOCKERFILE_DIR=.
TAG=v1.1.2
REPO=carinerey/$IMAGE_NAME:$TAG
docker build -t $REPO -f ./Dockerfile $DOCKERFILE_DIR
if [[ $1 == "push_yes" ]]
then
docker push $REPO
fi
......@@ -12,6 +12,9 @@ type result = [
| `Diffsel_bis of [`diffsel] directory workflow
| `Identical_LG of [`identical] directory workflow
| `Identical_WAG of [`identical] directory workflow
| `Topological_LG of [`topological] directory workflow
| `Topological_WAG of [`topological] directory workflow
| `Tdg09 of [`tdg09] directory workflow
]
let meth_string_of_result = function
......@@ -21,6 +24,9 @@ let meth_string_of_result = function
| `Diffsel_bis _ -> "diffsel_bis"
| `Identical_LG _ -> "identical_LG"
| `Identical_WAG _ -> "identical_WAG"
| `Topological_LG _ -> "topological_LG"
| `Topological_WAG _ -> "topological_WAG"
| `Tdg09 _ -> "Tdg09"
type dataset_res = {
model_prefix : string ;
......@@ -40,6 +46,9 @@ let merge_results ~res_by_tools : text_file workflow =
| `Diffsel_bis d -> Diffsel.selector d
| `Identical_LG d -> Identical.results d
| `Identical_WAG d -> Identical.results d
| `Topological_LG d -> Topological.results d
| `Topological_WAG d -> Topological.results d
| `Tdg09 d -> Tamuri.results d
in
let opt = match res with
| `Pcoc _ -> string "--pcoc"
......@@ -48,6 +57,9 @@ let merge_results ~res_by_tools : text_file workflow =
| `Diffsel_bis _ -> string "--diffsel_bis"
| `Identical_LG _ -> string "--identical_LG"
| `Identical_WAG _ -> string "--identical_WAG"
| `Topological_LG _ -> string "--topological_LG"
| `Topological_WAG _ -> string "--topological_WAG"
| `Tdg09 _ -> string "--tdg09"
in
seq ~sep:" " [opt; dep w]
)
......@@ -74,6 +86,9 @@ let plot_merge_results ~(res_by_tools:result list) ~tree ~faa ~tsv : svg workflo
| `Diffsel_bis _ -> "Diffsel_bis"
| `Identical_LG _ -> "Identical_LG08"
| `Identical_WAG _ -> "Identical_WAG01"
| `Topological_LG _ -> "Topological_LG08"
| `Topological_WAG _ -> "Topological_WAG01"
| `Tdg09 _ -> "Tdg09"
in
string opt
) |> seq ~sep:","
......@@ -86,6 +101,9 @@ let plot_merge_results ~(res_by_tools:result list) ~tree ~faa ~tsv : svg workflo
| `Diffsel_bis _ -> "Diffsel_bis:0.9"
| `Identical_LG _ -> "Identical_LG08:0.9"
| `Identical_WAG _ -> "Identical_WAG01:0.9"
| `Topological_LG _ -> "Topological_LG08:0.9"
| `Topological_WAG _ -> "Topological_WAG01:0.9"
| `Tdg09 _ -> "Tdg09:0.9"
in
string opt
) |> seq ~sep:","
......
......@@ -13,6 +13,9 @@ type result = [
| `Diffsel_bis of [`diffsel] directory workflow
| `Identical_LG of [`identical] directory workflow
| `Identical_WAG of [`identical] directory workflow
| `Topological_LG of [`topological] directory workflow
| `Topological_WAG of [`topological] directory workflow
| `Tdg09 of [`tdg09] directory workflow
]
val meth_string_of_result : result -> string
......
......@@ -51,6 +51,9 @@ let repo_of_detection_result res =
| `Diffsel_bis w -> item ["diffsel_bis.results.tsv"] (Diffsel.selector w)
| `Identical_LG w -> item ["Identical_LG.results.tsv"] (Identical.results w)
| `Identical_WAG w -> item ["Identical_WAG.results.tsv"] (Identical.results w)
| `Topological_LG w -> item ["Topological_LG.results.tsv"] (Topological.results w)
| `Topological_WAG w -> item ["Topological_WAG.results.tsv"] (Topological.results w)
| `Tdg09 w -> item ["Tdg09.results.tsv"] (Tamuri.results w)
) ;
(
match res with
......@@ -60,6 +63,9 @@ let repo_of_detection_result res =
| `Diffsel_bis w -> item ["raw_results"] w
| `Identical_LG w -> item ["raw_results"] w
| `Identical_WAG w -> item ["raw_results"] w
| `Topological_LG w -> item ["raw_results"] w
| `Topological_WAG w -> item ["raw_results"] w
| `Tdg09 w -> item ["raw_results"] w
) ;
]
|> Repo.shift det_meth_prefix
......@@ -90,23 +96,27 @@ let derive_from_det_meth ~det_meth ~(dataset : Dataset.t) ~preview =
let tree_sc = Tree_dataset.tree dataset.dataset.tree_dataset `Detection in
let tree_id = Tree_dataset.tree dataset.dataset.tree_dataset `Simulation in
let diffsel_tree = Tree_dataset.diffsel_tree dataset.dataset.tree_dataset in
let tree_conv = Tree_dataset.topological_tree dataset.dataset.tree_dataset in
let w_every = if preview then 1 else 1 in
let n_cycles = if preview then 10 else 2000 in
match det_meth with
| `Pcoc -> `Pcoc (Pcoc.pcoc ~plot_complete:true ~gamma:false ~faa ~tree:tree_sc)
| `Pcoc_gamma -> `Pcoc_gamma (Pcoc.pcoc ~plot_complete: true ~gamma:true ~faa ~tree:tree_sc)
| `Tdg09 -> `Tdg09 (Tamuri.tdg09 ~faa ~tree:tree_sc)
| `Diffsel -> `Diffsel (Diffsel.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles)
| `Diffsel_bis -> `Diffsel_bis (Diffsel.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles)
| `Identical_LG -> `Identical_LG (Identical.identical ~faa ~tree_id ~tree_sc ~prot_model:"LG08")
| `Identical_WAG -> `Identical_WAG (Identical.identical ~faa ~tree_id ~tree_sc ~prot_model:"WAG01")
| `Topological_LG -> `Topological_LG (Topological.topological ~faa ~tree:tree_id ~tree_conv ~prot_model:"LG08")
| `Topological_WAG -> `Topological_WAG (Topological.topological ~faa ~tree:tree_id ~tree_conv ~prot_model:"WAG01")
let derive_from_dataset ~dataset ~preview =
let det_meths = [`Pcoc;`Pcoc_gamma;
`Tdg09;
`Diffsel;`Diffsel_bis;
`Identical_LG;`Identical_WAG] in
`Identical_LG;`Identical_WAG;
`Topological_LG;`Topological_WAG] in
let res_by_tools = List.map det_meths ~f:(fun det_meth ->
derive_from_det_meth ~det_meth ~dataset ~preview
) in
......
......@@ -25,6 +25,7 @@ let repo rd =
item ["tree.only_convergent_tags.nhx" ] (Tree_dataset.tree rd.tree_dataset `Detection) ;
item ["tree.only_node_ids.nhx" ] (Tree_dataset.tree rd.tree_dataset `Simulation) ;
item ["tree.diffsel" ] (Tree_dataset.diffsel_tree rd.tree_dataset) ;
item ["tree.convergent_topology" ] (Tree_dataset.topological_tree rd.tree_dataset) ;
item ["simulated_sequences.fna"] rd.fna ;
item ["simulated_sequences.faa"] rd.faa ;
]
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os, re
import logging
import pandas as pd
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from Bio import AlignIO
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="calc_topological.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-bppml_non_conv', "--bppml_non_conv", type=argparse.FileType('r'),
help='bppml_non_conv filename', required=True)
requiredOptions.add_argument('-bppml_conv', "--bppml_conv", type=argparse.FileType('r'),
help='bppml_conv filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
##############
### Option parsing
args = parser.parse_args()
Bppml_non_conv = args.bppml_non_conv
Bppml_conv = args.bppml_conv
OutFile = args.output
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("calc_identical")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
#
#===================================================================================================
#proba_df_melt["Sites"] = pd.to_numeric(proba_df_melt["Sites"].str.replace('[','').str.replace(']',''))
n_sites = 20
Sites = [i +1 for i in range(n_sites)]
df_final = pd.DataFrame({'Sites': Sites,
'Topological' : 0})
df_final = df_final[["Sites","Topological"]]
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
logger.error("Format error!")
sys.exit(42)
df_final.to_csv(OutFile, sep = "\t", index = False)
logger.info("Write summary output in %s", OutFile)
......@@ -62,6 +62,12 @@ availableOptions.add_argument('--identical_LG', type=str,
help="Identical_LG output name", default = None)
availableOptions.add_argument('--identical_WAG', type=str,
help="Identical_WAG output name", default = None)
availableOptions.add_argument('--topological_LG', type=str,
help="topological_LG output name", default = None)
availableOptions.add_argument('--topological_WAG', type=str,
help="topological_WAG output name", default = None)
availableOptions.add_argument('--tdg09', type=str,
help="tdg09 output name", default = None)
##############
......@@ -75,6 +81,9 @@ df_diffsel = pd.DataFrame()
df_diffsel_bis = pd.DataFrame()
df_identical_LG = pd.DataFrame()
df_identical_WAG = pd.DataFrame()
df_topological_LG = pd.DataFrame()
df_topological_WAG = pd.DataFrame()
df_tdg09 = pd.DataFrame()
OutName = args.output
if args.pcoc :
......@@ -103,10 +112,24 @@ if args.identical_WAG :
df_identical_WAG = pd.read_csv(args.identical_WAG, sep="\t")
df_identical_WAG.rename(columns={'Identical': 'Identical_WAG01'}, inplace=True)
if args.topological_LG :
df_topological_LG = pd.read_csv(args.topological_LG, sep="\t")
df_topological_LG.rename(columns={'Topological': 'Topological_LG08'}, inplace=True)
if args.topological_WAG :
df_topological_WAG = pd.read_csv(args.topological_WAG, sep="\t")
df_topological_WAG.rename(columns={'Topological': 'Topological_WAG01'}, inplace=True)
if args.tdg09 :
df_tdg09 = pd.read_csv(args.tdg09, sep="\t")
df_list = [df for df in [df_pcoc, df_pcoc_gamma,
df_diffsel, df_diffsel_bis,
df_identical_LG, df_identical_WAG] if not df.empty ]
df_identical_LG, df_identical_WAG,
df_topological_LG, df_topological_WAG,
df_tdg09
] if not df.empty ]
df_list_len = [df.shape[0] for df in df_list]
if len(set(df_list_len)) != 1:
......
......@@ -205,9 +205,76 @@ with open("%s/tree.Ha.node_ids" %(OutDirName), "w") as output_Ha_node_ids:
t.write(format=1, features=["Condition","Transition"], outfile = "%s/tree.only_convergent_tags.nhx" %(OutDirName))
### tree.diffsel: a diffsel input formated tree
for n in t.traverse():
tdiffsel = t.copy(method="deepcopy")
for n in tdiffsel.traverse():
if n.ND in conv_node_ids+trans_node_ids:
n.dist = 2
elif n.ND in not_conv_node_ids:
n.dist = 1
t.write(format=1, outfile = "%s/tree.diffsel" %(OutDirName))
tdiffsel.write(format=1, outfile = "%s/tree.diffsel" %(OutDirName))
### tree.topological : a diffsel input formated tree
def build_conv_topo(t):
conv_nodes_ids = [n.ND for n in t.search_nodes(Condition="1")]
tconv = t.copy(method="deepcopy")
for n in tconv.iter_leaves():
n.add_features(L=1)
for n in tconv.traverse():
n.add_features(COPY=0)
# get the most recent ancestral node of all the convergent clades
l_convergent_clades = tconv.search_nodes(Condition="1")
common_anc_conv=tconv.get_common_ancestor(l_convergent_clades)
# duplicate it at its same location (branch lenght = 0). we get
# a duplicated subtree with subtrees A and B (A == B)
dist_dup = common_anc_conv.dist
if not common_anc_conv.is_root():
dup_point = common_anc_conv.add_sister(name="dup_point",dist=0.000001)
dup_point_root = False
else:
dup_point = Tree()
dup_point_root = True
dup_point.dist=0.000001
dup_point.add_features(ND=0,Transition="0", Condition="0")
common_anc_conv.detach()
common_anc_conv_copy = common_anc_conv.copy(method="deepcopy")
# tag duplicated nodes:
for n in common_anc_conv_copy.traverse():
n.COPY=1
if n.ND not in conv_nodes_ids and not n.is_root():
n.dist=0.000001
# pruned A from all branches not leading to any convergent clade
l_leaves_to_keep_A = common_anc_conv.search_nodes(COPY=0, Condition="0", L=1)
#logger.debug("A: %s",l_leaves_to_keep_A)
common_anc_conv.prune(l_leaves_to_keep_A, preserve_branch_length=True)
# pruned B from all branches not leading to any non-convergent clade
l_leaves_to_keep_B = common_anc_conv_copy.search_nodes(COPY=1, Condition="1", L=1)
#logger.debug("B : %s", l_leaves_to_keep_B)
common_anc_conv_copy.prune(l_leaves_to_keep_B, preserve_branch_length=True)
dup_point.add_child(common_anc_conv_copy)
dup_point.add_child(common_anc_conv)
tconv = dup_point.get_tree_root()
nodeId = 0
for node in tconv.traverse("postorder"):
node.ND = nodeId
nodeId += 1
return tconv
tconv = build_conv_topo(t)
tconv.write(format=1, outfile = "%s/tree.topological" %(OutDirName))
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import pandas as pd
import sys
# LIGNE ESSAI
# python parse_results_tdg09.py -ftdg09 toto -o out
### Option defining
parser = argparse.ArgumentParser(prog="parse_results_tdg09.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-tdg09', "--file", type=argparse.FileType('r'),
help='Input filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
### Option parsing
args = parser.parse_args()
Tdg09File = args.file
OutFile = args.output
searchlines = Tdg09File.readlines()
Tdg09File.close()
n_sites=int(searchlines[9].split()[1])
fdr=["NA"] * (n_sites)
for i,line in enumerate(searchlines):
if "LrtResults:\n" in line:
for l in searchlines[(i+2):(i+n_sites+1)]:
if not 'FullResults:\n' in l:
ll=l.split()
if ll:
index = int(ll[2][:-1])
value = float(ll[6].replace(',','.'))
#print ll
#print index
#print value
fdr[index-1]=1-value
else:
break
# Sites Tamuri
Sites = [i +1 for i in range(n_sites)]
df_final = pd.DataFrame({'Sites': Sites,
'Tdg09' : fdr})
df_final = df_final[["Sites","Tdg09"]]
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
print("Format error!")
sys.exit(42)
df_final.to_csv(OutFile, sep = "\t", index = False)
print("Write summary output in %s" %OutFile)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import sys
import os
import logging
import ete3
import Bio
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from Bio import AlignIO
# LIGNE ESSAI
# python rename_input_tree_ali_tdg09.py -t tree.only_convergent_tags.nhx -a simulated_sequences.faa -o out
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="rename_input_tree_ali_tdg09.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-t', "--tree", type=argparse.FileType('r'),
help='Input tree filename', required=True)
requiredOptions.add_argument('-a', "--alignment", type=argparse.FileType('r'),
help='Input alignment', required=True)
requiredOptions.add_argument('-out_a', '--output_ali', type=str,
help="Output ali name", required=True)
requiredOptions.add_argument('-out_t', '--output_tree', type=str,
help="Output tree name", required=True)
##############
### Option parsing
args = parser.parse_args()
TreeFile = args.tree
AliFile = args.alignment
OutAli = args.output_ali
OutTree = args.output_tree
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("parse_input_tree")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input tree
#===================================================================================================
try:
t=Tree(TreeFile.name)
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
logger.info("Tree (%s) ok after checking", TreeFile.name)
#===================================================================================================
# Read input alignment
#===================================================================================================
try:
alignment = AlignIO.read(AliFile.name, "fasta")
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
logger.info("Ali (%s) ok after checking", AliFile.name)
#===================================================================================================
# Create output files
#===================================================================================================
######## RENAME TREE LEAVES AND SEQUENCES FOR TDG09
nodeId = 0