Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 3f225416 authored by Carine Rey's avatar Carine Rey
Browse files

add Topological and Tdg09 detection meth (not working yet)

parent 8ee06d5a
FROM debian:stretch
RUN apt-get update && \
apt-get install --no-install-recommends -qy \
python-pip \
python-dev \
build-essential \
python2.7-minimal \
python-numpy \
python-pandas \
python-setuptools \
python-dev \
pyqt4-dev-tools \
xauth \
libcurl4-openssl-dev \
libxml2-dev \
libssl-dev \
libcairo2-dev \
openjdk-8-jre \
wget
RUN pip install --upgrade pip
RUN pip install ete3==3.0.0b35
RUN pip install biopython==1.72
# getting tdg09
WORKDIR /tdg09
RUN wget https://github.com/tamuri/tdg09/archive/v1.1.2.tar.gz &&\
tar xzf v1.1.2.tar.gz &&\
rm v1.1.2.tar.gz && \
printf "#!/bin/bash \
\njava -cp /tdg09/tdg09-1.1.2/dist/tdg09.jar tdg09.Analyse \$@ \
\n" > /usr/bin/tdg09.sh && chmod +x /usr/bin/tdg09.sh
#! /bin/bash
set -e
IMAGE_NAME=tdg09
DOCKERFILE_DIR=.
TAG=v1.1.2
REPO=carinerey/$IMAGE_NAME:$TAG
docker build -t $REPO -f ./Dockerfile $DOCKERFILE_DIR
if [[ $1 == "push_yes" ]]
then
docker push $REPO
fi
......@@ -12,6 +12,9 @@ type result = [
| `Diffsel_bis of [`diffsel] directory workflow
| `Identical_LG of [`identical] directory workflow
| `Identical_WAG of [`identical] directory workflow
| `Topological_LG of [`topological] directory workflow
| `Topological_WAG of [`topological] directory workflow
| `Tdg09 of [`tdg09] directory workflow
]
let meth_string_of_result = function
......@@ -21,6 +24,9 @@ let meth_string_of_result = function
| `Diffsel_bis _ -> "diffsel_bis"
| `Identical_LG _ -> "identical_LG"
| `Identical_WAG _ -> "identical_WAG"
| `Topological_LG _ -> "topological_LG"
| `Topological_WAG _ -> "topological_WAG"
| `Tdg09 _ -> "Tdg09"
type dataset_res = {
model_prefix : string ;
......@@ -40,6 +46,9 @@ let merge_results ~res_by_tools : text_file workflow =
| `Diffsel_bis d -> Diffsel.selector d
| `Identical_LG d -> Identical.results d
| `Identical_WAG d -> Identical.results d
| `Topological_LG d -> Topological.results d
| `Topological_WAG d -> Topological.results d
| `Tdg09 d -> Tamuri.results d
in
let opt = match res with
| `Pcoc _ -> string "--pcoc"
......@@ -48,6 +57,9 @@ let merge_results ~res_by_tools : text_file workflow =
| `Diffsel_bis _ -> string "--diffsel_bis"
| `Identical_LG _ -> string "--identical_LG"
| `Identical_WAG _ -> string "--identical_WAG"
| `Topological_LG _ -> string "--topological_LG"
| `Topological_WAG _ -> string "--topological_WAG"
| `Tdg09 _ -> string "--tdg09"
in
seq ~sep:" " [opt; dep w]
)
......@@ -74,6 +86,9 @@ let plot_merge_results ~(res_by_tools:result list) ~tree ~faa ~tsv : svg workflo
| `Diffsel_bis _ -> "Diffsel_bis"
| `Identical_LG _ -> "Identical_LG08"
| `Identical_WAG _ -> "Identical_WAG01"
| `Topological_LG _ -> "Topological_LG08"
| `Topological_WAG _ -> "Topological_WAG01"
| `Tdg09 _ -> "Tdg09"
in
string opt
) |> seq ~sep:","
......@@ -86,6 +101,9 @@ let plot_merge_results ~(res_by_tools:result list) ~tree ~faa ~tsv : svg workflo
| `Diffsel_bis _ -> "Diffsel_bis:0.9"
| `Identical_LG _ -> "Identical_LG08:0.9"
| `Identical_WAG _ -> "Identical_WAG01:0.9"
| `Topological_LG _ -> "Topological_LG08:0.9"
| `Topological_WAG _ -> "Topological_WAG01:0.9"
| `Tdg09 _ -> "Tdg09:0.9"
in
string opt
) |> seq ~sep:","
......
......@@ -13,6 +13,9 @@ type result = [
| `Diffsel_bis of [`diffsel] directory workflow
| `Identical_LG of [`identical] directory workflow
| `Identical_WAG of [`identical] directory workflow
| `Topological_LG of [`topological] directory workflow
| `Topological_WAG of [`topological] directory workflow
| `Tdg09 of [`tdg09] directory workflow
]
val meth_string_of_result : result -> string
......
......@@ -51,6 +51,9 @@ let repo_of_detection_result res =
| `Diffsel_bis w -> item ["diffsel_bis.results.tsv"] (Diffsel.selector w)
| `Identical_LG w -> item ["Identical_LG.results.tsv"] (Identical.results w)
| `Identical_WAG w -> item ["Identical_WAG.results.tsv"] (Identical.results w)
| `Topological_LG w -> item ["Topological_LG.results.tsv"] (Topological.results w)
| `Topological_WAG w -> item ["Topological_WAG.results.tsv"] (Topological.results w)
| `Tdg09 w -> item ["Tdg09.results.tsv"] (Tamuri.results w)
) ;
(
match res with
......@@ -60,6 +63,9 @@ let repo_of_detection_result res =
| `Diffsel_bis w -> item ["raw_results"] w
| `Identical_LG w -> item ["raw_results"] w
| `Identical_WAG w -> item ["raw_results"] w
| `Topological_LG w -> item ["raw_results"] w
| `Topological_WAG w -> item ["raw_results"] w
| `Tdg09 w -> item ["raw_results"] w
) ;
]
|> Repo.shift det_meth_prefix
......@@ -90,23 +96,27 @@ let derive_from_det_meth ~det_meth ~(dataset : Dataset.t) ~preview =
let tree_sc = Tree_dataset.tree dataset.dataset.tree_dataset `Detection in
let tree_id = Tree_dataset.tree dataset.dataset.tree_dataset `Simulation in
let diffsel_tree = Tree_dataset.diffsel_tree dataset.dataset.tree_dataset in
let tree_conv = Tree_dataset.topological_tree dataset.dataset.tree_dataset in
let w_every = if preview then 1 else 1 in
let n_cycles = if preview then 10 else 2000 in
match det_meth with
| `Pcoc -> `Pcoc (Pcoc.pcoc ~plot_complete:true ~gamma:false ~faa ~tree:tree_sc)
| `Pcoc_gamma -> `Pcoc_gamma (Pcoc.pcoc ~plot_complete: true ~gamma:true ~faa ~tree:tree_sc)
| `Tdg09 -> `Tdg09 (Tamuri.tdg09 ~faa ~tree:tree_sc)
| `Diffsel -> `Diffsel (Diffsel.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles)
| `Diffsel_bis -> `Diffsel_bis (Diffsel.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles)
| `Identical_LG -> `Identical_LG (Identical.identical ~faa ~tree_id ~tree_sc ~prot_model:"LG08")
| `Identical_WAG -> `Identical_WAG (Identical.identical ~faa ~tree_id ~tree_sc ~prot_model:"WAG01")
| `Topological_LG -> `Topological_LG (Topological.topological ~faa ~tree:tree_id ~tree_conv ~prot_model:"LG08")
| `Topological_WAG -> `Topological_WAG (Topological.topological ~faa ~tree:tree_id ~tree_conv ~prot_model:"WAG01")
let derive_from_dataset ~dataset ~preview =
let det_meths = [`Pcoc;`Pcoc_gamma;
`Tdg09;
`Diffsel;`Diffsel_bis;
`Identical_LG;`Identical_WAG] in
`Identical_LG;`Identical_WAG;
`Topological_LG;`Topological_WAG] in
let res_by_tools = List.map det_meths ~f:(fun det_meth ->
derive_from_det_meth ~det_meth ~dataset ~preview
) in
......
......@@ -25,6 +25,7 @@ let repo rd =
item ["tree.only_convergent_tags.nhx" ] (Tree_dataset.tree rd.tree_dataset `Detection) ;
item ["tree.only_node_ids.nhx" ] (Tree_dataset.tree rd.tree_dataset `Simulation) ;
item ["tree.diffsel" ] (Tree_dataset.diffsel_tree rd.tree_dataset) ;
item ["tree.convergent_topology" ] (Tree_dataset.topological_tree rd.tree_dataset) ;
item ["simulated_sequences.fna"] rd.fna ;
item ["simulated_sequences.faa"] rd.faa ;
]
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os, re
import logging
import pandas as pd
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from Bio import AlignIO
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="calc_topological.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-bppml_non_conv', "--bppml_non_conv", type=argparse.FileType('r'),
help='bppml_non_conv filename', required=True)
requiredOptions.add_argument('-bppml_conv', "--bppml_conv", type=argparse.FileType('r'),
help='bppml_conv filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
##############
### Option parsing
args = parser.parse_args()
Bppml_non_conv = args.bppml_non_conv
Bppml_conv = args.bppml_conv
OutFile = args.output
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("calc_identical")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
#
#===================================================================================================
#proba_df_melt["Sites"] = pd.to_numeric(proba_df_melt["Sites"].str.replace('[','').str.replace(']',''))
n_sites = 20
Sites = [i +1 for i in range(n_sites)]
df_final = pd.DataFrame({'Sites': Sites,
'Topological' : 0})
df_final = df_final[["Sites","Topological"]]
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
logger.error("Format error!")
sys.exit(42)
df_final.to_csv(OutFile, sep = "\t", index = False)
logger.info("Write summary output in %s", OutFile)
......@@ -62,6 +62,12 @@ availableOptions.add_argument('--identical_LG', type=str,
help="Identical_LG output name", default = None)
availableOptions.add_argument('--identical_WAG', type=str,
help="Identical_WAG output name", default = None)
availableOptions.add_argument('--topological_LG', type=str,
help="topological_LG output name", default = None)
availableOptions.add_argument('--topological_WAG', type=str,
help="topological_WAG output name", default = None)
availableOptions.add_argument('--tdg09', type=str,
help="tdg09 output name", default = None)
##############
......@@ -75,6 +81,9 @@ df_diffsel = pd.DataFrame()
df_diffsel_bis = pd.DataFrame()
df_identical_LG = pd.DataFrame()
df_identical_WAG = pd.DataFrame()
df_topological_LG = pd.DataFrame()
df_topological_WAG = pd.DataFrame()
df_tdg09 = pd.DataFrame()
OutName = args.output
if args.pcoc :
......@@ -103,10 +112,24 @@ if args.identical_WAG :
df_identical_WAG = pd.read_csv(args.identical_WAG, sep="\t")
df_identical_WAG.rename(columns={'Identical': 'Identical_WAG01'}, inplace=True)
if args.topological_LG :
df_topological_LG = pd.read_csv(args.topological_LG, sep="\t")
df_topological_LG.rename(columns={'Topological': 'Topological_LG08'}, inplace=True)
if args.topological_WAG :
df_topological_WAG = pd.read_csv(args.topological_WAG, sep="\t")
df_topological_WAG.rename(columns={'Topological': 'Topological_WAG01'}, inplace=True)
if args.tdg09 :
df_tdg09 = pd.read_csv(args.tdg09, sep="\t")
df_list = [df for df in [df_pcoc, df_pcoc_gamma,
df_diffsel, df_diffsel_bis,
df_identical_LG, df_identical_WAG] if not df.empty ]
df_identical_LG, df_identical_WAG,
df_topological_LG, df_topological_WAG,
df_tdg09
] if not df.empty ]
df_list_len = [df.shape[0] for df in df_list]
if len(set(df_list_len)) != 1:
......
......@@ -205,9 +205,76 @@ with open("%s/tree.Ha.node_ids" %(OutDirName), "w") as output_Ha_node_ids:
t.write(format=1, features=["Condition","Transition"], outfile = "%s/tree.only_convergent_tags.nhx" %(OutDirName))
### tree.diffsel: a diffsel input formated tree
for n in t.traverse():
tdiffsel = t.copy(method="deepcopy")
for n in tdiffsel.traverse():
if n.ND in conv_node_ids+trans_node_ids:
n.dist = 2
elif n.ND in not_conv_node_ids:
n.dist = 1
t.write(format=1, outfile = "%s/tree.diffsel" %(OutDirName))
tdiffsel.write(format=1, outfile = "%s/tree.diffsel" %(OutDirName))
### tree.topological : a diffsel input formated tree
def build_conv_topo(t):
conv_nodes_ids = [n.ND for n in t.search_nodes(Condition="1")]
tconv = t.copy(method="deepcopy")
for n in tconv.iter_leaves():
n.add_features(L=1)
for n in tconv.traverse():
n.add_features(COPY=0)
# get the most recent ancestral node of all the convergent clades
l_convergent_clades = tconv.search_nodes(Condition="1")
common_anc_conv=tconv.get_common_ancestor(l_convergent_clades)
# duplicate it at its same location (branch lenght = 0). we get
# a duplicated subtree with subtrees A and B (A == B)
dist_dup = common_anc_conv.dist
if not common_anc_conv.is_root():
dup_point = common_anc_conv.add_sister(name="dup_point",dist=0.000001)
dup_point_root = False
else:
dup_point = Tree()
dup_point_root = True
dup_point.dist=0.000001
dup_point.add_features(ND=0,Transition="0", Condition="0")
common_anc_conv.detach()
common_anc_conv_copy = common_anc_conv.copy(method="deepcopy")
# tag duplicated nodes:
for n in common_anc_conv_copy.traverse():
n.COPY=1
if n.ND not in conv_nodes_ids and not n.is_root():
n.dist=0.000001
# pruned A from all branches not leading to any convergent clade
l_leaves_to_keep_A = common_anc_conv.search_nodes(COPY=0, Condition="0", L=1)
#logger.debug("A: %s",l_leaves_to_keep_A)
common_anc_conv.prune(l_leaves_to_keep_A, preserve_branch_length=True)
# pruned B from all branches not leading to any non-convergent clade
l_leaves_to_keep_B = common_anc_conv_copy.search_nodes(COPY=1, Condition="1", L=1)
#logger.debug("B : %s", l_leaves_to_keep_B)
common_anc_conv_copy.prune(l_leaves_to_keep_B, preserve_branch_length=True)
dup_point.add_child(common_anc_conv_copy)
dup_point.add_child(common_anc_conv)
tconv = dup_point.get_tree_root()
nodeId = 0
for node in tconv.traverse("postorder"):
node.ND = nodeId
nodeId += 1
return tconv
tconv = build_conv_topo(t)
tconv.write(format=1, outfile = "%s/tree.topological" %(OutDirName))
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import pandas as pd
import sys
# LIGNE ESSAI
# python parse_results_tdg09.py -ftdg09 toto -o out
### Option defining
parser = argparse.ArgumentParser(prog="parse_results_tdg09.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-tdg09', "--file", type=argparse.FileType('r'),
help='Input filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
### Option parsing
args = parser.parse_args()
Tdg09File = args.file
OutFile = args.output
searchlines = Tdg09File.readlines()
Tdg09File.close()
n_sites=int(searchlines[9].split()[1])
fdr=["NA"] * (n_sites)
for i,line in enumerate(searchlines):
if "LrtResults:\n" in line:
for l in searchlines[(i+2):(i+n_sites+1)]:
if not 'FullResults:\n' in l:
ll=l.split()
if ll:
index = int(ll[2][:-1])
value = float(ll[6].replace(',','.'))
#print ll
#print index
#print value
fdr[index-1]=1-value
else:
break
# Sites Tamuri
Sites = [i +1 for i in range(n_sites)]
df_final = pd.DataFrame({'Sites': Sites,
'Tdg09' : fdr})
df_final = df_final[["Sites","Tdg09"]]
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
print("Format error!")
sys.exit(42)
df_final.to_csv(OutFile, sep = "\t", index = False)
print("Write summary output in %s" %OutFile)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import sys
import os
import logging
import ete3
import Bio
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from Bio import AlignIO
# LIGNE ESSAI
# python rename_input_tree_ali_tdg09.py -t tree.only_convergent_tags.nhx -a simulated_sequences.faa -o out
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="rename_input_tree_ali_tdg09.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-t', "--tree", type=argparse.FileType('r'),
help='Input tree filename', required=True)
requiredOptions.add_argument('-a', "--alignment", type=argparse.FileType('r'),
help='Input alignment', required=True)
requiredOptions.add_argument('-out_a', '--output_ali', type=str,
help="Output ali name", required=True)
requiredOptions.add_argument('-out_t', '--output_tree', type=str,
help="Output tree name", required=True)
##############
### Option parsing
args = parser.parse_args()
TreeFile = args.tree
AliFile = args.alignment
OutAli = args.output_ali
OutTree = args.output_tree
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("parse_input_tree")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input tree
#===================================================================================================
try:
t=Tree(TreeFile.name)
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
logger.info("Tree (%s) ok after checking", TreeFile.name)
#===================================================================================================
# Read input alignment
#===================================================================================================
try:
alignment = AlignIO.read(AliFile.name, "fasta")
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
logger.info("Ali (%s) ok after checking", AliFile.name)
#===================================================================================================
# Create output files
#===================================================================================================
######## RENAME TREE LEAVES AND SEQUENCES FOR TDG09
nodeId = 0