Commit 2f05ec1c authored by LANORE Vincent's avatar LANORE Vincent
Browse files

Added script to compute inferred tree lengths from ali+tree

parent 22cc0113
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Vincent Lanore <vincent.lanore@gmail.com>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
"""Computes the length of trees inferred from an alignment.
Usage:
compute_tree_length.py [options...] -t <tree> -a <align>
Options:
-a, --align <align> the alignment file to analyse
-t, --tree <tree> the corresponding tree file
-h, --help show this help message and exit"""
from diffsel_script_utils import STEP, MESSAGE, data, param
#===================================================================================================
STEP("Parsing command line arguments")
from docopt import docopt
args = docopt(__doc__)
ali_filename = args["--align"]
MESSAGE("Input alignment is " + param(ali_filename))
tree_filename = args["--tree"]
MESSAGE("Input tree is " + param(tree_filename))
#===================================================================================================
STEP("Convert alignment to amino acids")
from subprocess import check_output, CalledProcessError
from os.path import isfile, splitext
# if problems with seaview:
# sed -i $'s/\t/ /g' *.ali
base_ali_filename = splitext(ali_filename)[0]
MESSAGE("Alignment base filename is " + data(base_ali_filename))
aa_ali_filename = base_ali_filename + ".aa.phy"
if (isfile(aa_ali_filename)):
MESSAGE("Amino acid alignment file " + data(aa_ali_filename) + " already present")
else:
MESSAGE("Conversion using seaview...")
try:
check_output(["seaview", "-convert", "-o", aa_ali_filename, "-output-format", "phylip", "-translate", ali_filename])
except CalledProcessError as e:
print(e.output.decode("ascii"))
exit(1)
MESSAGE("Done. Amino acid alignment file is " + data(aa_ali_filename))
#===================================================================================================
STEP("Infer trees from alignment")
inferred_tree_nt_filename = ali_filename + "_phyml_tree.txt"
inferred_tree_aa_filename = aa_ali_filename + "_phyml_tree.txt"
if (isfile(inferred_tree_nt_filename)):
MESSAGE("Inferred tree " + data(inferred_tree_nt_filename) + " already exists")
else:
MESSAGE("Running phyml on nucleotide alignment...")
try:
check_output(["phyml", "-u", tree_filename, "-o", "lr", "-i", ali_filename, "-m", "GTR", "-d", "nt"])
except CalledProcessError as e:
print(e.output.decode("ascii"))
exit(1)
MESSAGE("Done.")
if (isfile(inferred_tree_aa_filename)):
MESSAGE("Inferred tree " + data(inferred_tree_aa_filename) + " already exists")
else:
MESSAGE("Running phyml on amino acid alignment...")
try:
check_output(["phyml", "-u", tree_filename, "-o", "lr", "-i", aa_ali_filename, "-m", "LG", "-d", "aa"])
except CalledProcessError as e:
print(e.output.decode("ascii"))
exit(1)
MESSAGE("Done.")
#===================================================================================================
STEP("Computingt tree lengths")
from ete3 import Tree
def length(tf):
tree = Tree(tf)
bl =0
for n in tree.traverse("postorder"):
bl += n.dist
return bl
trees = [["tree", tree_filename], ["inferred_tree_aa", inferred_tree_aa_filename], ["inferred_tree_nt", inferred_tree_nt_filename]]
list(map(lambda x: x.append(length(x[1])), trees))
for t in trees:
MESSAGE("Tree " + data(t[0]) + " has length " + data(t[2]))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment