Commit f99b6287 authored by Carine Rey's avatar Carine Rey Committed by Philippe Veber
Browse files

auto rm leaves if lacking sequences

parent e9b996b3
>Abildgaar
-TTGACATCCGCCAAGAGTCAGACCGCCACACTGATGTGCTTGATGCCATCACACAATAC
CTTGGAATTGGGTCCTACCGTGAGTGGTCTGAGGAGAAGCGTCAGGAGTGGCTCCTTTCC
GAACTCACTGGGAAACGTCCTCTTATCCCCCACGATTTT---CCCCAGACAGAAGAAATT
AAAGATGTCCTTGACACCCTACATGTCATTGCTGAGCTCCCCTCAGACAACTTTGGGGCC
TATATCATTTCCATGGCTACCTCCCCATCAGATGTTTTGGCTGTGGAGCTGTTGCAGCGA
GAGTGCCATGTGAAGCAACCACTGCGTGTCGTGCCACTGTTCGAGAAGCTTGCCGATCTT
GAGGCAGCTCCTGCCGCTGTGGCGCGTTTGTTCTCGATAGACTGGTACAGAAACAGAATC
AATGGCAAGCAAGAGGTTATGATTGGGTACTCTGACTCTGGCAAGGATGCTGGGCGGTTT
TCAGCCGCATGGCAGTTGTACAAGAGCCAGGCTGAGCTTGTAAAGGTTGCAAAACAATTC
GGTGTGAAGCTCACCATGTTCCACGGTCGTGGTGGGACTGTTGGTAGGGGTGGAGGCCCC
ACCCACTTGGCTATCCTGTCTCAGCCACCAGATACAATTCATGGGTCTCTCAGGGTCACA
GTCCAGGGAGAGGTCATTGAGCAATCATTTGGTGAGGAGCATCTCTGCTTCAGGACTCTG
CAGAGGTTCACAGCTGCTACCTTGGAGCATGGCATGCACCCACCGGTGTCACCAAAGCCT
GAGTGGGCTGCACTAATGGACGAGATGGCAATCATTGCCACCGAGGAGTACCGCTCAATT
GTGTTCAAGGAGCCGCGCTTTGTTGAGTATTTCCGCTTGGCTACACCTGAGCTGGAGTAT
GGACGTATGAACATTGGCAGCCGTCCATCCAAGCGCAAACCAAGTGGTGGCATTGAATCA
CTTCGTGCCATTCCGTGGATCTTTGCATGGACACAGACCCGCTTCCACCTGCCTGTCTGG
CTCGGCTTCGGTGCTGCATTTAAACATGTGATTGAGAAGGATGTGCGCAACATCCACATG
CTCCGGGAGATGTACAATGAGTGGCCGTTCTTCAGGGTCACCATTGATCTGGTGGAGATG
GTTTTCGCTAAGGGTGACCCCGGAATTGCATCATTGTATGACAAGCTGTTGGTGTCTGAG
GATTTGTGGCCGTTTGGAGAGAAATTGAGGGCTAATTACGAAGAAACCAAGAATCTCCTT
CTTCAGGTTGCGGGCCACAAGGACCTTCTTGAAGGAGATCCCTACCTAAAGCAAAGACTG
CGCCTTCGTGATTCCTACATCACAACCCTCAACGTGTTACAAGCATACACTCTX
>Actinosch
------------------------------------------------------------
-----------------------------TGAGGAGAAGCGTCAGGAGTGGCTCCTCGCC
GAACTCACTGGGAAACGTCCCCTTATCCCCCACGATTTT---CCCCAGACAGAAGAAATT
AAAGATGTCCTTGACACCCTTCATGTCATTGCTGAGCTCCCCTCGGACAACTTTGGGGCC
TACATCATTTCCATGGCTACCTCCCCATCAGATGTTTTAGCTGTGGAGCTGTTGCAGCGT
GAGTGCCATGTGAAGCAACCACTGCGTGTCGTGCCGCTGTTTGAGAAGCTTGCTGATCTT
GAGGCAGCTCCTGCTGCTGTGGCACGCTTATTTTCCATAGACTGGTACAGAAACAGAATA
AATGGCAAGCAAGAGGTTATGATTGGGTACTCTGACTCTGGCAAGGATGCTGGGCGGTTT
TCAGCCGCATGGCAGTTGTACAAGAGCCAGGCTGAGCTTGTGAAGGTTGCAAAACAATTT
GGTGTAAAGCTCACCATGTTCCATGGTCGTGGTGGAACTGTGGGTAGGGGTGGAGGCCCC
ACCCACTTGGCTATCCTGTCTCAGCCACCAGATACTATTCATGGGTCACTCAGGGTCACA
GTGCAGGGAGAGGTCATTGAGCAATCATTTGGCGAGGAGCATCTCTGCTTCAGGACTCTG
CAGAGGTTCACAGCTGCTACCTTGGAGCATGGTATGCATCCACCGGTGTCGCCAAAGCCA
GAATGGGCTGCACTTATGGACGAGATGGCGGTTATTGCCACTGAGGAGTACCGCTCAATT
GTGTTCAAGGAGCCACGCTTTGTTGAGTACTTCCGCCTGGCTACACCGGAGCCGGAGTAT
GGACGTATGAACATTGGCAGCCGACCATCCAAGCGCAAACCAAGTGGTGGCATTGAGTCA
CTTCGTGCCATTCCATGGATCTTTGCATGGACACAGACCCGTTTCCACCTGCCTGTGTGG
CTCGGCTTCGGTGCTGCATTTAATCATGTGATTGCGAAGGATGTGCGCCACATCCACATG
CTCCAGGAGATGTACAATGAGTGGCCATTCTTCAGGGTCACCATTGATCTGGTGGAGATG
GTTTTCGCCAAGGGTGACCCTGGAATTGCAGCGTTATATGACAAGCTGTTGGTGTCTGAG
GATCTGTGGACTTTTGGGGAGAGATTGAGGTCTAATTATGAAGAAACCAAGAATCTCCTT
CTTCAGGTTGCGGGCCACAAGGACCTTCTTGAA---------------------------
-----------------------------------------------------X
>Baumea
------------------------------------------------------------
-----------------------------TGAGGAGAAACGTCAGGAGTGGCTTCTCTCT
......
......@@ -38,13 +38,13 @@ with the format "gene1.fna", "gene2.fna",... |} in
let tree_prefix = Filename.chop_extension input_tree in
let input_tree = Workflow.input (Filename.concat indir (Filename.concat dataset_prefix input_tree)) in
let fna_l = Array.to_list @@ Sys.readdir (Filename.concat indir (dataset_prefix ^ "/" ^ fna_dir)) in
printf "%i files detected in %s\n" (List.length fna_l) fna_dir;
printf " %i files detected in %s\n" (List.length fna_l) fna_dir;
List.map fna_l ~f:(function fna ->
let fna_prefix = Filename.chop_extension fna in
printf "%s: %s\n" fna_prefix (Filename.concat indir (Filename.concat (dataset_prefix ^ "/" ^ fna_dir) fna));
let fna = Workflow.input (Filename.concat indir (Filename.concat (dataset_prefix ^ "/" ^ fna_dir) fna)) in
let filtered_input_tree = Raw_dataset.filter_input_tree ~descr:fna_prefix ~tree:input_tree ~fna () in
let fna_infos = None in
let raw_dataset = Raw_dataset.{input_tree; fna; fna_infos} in
let raw_dataset = Raw_dataset.{input_tree=filtered_input_tree; fna; fna_infos} in
let dataset = {Dataset.model_prefix = fna_prefix;
is_real = true;
tree_prefix = tree_prefix;
......
open Bistro
open File_formats
open Bistro_utils
open Bistro.Shell_dsl
open Core
type t = {
input_tree : nhx pworkflow ;
......@@ -15,11 +18,19 @@ let repo ~prefix rd =
]
let repo_realdata ~tree_prefix ~ali_prefix rd =
List.concat [
Repo.[
item [tree_prefix ^ ".nhx"] rd.input_tree ;
];
Repo.[
Repo.[
item [ali_prefix ^ ".fna"] rd.fna ;
] |> Repo.shift "Alignments"
item [tree_prefix ^ "." ^ ali_prefix ^ ".nhx"] rd.input_tree ;
]
|> Repo.shift ali_prefix
|> Repo.shift "Alignments"
let filter_input_tree ?(descr="") ~(tree:_ workflow) ~(fna:nucleotide_fasta pworkflow) () : nhx pworkflow =
Workflow.shell ~descr:("filter_input_tree."^descr) [
cmd "python" ~img:Env.env_py [
file_dump (string Scripts.filter_input_tree) ;
opt "-t" dep tree;
opt "-a" dep fna ;
opt "-o" ident dest ;
]
]
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os
import logging
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from Bio import AlignIO, SeqIO, Seq, SeqRecord
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="parse_input_tree.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-t', "--tree", type=argparse.FileType('r'),
help='Input tree filename', required=True)
requiredOptions.add_argument('-a', "--ali", type=argparse.FileType('r'),
help='Alignment filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output filename", required=True)
##############
### Option parsing
args = parser.parse_args()
TreeFile = args.tree
AliFile = args.ali
OutDirName = args.output
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("parse_input_tree")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input tree
#===================================================================================================
try:
t=Tree(TreeFile.name)
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
if t:
features = []
for n in t.traverse("postorder"): # get all features:
features.extend(list(set(dir(n)) - set(dir(Tree()))))
features = list(set(features)) # list(set(*)) = remove duplicates
logger.info("No detected tag" if not features else "Detected tags: "+", ".join([f for f in features]))
#check rooted tree
nb_root_children = len(t.get_children())
if nb_root_children > 2:
logger.error("Unrooted tree")
sys.exit(1)
if "ND" in features:
logger.warning("\"ND\" is in the detected tags but it will be removed by the programm")
features.remove("ND")
if not "Condition" in features:
logger.error("\"Condition\" must be in the detected tags. \"Condition:1\" must identify nodes under the convergent model under Ha")
sys.exit(1)
if not "Transition" in features:
logger.error("\"Transition\" must be in the detected tags. \"Transition:1\" must identify nodes where there are transitions")
sys.exit(1)
if "Transition" in features:
logger.info("\"Transition\" is in the detected tags. \"Transition:1\" will be use to indicate OneChange model on this node")
# Check existing bl
branch_lengths = []
nodeId = 0
for node in t.traverse("postorder"):
node.add_features(ND=nodeId)
branch_lengths.append(node.dist)
nodeId = nodeId + 1
logger.debug("Branch length %s", branch_lengths)
if set(branch_lengths) == set([1.0]):
logger.warning("No branch length in %s, all branch length will be put to 1",tree_filename)
if not (t and branch_lengths):
sys.exit(1)
logger.info("Tree (%s) ok after checking", TreeFile.name)
#===================================================================================================
# Read input alignment
#===================================================================================================
try:
ali = AlignIO.read(AliFile.name, "fasta")
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
leaves_names = [ l.name for l in t.get_leaves()]
seq_names = [ s.name for s in ali]
logger.debug("leaves names: %s",leaves_names )
logger.debug("sequences names: %s",seq_names )
seq_not_in_tree = set(seq_names) - set(leaves_names)
leaves_not_in_ali = set(leaves_names) - set(seq_names)
if len(set(seq_names)) != len(seq_names):
logger.error("There are duplicated sequence names")
sys.exit(1)
if len(set(leaves_names)) != len(leaves_names):
logger.error("There are duplicated leaf names")
sys.exit(1)
sp_present=[]
if seq_not_in_tree:
logger.error("Some sequences do not match with a leaf: %s",",".join(list(seq_not_in_tree)))
logger.error("All sequence names must match with a leaf")
sys.exit(1)
elif leaves_not_in_ali:
logger.warning("Some leaves do not match with a sequence: %s",",".join(list(leaves_not_in_ali)))
sp_present = seq_names
else:
logger.info("Sequence and leaf names match.")
#===================================================================================================
# Create output tree without lacking species
#===================================================================================================
trim_t = t.copy(method="deepcopy")
if sp_present:
trim_t.prune(sp_present)
sister_T = True
while sister_T:
sister_T=False
for n in trim_t.traverse("postorder"):
children = n.get_children()
if len(children) == 2:
if hasattr(children[0], "Transition" ) and hasattr(children[1], "Transition" ):
if children[0].Transition == "1" and children[1].Transition == "1":
n.Transition = "1"
n.Condition = "1"
children[0].Transition == "0"
children[1].Transition == "0"
sister_T = True
new_leaves_names = [ l.name for l in trim_t.get_leaves()]
logger.debug("new leaves names: %s",new_leaves_names )
logger.debug("sequences names: %s",seq_names )
seq_not_in_tree = set(seq_names) - set(new_leaves_names)
leaves_not_in_ali = set(new_leaves_names) - set(seq_names)
if leaves_not_in_ali:
logger.error("Some leaves do not match with a sequence: %s",",".join(list(leaves_not_in_ali)))
sys.exit(1)
else:
logger.info("Sequence and leaf names match.")
trim_t.write(format=1, features=["Condition","Transition"], outfile=args.output, format_root_node=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment