Commit 5b401815 authored by Carine Rey's avatar Carine Rey
Browse files

add a new workflow to parse input trees

parent fbf4e99b
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os
import logging
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="parse_input_tree.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-t', "--tree", type=argparse.FileType('r'),
help='Input tree filename', required=True)
requiredOptions.add_argument('-o', '--output_dir', type=str,
help="Output directory name", required=True)
##############
### Option parsing
args = parser.parse_args()
TreeFile = args.tree
OutDirName = args.output_dir
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("parse_input_tree")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
### Set up the output directory
if os.path.isdir(OutDirName):
pass
elif OutDirName: # if OutDirName is not a empty string we create the directory
os.makedirs(OutDirName)
logger.debug("mkdir %s", OutDirName)
#===================================================================================================
# Read input tree
#===================================================================================================
try:
t=Tree(TreeFile.name)
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
if t:
features = []
for n in t.traverse("postorder"): # get all features:
features.extend(list(set(dir(n)) - set(dir(Tree()))))
features = list(set(features)) # list(set(*)) = remove duplicates
logger.info("No detected tag" if not features else "Detected tags: "+", ".join([f for f in features]))
if "ND" in features:
logger.warning("\"ND\" is in the detected tags but it will be removed by the programm")
features.remove("ND")
if not "Condition" in features:
logger.error("\"Condition\" must be in the detected tags. \"Condition:1\" must identify branches under the convergent model under Ha")
sys.exit(1)
# Check existing bl
branch_lengths = []
nodeId = 0
for node in t.traverse("postorder"):
node.add_features(ND=nodeId)
branch_lengths.append(node.dist)
nodeId = nodeId + 1
logger.debug("Branch length %s", branch_lengths)
if set(branch_lengths) == set([1.0]):
logger.warning("No branch length in %s, all branch length will be put to 1",tree_filename)
if not (t and branch_lengths):
sys.exit(1)
logger.info("Tree (%s) ok after checking", TreeFile.name)
#===================================================================================================
# Create output files
#===================================================================================================
######### output trees for simulation
### tree.only_node_ids.nhx: a tree with numbered nodes
t.write(format=1, features=["ND"], outfile = "%s/tree.only_node_ids.nhx" %(OutDirName),format_root_node=True)
### tree.H0.node_ids: null hypothesis
#### -> 1 line: all nodes ids
all_node_ids = range(nodeId-1)
with open("%s/tree.H0.node_ids" %(OutDirName), "w") as output_H0_node_ids:
output_H0_node_ids.write(",".join(map(str, all_node_ids)))
### tree.Ha.node_ids: alternative hypothesis
#### -> 2 lines: 1) node ids under the ancestral model
#### 2) node ids under the convergent model
with open("%s/tree.Ha.node_ids" %(OutDirName), "w") as output_Ha_node_ids:
conv_node_ids = [n.ND for n in t.search_nodes(Condition = "1")]
not_conv_node_ids = [i for i in all_node_ids if i not in conv_node_ids]
output_Ha_node_ids.write(",".join(map(str, conv_node_ids)))
output_Ha_node_ids.write("\n")
output_Ha_node_ids.write(",".join(map(str, not_conv_node_ids)))
######### output trees for detection
### tree.only_convergent_tags.nhx: a tree with only Condition and Transition tags
t.write(format=1, features=["Condition","Transition"], outfile = "%s/tree.only_convergent_tags.nhx" %(OutDirName))
### tree.diffsel: a diffsel input formated tree
for n in t.traverse():
if n.ND in conv_node_ids:
n.dist = 2
elif n.ND in not_conv_node_ids:
n.dist = 1
t.write(format=1, outfile = "%s/tree.diffsel" %(OutDirName))
...@@ -10,3 +10,10 @@ class type nucleotide_fasta = object ...@@ -10,3 +10,10 @@ class type nucleotide_fasta = object
inherit fasta inherit fasta
method alphabet : [`Nucleotide] method alphabet : [`Nucleotide]
end end
type output_parse_input_tree =
| Nodes_H0
| Nodes_Ha
| Tree4detect
| Tree4simu
| Tree_diffsel
open Core open Core
open Bistro_utils open Bistro_utils
open Bistro.EDSL open Bistro.EDSL
open File_formats
let parse_input_tree ~tree = workflow ~descr:"utils.parse_input_tree" [
(*let env = docker_image ~account:"carinerey" ~name:"ete3:3.0.0b35" () in*)
cmd "python" (*~env*) [
string "etc/utils/bin/parse_input_tree.py";
opt "-t" dep tree;
opt "-o" ident dest;
]
]
let select_out parsed_tree t = match t with
| Nodes_H0 -> parsed_tree / selector [ "tree.H0.node_ids" ]
| Nodes_Ha -> parsed_tree / selector [ "tree.Ha.node_ids" ]
| Tree4detect -> parsed_tree / selector [ "tree.only_convergent_tags.nhx" ]
| Tree4simu -> parsed_tree / selector [ "tree.only_node_ids.nhx" ]
| Tree_diffsel -> parsed_tree / selector [ "tree.diffsel" ]
let derive_from_tree ~tree_dir ~tree ~preview = let derive_from_tree ~tree_dir ~tree ~preview =
let tree = input (Filename.concat tree_dir tree) in let tree = input (Filename.concat tree_dir tree) in
let nb_sites = if preview then 10 else 100 in let nb_sites = if preview then 10 else 100 in
let fna = Bppsuite.bppseqgen ~tree ~nb_sites in let parsed_tree = parse_input_tree ~tree in
let fna = Bppsuite.bppseqgen ~nb_sites ~tree:(select_out parsed_tree Tree4simu) in
Repo.[ Repo.[
item ["simulated_sequences.fna"] fna ; item ["simulated_sequences.fna"] fna ;
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment