Commit 8ee06d5a authored by Carine Rey's avatar Carine Rey
Browse files

add the "identical" method

parent b7c70b84
......@@ -10,6 +10,8 @@ type result = [
| `Pcoc_gamma of [`pcoc] directory workflow
| `Diffsel of [`diffsel] directory workflow
| `Diffsel_bis of [`diffsel] directory workflow
| `Identical_LG of [`identical] directory workflow
| `Identical_WAG of [`identical] directory workflow
]
let meth_string_of_result = function
......@@ -17,6 +19,8 @@ let meth_string_of_result = function
| `Pcoc_gamma _ -> "pcoc_gamma"
| `Diffsel _ -> "diffsel"
| `Diffsel_bis _ -> "diffsel_bis"
| `Identical_LG _ -> "identical_LG"
| `Identical_WAG _ -> "identical_WAG"
type dataset_res = {
model_prefix : string ;
......@@ -30,16 +34,20 @@ let merge_results ~res_by_tools : text_file workflow =
let env = docker_image ~account:"carinerey" ~name:"ete3" ~tag:"3.0.0b35" () in
let command = List.map res_by_tools ~f:(fun res ->
let w = match res with
| `Pcoc d
| `Pcoc d -> Pcoc.results d
| `Pcoc_gamma d -> Pcoc.results d
| `Diffsel d -> Diffsel.selector d
| `Diffsel_bis d -> Diffsel.selector d
| `Identical_LG d -> Identical.results d
| `Identical_WAG d -> Identical.results d
in
let opt = match res with
| `Pcoc _ -> string "--pcoc"
| `Pcoc_gamma _ -> string "--pcoc_gamma"
| `Diffsel _ -> string "--diffsel"
| `Diffsel_bis _ -> string "--diffsel_bis"
| `Identical_LG _ -> string "--identical_LG"
| `Identical_WAG _ -> string "--identical_WAG"
in
seq ~sep:" " [opt; dep w]
)
......@@ -53,7 +61,7 @@ let merge_results ~res_by_tools : text_file workflow =
]
let plot_merge_results ~res_by_tools ~tree ~faa ~tsv : svg workflow =
let plot_merge_results ~(res_by_tools:result list) ~tree ~faa ~tsv : svg workflow =
let plot_all_sites = true in
(*let env = docker_image ~account:"carinerey" ~name:"ete3" ~tag:"3.0.0b35" () in*)
let env = docker_image ~account:"carinerey" ~name:"pcoc" ~tag:"06212018" () in
......@@ -64,6 +72,8 @@ let plot_merge_results ~res_by_tools ~tree ~faa ~tsv : svg workflow =
| `Pcoc_gamma _ -> "PCOC_gamma"
| `Diffsel _ -> "Diffsel"
| `Diffsel_bis _ -> "Diffsel_bis"
| `Identical_LG _ -> "Identical_LG08"
| `Identical_WAG _ -> "Identical_WAG01"
in
string opt
) |> seq ~sep:","
......@@ -74,6 +84,8 @@ let plot_merge_results ~res_by_tools ~tree ~faa ~tsv : svg workflow =
| `Pcoc_gamma _ -> "PCOC_gamma:0.99"
| `Diffsel _ -> "Diffsel:0.9"
| `Diffsel_bis _ -> "Diffsel_bis:0.9"
| `Identical_LG _ -> "Identical_LG08:0.9"
| `Identical_WAG _ -> "Identical_WAG01:0.9"
in
string opt
) |> seq ~sep:","
......
......@@ -11,6 +11,8 @@ type result = [
| `Pcoc_gamma of [`pcoc] directory workflow
| `Diffsel of [`diffsel] directory workflow
| `Diffsel_bis of [`diffsel] directory workflow
| `Identical_LG of [`identical] directory workflow
| `Identical_WAG of [`identical] directory workflow
]
val meth_string_of_result : result -> string
......
open Core
open Bistro.Std
open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
let assign k v =
seq ~sep:"=" [ string k ; v ]
let conf_file_bppml ~tree ~faa ~out ~config =
seq ~sep:"\n" (
[
assign "OUT" (out) ;
assign "input.sequence.file" (dep faa) ;
assign "alphabet" (string "Protein") ;
assign "input.tree.file" (dep tree) ;
assign "init.tree" (string "user") ;
assign "input.tree.format" (string "Nhx") ;
assign "optimization.topology" (string "false") ;
assign "output.tree.file" (string "$(OUT)/tree.nhx") ;
assign "output.tree.format" (string "Nhx") ;
assign "output.infos" (string "$(OUT)/infos.tsv") ;
assign "output.estimates" (string "$(OUT)/estimates.tsv") ;
]
@ config
)
let bppml ?(descr="") ~faa ~tree ~config : _ workflow =
let env = docker_image ~account:"carinerey" ~name:"bppsuite" ~tag:"06182018" () in
let config_f = dest // "config_bppml.bpp" in
let out = ident dest in
workflow ~descr:("bppsuite.bppml" ^ descr) [
docker env (
and_list [
mkdir_p dest;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppml ~tree ~faa ~out ~config ))];
cmd "bppml" [
assign "param" config_f;
]
]
)
]
let conf_file_bppancestor ~tree ~faa ~out ~config =
seq ~sep:"\n" (
[
assign "OUT" (out) ;
assign "input.sequence.file" (dep faa) ;
assign "alphabet" (string "Protein") ;
assign "input.tree.file" (dep tree) ;
assign "init.tree" (string "user") ;
assign "input.tree.format" (string "Nhx") ;
assign "optimization.topology" (string "false") ;
assign "output.sequence.file" (string "$(OUT)/output_anc.fa") ;
assign "asr.add_extant" (string "true") ;
assign "asr.probabilities" (string "true") ;
assign "output.sites.file" (string "$(OUT)/sites.tsv") ;
assign "output.nodes.file" (string "$(OUT)/nodes.tsv") ;
]
@ config
)
let bppancestor ?(descr="") ~faa ~tree ~config : _ workflow =
let env = docker_image ~account:"carinerey" ~name:"bppsuite" ~tag:"06182018" () in
let config_f = dest // "config_bppancestor.bpp" in
let out = ident dest in
workflow ~descr:("bppsuite.bppancestor" ^ descr) [
docker env (
and_list [
mkdir_p dest;
cmd "cat" ~stdout:config_f [(file_dump (conf_file_bppancestor ~tree ~faa ~out ~config))];
cmd "bppancestor" [
assign "param" config_f;
]
]
)
]
let identical ~(tree_id:_ workflow) ~(tree_sc:_ workflow) ~(faa:aminoacid_fasta workflow) ~prot_model : [`identical] directory workflow =
let config = [assign "model" (string prot_model)] in
let out1 = dest // "out1.tsv" in
let out2 = dest // "out2.tsv" in
let run_bppml = bppml ~descr:"" ~tree:tree_id ~config ~faa in
let run_bppancestor = bppancestor ~descr:"" ~tree:tree_id ~faa ~config in
let proba = run_bppancestor / selector ["sites.tsv"] in
let env = docker_image ~account:"carinerey" ~name:"pcoc" ~tag:"07022018" () in
workflow ~descr:("identical."^prot_model) [
mkdir dest ;
cmd "python" ~env [
file_dump (string Scripts.calc_identical) ;
opt "-t" dep tree_sc;
opt "-a" dep faa;
opt "-p" dep proba;
opt "-o" ident out1 ;
opt "-o2" ident out2 ;
]
]
let results run_identical : text_file workflow =
run_identical / selector ["out1.tsv"]
......@@ -49,6 +49,8 @@ let repo_of_detection_result res =
| `Pcoc_gamma w -> item ["pcoc_gamma.results.tsv"] (Pcoc.results w)
| `Diffsel w -> item ["diffsel.results.tsv"] (Diffsel.selector w)
| `Diffsel_bis w -> item ["diffsel_bis.results.tsv"] (Diffsel.selector w)
| `Identical_LG w -> item ["Identical_LG.results.tsv"] (Identical.results w)
| `Identical_WAG w -> item ["Identical_WAG.results.tsv"] (Identical.results w)
) ;
(
match res with
......@@ -56,6 +58,8 @@ let repo_of_detection_result res =
| `Pcoc_gamma w -> item ["raw_results"] w
| `Diffsel w -> item ["raw_results"] w
| `Diffsel_bis w -> item ["raw_results"] w
| `Identical_LG w -> item ["raw_results"] w
| `Identical_WAG w -> item ["raw_results"] w
) ;
]
|> Repo.shift det_meth_prefix
......@@ -83,21 +87,26 @@ let derive_from_det_meth ~det_meth ~(dataset : Dataset.t) ~preview =
let faa = dataset.dataset.faa in
let fna = dataset.dataset.fna in
let phy_n = Bppsuite.fa2phy ~fna in
let pcoc_tree = Tree_dataset.tree dataset.dataset.tree_dataset `Detection in
let tree_sc = Tree_dataset.tree dataset.dataset.tree_dataset `Detection in
let tree_id = Tree_dataset.tree dataset.dataset.tree_dataset `Simulation in
let diffsel_tree = Tree_dataset.diffsel_tree dataset.dataset.tree_dataset in
let w_every = if preview then 1 else 1 in
let n_cycles = if preview then 10 else 2000 in
match det_meth with
| `Pcoc -> `Pcoc (Pcoc.pcoc ~plot_complete:true ~gamma:false ~faa ~tree:pcoc_tree)
| `Pcoc_gamma -> `Pcoc_gamma (Pcoc.pcoc ~plot_complete: true ~gamma:true ~faa ~tree:pcoc_tree)
| `Pcoc -> `Pcoc (Pcoc.pcoc ~plot_complete:true ~gamma:false ~faa ~tree:tree_sc)
| `Pcoc_gamma -> `Pcoc_gamma (Pcoc.pcoc ~plot_complete: true ~gamma:true ~faa ~tree:tree_sc)
| `Diffsel -> `Diffsel (Diffsel.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles)
| `Diffsel_bis -> `Diffsel_bis (Diffsel.diffsel ~phy_n ~tree:diffsel_tree ~w_every ~n_cycles)
| `Identical_LG -> `Identical_LG (Identical.identical ~faa ~tree_id ~tree_sc ~prot_model:"LG08")
| `Identical_WAG -> `Identical_WAG (Identical.identical ~faa ~tree_id ~tree_sc ~prot_model:"WAG01")
let derive_from_dataset ~dataset ~preview =
let det_meths = [`Pcoc;`Pcoc_gamma;`Diffsel;`Diffsel_bis] in
let det_meths = [`Pcoc;`Pcoc_gamma;
`Diffsel;`Diffsel_bis;
`Identical_LG;`Identical_WAG] in
let res_by_tools = List.map det_meths ~f:(fun det_meth ->
derive_from_det_meth ~det_meth ~dataset ~preview
) in
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os, re
import logging
import pandas as pd
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from Bio import AlignIO
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="parse_input_tree.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-t', "--tree", type=argparse.FileType('r'),
help='Tree filename', required=True)
requiredOptions.add_argument('-a', "--ali", type=argparse.FileType('r'),
help='Alignment filename', required=True)
requiredOptions.add_argument('-p', "--proba", type=argparse.FileType('r'),
help='AA probability per site filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
requiredOptions.add_argument('-o2', '--output2', type=str,
help="Output name (bilan)", required=False)
##############
### Option parsing
args = parser.parse_args()
TreeFile = args.tree
AliFile = args.ali
ProbaFile = args.proba
OutFile = args.output
OutFile2 = args.output2
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("calc_identical")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input tree
#===================================================================================================
try:
t=Tree(TreeFile.name)
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
if t:
features = []
for n in t.traverse("postorder"): # get all features:
features.extend(list(set(dir(n)) - set(dir(Tree()))))
features = list(set(features)) # list(set(*)) = remove duplicates
logger.info("No detected tag" if not features else "Detected tags: "+", ".join([f for f in features]))
if "ND" in features:
logger.warning("\"ND\" is in the detected tags but it will be removed by the programm")
features.remove("ND")
# add ND
nodeId = 0
for node in t.traverse("postorder"):
node.add_features(ND=nodeId)
nodeId = nodeId + 1
if not "Condition" in features:
logger.error("\"Condition\" must be in the detected tags. \"Condition:1\" must identify nodes under the convergent model under Ha")
sys.exit(1)
if not "Transition" in features:
logger.error("\"Transition\" must be in the detected tags. \"Transition:1\" must identify nodes where there are transitions")
sys.exit(1)
all_node_ids = range(nodeId-1)
trans_node_ids = [n.ND for n in t.search_nodes(Transition = "1")]
anc_trans_node_ids = [n.up.ND for n in t.search_nodes(Transition = "1")]
leaves_trans_node_ids = [n for n in t.search_nodes(Transition = "1") if n.is_leaf()]
logger.info("Trans nodes: %s",trans_node_ids)
logger.info("anc_trans_node_ids: %s",anc_trans_node_ids)
logger.info("leaves_trans_node_ids nodes: %s",[n.ND for n in leaves_trans_node_ids])
logger.info("Tree (%s) ok after checking", TreeFile.name)
#===================================================================================================
# Read input alignment
#===================================================================================================
try:
ali = AlignIO.read(AliFile, "fasta")
ali_d = {}
for s in ali:
ali_d[s.id] = s
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
logger.info("Ali (%s) ok after checking", AliFile.name)
df_leaves_l = []
n_sites = ali.get_alignment_length()
Sites = [i +1 for i in range(n_sites)]
prob = 1.0
for n in leaves_trans_node_ids:
seq = ali_d[n.name].seq
node = n.ND
AA = list(seq)
df_leaves_l.append(pd.DataFrame({'Sites': Sites,
'node' : node,
'prob' : prob,
'AA' : AA})
)
df_leaves = pd.concat(df_leaves_l)
#===================================================================================================
# Read proba per site table
#===================================================================================================
try:
proba_df = pd.read_csv(ProbaFile.name, sep = "\t")
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
proba_df.drop(labels=[c for c in proba_df.columns if re.match("max.", c)], axis=1, inplace=True)
proba_df_melt = pd.melt(proba_df, id_vars=['Sites', 'is.complete', 'is.constant', 'lnL', 'rc', 'pr'],
#value_vars = [c for c in proba_df.columns if re.match("prob", c)],
var_name ="node",
value_name = "prob")
proba_df_melt["node"] = proba_df_melt["node"].str.replace('prob.','')
proba_df_melt["AA"] = proba_df_melt["node"]
proba_df_melt["node"] = proba_df_melt["node"].str.replace('.[A-Z]','')
proba_df_melt["AA"] = proba_df_melt["AA"].str.replace('[0-9]*\\.','')
proba_df_melt = proba_df_melt[proba_df_melt["node"].isin(map(str,trans_node_ids+anc_trans_node_ids))]
proba_df_melt["node"] = pd.to_numeric(proba_df_melt["node"])
proba_df_melt["Sites"] = pd.to_numeric(proba_df_melt["Sites"].str.replace('[','').str.replace(']',''))
# filter col names
proba_df_melt = proba_df_melt[["Sites","node","prob","AA"]]
#get only the best aa per site
proba_df_melt_filtered = proba_df_melt.groupby(['Sites', 'node']).apply(lambda x: x.loc[x.prob.idxmax(),["prob", "AA"]]).reset_index()
logger.info("Proba file (%s) ok after checking", ProbaFile.name)
df_prob_ok = pd.concat([proba_df_melt_filtered, df_leaves])
df_prob_ok.sort_values(by='Sites', inplace=True)
df_prob_ok.reset_index(drop=True,inplace=True)
#===================================================================================================
# Detect convergent events
#===================================================================================================
df_T_l = []
T_id = 1
for n_T in t.search_nodes(Transition = "1"):
n_T_up_ND = n_T.up.ND
n_T_ND = n_T.ND
logger.debug("%i -> %i", n_T_up_ND, n_T_ND)
df_up = df_prob_ok[df_prob_ok["node"] == n_T_up_ND]
df_child = df_prob_ok[df_prob_ok["node"] == n_T_ND]
df_up_child = pd.merge(df_up, df_child, on = "Sites", suffixes=('_up', '_child'))
df_up_child["T_AA"] = df_up_child["AA_up"] + "->" + df_up_child["AA_child"]
df_up_child["prob"] = df_up_child["prob_up"] * df_up_child["prob_child"]
df_up_child["T_node"] = df_up_child["node_up"].astype(str) + "->" + df_up_child["node_child"].astype(str)
df_up_child = df_up_child[["Sites","T_node","T_AA","prob","AA_up","AA_child"]]
logger.debug(df_up_child)
df_T_l.append(df_up_child)
df_T = pd.concat(df_T_l)
df_T.sort_values(by='Sites', inplace=True)
df_T.reset_index(drop=True,inplace=True)
df_T_conv = df_T[df_T["AA_up"] != df_T["AA_child"]]
df_T_conv_filtered = df_T_conv[df_T_conv["prob"] > 0.5]
logger.info(df_T_conv)
df_T_conv_filtered = df_T_conv_filtered.groupby(['Sites']).apply(lambda x: x.AA_child.value_counts()).reset_index() #FRAGILE
logger.info(df_T_conv_filtered)
if df_T_conv_filtered.shape[0] > 1 :
df_T_conv_filtered.rename(index=str, columns={"level_1": "AA", "AA_child": "Freq"}, inplace = True)
df_T_conv_filtered_max_AA = df_T_conv_filtered.groupby(['Sites']).apply(lambda x: x.loc[x.Freq.idxmax(),["Freq"]]).reset_index()
Sites_NA = [s for s in Sites if s not in set(df_T_conv_filtered.Sites)]
else:
Sites_NA = Sites
df_T_conv_filtered_max_AA = pd.DataFrame()
df_empty_data = pd.DataFrame({'Sites': Sites_NA,
'Freq' : 0})
df_final = pd.concat([ df for df in [df_empty_data,df_T_conv_filtered_max_AA] if not df.empty])
df_final.sort_values(by='Sites', inplace=True)
nb_T = len(trans_node_ids)
df_final["Freq"][df_final["Freq"] <=1] = 0
df_final["Identical"] = df_final["Freq"] / nb_T
df_final = df_final[["Sites","Identical"]]
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
logger.error("Format error!")
sys.exit(42)
df_final.to_csv(OutFile, sep = "\t", index = False)
logger.info("Write summary output in %s", OutFile)
if OutFile2:
df_T.to_csv(OutFile2, sep = "\t", index = False)
logger.info("Write detailed output in %s", OutFile2)
......@@ -58,6 +58,10 @@ availableOptions.add_argument('--diffsel', type=str,
help="Diffsel output name", default = None)
availableOptions.add_argument('--diffsel_bis', type=str,
help="Diffsel_bis output name", default = None)
availableOptions.add_argument('--identical_LG', type=str,
help="Identical_LG output name", default = None)
availableOptions.add_argument('--identical_WAG', type=str,
help="Identical_WAG output name", default = None)
##############
......@@ -69,6 +73,8 @@ df_pcoc = pd.DataFrame()
df_pcoc_gamma = pd.DataFrame()
df_diffsel = pd.DataFrame()
df_diffsel_bis = pd.DataFrame()
df_identical_LG = pd.DataFrame()
df_identical_WAG = pd.DataFrame()
OutName = args.output
if args.pcoc :
......@@ -86,11 +92,26 @@ if args.diffsel_bis :
df_diffsel_bis.rename(columns={'Diffsel': 'Diffsel_bis'}, inplace=True)
if args.pcoc_gamma :
if args.diffsel :
df_diffsel = pd.read_csv(args.diffsel, sep="\t")
if args.identical_LG :
df_identical_LG = pd.read_csv(args.identical_LG, sep="\t")
df_identical_LG.rename(columns={'Identical': 'Identical_LG08'}, inplace=True)
if args.identical_WAG :
df_identical_WAG = pd.read_csv(args.identical_WAG, sep="\t")
df_identical_WAG.rename(columns={'Identical': 'Identical_WAG01'}, inplace=True)
df_list = [df for df in [df_pcoc, df_pcoc_gamma,
df_diffsel, df_diffsel_bis,
df_identical_LG, df_identical_WAG] if not df.empty ]
df_list = [df for df in [df_pcoc, df_pcoc_gamma, df_diffsel, df_diffsel_bis] if not df.empty ]
df_list_len = [df.shape[0] for df in df_list]
if len(set(df_list_len)) != 1:
print("ERROR: all files have not the same number of rows")
sys.exit(1)
df_final = reduce(lambda x, y: pd.merge(x, y, on = 'Sites', how='outer'), df_list)
......
......@@ -81,7 +81,7 @@ MESSAGE("Output file is "+param(out_file))
methods_to_be_plotted = args.methods_to_be_plotted
if methods_to_be_plotted:
methods_to_be_plotted = methods_to_be_plotted.split(",")
methods_to_be_plotted = [m for m in methods_to_be_plotted.split(",") if m]
MESSAGE("Methods to be plotted: "+", ".join([param(meth) for meth in methods_to_be_plotted]))
threshold_by_method = args.threshold_by_method
dic_threshold_by_method = {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment