Commit 1b01ac9a authored by Carine Rey's avatar Carine Rey
Browse files

add a final plot

parent 2fb2c7da
......@@ -22,7 +22,8 @@ type dataset_res = {
model_prefix : string ;
tree_prefix : string ;
res_by_tools: det_result list ;
merged_results : text_file workflow
merged_results : text_file workflow ;
plot_merged_results : svg workflow ;
}
let pcoc ?plot_complete ?gamma ~(faa:aminoacid_fasta workflow) ~(tree:_ workflow) : (*`pcoc TODO*) det_out directory workflow =
......@@ -71,8 +72,8 @@ let diffsel ~(phy_n:nucleotide_phylip workflow) ~(tree: _ workflow) ~(w_every:in
cmd "ls" [dep tree]; (* required dep to link the file in the env *)
(*python diffsel_analyze_result.py [-r /path/to/readdiffsel] [-o output_file] chainname *)
cmd "cat" ~stdout:package [ file_dump (string Scripts.diffsel_script_utils) ] ;
cmd "cat" ~stdout:script [ file_dump (string Scripts.diffsel_analyze_result) ] ;
cmd "cp" [ file_dump (string Scripts.diffsel_script_utils) ; package] ;
cmd "cp" [ file_dump (string Scripts.diffsel_analyze_result) ; script] ;
cmd "python" [
string "diffsel_analyze_result.py" ;
......@@ -111,3 +112,46 @@ let merge_results ~res_by_tools : text_file workflow =
]
let plot_merge_results ~res_by_tools ~tree ~faa ~tsv : svg workflow =
let plot_all_sites = true in
(*let env = docker_image ~account:"carinerey" ~name:"ete3" ~tag:"3.0.0b35" () in*)
let env = docker_image ~account:"carinerey" ~name:"pcoc" ~tag:"06212018" () in
(* use of pcoc env due to its working X server for dra plot with ete3 *)
let meths = List.map res_by_tools ~f:(fun res ->
let def_meth = res.det_meth in
let opt = match def_meth with
| Pcoc -> string "PCOC"
| Pcoc_gamma -> string "PCOC_gamma"
| Diffsel -> string "Diffsel"
in
opt
) |> seq ~sep:","
in
let package_diffsel_script_utils = tmp // "diffsel_script_utils.py" in
let package_plot_data = tmp // "plot_data.py" in
let script_plot_convergent_sites = tmp // "plot_convergent_sites.py" in
let out = dest // "results.svg" in
workflow ~descr:"convergence_detection.plot_results" [
docker env (
and_list [
mkdir_p tmp ;
mkdir_p dest ;
cd tmp ;
cmd "cp" [ file_dump (string Scripts.diffsel_script_utils) ; package_diffsel_script_utils ] ;
cmd "cp" [ file_dump (string Scripts.plot_data) ; package_plot_data] ;
cmd "cp" [ file_dump (string Scripts.plot_convergent_sites); script_plot_convergent_sites ] ;
cmd "python" [
string "plot_convergent_sites.py" ;
opt "-msa" dep faa ;
opt "-tsv" dep tsv ;
opt "-tree" dep tree ;
opt "-out" ident out ;
opt "-meth" ident meths ;
flag string "--all_sites" plot_all_sites ;
]
]
)
]
......@@ -22,7 +22,8 @@ type dataset_res = {
model_prefix : string ;
tree_prefix : string ;
res_by_tools: det_result list ;
merged_results : text_file workflow
merged_results : text_file workflow ;
plot_merged_results : svg workflow
}
val pcoc :
......@@ -43,3 +44,10 @@ val diffsel :
val merge_results :
res_by_tools : det_result list ->
text_file workflow
val plot_merge_results :
res_by_tools : det_result list ->
tree : _ workflow ->
faa : aminoacid_fasta workflow ->
tsv : text_file workflow ->
svg workflow
......@@ -93,28 +93,30 @@ let repo_of_dataset_results_l ~dataset_results_l =
List.map dataset_results_l ~f:(fun dataset_results ->
let det_results_l = dataset_results.res_by_tools in
let merged_results = dataset_results.merged_results in
let merged_results_item = Repo.shift dataset_results.tree_prefix (Repo.shift dataset_results.model_prefix [(Repo.item ["merged_results.tsv"] merged_results) ]) in
[ merged_results_item ;
List.map det_results_l ~f:(fun det_results ->
let model_prefix = det_results.dataset.model_prefix in
let tree_prefix = det_results.dataset.tree_prefix in
let det_meth = det_results.det_meth in
let det_meth_prefix = string_of_det_meth det_meth in
let w = det_results.det_result
let plot_merge_results = dataset_results.plot_merged_results in
let merged_results_item = Repo.item ["merged_results.tsv"] merged_results in
let plot_merged_results_item = Repo.item ["plot_merged_results"] plot_merge_results in
[ [ merged_results_item ; plot_merged_results_item ] ;
List.map det_results_l ~f:(fun det_results ->
let det_meth = det_results.det_meth in
let det_meth_prefix = string_of_det_meth det_meth in
let w = det_results.det_result
in
let repo_d = Repo.shift "Detection_tools" (Repo.shift det_meth_prefix (
Repo.[
match det_meth with
| Pcoc -> item ["pcoc.results.tsv"] w
| Pcoc_gamma -> item ["pcoc_gamma.results.tsv"] w
| Diffsel -> item ["diffsel.results.tsv"] w
]
))
in
let repo_d = Repo.shift "Detection_tools" (Repo.shift det_meth_prefix (
Repo.[
match det_meth with
| Pcoc -> item ["pcoc.results.tsv"] w
| Pcoc_gamma -> item ["pcoc_gamma.results.tsv"] w
| Diffsel -> item ["diffsel.results.tsv"] w
]
))
in
Repo.shift tree_prefix (Repo.shift model_prefix repo_d)
) |> List.concat
repo_d
) |> List.concat
] |> List.concat
|> Repo.shift dataset_results.model_prefix
|> Repo.shift dataset_results.tree_prefix
)
|> List.concat
......@@ -142,9 +144,13 @@ let derive_from_dataset ~dataset ~preview =
derive_from_det_meth ~det_meth ~dataset ~preview
) in
let merged_results = merge_results ~res_by_tools in
let tsv = merged_results in
let faa = dataset.ready_dataset.faa in
let tree = Tree_dataset.tree dataset.ready_dataset.tree_dataset `Detection in
let plot_merged_results = plot_merge_results ~res_by_tools ~tsv ~faa ~tree in
let model_prefix = dataset.model_prefix in
let tree_prefix = dataset.tree_prefix in
{model_prefix; tree_prefix; res_by_tools ; merged_results}
{model_prefix; tree_prefix; res_by_tools ; merged_results ; plot_merged_results}
let derive_det ~dataset_l ~profile_fn ~preview=
List.map dataset_l ~f:(fun dataset ->
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) and École Normale Supérieure de Lyon (2018)
# Contributors:
# - Vincent Lanore <vincent.lanore@gmail.com>
# - Carine Rey <carine.rey@ens-lyon.org>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
from ete3 import Tree, NodeStyle, TreeStyle, TextFace
from diffsel_script_utils import *
from plot_data import *
import pandas as pd
from Bio import AlignIO, SeqIO, Seq, SeqRecord
import sys
#===================================================================================================
STEP("Parsing command line arguments")
from argparse import ArgumentParser, FileType
parser = ArgumentParser(description='''Tool to plot convergent sites from a result table, a tree and a MSA.
ex:\n\n
python script/plot_convergent_sites.py -tsv example/tree4plot.tsv -msa example/tree4plot.fa -tree example/tree4plot_annotated.nw -out example/tree4plot.svg -meth Meth3,Meth1 -t Meth3:0.8,Meth1:70
''')
parser.add_argument('-tsv', metavar="table.tsv", type=FileType('r'),
help='the result table (tabular file, with a column named "Sites" and the others named by the name of the convergent detection tool)', required=True)
parser.add_argument('-msa', metavar="msa.fa", type=FileType('r'),
help='the msa file (fasta format)', required=True)
parser.add_argument('-tree', metavar="tree.nhx", type=FileType('r'),
help='the tree file (NHX format with the "Condition" tag)', required=True)
parser.add_argument('-out', metavar="output.svg", type=str,
help='the output file (svg format)', required=True)
parser.add_argument('-meth', dest="methods_to_be_plotted", type=str,
metavar="\"meth1,meth3\"", help="columns of the table file to use (default:all)",
default=None)
parser.add_argument('-t', dest="threshold_by_method", type=str,
metavar="\"meth1:0.85,meth3:70\"", help="threshold to filter site by method in the table file (default:0.99 or 99)",
default=None)
parser.add_argument('--all_sites', dest="plot_all_sites", action="store_true",
help="Plot all sites (default:False)",
default=False)
parser.add_argument('-title', dest="title", type=str,
metavar="\"title\"", help="Plot title (default:None)",
default=None)
args = parser.parse_args()
tsv_file = args.tsv
MESSAGE("Tabular file is "+param(tsv_file.name))
msa_file = args.msa
MESSAGE("MSA file is "+param(msa_file.name))
tree_file = args.tree
MESSAGE("Tree file is "+param(tree_file.name))
out_file = args.out
MESSAGE("Output file is "+param(out_file))
methods_to_be_plotted = args.methods_to_be_plotted
if methods_to_be_plotted:
methods_to_be_plotted = methods_to_be_plotted.split(",")
MESSAGE("Methods to be plotted: "+", ".join([param(meth) for meth in methods_to_be_plotted]))
threshold_by_method = args.threshold_by_method
dic_threshold_by_method = {}
if threshold_by_method:
dic_threshold_by_method = {mt.split(":")[0]:float(mt.split(":")[1]) for mt in threshold_by_method.split(",") }
MESSAGE("Threshold by method: "+", ".join([(param(key)+": "+param(value)) for key,value in dic_threshold_by_method.items()]))
#===================================================================================================
STEP("Reading input files:")
MESSAGE("Reading the result table (%s)"%param(tsv_file.name))
result_table = pd.read_table(tsv_file)
colnames = list(result_table)
if "Sites" in colnames:
SUBMESSAGE(success() + "\"Sites\" column is present")
colnames.remove("Sites")
else:
SUBMESSAGE(failure() + "\"Sites\" column is not present in the input table")
sys.exit(1)
SUBMESSAGE("Detected methods: " + ", ".join([data(m) for m in colnames]))
if methods_to_be_plotted:
methods_to_be_plotted = list(set(colnames) & set(methods_to_be_plotted))
SUBMESSAGE("Methods which will be used after filtering: " + ", ".join([param(m) for m in methods_to_be_plotted]))
else:
methods_to_be_plotted = colnames
SUBMESSAGE("All methods will be used")
if not methods_to_be_plotted:
SUBMESSAGE(failure() + "No method to be plot. Check your table file and your option.")
sys.exit(1)
MESSAGE("Computing thresholds to filter sites:")
methode_scales = {}
for meth in methods_to_be_plotted:
max_meth = max(result_table[meth])
if max_meth > 1:
methode_scales[meth] = 100
else:
methode_scales[meth] = 1
if meth in dic_threshold_by_method:
threshold = dic_threshold_by_method[meth]
elif max_meth > 1:
threshold = 99
dic_threshold_by_method[meth] = threshold
else:
threshold = 0.99
dic_threshold_by_method[meth] = threshold
nb_sites = sum(result_table[meth] > threshold)
SUBMESSAGE("%s: > %s (%s sites)" %(param(meth), data(threshold), data(nb_sites)))
MESSAGE("Reading the msa")
try:
alignment = AlignIO.read(msa_file, "fasta")
except Exception as exc:
SUBMESSAGE(failure() + str(exc))
sys.exit(1)
nb_seq=len(alignment)
nb_sites=len(alignment[0].seq)
SUBMESSAGE(success() + "There are %s sequences of %s sites" %(data(nb_seq), data(nb_sites)))
seq_names = [ s.name for s in alignment]
MESSAGE("Reading the tree")
try:
t=Tree(tree_file.name)
except Exception as exc:
SUBMESSAGE(failure() + str(exc))
sys.exit(1)
leaves_names = [ l.name for l in t.get_leaves()]
nb_leaves = len(leaves_names)
SUBMESSAGE(success() + "There are %s leaves" %(data(nb_leaves)))
MESSAGE("Check link between sequence names and leaf names")
seq_names_not_in_leaves = list(set(seq_names) - set(leaves_names))
leaf_names_not_in_seq = list(set(leaves_names) - set(seq_names))
if seq_names_not_in_leaves:
SUBMESSAGE(warning() + "These sequences are not in the tree:\n\t-%s" %("\n\t-".join(seq_names_not_in_leaves)))
if leaf_names_not_in_seq:
SUBMESSAGE(warning() + "These leaves are not in the alignment:\n\t-%s" %("\n\t-".join(leaf_names_not_in_seq)))
if set(leaves_names) != set(seq_names) or len(leaves_names) != len(seq_names):
if len(leaves_names) != len(seq_names):
SUBMESSAGE(failure() + "There are not the same number of leaves and sequences")
if set(leaves_names) != set(seq_names):
SUBMESSAGE(failure() + "Sequence names and leaf names are not identical.")
sys.exit(1)
else:
SUBMESSAGE(success() + "Sequence names and leaves match!")
MESSAGE("Detect existing tags in the tree")
features = []
for n in t.traverse("postorder"): # get all features:
features.extend(list(set(dir(n)) - set(dir(Tree()))))
features = list(set(features)) # list(set(*)) = remove duplicates
if "Condition" in features:
SUBMESSAGE(success() + "Tag \"Condition\" detected")
else:
SUBMESSAGE(warning() + "Tag \"Condition\" not detected")
if "Transition" in features:
SUBMESSAGE(success() + "Tag \"Transition\" detected")
#===================================================================================================
STEP("Prepare plot")
def unlist(list2d):
return [item for sublist in list2d for item in sublist]
def filter_l(l, pos):
new_l = [None]*len(pos)
for i in range(len(pos)):
p = pos[i]
new_l[i] = l[p-1]
return new_l
MESSAGE("Filter alignment")
bilan_f = {}
all_pos = range(1, nb_sites +1)
dict_pos_filtered = {}
for meth in methods_to_be_plotted:
dict_pos_filtered[meth] = result_table["Sites"][result_table[meth]>dic_threshold_by_method[meth]].tolist()
all_filtered_position = list(set(unlist(dict_pos_filtered.values())))
all_filtered_position.sort()
if args.plot_all_sites:
dict_pos_filtered["union"] = all_pos
else:
dict_pos_filtered["union"] = all_filtered_position
# filtered ali:
filtered_ali_filename = out_file + ".filtered_ali.faa"
for meth in ["union"]: #methods_to_be_plotted + ["union"]:
filtered_ali = []
for seq in alignment:
new_seq = SeqRecord.SeqRecord(Seq.Seq("".join(filter_l(list(seq.seq),dict_pos_filtered[meth]))), seq.id, "", "")
filtered_ali.append(new_seq)
SeqIO.write(filtered_ali, filtered_ali_filename, "fasta")
if meth == "union":
methstr = "union"
else:
methstr = meth
SUBMESSAGE("Conserved sites: %s" %(", ".join(map(data, dict_pos_filtered ["union"]))))
#===================================================================================================
STEP("Draw plot")
MESSAGE("Writing plot to " + data(out_file))
positions_to_highlight = None
# all model
if dict_pos_filtered["union"]:
dict_values_pcoc_filtered_model = {}
for meth in methods_to_be_plotted:
dict_values_pcoc_filtered_model[meth] = result_table[meth][result_table["Sites"].isin(dict_pos_filtered["union"])].tolist()
meth = "union"
make_tree_ali_detect_combi(tree_file.name, filtered_ali_filename, out_file,
dict_benchmark = dict_values_pcoc_filtered_model,
x_values= dict_pos_filtered[meth], hp=positions_to_highlight,
methode_scales = methode_scales, methode_thresholds=dic_threshold_by_method,
title = args.title)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment