Commit 39680754 authored by Carine Rey's avatar Carine Rey
Browse files

prep dn ds diversity analyses

parent 399325fe
......@@ -41,6 +41,12 @@ type simu_infos = {
tree_prefix: string ;
}
type dn_ds_tsv = {
dn_ds: Testnh.dn_ds_t;
model_prefix: string ;
tree_prefix: string ;
}
type reinfered_trees = {
reinfered_tree_nt : nw workflow ;
reinfered_tree_aa : nw workflow ;
......@@ -54,6 +60,7 @@ type post_analyses_simu = {
simu_infos_plot : text_file workflow ;
trees_plot : text_file workflow ;
bl_trees_tsv : text_file workflow ;
dn_ds_tsv_l : dn_ds_tsv list;
}
let is_hyp ~hyp (dataset_results :dataset_res) =
......@@ -420,10 +427,20 @@ let post_analyses_simu_of_simu_dataset_l ~simu_dataset_l =
model_prefix = dataset.model_prefix
}
) in
let dn_ds_tsv_l = List.map simu_dataset_l ~f:(fun simu_dataset ->
let rd = simu_dataset.dataset in
let tree_prefix = simu_dataset.tree_prefix in
let model_prefix = simu_dataset.model_prefix in
let tree = rd.input_tree in
let fna = rd.fna in
let faa = rd.faa in
let dn_ds = Testnh.calc_dn_ds ~faa ~tree ~fna in
{tree_prefix; model_prefix; dn_ds}
) in
let simu_infos_plot = group_simu_infos ~simu_infos_l / selector ["out.pdf"] in
let trees_plot = plot_trees ~reinfered_tree_l / selector ["out.pdf"] in
let bl_trees_tsv = plot_trees ~reinfered_tree_l / selector ["out.tsv"] in
{simu_infos_l; simu_infos_plot; trees_plot; bl_trees_tsv}
{simu_infos_l; simu_infos_plot; trees_plot; bl_trees_tsv; dn_ds_tsv_l}
......@@ -499,6 +516,15 @@ let repo_of_post_analyses_simu ~post_analyses_simu =
|> Repo.shift "tsv"
) |> List.concat
);
(List.map post_analyses_simu.dn_ds_tsv_l ~f:(fun dn_ds_tsv ->
Repo.[
item [dn_ds_tsv.tree_prefix ^ "@" ^ dn_ds_tsv.model_prefix ^ "@dn.nw"] dn_ds_tsv.dn_ds.dn_tree;
item [dn_ds_tsv.tree_prefix ^ "@" ^ dn_ds_tsv.model_prefix ^ "@ds.nw"] dn_ds_tsv.dn_ds.ds_tree;
item [dn_ds_tsv.tree_prefix ^ "@" ^ dn_ds_tsv.model_prefix ^ "@diversity.tsv"] dn_ds_tsv.dn_ds.diversity_tsv;
item [dn_ds_tsv.tree_prefix ^ "@" ^ dn_ds_tsv.model_prefix ^ "@length.tsv"] dn_ds_tsv.dn_ds.length_tsv;
]) |> List.concat
|> Repo.shift "dn_ds_tsv"
);
] |> List.concat
|> Repo.shift "Simulation_details"
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# - Bastien Boussau <bastien.boussau@univ-lyon1.fr>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os, re
import logging
import pandas as pd
from ete3 import Tree
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="calc_diversity.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument("--dn", type=argparse.FileType('r'),
help='dn tree filename', required=True)
requiredOptions.add_argument("--ds", type=argparse.FileType('r'),
help='ds tree filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
##############
### Option parsing
args = parser.parse_args()
DnFile = args.dn.name
DsFile = args.ds.name
OutFile = args.output
#===================================================================================================
# ouput
#===================================================================================================
def length(tf):
tree = Tree(tf)
bl =0
for n in tree.traverse("postorder"):
bl += n.dist
return bl
with open(OutFile, "w") as File:
File.write("dn_tree_len\tdS_tree_len")
File.write(str(length(DnFile)) + "\t" + str(length(DsFile)))
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# - Bastien Boussau <bastien.boussau@univ-lyon1.fr>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os, re
import logging
import pandas as pd
from Bio import AlignIO
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="calc_diversity.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument('-a', "--ali", type=argparse.FileType('r'),
help='Alignment filename', required=True)
requiredOptions.add_argument('-o', '--output', type=str,
help="Output name", required=True)
##############
### Option parsing
args = parser.parse_args()
AliFile = args.ali
OutFile = args.output
#===================================================================================================
# Set up output directory and logger
#===================================================================================================
### Set up the logger
# create logger
logger = logging.getLogger("calc_diversity")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if args.debug:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter_ch)
logger.addHandler(ch)
logger.debug(sys.argv)
#===================================================================================================
# Read input alignment
#===================================================================================================
try:
ali = AlignIO.read(AliFile, "fasta")
ali_d = {}
for s in ali:
ali_d[s.id] = s
except Exception as exc:
logger.error(str(exc))
sys.exit(1)
logger.info("Ali (%s) ok after checking", AliFile.name)
df_leaves_l = []
n_sites = ali.get_alignment_length()
Sites = [i + 1 for i in range(n_sites)]
for s in ali:
seq = ali_d[s.id].seq
AA = list(seq)
df_leaves_l.append(pd.DataFrame({'Sites': Sites,
'name' : s.id,
'AA' : AA})
)
if df_leaves_l:
df_leaves = pd.concat(df_leaves_l)
else:
df_leaves = pd.DataFrame()
logger.info(df_leaves)
#===================================================================================================
# Construct per site multinomial vectors
#===================================================================================================
orderedAA = dict()
orderedAA["A"] = 0
orderedAA["C"] = 1
orderedAA["D"] = 2
orderedAA["E"] = 3
orderedAA["F"] = 4
orderedAA["G"] = 5
orderedAA["H"] = 6
orderedAA["I"] = 7
orderedAA["K"] = 8
orderedAA["L"] = 9
orderedAA["M"] = 10
orderedAA["N"] = 11
orderedAA["P"] = 12
orderedAA["Q"] = 13
orderedAA["R"] = 14
orderedAA["S"] = 15
orderedAA["T"] = 16
orderedAA["V"] = 17
orderedAA["W"] = 18
orderedAA["Y"] = 19
orderedAA["X"] = 20
orderedAA["-"] = 20
orderedAA["?"] = 20
orderedAA["*"] = 20
# Alanine Ala A
# Cysteine Cys C
# Aspartic AciD Asp D
# Glutamic Acid Glu E
# Phenylalanine Phe F
# Glycine Gly G
# Histidine His H
# Isoleucine Ile I
# Lysine Lys K
# Leucine Leu L
# Methionine Met M
# AsparagiNe Asn N
# Proline Pro P
# Glutamine Gln Q
# ARginine Arg R
# Serine Ser S
# Threonine Thr T
# Valine Val V
# Tryptophan Trp W
# TYrosine Tyr Y
condition0Counts = [0]*20
condition1Counts = [0]*20
condition0Probas = [0.0]*20
condition1Probas = [0.0]*20
commonProbas = [0.0]*20
# Now we are going to go site by site
# For each site we'll compute the total number of non gap characters, and the diversity.
# computes the diversity per position along with the total number of non-gaps
def diversity_numTot (vectorAll):
logger.debug("vectorAll")
logger.debug(vectorAll)
numNonGaps = sum(vectorAll)
logger.debug("numNonGaps: ")
logger.debug(numNonGaps)
diversity = 20 - vectorAll.count(0)
logger.debug("diversity: ")
logger.debug(diversity)
return diversity, numNonGaps
def applyDiversity_numTot(x2, orderedAA):
vec = [0]*20
x = x2['AA']
for i in range (len (x)):
if orderedAA[x.iloc[i]] != 20:
vec[orderedAA[x.iloc[i]]] = vec[orderedAA[x.iloc[i]]] + 1
return diversity_numTot(vec)
df_leaves_grouped = df_leaves.groupby(['Sites']).apply(lambda x: (applyDiversity_numTot(x, orderedAA)))
logger.info("Computations done")
sites =[]
diversities = []
nongaps = []
for i in range(1, df_leaves_grouped.shape[0]+1):
sites.append(i)
diversities.append(df_leaves_grouped[i][0])
nongaps.append(df_leaves_grouped[i][1])
df_final = pd.DataFrame({'Sites':sites, 'Diversities':diversities, 'Nongaps':nongaps})
df_final = df_final[["Sites","Diversities","Nongaps"]]
logger.info("Final dataframe built")
#===================================================================================================
# Create output files
#===================================================================================================
if df_final.shape[0] != n_sites:
logger.error("Format error!")
sys.exit(42)
df_final.to_csv(OutFile, sep = "\t", index = False)
logger.info("Write summary output in %s", OutFile)
open Core
open Bistro.Std
open Bistro.EDSL
open Bistro_bioinfo.Std
open File_formats
let env = Env.env_bppsuite
let assign k v =
seq ~sep:"=" [ string k ; v ]
let bash_script args code =
let prelude =
args
|> List.map ~f:(fun (k, v) ->
assign k v
)
|> seq ~sep:"\n"
in
seq ~sep:"\n" [ prelude ; string code ]
let bppml_config_pre_mapnh =
[ string "
# Sequences:
# The alphabet to use:
# DNA, RNA or Protein
alphabet = Codon(letter=DNA)
genetic_code=Standard
# The sequence file to use (sequences must be aligned!)
###input.sequence.file=$(DATA).fasta
# The alignment format:
input.sequence.format = Fasta
#Sites to use:
# all, nogap or complete (=only resolved chars)
input.sequence.sites_to_use = complete
# Specify a maximum amount of gaps: may be an absolute number or a percentage.
input.sequence.max_gap_allowed = 50%
# Remove stop codons
input.sequence.remove_stop_codons=yes
# We will need this:
input.sequence.remove_saturated_sites=yes
# ----------------------------------------------------------------------------------------
# Input tree file
# ----------------------------------------------------------------------------------------
# user or random
init.tree = user
input.tree.format = Nhx
init.brlen.method = Equal(value=1.0)
# ----------------------------------------------------------------------------------------
# Model specification
# ----------------------------------------------------------------------------------------
# See the manual for a description of the syntax and available options.
#
model = YN98(kappa=1, omega=1, frequencies=F1X4, initFreqs=observed, initFreqs.observedPseudoCount=1)
rate_distribution = Constant #Gamma(n=4, alpha=0.5)
# Likelihood recursion option:
# - simple: derivatives takes more time to compute, but likelihood computation is faster.
# For big data sets, it can save a lot of memory usage too, particularly when the data are compressed.
# - double: Uses more memory and need more time to compute likelihood, due to the double recursion.
# Analytical derivatives are however faster to compute.
# This option has no effect in the following cases:
# - Topology estimation: this requires a double recursive algorithm,
# - Optimization with a molecular clock: a simple recursion with data compression is used in this case,
# due to the impossibility of computing analytical derivatives.
likelihood.recursion = simple
# Site compression for the simple recursion:
# - simple: identical sites are not computed twice
# - recursive: look for site patterns to save computation time during optimization, but
# requires extra time for building the patterns.
# This is usually the best option, particularly for nucleotide data sets.
likelihood.recursion_simple.compression = recursive
# ----------------------------------------------------------------------------------------
# Optimization
# ----------------------------------------------------------------------------------------
# Should we reestimate likelihood parameters? Tree topology will not be optimized.
# (recommanded)
# Method to use for optimizing numerical parameters:
# - None, no optimization performed
# - DB derivatives for branch lengths + Brent for other parameters
# - FullD derivatives for all parameters, using numerical derivatives for non-branch lengths parameters.
optimization = FullD(derivatives=Newton)
# Tell if the parameter should be transformed in order to remove constraints.
# This can improves the optimization, but might be a bit slower.
optimization.reparametrization = no
# Final optimization step, may be useful if numerical derivatives are used:
# powell or simplex or none.
optimization.final = none
# Set the quantity of output to the screen:
optimization.verbose = 3
# Parameters to ignore (for instance equilibrium frequencies)
optimization.ignore_parameters = YN98.freq*
# Maximum number of likelihood evaluations:
optimization.max_number_f_eval = 10000
# Precision to reach:
optimization.tolerance = 1 #0.000001
# idem for error or warning messages:
optimization.message_handler = $(DATA).ml_h.messages
# A file where to dump optimization steps (a file path or std for standard output)
optimization.profiler = $(DATA).ml_h.profile
# Shall we optimize tree topology as well?
optimization.topology = no
# Algorithm to use for topology estimation: only NNI for now
optimization.topology.algorithm = NNI
# NNI method: fast, better or phyml
# You should use the phyml option, since it is really more efficient!
optimization.topology.algorithm_nni.method = phyml
# Number of phyml topology movement steps before reoptimizing parameters:
optimization.topology.nstep = 4
# Shall we estimate parameters before looking for topology movements?
optimization.topology.numfirst = no
# Tolerances: These numbers should not be too low, in order to save computation
# time and also for a better topology estimation.
# The optimization.tolerance parameter will be used for the final optimization
# of numerical parameters.
#
# Tolerance for the prior-topology estimation
optimization.topology.tolerance.before = 100
# Tolerance for the during-topology estimation
optimization.topology.tolerance.during = 100
# Shall we first scale the tree before optimizing parameters? [deprecated]
optimization.scale_first = yes
# Should we write the resulting tree? none or file name.
output.tree.file = $(DATA).ml_h.dnd
output.tree.format = Newick
# Alignment information log file (site specific rates, etc):
output.infos = $(DATA).ml_h.infos
# Write numerical parameter estimated values:
output.estimates = $(DATA).ml_h.params.bpp
# ----------------------------------------------------------------------------------------
# Bootstrap
# ----------------------------------------------------------------------------------------
bootstrap.number = 0
# Tell if numerical parameters should be kept to their initial value when bootstrapping:
bootstrap.approximate = no
# Set this to yes for detailed output when bootstrapping.
bootstrap.verbose = no
bootstrap.output.file =
" ]
let mapnh_config =
[ string "
alphabet = Codon(letter=DNA) #DNA
genetic_code=Standard
#Set imput tree file. Original branch lengths will be used for mapping.
input.tree.file = $(DATA).ml_h.dnd
input.tree.format = Newick
#We convert the tree to Nhx format for use with PartNH:
output.tree_with_id.file = $(DATA).nhx
#Where to output cluster tree:
output.cluster_tree.file = $(DATA).cluster_join.dnd
input.sequence.format = Fasta
input.sequence.sites_to_use = all
# Remove stop codons
input.sequence.remove_stop_codons = yes
map.type = DnDs(code=VertebrateMitochondrial)
# Here we use the estimated homogeneous model for mapping:
model = YN98(frequencies=F3X4,kappa=4.256962136755, omega=0.034993311532, freq_Codon.1_Full.theta=0.456066719064, freq_Codon.1_Full.theta1=0.588175796088, freq_Codon.1_Full.theta2=0.447949418086, freq_Codon.2_Full.theta=0.386701598114, freq_Codon.2_Full.theta1=0.319381792085, freq_Codon.2_Full.theta2=0.307606839923, freq_Codon.3_Full.theta=0.357395860624, freq_Codon.3_Full.theta1=0.656914626158, freq_Codon.3_Full.theta2=0.129523474419)
# Saturation threshold:
count.max = 3
test.global = no
test.branch = yes
test.branch.neighbor = #yes or no
test.branch.negbrlen = no
test.branch.auto_cluster = Marginal(threshold=3)
output.counts.tree.prefix = CountsTree
"
]
let conf_file_bppml ~tree ~fna ~out ~config =
seq ~sep:"\n" (
[
assign "DATA" (out) ;
assign "input.sequence.file" (dep fna) ;
assign "input.tree.file" (dep tree) ;
]
@ config
)
let conf_file_mapnh ~fna ~out ~config =
seq ~sep:"\n" (
[
assign "DATA" (out) ;
assign "input.sequence.file" (dep fna) ;
]
@ config
)
let bppml_mapnh ?(descr="") ~fna ~tree : _ directory workflow =
let env = Env.env_bppsuite in
let config_bppml_f = tmp // "config_bppml.bpp" in
let config_mapnh_f = tmp // "config_mapnh.bpp" in
let out = ident dest in
workflow ~descr:("testnh.bppml_mapnh" ^ descr) [
docker env (
and_list [
mkdir_p dest;
cd dest;
cmd "cat" ~stdout:config_bppml_f [(file_dump (conf_file_bppml ~tree ~fna ~out ~config:bppml_config_pre_mapnh))];
cmd "cat" ~stdout:config_mapnh_f [(file_dump (conf_file_mapnh ~fna ~out ~config:mapnh_config ))];
cmd "bppml" [
assign "param" config_bppml_f;
];
cmd "mapnh" [
assign "param" config_mapnh_f;
assign "--noninteractive" (string "yes");
assign "test.branch.neighbor" (string "no");
assign "output.cluster_tree.file" (string "simu.cluster_free.dnd");
]
]
)
]
let get_dl_dn_ds ~(dn_tree:_ workflow) ~(ds_tree:_ workflow): text_file workflow =
let env = Env.env_py in
workflow ~descr:("get_bl_dn_ds") [
cmd "python" ~env [
file_dump (string Scripts.calc_bl) ;
opt "--dn" dep dn_tree;
opt "--ds" dep ds_tree;
opt "-o" ident dest ;
]
]
let aa_diversity ~(faa : aminoacid_fasta workflow) : text_file workflow =
let env = Env.env_py in
workflow ~descr:("diversity") [
cmd "python" ~env [
file_dump (string Scripts.calc_diversity) ;
opt "-a" dep faa;
opt "-o" ident dest ;
]
]
type dn_ds_t = {
dn_tree : text_file workflow;
ds_tree : text_file workflow;
length_tsv : text_file workflow;
diversity_tsv : text_file workflow;
}
let calc_dn_ds ~tree ~faa ~(fna:nucleotide_fasta workflow) : dn_ds_t =
let run_bppml = bppml_mapnh ~descr:"" ~tree ~fna in
let dn_tree = run_bppml / selector["mapping_counts_per_type_dN.dnd"]