Commit 4b3ad11d authored by Carine Rey's avatar Carine Rey Committed by Philippe Veber
Browse files

add calc_dnds

parent 4fd11201
......@@ -35,7 +35,7 @@ push_docker:
test:
cd example && \
reviewphiltrans validation --outdir outdir_test --tree-dir trees_test --profile-fn aa_fitness/263SelectedProfiles.tsv --ns 5 --np 4 --seed 4256073781403810077 && \
reviewphiltrans validation --outdir outdir_test --tree-dir trees_test --profile-fn aa_fitness/263SelectedProfiles.tsv --diffsel --ns 5 --np 4 --seed 4256073781403810077 && \
reviewphiltrans validation --outdir outdir_test --tree-dir trees_test --profile-fn aa_fitness/263SelectedProfiles.tsv --dnds --ns 5 --np 4 --seed 4256073781403810077 && \
mv dag.dot dagtest_val.dot && \
dot -Tsvg dagtest_val.dot -o dagtest_val.svg
......@@ -46,7 +46,19 @@ test:
.PHONY: realdata_test
realdata_test:
cd example && \
reviewphiltrans realdata --outdir outdir_realdata_test --indir real_data --np 4 --seed 4256073781403810077
reviewphiltrans realdata --outdir outdir_realdata_test --indir real_data --np 4 --seed 4256073781403810077 &&\
reviewphiltrans realdata --dnds --outdir outdir_realdata_test --indir real_data --np 4 --seed 4256073781403810077
# -----------------------------------------------------------------------
# Realdata
# -----------------------------------------------------------------------
.PHONY: realdata
realdata:
cd /mnt && \
mkdir -p wd_reviewphiltrans &&\
cd wd_reviewphiltrans &&\
reviewphiltrans realdata --outdir realdata_online_rodents_ensemblkeep --indir /home/ubuntu/data_4_reviewphiltrans_rongeur_ensemblkeep --np 32 --mem 160 --seed 4256073781403810077
# -----------------------------------------------------------------------
......
......@@ -5,6 +5,7 @@ type t = {
model_prefix: string ;
tree_prefix : string ;
is_real: bool ;
calc_dnds: bool ;
dataset : Ready_dataset.t ;
seed : int ;
}
......@@ -17,7 +18,7 @@ let repo dataset_l =
let model_prefix = dataset.model_prefix in
let tree_prefix = dataset.tree_prefix in
if dataset.is_real then
let repo_realdata = Ready_dataset.repo_realdata ~ali_prefix:model_prefix ~tree_prefix dataset.dataset in
let repo_realdata = Ready_dataset.repo_realdata ~ali_prefix:model_prefix ~tree_prefix dataset.dataset ~calc_dnds:dataset.calc_dnds in
repo_realdata
|> Repo.shift "Dataset"
|> Repo.shift tree_prefix
......@@ -46,4 +47,5 @@ let add_indels_to_dataset d ~seed =
let new_seed = Hashtbl.hash seed in
let indel_seed = Hashtbl.hash new_seed in
let dataset = Ready_dataset.add_indels_to_ready_dataset ~p ~seed:indel_seed d.dataset in
{model_prefix; tree_prefix; is_real; dataset; seed = new_seed}
let calc_dnds = d.calc_dnds in
{model_prefix; tree_prefix; is_real; calc_dnds; dataset; seed = new_seed}
......@@ -5,7 +5,7 @@ open Convergence_detection
open Convergence_hypothesis
open Profile
let parse_input_data ~seed indir =
let parse_input_data ~seed ~calc_dnds indir =
let error_message = {|
I need a file "tree.nhx" containing the annotated tree and
a directory "Alignments" containing fasta alignments (in nt)
......@@ -47,6 +47,7 @@ with the format "gene1.fna", "gene2.fna",... |} in
let raw_dataset = Raw_dataset.{input_tree=filtered_input_tree; fna; fna_infos} in
let dataset = {Dataset.model_prefix = fna_prefix;
is_real = true;
calc_dnds;
tree_prefix = tree_prefix;
dataset = Ready_dataset.of_raw ~descr:("real_data." ^ tree_prefix) raw_dataset;
seed;
......@@ -74,7 +75,7 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~se
let fna_infos = Some (Bppsuite.Bppseqgen.info run_fna) in
let faa = Bppsuite.fna2faa fna in
let ready_dataset = { Ready_dataset.input_tree = input_tree ; tree_dataset ; fna; faa; fna_infos } in
{ Dataset.model_prefix; is_real= false; tree_prefix; dataset = ready_dataset; seed }
{ Dataset.model_prefix; is_real= false; calc_dnds = false; tree_prefix; dataset = ready_dataset; seed }
let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~add_indels ~seed =
let tree_prefix = Filename.chop_extension tree in
......@@ -98,8 +99,8 @@ let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~add_indels ~
derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~seed
) in
let _concat_H0HaPCOC = {Dataset.model_prefix="H0_NeG5+HaPCOC"; tree_prefix; is_real = false; dataset = Ready_dataset.paste dataset_H0_NeG5.dataset dataset_HaPCOC.dataset; seed=(dataset_H0_NeG5.seed + dataset_HaPCOC.seed |> Hashtbl.hash) } in
let concat_H0HaPC = {Dataset.model_prefix="H0_NeG5+HaPC_NeG5"; tree_prefix; is_real = false; dataset = Ready_dataset.paste dataset_H0_NeG5.dataset dataset_HaPC_NeG5.dataset; seed=(dataset_H0_NeG5.seed + dataset_HaPC_NeG5.seed |> Hashtbl.hash) } in
let _concat_H0HaPCOC = {Dataset.model_prefix="H0_NeG5+HaPCOC"; tree_prefix; is_real = false; calc_dnds = false; dataset = Ready_dataset.paste dataset_H0_NeG5.dataset dataset_HaPCOC.dataset; seed=(dataset_H0_NeG5.seed + dataset_HaPCOC.seed |> Hashtbl.hash) } in
let concat_H0HaPC = {Dataset.model_prefix="H0_NeG5+HaPC_NeG5"; tree_prefix; is_real = false; calc_dnds = false; dataset = Ready_dataset.paste dataset_H0_NeG5.dataset dataset_HaPC_NeG5.dataset; seed=(dataset_H0_NeG5.seed + dataset_HaPC_NeG5.seed |> Hashtbl.hash) } in
let dataset_concat_hypos = if use_concat then [concat_H0HaPC;] else [] in
let dataset_with_indels = if add_indels then [indel_H0_NeG5 ; indel_HaPC_NeG5] else [] in
List.concat [ (*dataset_basis_hyps;*) dataset_per_hypo ; dataset_concat_hypos; dataset_with_indels ]
......@@ -319,7 +320,7 @@ let loggers = [
time_logger#logger ;
]
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~use_diffsel ~use_c60 ~tree_dir ~profile_fn ~use_concat ~add_indels ~only_simu ?(seed = Random.int Int.max_value) () =
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~use_diffsel ~use_c60 ~calc_dnds ~tree_dir ~profile_fn ~use_concat ~add_indels ~only_simu ?(seed = Random.int Int.max_value) () =
printf "Global seed: %i\n" seed;
Out_channel.write_all "global.seed" ~data:(string_of_int seed);
(* simulated trees *)
......@@ -328,7 +329,7 @@ let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~previe
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:All ~nb_sites profile_fn ~seed:(Random.int Int.max_value) in
let sim_repo_l = derive_profile ~preview ~use_diffsel ~use_c60 ~tree_dir ~profile ~use_concat ~add_indels ~only_simu ~seed () in
(* real trees *)
let indir_dataset_l = if indir = "" then [] else parse_input_data ~seed indir in
let indir_dataset_l = if indir = "" then [] else parse_input_data ~seed ~calc_dnds indir in
let dataset_l = indir_dataset_l in
let dataset_results_l =
if only_simu then
......@@ -361,6 +362,8 @@ let validation_command =
flag "--diffsel" no_arg ~doc:" use the diffsel method (very slow)."
and use_c60 =
flag "--c60" no_arg ~doc:" use the pcoc c60 method (slow)."
and calc_dnds =
flag "--dnds" no_arg ~doc:" calculate dn ds dnds trees (slow)."
and only_simu =
flag "--only-simu" no_arg ~doc:" mode only simulation"
and use_concat =
......@@ -380,17 +383,17 @@ let validation_command =
and seed =
flag "--seed" (optional int) ~doc:"INT Global seed"
in
validation_main ~outdir ?indir ?ns ?np ?mem ~preview ~use_diffsel ~use_c60 ~tree_dir ~profile_fn ~use_concat ~only_simu ~add_indels ?seed
validation_main ~outdir ?indir ?ns ?np ?mem ~preview ~use_diffsel ~use_c60 ~calc_dnds ~tree_dir ~profile_fn ~use_concat ~only_simu ~add_indels ?seed
]
let realdata_main ~outdir ~indir ~preview ~use_diffsel ~use_c60 ?(np = 2) ?(mem = 2) ?(seed = Random.int Int.max_value) () =
let realdata_main ~outdir ~indir ~preview ~use_diffsel ~use_c60 ~calc_dnds ?(np = 2) ?(mem = 2) ?(seed = Random.int Int.max_value) () =
(* seed-related stuff *)
printf "Global seed: %i\n" seed;
Out_channel.write_all "global.seed" ~data:(string_of_int seed);
Random.init seed ;
(* real trees *)
let dataset_l = parse_input_data ~seed indir in
let dataset_l = parse_input_data ~seed ~calc_dnds indir in
let dataset_results_l = derive_det ~dataset_l ~preview ~use_diffsel ~use_c60 in
let repo_real_trees = [
Dataset.repo dataset_l ;
......@@ -416,6 +419,8 @@ let realdata_command =
flag "--diffsel" no_arg ~doc:" use the diffsel method (very slow)."
and use_c60 =
flag "--c60" no_arg ~doc:" use the pcoc c60 method (slow)."
and calc_dnds =
flag "--dnds" no_arg ~doc:" calculate dn ds dnds trees (slow)."
and np =
flag "--np" (optional int) ~doc:"INT Number of available processors"
and mem =
......@@ -423,5 +428,5 @@ let realdata_command =
and seed =
flag "--seed" (optional int) ~doc:"INT Global seed"
in
realdata_main ~outdir ~indir ?np ?mem ~preview ~use_diffsel ~use_c60 ?seed
realdata_main ~outdir ~indir ?np ?mem ~preview ~use_diffsel ~use_c60 ~calc_dnds ?seed
]
......@@ -47,8 +47,9 @@ let repo rd =
]
|> List.concat
let repo_realdata ~tree_prefix ~ali_prefix rd =
let repo_realdata ~tree_prefix ~ali_prefix ~calc_dnds rd =
let phy_nt = (Bppsuite.fna2phy ~fna:rd.fna) in
let dnds_realdata = Testnh.dn_ds_trees_real_data ~ali_prefix ~tree:rd.input_tree ~fna:rd.fna () in
List.concat [
Repo.[
item [ali_prefix ^ ".fna"] rd.fna ;
......@@ -59,6 +60,14 @@ let repo_realdata ~tree_prefix ~ali_prefix rd =
item [ali_prefix ^ ".nt.phy"] phy_nt ;
item [ali_prefix ^ ".tree.diffsel" ] (Tree_dataset.diffsel_tree rd.tree_dataset) ;
] |> Repo.shift "ForDiffsel";
if calc_dnds then
Repo.[
item [ali_prefix ^ ".dn.tsv"] dnds_realdata.dn_tsv ;
item [ali_prefix ^ ".ds.tsv"] dnds_realdata.ds_tsv ;
item [ali_prefix ^ ".dnds.tsv" ] dnds_realdata.dnds_tsv ;
] |> Repo.shift "ForRERconverge"
else
[];
]
|> Repo.shift ali_prefix
|> Repo.shift "Alignments"
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# - Bastien Boussau <bastien.boussau@univ-lyon1.fr>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import argparse
import sys
import os, re
import logging
import pandas as pd
from ete3 import Tree
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser = argparse.ArgumentParser(prog="format_dn_ds.py",
description='')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('--debug', action="store_true",
help="debug mode",
default=False)
##############
requiredOptions = parser.add_argument_group('REQUIRED OPTIONS')
requiredOptions.add_argument("--dn", type=argparse.FileType('r'),
help='dn tree filename', required=True)
requiredOptions.add_argument("--ds", type=argparse.FileType('r'),
help='ds tree filename', required=True)
requiredOptions.add_argument("--gene_name", type=str,
help='gene name', required=True)
requiredOptions.add_argument('-o', '--output_dir', type=str,
help="Output name", required=True)
##############
### Option parsing
args = parser.parse_args()
DnFile = args.dn.name
DsFile = args.ds.name
GeneName = args.gene_name
OutDir = args.output_dir
#===================================================================================================
# ouput
#===================================================================================================
Dn_tree = Tree(DnFile)
Ds_tree = Tree(DsFile)
DnDs_tree = Dn_tree.copy()
i=0
n_ds_bl = {}
for n_ds in Ds_tree.traverse("postorder"):
n_ds_bl[i] = n_ds.dist
i+=1
i=0
for n in DnDs_tree.traverse("postorder"):
if n_ds_bl[i] != 0:
n.dist = n.dist / float(n_ds_bl[i])
else:
n.dist = 0
i+=1
Dn_tree_str = Dn_tree.write(format=1,format_root_node=True)
Ds_tree_str = Ds_tree.write(format=1,format_root_node=True)
DnDs_tree_str = DnDs_tree.write(format=1,format_root_node=True)
with open(OutDir+"/dn_tree.tsv", "w") as File:
File.write(GeneName + "\t" + Dn_tree_str)
File.write("\n")
with open(OutDir+"/ds_tree.tsv", "w") as File:
File.write(GeneName + "\t" + Ds_tree_str)
File.write("\n")
with open(OutDir+"/dnds_tree.tsv", "w") as File:
File.write(GeneName + "\t" + DnDs_tree_str)
File.write("\n")
......@@ -20,7 +20,7 @@ genetic_code=Standard
input.sequence.format = Fasta
#Sites to use:
# all, nogap or complete (=only resolved chars)
input.sequence.sites_to_use = complete
input.sequence.sites_to_use = all
# Specify a maximum amount of gaps: may be an absolute number or a percentage.
input.sequence.max_gap_allowed = 50%
# Remove stop codons
......@@ -33,14 +33,13 @@ input.sequence.remove_saturated_sites=yes
# user or random
init.tree = user
input.tree.format = Nhx
init.brlen.method = Equal(value=1.0)
# ----------------------------------------------------------------------------------------
# Model specification
# ----------------------------------------------------------------------------------------
# See the manual for a description of the syntax and available options.
#
model = YN98(kappa=1, omega=1, frequencies=F1X4, initFreqs=observed, initFreqs.observedPseudoCount=1)
rate_distribution = Constant #Gamma(n=4, alpha=0.5)
rate_distribution = Gamma(n=4)
# Likelihood recursion option:
# - simple: derivatives takes more time to compute, but likelihood computation is faster.
# For big data sets, it can save a lot of memory usage too, particularly when the data are compressed.
......@@ -76,37 +75,20 @@ optimization.final = none
# Set the quantity of output to the screen:
optimization.verbose = 3
# Parameters to ignore (for instance equilibrium frequencies)
optimization.ignore_parameters = YN98.freq*
optimization.ignore_parameters = YN98.*
# Maximum number of likelihood evaluations:
optimization.max_number_f_eval = 10000
optimization.max_number_f_eval = 1000
# Precision to reach:
optimization.tolerance = 1 #0.000001
optimization.tolerance = 0.000001
# idem for error or warning messages:
optimization.message_handler = $(DATA).ml_h.messages
# A file where to dump optimization steps (a file path or std for standard output)
optimization.profiler = $(DATA).ml_h.profile
# Shall we optimize tree topology as well?
optimization.topology = no
# Algorithm to use for topology estimation: only NNI for now
optimization.topology.algorithm = NNI
# NNI method: fast, better or phyml
# You should use the phyml option, since it is really more efficient!
optimization.topology.algorithm_nni.method = phyml
# Number of phyml topology movement steps before reoptimizing parameters:
optimization.topology.nstep = 4
# Shall we estimate parameters before looking for topology movements?
optimization.topology.numfirst = no
# Tolerances: These numbers should not be too low, in order to save computation
# time and also for a better topology estimation.
# The optimization.tolerance parameter will be used for the final optimization
# of numerical parameters.
#
# Tolerance for the prior-topology estimation
optimization.topology.tolerance.before = 100
# Tolerance for the during-topology estimation
optimization.topology.tolerance.during = 100
# Shall we first scale the tree before optimizing parameters? [deprecated]
optimization.scale_first = yes
optimization.scale_first = no
# Should we write the resulting tree? none or file name.
output.tree.file = $(DATA).ml_h.dnd
output.tree.format = Newick
......@@ -118,11 +100,6 @@ output.estimates = $(DATA).ml_h.params.bpp
# Bootstrap
# ----------------------------------------------------------------------------------------
bootstrap.number = 0
# Tell if numerical parameters should be kept to their initial value when bootstrapping:
bootstrap.approximate = no
# Set this to yes for detailed output when bootstrapping.
bootstrap.verbose = no
bootstrap.output.file =
" ]
let mapnh_config =
......@@ -143,18 +120,15 @@ input.sequence.sites_to_use = all
input.sequence.remove_stop_codons = yes
input.sequence.remove_saturated_sites = yes
map.type = DnDs(code=VertebrateMitochondrial)
map.type = DnDs(code=Standard)
# Here we use the estimated homogeneous model for mapping:
model = YN98(frequencies=F3X4,kappa=4.256962136755, omega=0.034993311532, freq_Codon.1_Full.theta=0.456066719064, freq_Codon.1_Full.theta1=0.588175796088, freq_Codon.1_Full.theta2=0.447949418086, freq_Codon.2_Full.theta=0.386701598114, freq_Codon.2_Full.theta1=0.319381792085, freq_Codon.2_Full.theta2=0.307606839923, freq_Codon.3_Full.theta=0.357395860624, freq_Codon.3_Full.theta1=0.656914626158, freq_Codon.3_Full.theta2=0.129523474419)
model = YN98(kappa=1, omega=1, frequencies=F1X4, initFreqs=observed, initFreqs.observedPseudoCount=1)
# Saturation threshold:
count.max = 3
#count.max = 3
test.global = no
test.branch = yes
test.branch.neighbor = #yes or no
test.branch.negbrlen = no
test.branch.auto_cluster = Marginal(threshold=3)
test.branch = no
output.counts.tree.prefix = CountsTree
......@@ -243,3 +217,33 @@ let calc_dn_ds ~tree ~faa ~(fna:nucleotide_fasta pworkflow) : dn_ds_t =
let diversity_tsv = aa_diversity ~faa in
{dn_tree; ds_tree; length_tsv; diversity_tsv}
type dnds_real = {
dn_tsv : text_file pworkflow;
ds_tsv : text_file pworkflow;
dnds_tsv : text_file pworkflow;
}
let format_dn_ds ~ali_prefix ~(dn_tree:_ workflow) ~(ds_tree:_ workflow): directory pworkflow =
let img = Env.env_py in
Workflow.shell ~descr:("format_dn_ds."^ali_prefix) [
mkdir_p dest;
cmd "python" ~img [
file_dump (string Scripts.format_dn_ds) ;
opt "--dn" dep dn_tree;
opt "--ds" dep ds_tree;
opt "--gene_name" string ali_prefix;
opt "-o" ident dest ;
]
]
let dn_ds_trees_real_data ~ali_prefix ~tree ~fna () : dnds_real=
let run_bppml = bppml_mapnh ~descr:("."^ali_prefix) ~tree ~fna in
let dn_tree = Workflow.select run_bppml ["mapping_counts_per_type_dN.dnd"] in
let ds_tree = Workflow.select run_bppml ["mapping_counts_per_type_dS.dnd"] in
let formated_dnds_trees = format_dn_ds ~ali_prefix ~dn_tree ~ds_tree in
let dn_tsv = Workflow.select formated_dnds_trees ["dn_tree.tsv"] in
let ds_tsv = Workflow.select formated_dnds_trees ["ds_tree.tsv"] in
let dnds_tsv = Workflow.select formated_dnds_trees ["dnds_tree.tsv"] in
{dn_tsv; ds_tsv; dnds_tsv}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment