Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
4b3ad11d
Commit
4b3ad11d
authored
Apr 01, 2019
by
Carine Rey
Committed by
Philippe Veber
Sep 20, 2019
Browse files
add calc_dnds
parent
4fd11201
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
191 additions
and
52 deletions
+191
-52
Makefile
Makefile
+14
-2
lib/dataset.ml
lib/dataset.ml
+4
-2
lib/pipeline.ml
lib/pipeline.ml
+15
-10
lib/ready_dataset.ml
lib/ready_dataset.ml
+10
-1
lib/scripts/format_dn_ds.py
lib/scripts/format_dn_ds.py
+107
-0
lib/testnh.ml
lib/testnh.ml
+41
-37
No files found.
Makefile
View file @
4b3ad11d
...
...
@@ -35,7 +35,7 @@ push_docker:
test
:
cd
example
&&
\
reviewphiltrans validation
--outdir
outdir_test
--tree-dir
trees_test
--profile-fn
aa_fitness/263SelectedProfiles.tsv
--ns
5
--np
4
--seed
4256073781403810077
&&
\
reviewphiltrans validation
--outdir
outdir_test
--tree-dir
trees_test
--profile-fn
aa_fitness/263SelectedProfiles.tsv
--d
iffsel
--ns
5
--np
4
--seed
4256073781403810077
&&
\
reviewphiltrans validation
--outdir
outdir_test
--tree-dir
trees_test
--profile-fn
aa_fitness/263SelectedProfiles.tsv
--d
nds
--ns
5
--np
4
--seed
4256073781403810077
&&
\
mv
dag.dot dagtest_val.dot
&&
\
dot
-Tsvg
dagtest_val.dot
-o
dagtest_val.svg
...
...
@@ -46,7 +46,19 @@ test:
.PHONY
:
realdata_test
realdata_test
:
cd
example
&&
\
reviewphiltrans realdata
--outdir
outdir_realdata_test
--indir
real_data
--np
4
--seed
4256073781403810077
reviewphiltrans realdata
--outdir
outdir_realdata_test
--indir
real_data
--np
4
--seed
4256073781403810077
&&
\
reviewphiltrans realdata
--dnds
--outdir
outdir_realdata_test
--indir
real_data
--np
4
--seed
4256073781403810077
# -----------------------------------------------------------------------
# Realdata
# -----------------------------------------------------------------------
.PHONY
:
realdata
realdata
:
cd
/mnt
&&
\
mkdir
-p
wd_reviewphiltrans
&&
\
cd
wd_reviewphiltrans
&&
\
reviewphiltrans realdata
--outdir
realdata_online_rodents_ensemblkeep
--indir
/home/ubuntu/data_4_reviewphiltrans_rongeur_ensemblkeep
--np
32
--mem
160
--seed
4256073781403810077
# -----------------------------------------------------------------------
...
...
lib/dataset.ml
View file @
4b3ad11d
...
...
@@ -5,6 +5,7 @@ type t = {
model_prefix
:
string
;
tree_prefix
:
string
;
is_real
:
bool
;
calc_dnds
:
bool
;
dataset
:
Ready_dataset
.
t
;
seed
:
int
;
}
...
...
@@ -17,7 +18,7 @@ let repo dataset_l =
let
model_prefix
=
dataset
.
model_prefix
in
let
tree_prefix
=
dataset
.
tree_prefix
in
if
dataset
.
is_real
then
let
repo_realdata
=
Ready_dataset
.
repo_realdata
~
ali_prefix
:
model_prefix
~
tree_prefix
dataset
.
dataset
in
let
repo_realdata
=
Ready_dataset
.
repo_realdata
~
ali_prefix
:
model_prefix
~
tree_prefix
dataset
.
dataset
~
calc_dnds
:
dataset
.
calc_dnds
in
repo_realdata
|>
Repo
.
shift
"Dataset"
|>
Repo
.
shift
tree_prefix
...
...
@@ -46,4 +47,5 @@ let add_indels_to_dataset d ~seed =
let
new_seed
=
Hashtbl
.
hash
seed
in
let
indel_seed
=
Hashtbl
.
hash
new_seed
in
let
dataset
=
Ready_dataset
.
add_indels_to_ready_dataset
~
p
~
seed
:
indel_seed
d
.
dataset
in
{
model_prefix
;
tree_prefix
;
is_real
;
dataset
;
seed
=
new_seed
}
let
calc_dnds
=
d
.
calc_dnds
in
{
model_prefix
;
tree_prefix
;
is_real
;
calc_dnds
;
dataset
;
seed
=
new_seed
}
lib/pipeline.ml
View file @
4b3ad11d
...
...
@@ -5,7 +5,7 @@ open Convergence_detection
open
Convergence_hypothesis
open
Profile
let
parse_input_data
~
seed
indir
=
let
parse_input_data
~
seed
~
calc_dnds
indir
=
let
error_message
=
{
|
I
need
a
file
"tree.nhx"
containing
the
annotated
tree
and
a
directory
"Alignments"
containing
fasta
alignments
(
in
nt
)
...
...
@@ -47,6 +47,7 @@ with the format "gene1.fna", "gene2.fna",... |} in
let
raw_dataset
=
Raw_dataset
.{
input_tree
=
filtered_input_tree
;
fna
;
fna_infos
}
in
let
dataset
=
{
Dataset
.
model_prefix
=
fna_prefix
;
is_real
=
true
;
calc_dnds
;
tree_prefix
=
tree_prefix
;
dataset
=
Ready_dataset
.
of_raw
~
descr
:
(
"real_data."
^
tree_prefix
)
raw_dataset
;
seed
;
...
...
@@ -74,7 +75,7 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~se
let
fna_infos
=
Some
(
Bppsuite
.
Bppseqgen
.
info
run_fna
)
in
let
faa
=
Bppsuite
.
fna2faa
fna
in
let
ready_dataset
=
{
Ready_dataset
.
input_tree
=
input_tree
;
tree_dataset
;
fna
;
faa
;
fna_infos
}
in
{
Dataset
.
model_prefix
;
is_real
=
false
;
tree_prefix
;
dataset
=
ready_dataset
;
seed
}
{
Dataset
.
model_prefix
;
is_real
=
false
;
calc_dnds
=
false
;
tree_prefix
;
dataset
=
ready_dataset
;
seed
}
let
derive_from_tree
~
tree_dir
~
tree
~
profile
~
preview
~
use_concat
~
add_indels
~
seed
=
let
tree_prefix
=
Filename
.
chop_extension
tree
in
...
...
@@ -98,8 +99,8 @@ let derive_from_tree ~tree_dir ~tree ~profile ~preview ~use_concat ~add_indels ~
derive_from_model
~
model
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
seed
)
in
let
_concat_H0HaPCOC
=
{
Dataset
.
model_prefix
=
"H0_NeG5+HaPCOC"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
dataset_H0_NeG5
.
dataset
dataset_HaPCOC
.
dataset
;
seed
=
(
dataset_H0_NeG5
.
seed
+
dataset_HaPCOC
.
seed
|>
Hashtbl
.
hash
)
}
in
let
concat_H0HaPC
=
{
Dataset
.
model_prefix
=
"H0_NeG5+HaPC_NeG5"
;
tree_prefix
;
is_real
=
false
;
dataset
=
Ready_dataset
.
paste
dataset_H0_NeG5
.
dataset
dataset_HaPC_NeG5
.
dataset
;
seed
=
(
dataset_H0_NeG5
.
seed
+
dataset_HaPC_NeG5
.
seed
|>
Hashtbl
.
hash
)
}
in
let
_concat_H0HaPCOC
=
{
Dataset
.
model_prefix
=
"H0_NeG5+HaPCOC"
;
tree_prefix
;
is_real
=
false
;
calc_dnds
=
false
;
dataset
=
Ready_dataset
.
paste
dataset_H0_NeG5
.
dataset
dataset_HaPCOC
.
dataset
;
seed
=
(
dataset_H0_NeG5
.
seed
+
dataset_HaPCOC
.
seed
|>
Hashtbl
.
hash
)
}
in
let
concat_H0HaPC
=
{
Dataset
.
model_prefix
=
"H0_NeG5+HaPC_NeG5"
;
tree_prefix
;
is_real
=
false
;
calc_dnds
=
false
;
dataset
=
Ready_dataset
.
paste
dataset_H0_NeG5
.
dataset
dataset_HaPC_NeG5
.
dataset
;
seed
=
(
dataset_H0_NeG5
.
seed
+
dataset_HaPC_NeG5
.
seed
|>
Hashtbl
.
hash
)
}
in
let
dataset_concat_hypos
=
if
use_concat
then
[
concat_H0HaPC
;]
else
[]
in
let
dataset_with_indels
=
if
add_indels
then
[
indel_H0_NeG5
;
indel_HaPC_NeG5
]
else
[]
in
List
.
concat
[
(*dataset_basis_hyps;*)
dataset_per_hypo
;
dataset_concat_hypos
;
dataset_with_indels
]
...
...
@@ -319,7 +320,7 @@ let loggers = [
time_logger
#
logger
;
]
let
validation_main
~
outdir
?
(
indir
=
""
)
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
preview
~
use_diffsel
~
use_c60
~
tree_dir
~
profile_fn
~
use_concat
~
add_indels
~
only_simu
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
let
validation_main
~
outdir
?
(
indir
=
""
)
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
preview
~
use_diffsel
~
use_c60
~
calc_dnds
~
tree_dir
~
profile_fn
~
use_concat
~
add_indels
~
only_simu
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
printf
"Global seed: %i
\n
"
seed
;
Out_channel
.
write_all
"global.seed"
~
data
:
(
string_of_int
seed
);
(* simulated trees *)
...
...
@@ -328,7 +329,7 @@ let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~previe
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
All
~
nb_sites
profile_fn
~
seed
:
(
Random
.
int
Int
.
max_value
)
in
let
sim_repo_l
=
derive_profile
~
preview
~
use_diffsel
~
use_c60
~
tree_dir
~
profile
~
use_concat
~
add_indels
~
only_simu
~
seed
()
in
(* real trees *)
let
indir_dataset_l
=
if
indir
=
""
then
[]
else
parse_input_data
~
seed
indir
in
let
indir_dataset_l
=
if
indir
=
""
then
[]
else
parse_input_data
~
seed
~
calc_dnds
indir
in
let
dataset_l
=
indir_dataset_l
in
let
dataset_results_l
=
if
only_simu
then
...
...
@@ -361,6 +362,8 @@ let validation_command =
flag
"--diffsel"
no_arg
~
doc
:
" use the diffsel method (very slow)."
and
use_c60
=
flag
"--c60"
no_arg
~
doc
:
" use the pcoc c60 method (slow)."
and
calc_dnds
=
flag
"--dnds"
no_arg
~
doc
:
" calculate dn ds dnds trees (slow)."
and
only_simu
=
flag
"--only-simu"
no_arg
~
doc
:
" mode only simulation"
and
use_concat
=
...
...
@@ -380,17 +383,17 @@ let validation_command =
and
seed
=
flag
"--seed"
(
optional
int
)
~
doc
:
"INT Global seed"
in
validation_main
~
outdir
?
indir
?
ns
?
np
?
mem
~
preview
~
use_diffsel
~
use_c60
~
tree_dir
~
profile_fn
~
use_concat
~
only_simu
~
add_indels
?
seed
validation_main
~
outdir
?
indir
?
ns
?
np
?
mem
~
preview
~
use_diffsel
~
use_c60
~
calc_dnds
~
tree_dir
~
profile_fn
~
use_concat
~
only_simu
~
add_indels
?
seed
]
let
realdata_main
~
outdir
~
indir
~
preview
~
use_diffsel
~
use_c60
?
(
np
=
2
)
?
(
mem
=
2
)
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
let
realdata_main
~
outdir
~
indir
~
preview
~
use_diffsel
~
use_c60
~
calc_dnds
?
(
np
=
2
)
?
(
mem
=
2
)
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
(* seed-related stuff *)
printf
"Global seed: %i
\n
"
seed
;
Out_channel
.
write_all
"global.seed"
~
data
:
(
string_of_int
seed
);
Random
.
init
seed
;
(* real trees *)
let
dataset_l
=
parse_input_data
~
seed
indir
in
let
dataset_l
=
parse_input_data
~
seed
~
calc_dnds
indir
in
let
dataset_results_l
=
derive_det
~
dataset_l
~
preview
~
use_diffsel
~
use_c60
in
let
repo_real_trees
=
[
Dataset
.
repo
dataset_l
;
...
...
@@ -416,6 +419,8 @@ let realdata_command =
flag
"--diffsel"
no_arg
~
doc
:
" use the diffsel method (very slow)."
and
use_c60
=
flag
"--c60"
no_arg
~
doc
:
" use the pcoc c60 method (slow)."
and
calc_dnds
=
flag
"--dnds"
no_arg
~
doc
:
" calculate dn ds dnds trees (slow)."
and
np
=
flag
"--np"
(
optional
int
)
~
doc
:
"INT Number of available processors"
and
mem
=
...
...
@@ -423,5 +428,5 @@ let realdata_command =
and
seed
=
flag
"--seed"
(
optional
int
)
~
doc
:
"INT Global seed"
in
realdata_main
~
outdir
~
indir
?
np
?
mem
~
preview
~
use_diffsel
~
use_c60
?
seed
realdata_main
~
outdir
~
indir
?
np
?
mem
~
preview
~
use_diffsel
~
use_c60
~
calc_dnds
?
seed
]
lib/ready_dataset.ml
View file @
4b3ad11d
...
...
@@ -47,8 +47,9 @@ let repo rd =
]
|>
List
.
concat
let
repo_realdata
~
tree_prefix
~
ali_prefix
rd
=
let
repo_realdata
~
tree_prefix
~
ali_prefix
~
calc_dnds
rd
=
let
phy_nt
=
(
Bppsuite
.
fna2phy
~
fna
:
rd
.
fna
)
in
let
dnds_realdata
=
Testnh
.
dn_ds_trees_real_data
~
ali_prefix
~
tree
:
rd
.
input_tree
~
fna
:
rd
.
fna
()
in
List
.
concat
[
Repo
.[
item
[
ali_prefix
^
".fna"
]
rd
.
fna
;
...
...
@@ -59,6 +60,14 @@ let repo_realdata ~tree_prefix ~ali_prefix rd =
item
[
ali_prefix
^
".nt.phy"
]
phy_nt
;
item
[
ali_prefix
^
".tree.diffsel"
]
(
Tree_dataset
.
diffsel_tree
rd
.
tree_dataset
)
;
]
|>
Repo
.
shift
"ForDiffsel"
;
if
calc_dnds
then
Repo
.[
item
[
ali_prefix
^
".dn.tsv"
]
dnds_realdata
.
dn_tsv
;
item
[
ali_prefix
^
".ds.tsv"
]
dnds_realdata
.
ds_tsv
;
item
[
ali_prefix
^
".dnds.tsv"
]
dnds_realdata
.
dnds_tsv
;
]
|>
Repo
.
shift
"ForRERconverge"
else
[]
;
]
|>
Repo
.
shift
ali_prefix
|>
Repo
.
shift
"Alignments"
...
...
lib/scripts/format_dn_ds.py
0 → 100644
View file @
4b3ad11d
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Carine Rey <carine.rey@ens-lyon.org>
# - Bastien Boussau <bastien.boussau@univ-lyon1.fr>
# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.
# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".
# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.
# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
import
argparse
import
sys
import
os
,
re
import
logging
import
pandas
as
pd
from
ete3
import
Tree
#===================================================================================================
# inputs
#===================================================================================================
### Option defining
parser
=
argparse
.
ArgumentParser
(
prog
=
"format_dn_ds.py"
,
description
=
''
)
parser
.
add_argument
(
'--version'
,
action
=
'version'
,
version
=
'%(prog)s 1.0'
)
parser
.
add_argument
(
'--debug'
,
action
=
"store_true"
,
help
=
"debug mode"
,
default
=
False
)
##############
requiredOptions
=
parser
.
add_argument_group
(
'REQUIRED OPTIONS'
)
requiredOptions
.
add_argument
(
"--dn"
,
type
=
argparse
.
FileType
(
'r'
),
help
=
'dn tree filename'
,
required
=
True
)
requiredOptions
.
add_argument
(
"--ds"
,
type
=
argparse
.
FileType
(
'r'
),
help
=
'ds tree filename'
,
required
=
True
)
requiredOptions
.
add_argument
(
"--gene_name"
,
type
=
str
,
help
=
'gene name'
,
required
=
True
)
requiredOptions
.
add_argument
(
'-o'
,
'--output_dir'
,
type
=
str
,
help
=
"Output name"
,
required
=
True
)
##############
### Option parsing
args
=
parser
.
parse_args
()
DnFile
=
args
.
dn
.
name
DsFile
=
args
.
ds
.
name
GeneName
=
args
.
gene_name
OutDir
=
args
.
output_dir
#===================================================================================================
# ouput
#===================================================================================================
Dn_tree
=
Tree
(
DnFile
)
Ds_tree
=
Tree
(
DsFile
)
DnDs_tree
=
Dn_tree
.
copy
()
i
=
0
n_ds_bl
=
{}
for
n_ds
in
Ds_tree
.
traverse
(
"postorder"
):
n_ds_bl
[
i
]
=
n_ds
.
dist
i
+=
1
i
=
0
for
n
in
DnDs_tree
.
traverse
(
"postorder"
):
if
n_ds_bl
[
i
]
!=
0
:
n
.
dist
=
n
.
dist
/
float
(
n_ds_bl
[
i
])
else
:
n
.
dist
=
0
i
+=
1
Dn_tree_str
=
Dn_tree
.
write
(
format
=
1
,
format_root_node
=
True
)
Ds_tree_str
=
Ds_tree
.
write
(
format
=
1
,
format_root_node
=
True
)
DnDs_tree_str
=
DnDs_tree
.
write
(
format
=
1
,
format_root_node
=
True
)
with
open
(
OutDir
+
"/dn_tree.tsv"
,
"w"
)
as
File
:
File
.
write
(
GeneName
+
"
\t
"
+
Dn_tree_str
)
File
.
write
(
"
\n
"
)
with
open
(
OutDir
+
"/ds_tree.tsv"
,
"w"
)
as
File
:
File
.
write
(
GeneName
+
"
\t
"
+
Ds_tree_str
)
File
.
write
(
"
\n
"
)
with
open
(
OutDir
+
"/dnds_tree.tsv"
,
"w"
)
as
File
:
File
.
write
(
GeneName
+
"
\t
"
+
DnDs_tree_str
)
File
.
write
(
"
\n
"
)
lib/testnh.ml
View file @
4b3ad11d
...
...
@@ -20,7 +20,7 @@ genetic_code=Standard
input.sequence.format = Fasta
#Sites to use:
# all, nogap or complete (=only resolved chars)
input.sequence.sites_to_use =
complete
input.sequence.sites_to_use =
all
# Specify a maximum amount of gaps: may be an absolute number or a percentage.
input.sequence.max_gap_allowed = 50%
# Remove stop codons
...
...
@@ -33,14 +33,13 @@ input.sequence.remove_saturated_sites=yes
# user or random
init.tree = user
input.tree.format = Nhx
init.brlen.method = Equal(value=1.0)
# ----------------------------------------------------------------------------------------
# Model specification
# ----------------------------------------------------------------------------------------
# See the manual for a description of the syntax and available options.
#
model = YN98(kappa=1, omega=1, frequencies=F1X4, initFreqs=observed, initFreqs.observedPseudoCount=1)
rate_distribution =
Constant #Gamma(n=4, alpha=0.5
)
rate_distribution =
Gamma(n=4
)
# Likelihood recursion option:
# - simple: derivatives takes more time to compute, but likelihood computation is faster.
# For big data sets, it can save a lot of memory usage too, particularly when the data are compressed.
...
...
@@ -76,37 +75,20 @@ optimization.final = none
# Set the quantity of output to the screen:
optimization.verbose = 3
# Parameters to ignore (for instance equilibrium frequencies)
optimization.ignore_parameters = YN98.
freq
*
optimization.ignore_parameters = YN98.*
# Maximum number of likelihood evaluations:
optimization.max_number_f_eval = 1000
0
optimization.max_number_f_eval = 1000
# Precision to reach:
optimization.tolerance =
1 #
0.000001
optimization.tolerance = 0.000001
# idem for error or warning messages:
optimization.message_handler = $(DATA).ml_h.messages
# A file where to dump optimization steps (a file path or std for standard output)
optimization.profiler = $(DATA).ml_h.profile
# Shall we optimize tree topology as well?
optimization.topology = no
# Algorithm to use for topology estimation: only NNI for now
optimization.topology.algorithm = NNI
# NNI method: fast, better or phyml
# You should use the phyml option, since it is really more efficient!
optimization.topology.algorithm_nni.method = phyml
# Number of phyml topology movement steps before reoptimizing parameters:
optimization.topology.nstep = 4
# Shall we estimate parameters before looking for topology movements?
optimization.topology.numfirst = no
# Tolerances: These numbers should not be too low, in order to save computation
# time and also for a better topology estimation.
# The optimization.tolerance parameter will be used for the final optimization
# of numerical parameters.
#
# Tolerance for the prior-topology estimation
optimization.topology.tolerance.before = 100
# Tolerance for the during-topology estimation
optimization.topology.tolerance.during = 100
# Shall we first scale the tree before optimizing parameters? [deprecated]
optimization.scale_first =
yes
optimization.scale_first =
no
# Should we write the resulting tree? none or file name.
output.tree.file = $(DATA).ml_h.dnd
output.tree.format = Newick
...
...
@@ -118,11 +100,6 @@ output.estimates = $(DATA).ml_h.params.bpp
# Bootstrap
# ----------------------------------------------------------------------------------------
bootstrap.number = 0
# Tell if numerical parameters should be kept to their initial value when bootstrapping:
bootstrap.approximate = no
# Set this to yes for detailed output when bootstrapping.
bootstrap.verbose = no
bootstrap.output.file =
"
]
let
mapnh_config
=
...
...
@@ -143,18 +120,15 @@ input.sequence.sites_to_use = all
input.sequence.remove_stop_codons = yes
input.sequence.remove_saturated_sites = yes
map.type = DnDs(code=
VertebrateMitochondrial
)
map.type = DnDs(code=
Standard
)
# Here we use the estimated homogeneous model for mapping:
model = YN98(
frequencies=F3X4,kappa=4.256962136755, omega=0.034993311532, freq_Codon.1_Full.theta=0.456066719064, freq_Codon.1_Full.theta1=0.588175796088, freq_Codon.1_Full.theta2=0.447949418086, freq_Codon.2_Full.theta=0.386701598114, freq_Codon.2_Full.theta1=0.319381792085, freq_Codon.2_Full.theta2=0.307606839923, freq_Codon.3_Full.theta=0.357395860624, freq_Codon.3_Full.theta1=0.656914626158, freq_Codon.3_Full.theta2=0.129523474419
)
model = YN98(
kappa=1, omega=1, frequencies=F1X4, initFreqs=observed, initFreqs.observedPseudoCount=1
)
# Saturation threshold:
count.max = 3
#
count.max = 3
test.global = no
test.branch = yes
test.branch.neighbor = #yes or no
test.branch.negbrlen = no
test.branch.auto_cluster = Marginal(threshold=3)
test.branch = no
output.counts.tree.prefix = CountsTree
...
...
@@ -243,3 +217,33 @@ let calc_dn_ds ~tree ~faa ~(fna:nucleotide_fasta pworkflow) : dn_ds_t =
let
diversity_tsv
=
aa_diversity
~
faa
in
{
dn_tree
;
ds_tree
;
length_tsv
;
diversity_tsv
}
type
dnds_real
=
{
dn_tsv
:
text_file
pworkflow
;
ds_tsv
:
text_file
pworkflow
;
dnds_tsv
:
text_file
pworkflow
;
}
let
format_dn_ds
~
ali_prefix
~
(
dn_tree
:_
workflow
)
~
(
ds_tree
:_
workflow
)
:
directory
pworkflow
=
let
img
=
Env
.
env_py
in
Workflow
.
shell
~
descr
:
(
"format_dn_ds."
^
ali_prefix
)
[
mkdir_p
dest
;
cmd
"python"
~
img
[
file_dump
(
string
Scripts
.
format_dn_ds
)
;
opt
"--dn"
dep
dn_tree
;
opt
"--ds"
dep
ds_tree
;
opt
"--gene_name"
string
ali_prefix
;
opt
"-o"
ident
dest
;
]
]
let
dn_ds_trees_real_data
~
ali_prefix
~
tree
~
fna
()
:
dnds_real
=
let
run_bppml
=
bppml_mapnh
~
descr
:
(
"."
^
ali_prefix
)
~
tree
~
fna
in
let
dn_tree
=
Workflow
.
select
run_bppml
[
"mapping_counts_per_type_dN.dnd"
]
in
let
ds_tree
=
Workflow
.
select
run_bppml
[
"mapping_counts_per_type_dS.dnd"
]
in
let
formated_dnds_trees
=
format_dn_ds
~
ali_prefix
~
dn_tree
~
ds_tree
in
let
dn_tsv
=
Workflow
.
select
formated_dnds_trees
[
"dn_tree.tsv"
]
in
let
ds_tsv
=
Workflow
.
select
formated_dnds_trees
[
"ds_tree.tsv"
]
in
let
dnds_tsv
=
Workflow
.
select
formated_dnds_trees
[
"dnds_tree.tsv"
]
in
{
dn_tsv
;
ds_tsv
;
dnds_tsv
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment