Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
e96940c6
Commit
e96940c6
authored
Dec 15, 2018
by
Philippe Veber
Browse files
refactored bppseqgen
parent
6fbc0cea
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
139 additions
and
181 deletions
+139
-181
Makefile
Makefile
+2
-2
lib/bppsuite.ml
lib/bppsuite.ml
+112
-66
lib/bppsuite.mli
lib/bppsuite.mli
+21
-29
lib/convergence_hypothesis.ml
lib/convergence_hypothesis.ml
+0
-57
lib/debug.ml
lib/debug.ml
+0
-20
lib/pipeline.ml
lib/pipeline.ml
+4
-7
No files found.
Makefile
View file @
e96940c6
...
...
@@ -57,8 +57,8 @@ clean:
rm
-rf
example/_bistro
rm
-rf
example/outdir
.PHONY
:
clean
_
test
clean
_
test
:
.PHONY
:
clean
-
test
clean
-
test
:
rm
-rf
example/_bistro
rm
-rf
example/outdir_test
rm
-rf
example/report.log
...
...
lib/bppsuite.ml
View file @
e96940c6
...
...
@@ -4,8 +4,6 @@ open Bistro.EDSL
open
Bistro_bioinfo
.
Std
open
File_formats
type
bppseqgen_multi_profiles
let
env
=
Env
.
env_bppsuite
let
assign
k
v
=
...
...
@@ -31,44 +29,87 @@ let conf_file_bppseqgen ~tree ~out ~nb_sites ~config =
@
config
)
let
bppseqgen
?
(
descr
=
""
)
~
nb_sites
~
tree
~
config
:
nucleotide_fasta
workflow
=
let
config_f
=
dest
//
"config.bpp"
in
let
out
=
dest
//
"seq.fa"
in
workflow
~
descr
:
(
"bppsuite.bppseqgen"
^
descr
)
[
docker
env
(
and_list
[
mkdir_p
dest
;
cmd
"cat"
~
stdout
:
config_f
[(
file_dump
(
conf_file_bppseqgen
~
tree
~
out
~
nb_sites
~
config
))];
cmd
"bppseqgen"
[
assign
"param"
config_f
;
]
]
)
]
/
selector
[
"seq.fa"
]
module
Bppseqgen
=
struct
let
bpp_config_base
=
{
|
alphabet
=
Codon
(
letter
=
DNA
)
genetic_code
=
Standard
input
.
tree
.
format
=
Nhx
output
.
internal
.
sequences
=
no
nonhomogeneous
=
general
rate_distribution
=
Constant
()
|
}
let
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_f
~
ne_c
~
ne_a
~
config
~
nb_sites_per_profile
=
seq
~
sep
:
"
\n
"
(
[
assign
"input.tree.file"
(
dep
tree
)
;
assign
"input.tree.scale"
(
int
3
)
;
assign
"PROFILE_F"
(
dep
profile_f
)
;
assign
"number_of_sites"
(
int
nb_sites_per_profile
)
;
assign
"NE_1"
(
float
ne_a
)
;
assign
"NE_C"
(
float
ne_c
)
;
assign
"NE_T"
(
float
ne_c
)
;
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let
bpp_config_H0_F_Ne
=
seq
~
sep
:
"
\n
"
[
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))"
]
;
seq
[
string
"model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))"
]
;
seq
[
string
"nonhomogeneous.root_freq=FromModel(model=$(model1))"
]
;
]
@
config
)
let
bppseqgen_multi_profiles_script
~
config
~
out
~
profile_c
~
seed
=
let
vars
=
[
"FINAL_OUT"
,
ident
out
;
"PARAM"
,
config
;
"PROFILE_C"
,
ident
profile_c
;
"RANDOM"
,
int
seed
;
let
bpp_config_HaPCOC_F_Ne
=
seq
~
sep
:
"
\n
"
[
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))"
]
;
seq
[
string
"modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))"
]
;
seq
[
string
"modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))"
]
;
seq
[
string
"nonhomogeneous.root_freq=FromModel(model=$(model1))"
]
;
]
let
bpp_config_HaPC_F_Ne
=
seq
~
sep
:
"
\n
"
[
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))"
]
;
seq
[
string
"modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))"
]
;
seq
[
string
"modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))"
]
;
seq
[
string
"nonhomogeneous.root_freq=FromModel(model=$(model1))"
]
;
]
let
bpp_config_of_model
(
m
:
Convergence_hypothesis
.
t
)
=
match
m
with
|
H0
_
->
bpp_config_H0_F_Ne
|
HaPC
_
->
bpp_config_HaPC_F_Ne
|
HaPCOC
_
->
bpp_config_HaPCOC_F_Ne
let
bpp_config_F
m
=
[
string
bpp_config_base
;
bpp_config_of_model
m
;
]
in
bash_script
vars
{
|
let
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_f
~
ne_c
~
ne_a
~
hypothesis
~
nb_sites_per_profile
=
seq
~
sep
:
"
\n
"
(
[
assign
"input.tree.file"
(
dep
tree
)
;
assign
"input.tree.scale"
(
int
3
)
;
assign
"PROFILE_F"
(
dep
profile_f
)
;
assign
"number_of_sites"
(
int
nb_sites_per_profile
)
;
assign
"NE_1"
(
float
ne_a
)
;
assign
"NE_C"
(
float
ne_c
)
;
assign
"NE_T"
(
float
ne_c
)
;
]
@
bpp_config_F
hypothesis
)
let
bppseqgen_multi_profiles_script
~
config
~
out
~
profile_c
~
seed
=
let
vars
=
[
"FINAL_OUT"
,
ident
out
;
"PARAM"
,
config
;
"PROFILE_C"
,
ident
profile_c
;
"RANDOM"
,
int
seed
;
]
in
bash_script
vars
{
|
i
=
0
while
read
-
r
line
...
...
@@ -89,39 +130,44 @@ let bppseqgen_multi_profiles_script ~config ~out ~profile_c ~seed =
|
}
let
multi_profiles
?
(
descr
=
""
)
~
profile_f
~
profile_c
~
tree
~
tree_dataset
~
hypothesis
~
ne_c
~
ne_a
~
seed
=
let
nb_sites_per_profile
=
1
in
(* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
let
config_f
=
dest
//
"config.bpp"
in
(* let profile_c_ok = tmp // "profiles_c.tsv" in *)
let
profile_c_ok
=
dep
profile_c
in
let
out
=
dest
//
"seq.fa"
in
let
nodes
=
Tree_dataset
.
nodes
tree_dataset
hypothesis
in
workflow
~
descr
:
(
"bppsuite.bppseqgen"
^
descr
)
[
docker
env
(
and_list
[
mkdir_p
dest
;
mkdir_p
tmp
;
cd
tmp
;
(*cmd "head" ~stdout:profile_c_ok [
opt "-n" int nb_sites;
dep profile_c
];*)
cmd
"cat"
~
stdout
:
config_f
[
file_dump
(
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_f
~
hypothesis
~
ne_c
~
ne_a
~
nb_sites_per_profile
)
;
file_dump
(
seq
~
sep
:
"
\n
"
[
string
"
\n
"
])
;
dep
nodes
;
];
cmd
"bash"
[(
file_dump
(
bppseqgen_multi_profiles_script
~
config
:
config_f
~
out
~
profile_c
:
profile_c_ok
~
seed
))];
]
)
]
let
alignment
run_bppseqgen_multi_profiles
:
nucleotide_fasta
workflow
=
run_bppseqgen_multi_profiles
/
selector
[
"seq.fa"
]
let
info
run_bppseqgen_multi_profiles
:
text_file
workflow
=
run_bppseqgen_multi_profiles
/
selector
[
"seq.fa.info"
]
end
let
bppseqgen_multi_profiles
?
(
descr
=
""
)
~
profile_f
~
profile_c
~
nb_sites
~
tree
~
config
~
ne_c
~
ne_a
~
nodes
~
seed
:
bppseqgen_multi_profiles
directory
workflow
=
let
nb_sites_per_profile
=
1
in
(* let nb_combis = Pervasives.(nb_sites / nb_sites_per_profile) in *)
let
config_f
=
dest
//
"config.bpp"
in
(* let profile_c_ok = tmp // "profiles_c.tsv" in *)
let
profile_c_ok
=
dep
profile_c
in
let
out
=
dest
//
"seq.fa"
in
workflow
~
descr
:
(
"bppsuite.bppseqgen"
^
descr
)
[
docker
env
(
and_list
[
mkdir_p
dest
;
mkdir_p
tmp
;
cd
tmp
;
(*cmd "head" ~stdout:profile_c_ok [
opt "-n" int nb_sites;
dep profile_c
];*)
cmd
"cat"
~
stdout
:
config_f
[
file_dump
(
conf_file_bppseqgen_multi_profiles
~
tree
~
profile_f
~
config
~
ne_c
~
ne_a
~
nb_sites_per_profile
)
;
file_dump
(
seq
~
sep
:
"
\n
"
[
string
"
\n
"
])
;
dep
nodes
;
];
cmd
"bash"
[(
file_dump
(
bppseqgen_multi_profiles_script
~
config
:
config_f
~
out
~
profile_c
:
profile_c_ok
~
seed
))];
]
)
]
let
bppseqgen_multi_profiles_get_fa
run_bppseqgen_multi_profiles
:
nucleotide_fasta
workflow
=
run_bppseqgen_multi_profiles
/
selector
[
"seq.fa"
]
let
bppseqgen_multi_profiles_get_info
run_bppseqgen_multi_profiles
:
text_file
workflow
=
run_bppseqgen_multi_profiles
/
selector
[
"seq.fa.info"
]
let
conf_file_bppseqman_fna2faa
~
fna
=
seq
~
sep
:
"
\n
"
[
...
...
lib/bppsuite.mli
View file @
e96940c6
...
...
@@ -2,35 +2,27 @@ open Bistro.Std
open
Bistro_bioinfo
.
Std
open
File_formats
type
bppseqgen_multi_profiles
val
bppseqgen
:
?
descr
:
string
->
nb_sites
:
int
->
tree
:
nhx
workflow
->
config
:
Bistro
.
Template
.
t
list
->
nucleotide_fasta
workflow
val
bppseqgen_multi_profiles
:
?
descr
:
string
->
profile_f
:
text_file
workflow
->
profile_c
:
text_file
workflow
->
nb_sites
:
int
->
tree
:
nhx
workflow
->
config
:
Bistro
.
Template
.
t
list
->
ne_c
:
float
->
ne_a
:
float
->
nodes
:
text_file
workflow
->
seed
:
int
->
bppseqgen_multi_profiles
directory
workflow
val
bppseqgen_multi_profiles_get_fa
:
bppseqgen_multi_profiles
directory
workflow
->
nucleotide_fasta
workflow
val
bppseqgen_multi_profiles_get_info
:
bppseqgen_multi_profiles
directory
workflow
->
text_file
workflow
module
Bppseqgen
:
sig
val
multi_profiles
:
?
descr
:
string
->
profile_f
:
text_file
workflow
->
profile_c
:
text_file
workflow
->
tree
:
nhx
workflow
->
tree_dataset
:
[
`tree_dataset
]
directory
workflow
->
hypothesis
:
Convergence_hypothesis
.
t
->
ne_c
:
float
->
ne_a
:
float
->
seed
:
int
->
[
`bppseqgen
]
directory
workflow
val
alignment
:
[
`bppseqgen
]
directory
workflow
->
nucleotide_fasta
workflow
val
info
:
[
`bppseqgen
]
directory
workflow
->
text_file
workflow
end
val
fna2faa
:
fna
:
nucleotide_fasta
workflow
->
...
...
lib/convergence_hypothesis.ml
View file @
e96940c6
...
...
@@ -37,60 +37,3 @@ let h0_hapc_forall nes_list =
let
assign
k
v
=
seq
~
sep
:
"="
[
string
k
;
v
]
let
bpp_config_base
=
{
|
alphabet
=
Codon
(
letter
=
DNA
)
genetic_code
=
Standard
input
.
tree
.
format
=
Nhx
output
.
internal
.
sequences
=
no
nonhomogeneous
=
general
rate_distribution
=
Constant
()
|
}
(* let bpp_config_H0_F= seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPCOC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2)" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
]
let bpp_config_HaPC_F = seq ~sep:"\n" [
seq [string "model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)))" ] ;
seq [string "modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)))" ] ;
seq [string "nonhomogeneous.root_freq=FromModel(model=$(model1))" ] ;
] *)
let
bpp_config_H0_F_Ne
=
seq
~
sep
:
"
\n
"
[
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))"
]
;
seq
[
string
"model2=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_C))"
]
;
seq
[
string
"nonhomogeneous.root_freq=FromModel(model=$(model1))"
]
;
]
let
bpp_config_HaPCOC_F_Ne
=
seq
~
sep
:
"
\n
"
[
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))"
]
;
seq
[
string
"modelT=OneChange(model=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2))), register=DnDs, numReg=2, Ns=$(NE_T))"
]
;
seq
[
string
"modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))"
]
;
seq
[
string
"nonhomogeneous.root_freq=FromModel(model=$(model1))"
]
;
]
let
bpp_config_HaPC_F_Ne
=
seq
~
sep
:
"
\n
"
[
seq
[
string
"model1=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M1)), Ns=$(NE_1))"
]
;
seq
[
string
"modelT=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_T))"
]
;
seq
[
string
"modelC=Codon_AAFit(model=K80, fitness=Empirical(file=$(PROFILE_F), col=$(COL_M2)), Ns=$(NE_C))"
]
;
seq
[
string
"nonhomogeneous.root_freq=FromModel(model=$(model1))"
]
;
]
let
bpp_config_of_model
m
=
match
m
with
|
H0
_
->
bpp_config_H0_F_Ne
|
HaPC
_
->
bpp_config_HaPC_F_Ne
|
HaPCOC
_
->
bpp_config_HaPCOC_F_Ne
let
bpp_config_F
m
=
[
string
bpp_config_base
;
bpp_config_of_model
m
;
]
lib/debug.ml
View file @
e96940c6
...
...
@@ -17,23 +17,3 @@ let workflow_of_template t =
]
let
config_file
()
=
let
open
Bistro
.
EDSL
in
let
model
=
Convergence_hypothesis
.(
H0
(
Fixed
1
.
))
in
let
tree_dir
=
"/home/pveber/w/reviewphiltrans/example/trees_test/"
in
let
tree
=
"tree_small_bl.nhx"
in
let
input_tree
=
Bistro
.
EDSL
.
input
(
Filename
.
concat
tree_dir
tree
)
in
let
tree_prefix
=
Filename
.
chop_extension
tree
in
let
tree_dataset
=
Tree_dataset
.
prepare
~
descr
:
(
"simulated_data."
^
tree_prefix
)
input_tree
in
let
nodes
=
Tree_dataset
.
nodes
tree_dataset
model
in
workflow
[
cmd
"cat"
~
stdout
:
dest
[
file_dump
(
Bistro
.
EDSL
.
seq
~
sep
:
"
\n
"
(
Convergence_hypothesis
.
bpp_config_F
model
@
[
string
"
\n
"
]))
;
dep
nodes
;
];
]
let
main
()
=
less
(
config_file
()
)
lib/pipeline.ml
View file @
e96940c6
...
...
@@ -53,8 +53,6 @@ let calc_fixed_seed ~(str:string) (seed:int) : int =
let
derive_from_model
~
model
~
input_tree
~
tree_dataset
~
tree_prefix
~
profile
~
preview
~
ns
~
seed
=
let
model_prefix
=
Convergence_hypothesis
.
string_of_model
model
in
let
nb_sites
=
ns
in
let
nodes
=
Tree_dataset
.
nodes
tree_dataset
model
in
let
tree
=
Tree_dataset
.
tree
tree_dataset
`Simulation
in
let
descr
=
"."
^
model_prefix
^
"."
^
tree_prefix
in
(* only 1 profile or 1 couple of profiles*)
...
...
@@ -62,7 +60,6 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let fna = Bppsuite.bppseqgen ~descr ~nb_sites ~tree ~config in
*)
(* with several profiles or couples of profiles *)
let
config_p
=
Convergence_hypothesis
.
bpp_config_F
model
in
let
ne_g
=
neg_of_model
model
in
let
ne_c
=
nec_of_model
model
in
let
ne_a
=
ne_g
in
...
...
@@ -70,9 +67,9 @@ let derive_from_model ~model ~input_tree ~tree_dataset ~tree_prefix ~profile ~pr
let
profile_c
=
profile
.
profile_c
in
(*let seed = Random.int Int.max_value in*)
let
seed
=
calc_fixed_seed
~
str
:
descr
seed
in
let
run_fna
=
Bppsuite
.
b
ppseqgen
_
multi_profiles
~
descr
~
nb_sites
~
tree
~
nodes
~
config
:
config_p
~
profile_f
~
profile_c
~
ne_c
~
ne_a
~
seed
in
let
fna
=
Bppsuite
.
b
ppseqgen
_multi_profiles_get_fa
run_fna
in
let
fna_infos
=
Some
(
Bppsuite
.
b
ppseqgen
_multi_profiles_get_
info
run_fna
)
in
let
run_fna
=
Bppsuite
.
B
ppseqgen
.
multi_profiles
~
descr
~
tree
~
tree_dataset
~
hypothesis
:
model
~
profile_f
~
profile_c
~
ne_c
~
ne_a
~
seed
in
let
fna
=
Bppsuite
.
B
ppseqgen
.
alignment
run_fna
in
let
fna_infos
=
Some
(
Bppsuite
.
B
ppseqgen
.
info
run_fna
)
in
let
faa
=
Bppsuite
.
fna2faa
~
fna
in
let
ready_dataset
=
{
Ready_dataset
.
input_tree
=
input_tree
;
tree_dataset
;
fna
;
faa
;
fna_infos
}
in
...
...
@@ -411,4 +408,4 @@ let realdata_command =
flag
"--seed"
(
optional
int
)
~
doc
:
"INT Global seed"
in
realdata_main
~
outdir
~
indir
?
np
?
mem
~
preview
~
use_diffsel
~
use_c60
?
seed
]
\ No newline at end of file
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment