Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
591a053a
Commit
591a053a
authored
Sep 04, 2018
by
LANORE Vincent
Browse files
Pair generation now takes a seed.
parent
bb0c512b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
19 deletions
+25
-19
lib/pipeline.ml
lib/pipeline.ml
+3
-3
lib/profile.ml
lib/profile.ml
+4
-3
lib/scripts/generate_pairs.py
lib/scripts/generate_pairs.py
+18
-13
No files found.
lib/pipeline.ml
View file @
591a053a
...
...
@@ -375,7 +375,7 @@ let detection_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~fast_mode ?(see
let
simulation_main
~
outdir
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
tree_dir
~
profile_fn
~
preview
~
use_concat
~
no_Ne
~
no_HaPC
~
add_indels
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
1
~
nb_sites
profile_fn
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
1
~
nb_sites
profile_fn
~
seed
:
(
Random
.
int
Int
.
max_value
)
in
let
trees
=
Array
.
to_list
@@
Sys
.
readdir
tree_dir
in
let
dataset_l
=
derive_sim
~
tree_dir
~
trees
~
profile
~
preview
~
use_concat
~
ns
~
no_Ne
~
no_HaPC
~
ne_test
:
false
~
add_indels
~
seed
in
...
...
@@ -384,11 +384,11 @@ let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn
let
validation_main
~
outdir
?
(
indir
=
""
)
?
(
ns
=
0
)
?
(
np
=
2
)
?
(
mem
=
2
)
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile_fn
~
use_concat
~
add_indels
~
only_simu
?
(
seed
=
Random
.
int
Int
.
max_value
)
()
=
printf
"Global seed: %i
\n
"
seed
;
Out_channel
.
write_all
(
Filename
.
concat
outdir
"global.seed"
)
~
data
:
(
string_of_int
seed
);
Out_channel
.
write_all
"global.seed"
~
data
:
(
string_of_int
seed
);
(* simulated trees *)
Random
.
init
seed
;
let
nb_sites
=
if
ns
=
0
then
(
if
preview
then
20
else
50
)
else
ns
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
3
~
nb_sites
profile_fn
in
let
profile
=
Profile
.
profile_l_of_splitted_profile
~
nb_cat
:
3
~
nb_sites
profile_fn
~
seed
:
(
Random
.
int
Int
.
max_value
)
in
let
sim_repo_l
=
derive_profile
~
indir
~
ns
~
preview
~
fast_mode
~
no_Ne
~
ne_test
~
no_HaPC
~
tree_dir
~
profile
~
use_concat
~
add_indels
~
only_simu
~
seed
()
in
(* real trees *)
let
indir_dataset_l
=
if
indir
=
""
then
[]
else
parse_input_data
~
seed
indir
in
...
...
lib/profile.ml
View file @
591a053a
...
...
@@ -10,7 +10,7 @@ type profile = {
}
let
split_profile
~
nb_sites
~
dist_bins
profile_f
:
text_file
directory
workflow
=
let
split_profile
~
nb_sites
~
dist_bins
profile_f
~
seed
:
text_file
directory
workflow
=
let
env
=
docker_image
~
account
:
"carinerey"
~
name
:
"python_basics"
~
tag
:
"07252018"
()
in
let
package
=
tmp
//
"diffsel_script_utils.py"
in
let
script
=
tmp
//
"generate_pairs.py"
in
...
...
@@ -33,6 +33,7 @@ let split_profile ~nb_sites ~dist_bins profile_f : text_file directory workflow
opt
"-o"
ident
prefix
;
opt
"-s"
int
nb_sites
;
opt
"-b"
string
dist_bins
;
opt
"-r"
int
seed
;
dep
profile_f
;
]
]
...
...
@@ -48,7 +49,7 @@ let cat_file ~(f_l: text_file workflow list) : text_file workflow =
]
let
profile_l_of_splitted_profile
~
nb_cat
~
nb_sites
profile_fn
=
let
profile_l_of_splitted_profile
~
nb_cat
~
nb_sites
profile_fn
~
seed
=
let
profile_f
=
input
profile_fn
in
let
prefix
=
Filename
.
chop_extension
(
Filename
.
basename
profile_fn
)
in
let
dist_bins
=
match
nb_cat
with
...
...
@@ -56,7 +57,7 @@ let profile_l_of_splitted_profile ~nb_cat ~nb_sites profile_fn =
|
1
->
"[0.01,0.7]"
|
_
->
failwith
{
|
nbcat
must
be
1
or
3
|
}
in
let
splitted_profile
=
split_profile
~
nb_sites
~
dist_bins
profile_f
in
let
splitted_profile
=
split_profile
~
nb_sites
~
dist_bins
profile_f
~
seed
in
match
nb_cat
with
|
3
->
(
let
_p0
=
splitted_profile
/
selector
[
"profile_0.tsv"
]
in
...
...
lib/scripts/generate_pairs.py
View file @
591a053a
...
...
@@ -33,20 +33,21 @@ Usage:
generate_pairs.py [options...] -o <output-prefix> <profiles-file>
Positional arguments:
profiles-file the file to read profile from
profiles-file
the file to read profile from
Options:
-h, --help show this help message and exit
-h, --help
show this help message and exit
-o, --output-prefix <filename>
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on
-b, --bins <spec> bin specification in the form of a list of intervals
[default: [0.01,0.4],[0.4,0.6],[0.6,2]]
-s, --bin-size <size> number of pairs per bin [default: 100]
-e, --euclid use euclidien distance (instead of Jensen-Shannon
distance) [default: False]
-p, --plot show plot of pair distributions, otherwise it will
be written to a png file [default: False]"""
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on
-b, --bins <spec> bin specification in the form of a list of intervals
[default: [0.01,0.4],[0.4,0.6],[0.6,2]]
-s, --bin-size <size> number of pairs per bin [default: 100]
-e, --euclid use euclidien distance (instead of Jensen-Shannon
distance) [default: False]
-p, --plot show plot of pair distributions, otherwise it will
be written to a png file [default: False]
-r, --random-seed <int> a seed for the random generator [default: -1]"""
from
diffsel_script_utils
import
*
...
...
@@ -67,6 +68,10 @@ disttype = "euclidian" if args["--euclid"] else "Jensen-Shannon"
MESSAGE
(
"Distance to use is "
+
param
(
disttype
))
showplot
=
args
[
"--plot"
]
MESSAGE
(
"Showing distance histogram is set to "
+
param
(
showplot
==
1
))
rseed
=
int
(
args
[
"--random-seed"
][
0
])
rseed
=
random
.
randint
(
0
,
sys
.
maxsize
)
if
rseed
==
-
1
else
rseed
MESSAGE
(
"Random seed is "
+
param
(
rseed
))
random
.
seed
(
rseed
)
#===================================================================================================
STEP
(
"Parsing bin specification"
)
...
...
@@ -147,7 +152,7 @@ def jensen_shannon_distance(p1, p2):
return
sqrt
(
jensen_shannon_divergence
(
p1
,
p2
))
def
entropy_p
(
p
):
print
(
p
)
#
print(p)
return
entropy
(
p
)
MESSAGE
(
"Preparing a dataframe for every bin..."
)
...
...
@@ -196,7 +201,7 @@ else:
plt
.
savefig
(
plotfile
)
#===================================================================================================
print
(
step
(
"Writing result to file"
)
)
STEP
(
"Writing result to file"
)
from
numpy
import
array_split
for
i
in
range
(
nb_bins
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment