Commit 591a053a authored by LANORE Vincent's avatar LANORE Vincent
Browse files

Pair generation now takes a seed.

parent bb0c512b
......@@ -375,7 +375,7 @@ let detection_main ~outdir ~indir ?(np = 2) ?(mem = 2) ~preview ~fast_mode ?(see
let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn ~preview ~use_concat ~no_Ne ~no_HaPC ~add_indels ?(seed = Random.int Int.max_value) () =
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:1 ~nb_sites profile_fn in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:1 ~nb_sites profile_fn ~seed:(Random.int Int.max_value) in
let trees = Array.to_list @@ Sys.readdir tree_dir in
let dataset_l = derive_sim ~tree_dir ~trees ~profile ~preview ~use_concat ~ns ~no_Ne ~no_HaPC ~ne_test:false ~add_indels ~seed in
......@@ -384,11 +384,11 @@ let simulation_main ~outdir ?(ns = 0) ?(np = 2) ?(mem = 2) ~tree_dir ~profile_fn
let validation_main ~outdir ?(indir = "") ?(ns = 0) ?(np = 2) ?(mem = 2) ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile_fn ~use_concat ~add_indels ~only_simu ?(seed = Random.int Int.max_value) () =
printf "Global seed: %i\n" seed;
Out_channel.write_all (Filename.concat outdir "global.seed") ~data:(string_of_int seed);
Out_channel.write_all "global.seed" ~data:(string_of_int seed);
(* simulated trees *)
Random.init seed ;
let nb_sites = if ns = 0 then (if preview then 20 else 50) else ns in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:3 ~nb_sites profile_fn in
let profile = Profile.profile_l_of_splitted_profile ~nb_cat:3 ~nb_sites profile_fn ~seed:(Random.int Int.max_value) in
let sim_repo_l = derive_profile ~indir ~ns ~preview ~fast_mode ~no_Ne ~ne_test ~no_HaPC ~tree_dir ~profile ~use_concat ~add_indels ~only_simu ~seed () in
(* real trees *)
let indir_dataset_l = if indir = "" then [] else parse_input_data ~seed indir in
......
......@@ -10,7 +10,7 @@ type profile = {
}
let split_profile ~nb_sites ~dist_bins profile_f : text_file directory workflow =
let split_profile ~nb_sites ~dist_bins profile_f ~seed : text_file directory workflow =
let env = docker_image ~account:"carinerey" ~name:"python_basics" ~tag:"07252018" () in
let package = tmp // "diffsel_script_utils.py" in
let script = tmp // "generate_pairs.py" in
......@@ -33,6 +33,7 @@ let split_profile ~nb_sites ~dist_bins profile_f : text_file directory workflow
opt "-o" ident prefix ;
opt "-s" int nb_sites ;
opt "-b" string dist_bins ;
opt "-r" int seed ;
dep profile_f ;
]
]
......@@ -48,7 +49,7 @@ let cat_file ~(f_l: text_file workflow list) : text_file workflow =
]
let profile_l_of_splitted_profile ~nb_cat ~nb_sites profile_fn =
let profile_l_of_splitted_profile ~nb_cat ~nb_sites profile_fn ~seed =
let profile_f = input profile_fn in
let prefix = Filename.chop_extension (Filename.basename profile_fn) in
let dist_bins = match nb_cat with
......@@ -56,7 +57,7 @@ let profile_l_of_splitted_profile ~nb_cat ~nb_sites profile_fn =
| 1 -> "[0.01,0.7]"
| _ -> failwith {| nbcat must be 1 or 3 |}
in
let splitted_profile = split_profile ~nb_sites ~dist_bins profile_f in
let splitted_profile = split_profile ~nb_sites ~dist_bins profile_f ~seed in
match nb_cat with
| 3 -> (
let _p0 = splitted_profile / selector ["profile_0.tsv"] in
......
......@@ -33,20 +33,21 @@ Usage:
generate_pairs.py [options...] -o <output-prefix> <profiles-file>
Positional arguments:
profiles-file the file to read profile from
profiles-file the file to read profile from
Options:
-h, --help show this help message and exit
-h, --help show this help message and exit
-o, --output-prefix <filename>
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on
-b, --bins <spec> bin specification in the form of a list of intervals
[default: [0.01,0.4],[0.4,0.6],[0.6,2]]
-s, --bin-size <size> number of pairs per bin [default: 100]
-e, --euclid use euclidien distance (instead of Jensen-Shannon
distance) [default: False]
-p, --plot show plot of pair distributions, otherwise it will
be written to a png file [default: False]"""
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on
-b, --bins <spec> bin specification in the form of a list of intervals
[default: [0.01,0.4],[0.4,0.6],[0.6,2]]
-s, --bin-size <size> number of pairs per bin [default: 100]
-e, --euclid use euclidien distance (instead of Jensen-Shannon
distance) [default: False]
-p, --plot show plot of pair distributions, otherwise it will
be written to a png file [default: False]
-r, --random-seed <int> a seed for the random generator [default: -1]"""
from diffsel_script_utils import *
......@@ -67,6 +68,10 @@ disttype = "euclidian" if args["--euclid"] else "Jensen-Shannon"
MESSAGE("Distance to use is " + param(disttype))
showplot = args["--plot"]
MESSAGE("Showing distance histogram is set to " + param(showplot == 1))
rseed = int(args["--random-seed"][0])
rseed = random.randint(0, sys.maxsize) if rseed == -1 else rseed
MESSAGE("Random seed is " + param(rseed))
random.seed(rseed)
#===================================================================================================
STEP("Parsing bin specification")
......@@ -147,7 +152,7 @@ def jensen_shannon_distance(p1, p2):
return sqrt(jensen_shannon_divergence(p1, p2))
def entropy_p(p):
print(p)
# print(p)
return entropy(p)
MESSAGE("Preparing a dataframe for every bin...")
......@@ -196,7 +201,7 @@ else:
plt.savefig(plotfile)
#===================================================================================================
print(step("Writing result to file"))
STEP("Writing result to file")
from numpy import array_split
for i in range(nb_bins):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment