Commit f46a8534 authored by LANORE Vincent's avatar LANORE Vincent
Browse files

Added an option to specify number of pairs per distance class.

parent 5cb204b1
......@@ -33,15 +33,16 @@ Usage:
generate_pairs.py [options...] -o <output-prefix> <profiles-file>
Positional arguments:
profiles-file the file to read profile from
profiles-file the file to read profile from
Options:
-h, --help show this help message and exit
-h, --help show this help message and exit
-o, --output-prefix <filename>
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on
-b, --bins <spec> bin specification in the form of a list of intervals
[default: [0,0.4],[0.4,0.6],[0.6,2]]"""
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on
-b, --bins <spec> bin specification in the form of a list of intervals
[default: [0,0.4],[0.4,0.6],[0.6,2]]
-s, --bin-size <size> number of pairs per bin [default: 100]"""
from diffsel_script_utils import *
......@@ -56,6 +57,8 @@ out = args["--output-prefix"]
MESSAGE("Output prefix is " + param(out))
binspec = args["--bins"][0]
MESSAGE("Bin specification is " + param(binspec))
binsize = int(args["--bin-size"][0])
MESSAGE("Bin size is " + param(binsize))
#===================================================================================================
STEP("Parsing bin specification")
......@@ -137,8 +140,7 @@ columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(2
pair_bins = [pd.DataFrame(columns = columns) for x in range(nb_bins)]
MESSAGE("Picking profile pairs and computing distances...")
nb_pairs_per_bin = 100
while min(map(lambda b: b.shape[0], pair_bins)) < nb_pairs_per_bin:
while min(map(lambda b: b.shape[0], pair_bins)) < binsize:
p1 = pick(profiles)
p2 = pick(profiles)
dist = jensen_shannon_distance(p1, p2)
......@@ -146,7 +148,7 @@ while min(map(lambda b: b.shape[0], pair_bins)) < nb_pairs_per_bin:
# dist = euclidian_distance(p1, p2)
for i in range(nb_bins):
if in_bin(dist, i) and pair_bins[i].shape[0] < nb_pairs_per_bin:
if in_bin(dist, i) and pair_bins[i].shape[0] < binsize:
new_row = p1.tolist() + p2.tolist() + [dist]
pair_bins[i].loc[len(pair_bins[i])] = new_row
break
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment