Commit 5cb204b1 authored by LANORE Vincent's avatar LANORE Vincent
Browse files

New bin logic now properly works :)

parent 31565450
...@@ -61,8 +61,9 @@ MESSAGE("Bin specification is " + param(binspec)) ...@@ -61,8 +61,9 @@ MESSAGE("Bin specification is " + param(binspec))
STEP("Parsing bin specification") STEP("Parsing bin specification")
try: try:
binspec = binspec.split("],[") binspec = binspec.split("],[")
nb_bins = len(binspec)
binspec[0] = binspec[0][1:] # removing [ and ] at the beginning and end binspec[0] = binspec[0][1:] # removing [ and ] at the beginning and end
binspec[len(binspec)-1] = binspec[len(binspec)-1][:-1] binspec[nb_bins-1] = binspec[nb_bins-1][:-1]
binspec = list(map(lambda i: list(map(float, i.split(","))), binspec)) binspec = list(map(lambda i: list(map(float, i.split(","))), binspec))
if False in list(map(lambda i: len(i) == 2, binspec)): if False in list(map(lambda i: len(i) == 2, binspec)):
...@@ -75,12 +76,12 @@ try: ...@@ -75,12 +76,12 @@ try:
else: else:
SUCCESS("All intervals seem to be properly constructed.") SUCCESS("All intervals seem to be properly constructed.")
for i in range(len(binspec) - 1): for i in range(nb_bins - 1):
if binspec[i][1] > binspec[i+1][0]: if binspec[i][1] > binspec[i+1][0]:
FAILURE("Bins do not seem to be disjoint and/or are not in ascending order.") FAILURE("Bins do not seem to be disjoint and/or are not in ascending order.")
SUCCESS("Bins are disjoint and in ascending order.") SUCCESS("Bins are disjoint and in ascending order.")
MESSAGE("Found " + data(len(binspec)) + " bins") MESSAGE("Found " + data(nb_bins) + " bins")
list(map(lambda i: SUBMESSAGE("from " + data(i[0]) + " to " + data(i[1])), binspec)) list(map(lambda i: SUBMESSAGE("from " + data(i[0]) + " to " + data(i[1])), binspec))
except: except:
FAILURE("Bin specification was improperly formatted!") FAILURE("Bin specification was improperly formatted!")
...@@ -116,6 +117,9 @@ from math import sqrt ...@@ -116,6 +117,9 @@ from math import sqrt
def pick(profiles): # returns a profile in the form of a numpy array def pick(profiles): # returns a profile in the form of a numpy array
return profiles.sample().values[0] return profiles.sample().values[0]
def in_bin(dist, i):
return binspec[i][0] <= dist and binspec[i][1] >= dist;
def euclidian_distance(p1, p2): def euclidian_distance(p1, p2):
return norm(p1 - p2) return norm(p1 - p2)
...@@ -128,48 +132,40 @@ def jensen_shannon_divergence(p1, p2): # https://stackoverflow.com/questions/158 ...@@ -128,48 +132,40 @@ def jensen_shannon_divergence(p1, p2): # https://stackoverflow.com/questions/158
def jensen_shannon_distance(p1, p2): def jensen_shannon_distance(p1, p2):
return sqrt(jensen_shannon_divergence(p1, p2)) return sqrt(jensen_shannon_divergence(p1, p2))
MESSAGE("Preparing dataframe...") MESSAGE("Preparing a dataframe for every bin...")
columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"] columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"]
pairs = pd.DataFrame(columns = columns) pair_bins = [pd.DataFrame(columns = columns) for x in range(nb_bins)]
MESSAGE("Picking profile pairs and computing distances...") MESSAGE("Picking profile pairs and computing distances...")
nb_pairs = 300 nb_pairs_per_bin = 100
try: while min(map(lambda b: b.shape[0], pair_bins)) < nb_pairs_per_bin:
import progressbar
bar = progressbar.ProgressBar()
myrange = bar(range(nb_pairs))
except:
myrange = range(nb_pairs)
print("-- Progressbar is not installed! Cannot display progress. Please wait for a while...")
for i in myrange:
p1 = pick(profiles) p1 = pick(profiles)
p2 = pick(profiles) p2 = pick(profiles)
# dist = jensen_shannon_divergence(p1, p2)
dist = jensen_shannon_distance(p1, p2) dist = jensen_shannon_distance(p1, p2)
# dist = jensen_shannon_divergence(p1, p2)
# dist = euclidian_distance(p1, p2) # dist = euclidian_distance(p1, p2)
new_row = p1.tolist() + p2.tolist() + [dist]
pairs.loc[len(pairs)] = new_row
# pairs["distance"].hist(bins = 40) for i in range(nb_bins):
# plt.show() if in_bin(dist, i) and pair_bins[i].shape[0] < nb_pairs_per_bin:
new_row = p1.tolist() + p2.tolist() + [dist]
pair_bins[i].loc[len(pair_bins[i])] = new_row
break
MESSAGE("Post-treatment of pair bins...")
for i in range(nb_bins):
pair_bins[i].sort_values(by=["distance"])
pair_bins[i]["distance"].hist(bins = 40)
plt.show()
#=================================================================================================== #===================================================================================================
print(step("Writing result to file")) print(step("Writing result to file"))
from numpy import array_split from numpy import array_split
MESSAGE("Sorting pairs by distance...") for i in range(nb_bins):
pairs = pairs.sort_values(by=["distance"])
MESSAGE("Separating pairs into chunks...")
nb_classes = 3
chunks = array_split(range(nb_pairs), nb_classes)
for i in range(nb_classes):
filename_out = out + "_" + str(i) + ".tsv" filename_out = out + "_" + str(i) + ".tsv"
MESSAGE("Writing pairs to " + param(filename_out)) MESSAGE("Writing pairs to " + param(filename_out))
columns[0] = "#p1_0" # columns[0] = "#p1_0"
#pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns) #pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns)
pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=False) pair_bins[i].to_csv(filename_out, sep='\t', index=False, header=False)
MESSAGE("Done :)") MESSAGE("Done :)")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment