Commit 2ec82128 authored by LANORE Vincent's avatar LANORE Vincent
Browse files

Added Jensen-Shannon divergence and distance + added progress bar

parent b255b6ba
...@@ -77,27 +77,49 @@ MESSAGE("Profiles file contains " + data(profiles.shape[0]) + " profiles.") ...@@ -77,27 +77,49 @@ MESSAGE("Profiles file contains " + data(profiles.shape[0]) + " profiles.")
STEP("Picking random pairs") STEP("Picking random pairs")
from numpy.linalg import norm from numpy.linalg import norm
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from scipy.stats import entropy
from math import sqrt
def pick(profiles): # returns a profile in the form of a numpy array
return profiles.sample().values[0]
def euclidian_distance(p1, p2): def euclidian_distance(p1, p2):
return norm(p1 - p2) return norm(p1 - p2)
def pick(profiles): # returns a profile in the form of a numpy array def jensen_shannon_divergence(p1, p2): # https://stackoverflow.com/questions/15880133/jensen-shannon-divergence
return profiles.sample().values P = p1 / norm(p1, ord=1)
Q = p2 / norm(p2, ord=1)
M = 0.5 * (P + Q)
return 0.5 * (entropy(P, M) + entropy(Q, M))
def jensen_shannon_distance(p1, p2):
return sqrt(jensen_shannon_divergence(p1, p2))
MESSAGE("Preparing dataframe...") MESSAGE("Preparing dataframe...")
columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"] columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"]
pairs = pd.DataFrame(columns = columns) pairs = pd.DataFrame(columns = columns)
MESSAGE("Picking profile pairs and computing distances...") MESSAGE("Picking profile pairs and computing distances...")
for i in range(1000): nb_pairs = 1000
try:
import progressbar
bar = progressbar.ProgressBar()
myrange = bar(range(nb_pairs))
except:
myrange = range(nb_pairs)
print("-- Progressbar is not installed! Cannot display progress. Please wait for a while...")
for i in myrange:
p1 = pick(profiles) p1 = pick(profiles)
p2 = pick(profiles) p2 = pick(profiles)
dist = euclidian_distance(p1, p2) # dist = jensen_shannon_divergence(p1, p2)
new_row = p1.tolist()[0] + p2.tolist()[0] + [dist] dist = jensen_shannon_distance(p1, p2)
# dist = euclidian_distance(p1, p2)
new_row = p1.tolist() + p2.tolist() + [dist]
pairs.loc[len(pairs)] = new_row pairs.loc[len(pairs)] = new_row
# pairs["distance"].hist(bins = 40) pairs["distance"].hist(bins = 40)
# plt.show() plt.show()
#=================================================================================================== #===================================================================================================
print(step("Writing result to file")) print(step("Writing result to file"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment