Commit a2c8e384 authored by LANORE Vincent's avatar LANORE Vincent
Browse files

Changed generate_pairs to output a table of pair indices instead of raw profiles + fixed sorting

parent f84c8737
......@@ -117,6 +117,8 @@ if max(profiles_sum) < 1.02 and min(profiles_sum) > 0.98:
else:
FAILURE("Some profiles don't sum to 1 (sum is comprised between " + data(min(profiles_sum)) + " and " + data(max(profiles_sum)) + ").")
MESSAGE("Profiles file contains " + data(profiles.shape[0]) + " profiles.")
MESSAGE("Adding index column...")
profiles["index"] = range(len(profiles))
#===================================================================================================
STEP("Picking random pairs")
......@@ -126,7 +128,8 @@ from scipy.stats import entropy
from math import sqrt
def pick(profiles): # returns a profile in the form of a numpy array
return profiles.sample().values[0]
line = profiles.sample().values[0]
return dict(p = line[:-1], i = line[len(line) - 1])
def in_bin(dist, i):
return binspec[i][0] <= dist and binspec[i][1] >= dist;
......@@ -144,7 +147,8 @@ def jensen_shannon_distance(p1, p2):
return sqrt(jensen_shannon_divergence(p1, p2))
MESSAGE("Preparing a dataframe for every bin...")
columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"]
# columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"]
columns = ["p1", "p2", "distance"]
pair_bins = [pd.DataFrame(columns = columns) for x in range(nb_bins)]
MESSAGE("Picking profile pairs and computing distances...")
......@@ -155,16 +159,16 @@ while min(map(lambda b: b.shape[0], pair_bins)) < binsize:
p1 = pick(profiles)
p2 = pick(profiles)
if disttype == "Jensen-Shannon":
dist = jensen_shannon_distance(p1, p2)
# dist = jensen_shannon_divergence(p1, p2)
dist = jensen_shannon_distance(p1["p"], p2["p"])
# dist = jensen_shannon_divergence(p1["p"], p2["p"])
elif disttype == "euclidian":
dist = euclidian_distance(p1, p2)
dist = euclidian_distance(p1["p"], p2["p"])
else:
FAILURE("Unknown distance type " + data(disttype))
for i in range(nb_bins):
if in_bin(dist, i) and pair_bins[i].shape[0] < binsize:
new_row = p1.tolist() + p2.tolist() + [dist]
new_row = [p1["i"], p2["i"], dist]
pair_bins[i].loc[len(pair_bins[i])] = new_row
nb_ok += 1
break
......@@ -172,7 +176,7 @@ MESSAGE("%s%% of drawn pairs were kept." % data("%.1f" % (100 * nb_ok / nb_it)))
MESSAGE("Post-treatment of pair bins...")
for i in range(nb_bins):
pair_bins[i].sort_values(by=["distance"])
pair_bins[i] = pair_bins[i].sort_values(by=["distance"])
plot_bins = int(float(pair_bins[i]["distance"].max() - pair_bins[i]["distance"].min()) / 0.01)
pair_bins[i]["distance"].hist(bins = plot_bins)
plt.title("Distributions of pair distance classes")
......@@ -193,6 +197,8 @@ for i in range(nb_bins):
MESSAGE("Writing pairs to " + param(filename_out))
# columns[0] = "#p1_0"
#pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns)
pair_bins[i]["p1"] = pair_bins[i]["p1"].astype(int)
pair_bins[i]["p2"] = pair_bins[i]["p2"].astype(int)
pair_bins[i].to_csv(filename_out, sep='\t', index=False, header=False)
MESSAGE("Done :)")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment