Commit a2c8e384 by LANORE Vincent

### Changed generate_pairs to output a table of pair indices instead of raw profiles + fixed sorting

parent f84c8737
 ... ... @@ -117,6 +117,8 @@ if max(profiles_sum) < 1.02 and min(profiles_sum) > 0.98: else: FAILURE("Some profiles don't sum to 1 (sum is comprised between " + data(min(profiles_sum)) + " and " + data(max(profiles_sum)) + ").") MESSAGE("Profiles file contains " + data(profiles.shape[0]) + " profiles.") MESSAGE("Adding index column...") profiles["index"] = range(len(profiles)) #=================================================================================================== STEP("Picking random pairs") ... ... @@ -126,7 +128,8 @@ from scipy.stats import entropy from math import sqrt def pick(profiles): # returns a profile in the form of a numpy array return profiles.sample().values[0] line = profiles.sample().values[0] return dict(p = line[:-1], i = line[len(line) - 1]) def in_bin(dist, i): return binspec[i][0] <= dist and binspec[i][1] >= dist; ... ... @@ -144,7 +147,8 @@ def jensen_shannon_distance(p1, p2): return sqrt(jensen_shannon_divergence(p1, p2)) MESSAGE("Preparing a dataframe for every bin...") columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"] # columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"] columns = ["p1", "p2", "distance"] pair_bins = [pd.DataFrame(columns = columns) for x in range(nb_bins)] MESSAGE("Picking profile pairs and computing distances...") ... ... @@ -155,16 +159,16 @@ while min(map(lambda b: b.shape[0], pair_bins)) < binsize: p1 = pick(profiles) p2 = pick(profiles) if disttype == "Jensen-Shannon": dist = jensen_shannon_distance(p1, p2) # dist = jensen_shannon_divergence(p1, p2) dist = jensen_shannon_distance(p1["p"], p2["p"]) # dist = jensen_shannon_divergence(p1["p"], p2["p"]) elif disttype == "euclidian": dist = euclidian_distance(p1, p2) dist = euclidian_distance(p1["p"], p2["p"]) else: FAILURE("Unknown distance type " + data(disttype)) for i in range(nb_bins): if in_bin(dist, i) and pair_bins[i].shape[0] < binsize: new_row = p1.tolist() + p2.tolist() + [dist] new_row = [p1["i"], p2["i"], dist] pair_bins[i].loc[len(pair_bins[i])] = new_row nb_ok += 1 break ... ... @@ -172,7 +176,7 @@ MESSAGE("%s%% of drawn pairs were kept." % data("%.1f" % (100 * nb_ok / nb_it))) MESSAGE("Post-treatment of pair bins...") for i in range(nb_bins): pair_bins[i].sort_values(by=["distance"]) pair_bins[i] = pair_bins[i].sort_values(by=["distance"]) plot_bins = int(float(pair_bins[i]["distance"].max() - pair_bins[i]["distance"].min()) / 0.01) pair_bins[i]["distance"].hist(bins = plot_bins) plt.title("Distributions of pair distance classes") ... ... @@ -193,6 +197,8 @@ for i in range(nb_bins): MESSAGE("Writing pairs to " + param(filename_out)) # columns[0] = "#p1_0" #pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns) pair_bins[i]["p1"] = pair_bins[i]["p1"].astype(int) pair_bins[i]["p2"] = pair_bins[i]["p2"].astype(int) pair_bins[i].to_csv(filename_out, sep='\t', index=False, header=False) MESSAGE("Done :)")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!