#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018) # Contributors: # - Vincent Lanore # This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for # convergence detection programs. # This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software. # You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and # INRIA at the following URL "http://www.cecill.info". # As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users # are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive # licensors have only limited liability. # In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or # reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated # to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth # computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements # in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in # the same conditions as regards security. # The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept # its terms. """Scripts that reads aminoacid profiles from a file, draws random pairs among them, and outputs said pairs sorted in distance classes. Usage: generate_pairs.py [options...] -o Positional arguments: profiles-file the file to read profile from Options: -h, --help show this help message and exit -o, --output-prefix output prefix; files will be names _1.tsv, _2.tsv and so on -b, --bins bin specification in the form of a list of intervals [default: [0.01,0.4],[0.4,0.6],[0.6,2]] -s, --bin-size number of pairs per bin [default: 100] -e, --euclid use euclidien distance (instead of Jensen-Shannon distance) [default: False] -p, --plot show plot of pair distributions, otherwise it will be written to a png file [default: False]""" from diffsel_script_utils import * #=================================================================================================== STEP("Parsing command line arguments") from docopt import docopt args = docopt(__doc__) profiles_file = args[""] MESSAGE("Profile file is " + param(profiles_file)) out = args["--output-prefix"] MESSAGE("Output prefix is " + param(out)) binspec = args["--bins"][0] MESSAGE("Bin specification is " + param(binspec)) binsize = int(args["--bin-size"][0]) MESSAGE("Bin size is " + param(binsize)) disttype = "euclidian" if args["--euclid"] else "Jensen-Shannon" MESSAGE("Distance to use is " + param(disttype)) showplot = args["--plot"] MESSAGE("Showing distance histogram is set to " + param(showplot == 1)) #=================================================================================================== STEP("Parsing bin specification") try: binspec = binspec.split("],[") nb_bins = len(binspec) binspec[0] = binspec[0][1:] # removing [ and ] at the beginning and end binspec[nb_bins-1] = binspec[nb_bins-1][:-1] binspec = list(map(lambda i: list(map(float, i.split(","))), binspec)) if False in list(map(lambda i: len(i) == 2, binspec)): FAILURE("Some interval was not of length 2.") else: SUCCESS("All intervals are of length 2.") if False in list(map(lambda i: i[0] < i[1], binspec)): FAILURE("Some interval was improperly constructed (lower bound above upper bound).") else: SUCCESS("All intervals seem to be properly constructed.") for i in range(nb_bins - 1): if binspec[i][1] > binspec[i+1][0]: FAILURE("Bins do not seem to be disjoint and/or are not in ascending order.") SUCCESS("Bins are disjoint and in ascending order.") MESSAGE("Found " + data(nb_bins) + " bins") list(map(lambda i: SUBMESSAGE("from " + data(i[0]) + " to " + data(i[1])), binspec)) except: FAILURE("Bin specification was improperly formatted!") #=================================================================================================== STEP("Reading profiles from file") import pandas as pd MESSAGE("Reading tsv file") try: profiles = pd.read_csv(profiles_file, sep="\t", header=None).transpose() SUCCESS("Profiles file seems to exist.") except: FAILURE("Something went wrong while trying to open profiles file.") if profiles.shape[1] != 20: FAILURE("Profiles file does not seem to contain 20 lines.") else: SUCCESS("Profiles file contains 20 lines.") profiles_sum = profiles.sum(axis=1) if max(profiles_sum) < 1.02 and min(profiles_sum) > 0.98: SUCCESS("All profiles sum to 1.") else: FAILURE("Some profiles don't sum to 1 (sum is comprised between " + data(min(profiles_sum)) + " and " + data(max(profiles_sum)) + ").") MESSAGE("Profiles file contains " + data(profiles.shape[0]) + " profiles.") MESSAGE("Adding index column...") profiles["index"] = range(len(profiles)) #=================================================================================================== STEP("Picking random pairs") from numpy.linalg import norm import matplotlib.pyplot as plt from scipy.stats import entropy from math import sqrt def pick(profiles): # returns a profile in the form of a numpy array line = profiles.sample().values[0] return dict(p = line[:-1], i = line[len(line) - 1]) def in_bin(dist, i): return binspec[i][0] <= dist and binspec[i][1] >= dist; def euclidian_distance(p1, p2): return norm(p1 - p2) def jensen_shannon_divergence(p1, p2): # https://stackoverflow.com/questions/15880133/jensen-shannon-divergence P = p1 / norm(p1, ord=1) Q = p2 / norm(p2, ord=1) M = 0.5 * (P + Q) return 0.5 * (entropy(P, M) + entropy(Q, M)) def jensen_shannon_distance(p1, p2): return sqrt(jensen_shannon_divergence(p1, p2)) MESSAGE("Preparing a dataframe for every bin...") # columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"] columns = ["p1", "p2", "distance"] pair_bins = [pd.DataFrame(columns = columns) for x in range(nb_bins)] MESSAGE("Picking profile pairs and computing distances...") nb_ok = 0 nb_it = 0 while min(map(lambda b: b.shape[0], pair_bins)) < binsize: nb_it += 1 p1 = pick(profiles) p2 = pick(profiles) if disttype == "Jensen-Shannon": dist = jensen_shannon_distance(p1["p"], p2["p"]) # dist = jensen_shannon_divergence(p1["p"], p2["p"]) elif disttype == "euclidian": dist = euclidian_distance(p1["p"], p2["p"]) else: FAILURE("Unknown distance type " + data(disttype)) for i in range(nb_bins): if in_bin(dist, i) and pair_bins[i].shape[0] < binsize: new_row = [p1["i"], p2["i"], dist] pair_bins[i].loc[len(pair_bins[i])] = new_row nb_ok += 1 break MESSAGE("%s%% of drawn pairs were kept." % data("%.1f" % (100 * nb_ok / nb_it))) MESSAGE("Post-treatment of pair bins...") for i in range(nb_bins): pair_bins[i] = pair_bins[i].sort_values(by=["distance"]) plot_bins = int(float(pair_bins[i]["distance"].max() - pair_bins[i]["distance"].min()) / 0.01) plot_bins = max(1, plot_bins) pair_bins[i]["distance"].hist(bins = plot_bins) plt.title("Distributions of pair distance classes") if showplot: MESSAGE("Showing plot...") plt.show() else: plotfile = out + "_plot.png" MESSAGE("Saving plot to " + data(plotfile)) plt.savefig(plotfile) #=================================================================================================== print(step("Writing result to file")) from numpy import array_split for i in range(nb_bins): filename_out = out + "_" + str(i) + ".tsv" MESSAGE("Writing pairs to " + param(filename_out)) # columns[0] = "#p1_0" #pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns) pair_bins[i]["p1"] = pair_bins[i]["p1"].astype(int) + 1 pair_bins[i]["p2"] = pair_bins[i]["p2"].astype(int) + 1 pair_bins[i].to_csv(filename_out, sep='\t', index=False, header=False) MESSAGE("Done :)")