generate_pairs.py 8.65 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright or Copr. Centre National de la Recherche Scientifique (CNRS) (2018)
# Contributors:
# - Vincent Lanore <vincent.lanore@gmail.com>

# This software is a computer program whose purpose is to provide a set of scripts for pre and post processing of data for
# convergence detection programs.

# This software is governed by the CeCILL-C license under French law and abiding by the rules of distribution of free software.
# You can use, modify and/ or redistribute the software under the terms of the CeCILL-C license as circulated by CEA, CNRS and
# INRIA at the following URL "http://www.cecill.info".

# As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, users
# are provided only with a limited warranty and the software's author, the holder of the economic rights, and the successive
# licensors have only limited liability.

# In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or developing or
# reproducing the software by the user in light of its specific status of free software, that may mean that it is complicated
# to manipulate, and that also therefore means that it is reserved for developers and experienced professionals having in-depth
# computer knowledge. Users are therefore encouraged to load and test the software's suitability as regards their requirements
# in conditions enabling the security of their systems and/or data to be ensured and, more generally, to use and operate it in
# the same conditions as regards security.

# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.

29 30
"""Scripts that reads aminoacid profiles from a file, draws random pairs
among them, and outputs said pairs sorted in distance classes.
31 32

Usage:
33
  generate_pairs.py [options...] -o <output-prefix> <profiles-file>
34 35

Positional arguments:
36
  profiles-file          the file to read profile from
37 38

Options:
39
  -h, --help             show this help message and exit
40
  -o, --output-prefix <filename>
41 42 43
                         output prefix; files will be names <prefix>_1.tsv,
                         <prefix>_2.tsv and so on
  -b, --bins <spec>      bin specification in the form of a list of intervals
44
                         [default: [0.01,0.4],[0.4,0.6],[0.6,2]]
45
  -s, --bin-size <size>  number of pairs per bin [default: 100]
46 47
  -e, --euclid           use euclidien distance (instead of Jensen-Shannon
                         distance) [default: False]
48 49
  -p, --plot             show plot of pair distributions, otherwise it will
                         be written to a png file [default: False]"""
50 51 52 53 54 55 56 57 58 59

from diffsel_script_utils import *

#===================================================================================================
STEP("Parsing command line arguments")
from docopt import docopt

args = docopt(__doc__)
profiles_file = args["<profiles-file>"]
MESSAGE("Profile file is " + param(profiles_file))
60 61 62 63
out = args["--output-prefix"]
MESSAGE("Output prefix is " + param(out))
binspec = args["--bins"][0]
MESSAGE("Bin specification is " + param(binspec))
64 65
binsize = int(args["--bin-size"][0])
MESSAGE("Bin size is " + param(binsize))
66 67
disttype = "euclidian" if args["--euclid"] else "Jensen-Shannon"
MESSAGE("Distance to use is " + param(disttype))
68 69
showplot = args["--plot"]
MESSAGE("Showing distance histogram is set to " + param(showplot == 1))
70 71 72 73 74

#===================================================================================================
STEP("Parsing bin specification")
try:
    binspec = binspec.split("],[")
75
    nb_bins = len(binspec)
76
    binspec[0] = binspec[0][1:] # removing [ and ] at the beginning and end
77
    binspec[nb_bins-1] = binspec[nb_bins-1][:-1]
78 79 80 81 82 83 84 85 86 87 88 89
    binspec = list(map(lambda i: list(map(float, i.split(","))), binspec))

    if False in list(map(lambda i: len(i) == 2, binspec)):
        FAILURE("Some interval was not of length 2.")
    else:
        SUCCESS("All intervals are of length 2.")

    if False in list(map(lambda i: i[0] < i[1], binspec)):
        FAILURE("Some interval was improperly constructed (lower bound above upper bound).")
    else:
        SUCCESS("All intervals seem to be properly constructed.")

90
    for i in range(nb_bins - 1):
91 92 93 94
        if binspec[i][1] > binspec[i+1][0]:
            FAILURE("Bins do not seem to be disjoint and/or are not in ascending order.")
    SUCCESS("Bins are disjoint and in ascending order.")

95
    MESSAGE("Found " + data(nb_bins) + " bins")
96 97 98
    list(map(lambda i: SUBMESSAGE("from " + data(i[0]) + " to " + data(i[1])), binspec))
except:
    FAILURE("Bin specification was improperly formatted!")
99 100 101 102 103 104

#===================================================================================================
STEP("Reading profiles from file")
import pandas as pd

MESSAGE("Reading tsv file")
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
try:
    profiles = pd.read_csv(profiles_file, sep="\t", header=None).transpose()
    SUCCESS("Profiles file seems to exist.")
except:
    FAILURE("Something went wrong while trying to open profiles file.")
if profiles.shape[1] != 20:
    FAILURE("Profiles file does not seem to contain 20 lines.")
else:
    SUCCESS("Profiles file contains 20 lines.")
profiles_sum = profiles.sum(axis=1)
if max(profiles_sum) < 1.02 and min(profiles_sum) > 0.98:
    SUCCESS("All profiles sum to 1.")
else:
    FAILURE("Some profiles don't sum to 1 (sum is comprised between " + data(min(profiles_sum)) + " and " + data(max(profiles_sum)) + ").")
MESSAGE("Profiles file contains " + data(profiles.shape[0]) + " profiles.")
120 121
MESSAGE("Adding index column...")
profiles["index"] = range(len(profiles))
122 123 124 125

#===================================================================================================
STEP("Picking random pairs")
from numpy.linalg import norm
126
import matplotlib.pyplot as plt
127 128 129 130
from scipy.stats import entropy
from math import sqrt

def pick(profiles): # returns a profile in the form of a numpy array
131 132
    line = profiles.sample().values[0]
    return dict(p = line[:-1], i = line[len(line) - 1])
133

134 135 136
def in_bin(dist, i):
    return binspec[i][0] <= dist and binspec[i][1] >= dist;

137 138 139
def euclidian_distance(p1, p2):
    return norm(p1 - p2)

140 141 142 143 144 145 146 147
def jensen_shannon_divergence(p1, p2): # https://stackoverflow.com/questions/15880133/jensen-shannon-divergence
    P = p1 / norm(p1, ord=1)
    Q = p2 / norm(p2, ord=1)
    M = 0.5 * (P + Q)
    return 0.5 * (entropy(P, M) + entropy(Q, M))

def jensen_shannon_distance(p1, p2):
    return sqrt(jensen_shannon_divergence(p1, p2))
148

149
MESSAGE("Preparing a dataframe for every bin...")
150 151
# columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(20)] + ["distance"]
columns = ["p1", "p2", "distance"]
152
pair_bins = [pd.DataFrame(columns = columns) for x in range(nb_bins)]
153 154

MESSAGE("Picking profile pairs and computing distances...")
155 156
nb_ok = 0
nb_it = 0
157
while min(map(lambda b: b.shape[0], pair_bins)) < binsize:
158
    nb_it += 1
159 160
    p1 = pick(profiles)
    p2 = pick(profiles)
161
    if disttype == "Jensen-Shannon":
162 163
        dist = jensen_shannon_distance(p1["p"], p2["p"])
        # dist = jensen_shannon_divergence(p1["p"], p2["p"])
164
    elif disttype == "euclidian":
165
        dist = euclidian_distance(p1["p"], p2["p"])
166 167
    else:
        FAILURE("Unknown distance type " + data(disttype))
168

169
    for i in range(nb_bins):
170
        if in_bin(dist, i) and pair_bins[i].shape[0] < binsize:
171
            new_row = [p1["i"], p2["i"], dist]
172
            pair_bins[i].loc[len(pair_bins[i])] = new_row
173
            nb_ok += 1
174
            break
175
MESSAGE("%s%% of drawn pairs were kept." % data("%.1f" % (100 * nb_ok / nb_it)))
176 177 178

MESSAGE("Post-treatment of pair bins...")
for i in range(nb_bins):
179
    pair_bins[i] = pair_bins[i].sort_values(by=["distance"])
180
    plot_bins = int(float(pair_bins[i]["distance"].max() - pair_bins[i]["distance"].min()) / 0.01)
181
    plot_bins = max(1, plot_bins)
182
    pair_bins[i]["distance"].hist(bins = plot_bins)
183
plt.title("Distributions of pair distance classes")
184
if showplot:
185
    MESSAGE("Showing plot...")
186
    plt.show()
187 188 189 190
else:
    plotfile = out + "_plot.png"
    MESSAGE("Saving plot to " + data(plotfile))
    plt.savefig(plotfile)
191 192 193

#===================================================================================================
print(step("Writing result to file"))
194
from numpy import array_split
195

196
for i in range(nb_bins):
197 198
    filename_out = out + "_" + str(i) + ".tsv"
    MESSAGE("Writing pairs to " + param(filename_out))
199
    # columns[0] = "#p1_0"
Carine Rey's avatar
Carine Rey committed
200
    #pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns)
Carine Rey's avatar
Carine Rey committed
201 202
    pair_bins[i]["p1"] = pair_bins[i]["p1"].astype(int) + 1
    pair_bins[i]["p2"] = pair_bins[i]["p2"].astype(int) + 1
203
    pair_bins[i].to_csv(filename_out, sep='\t', index=False, header=False)
204 205

MESSAGE("Done :)")