Commit 0067c5c2 authored by LANORE Vincent's avatar LANORE Vincent
Browse files

generate_pairs now outputs separate files for distance classes (separation...

generate_pairs now outputs separate files for distance classes (separation into classes is very simple so far :/)
parent 2ec82128
......@@ -26,19 +26,20 @@
# The fact that you are presently reading this means that you have had knowledge of the CeCILL-C license and that you accept
# its terms.
"""Scripts that reads aminoacid profiles from a file, draws random pairs among them,
and outputs said pairs sorted in distance classes.
"""Scripts that reads aminoacid profiles from a file, draws random pairs
among them, and outputs said pairs sorted in distance classes.
Usage:
generate_pairs.py [options...] -o <output-file> <profiles-file>
generate_pairs.py [options...] -o <output-prefix> <profiles-file>
Positional arguments:
profiles-file the file to read profile from
profiles-file the file to read profile from
Options:
-h, --help show this help message and exit
-o, --output-file <filename>
output file"""
output prefix; files will be names <prefix>_1.tsv,
<prefix>_2.tsv and so on"""
from diffsel_script_utils import *
......@@ -100,7 +101,7 @@ columns = ["p1_" + str(i) for i in range(20)] + ["p2_" + str(i) for i in range(2
pairs = pd.DataFrame(columns = columns)
MESSAGE("Picking profile pairs and computing distances...")
nb_pairs = 1000
nb_pairs = 300
try:
import progressbar
bar = progressbar.ProgressBar()
......@@ -118,14 +119,24 @@ for i in myrange:
new_row = p1.tolist() + p2.tolist() + [dist]
pairs.loc[len(pairs)] = new_row
pairs["distance"].hist(bins = 40)
plt.show()
# pairs["distance"].hist(bins = 40)
# plt.show()
#===================================================================================================
print(step("Writing result to file"))
from numpy import array_split
MESSAGE("Writing pairs to " + param(out))
columns[0] = "#p1_0"
pairs.to_csv(out, sep='\t', index=False, header=columns)
MESSAGE("Done :)")
MESSAGE("Sorting pairs by distance...")
pairs = pairs.sort_values(by=["distance"])
MESSAGE("Separating pairs into chunks...")
nb_classes = 3
chunks = array_split(range(nb_pairs), nb_classes)
for i in range(nb_classes):
filename_out = out + "_" + str(i) + ".tsv"
MESSAGE("Writing pairs to " + param(filename_out))
columns[0] = "#p1_0"
pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns)
MESSAGE("Done :)")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment