Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
VEBER Philippe
codepi
Commits
5cb204b1
Commit
5cb204b1
authored
Jul 26, 2018
by
LANORE Vincent
Browse files
New bin logic now properly works :)
parent
31565450
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
26 additions
and
30 deletions
+26
-30
lib/scripts/generate_pairs.py
lib/scripts/generate_pairs.py
+26
-30
No files found.
lib/scripts/generate_pairs.py
View file @
5cb204b1
...
...
@@ -61,8 +61,9 @@ MESSAGE("Bin specification is " + param(binspec))
STEP
(
"Parsing bin specification"
)
try
:
binspec
=
binspec
.
split
(
"],["
)
nb_bins
=
len
(
binspec
)
binspec
[
0
]
=
binspec
[
0
][
1
:]
# removing [ and ] at the beginning and end
binspec
[
len
(
binspec
)
-
1
]
=
binspec
[
len
(
binspec
)
-
1
][:
-
1
]
binspec
[
nb_bins
-
1
]
=
binspec
[
nb_bins
-
1
][:
-
1
]
binspec
=
list
(
map
(
lambda
i
:
list
(
map
(
float
,
i
.
split
(
","
))),
binspec
))
if
False
in
list
(
map
(
lambda
i
:
len
(
i
)
==
2
,
binspec
)):
...
...
@@ -75,12 +76,12 @@ try:
else
:
SUCCESS
(
"All intervals seem to be properly constructed."
)
for
i
in
range
(
len
(
binspec
)
-
1
):
for
i
in
range
(
nb_bins
-
1
):
if
binspec
[
i
][
1
]
>
binspec
[
i
+
1
][
0
]:
FAILURE
(
"Bins do not seem to be disjoint and/or are not in ascending order."
)
SUCCESS
(
"Bins are disjoint and in ascending order."
)
MESSAGE
(
"Found "
+
data
(
len
(
binspec
)
)
+
" bins"
)
MESSAGE
(
"Found "
+
data
(
nb_bins
)
+
" bins"
)
list
(
map
(
lambda
i
:
SUBMESSAGE
(
"from "
+
data
(
i
[
0
])
+
" to "
+
data
(
i
[
1
])),
binspec
))
except
:
FAILURE
(
"Bin specification was improperly formatted!"
)
...
...
@@ -116,6 +117,9 @@ from math import sqrt
def
pick
(
profiles
):
# returns a profile in the form of a numpy array
return
profiles
.
sample
().
values
[
0
]
def
in_bin
(
dist
,
i
):
return
binspec
[
i
][
0
]
<=
dist
and
binspec
[
i
][
1
]
>=
dist
;
def
euclidian_distance
(
p1
,
p2
):
return
norm
(
p1
-
p2
)
...
...
@@ -128,48 +132,40 @@ def jensen_shannon_divergence(p1, p2): # https://stackoverflow.com/questions/158
def
jensen_shannon_distance
(
p1
,
p2
):
return
sqrt
(
jensen_shannon_divergence
(
p1
,
p2
))
MESSAGE
(
"Preparing dataframe..."
)
MESSAGE
(
"Preparing
a
dataframe
for every bin
..."
)
columns
=
[
"p1_"
+
str
(
i
)
for
i
in
range
(
20
)]
+
[
"p2_"
+
str
(
i
)
for
i
in
range
(
20
)]
+
[
"distance"
]
pairs
=
pd
.
DataFrame
(
columns
=
columns
)
pair
_bin
s
=
[
pd
.
DataFrame
(
columns
=
columns
)
for
x
in
range
(
nb_bins
)]
MESSAGE
(
"Picking profile pairs and computing distances..."
)
nb_pairs
=
300
try
:
import
progressbar
bar
=
progressbar
.
ProgressBar
()
myrange
=
bar
(
range
(
nb_pairs
))
except
:
myrange
=
range
(
nb_pairs
)
print
(
"-- Progressbar is not installed! Cannot display progress. Please wait for a while..."
)
for
i
in
myrange
:
nb_pairs_per_bin
=
100
while
min
(
map
(
lambda
b
:
b
.
shape
[
0
],
pair_bins
))
<
nb_pairs_per_bin
:
p1
=
pick
(
profiles
)
p2
=
pick
(
profiles
)
# dist = jensen_shannon_divergence(p1, p2)
dist
=
jensen_shannon_distance
(
p1
,
p2
)
# dist = jensen_shannon_divergence(p1, p2)
# dist = euclidian_distance(p1, p2)
new_row
=
p1
.
tolist
()
+
p2
.
tolist
()
+
[
dist
]
pairs
.
loc
[
len
(
pairs
)]
=
new_row
# pairs["distance"].hist(bins = 40)
# plt.show()
for
i
in
range
(
nb_bins
):
if
in_bin
(
dist
,
i
)
and
pair_bins
[
i
].
shape
[
0
]
<
nb_pairs_per_bin
:
new_row
=
p1
.
tolist
()
+
p2
.
tolist
()
+
[
dist
]
pair_bins
[
i
].
loc
[
len
(
pair_bins
[
i
])]
=
new_row
break
MESSAGE
(
"Post-treatment of pair bins..."
)
for
i
in
range
(
nb_bins
):
pair_bins
[
i
].
sort_values
(
by
=
[
"distance"
])
pair_bins
[
i
][
"distance"
].
hist
(
bins
=
40
)
plt
.
show
()
#===================================================================================================
print
(
step
(
"Writing result to file"
))
from
numpy
import
array_split
MESSAGE
(
"Sorting pairs by distance..."
)
pairs
=
pairs
.
sort_values
(
by
=
[
"distance"
])
MESSAGE
(
"Separating pairs into chunks..."
)
nb_classes
=
3
chunks
=
array_split
(
range
(
nb_pairs
),
nb_classes
)
for
i
in
range
(
nb_classes
):
for
i
in
range
(
nb_bins
):
filename_out
=
out
+
"_"
+
str
(
i
)
+
".tsv"
MESSAGE
(
"Writing pairs to "
+
param
(
filename_out
))
columns
[
0
]
=
"#p1_0"
#
columns[0] = "#p1_0"
#pairs.iloc[chunks[i]].to_csv(filename_out, sep='\t', index=False, header=columns)
pair
s
.
iloc
[
chunk
s
[
i
]
]
.
to_csv
(
filename_out
,
sep
=
'
\t
'
,
index
=
False
,
header
=
False
)
pair
_bin
s
[
i
].
to_csv
(
filename_out
,
sep
=
'
\t
'
,
index
=
False
,
header
=
False
)
MESSAGE
(
"Done :)"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment