Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
77423465
Commit
77423465
authored
Feb 14, 2020
by
LE GAC Renaud
Browse files
Add scripts/citations.py to exercise extraction and plot of citations.
parent
6ce6e0d3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
255 additions
and
0 deletions
+255
-0
scripts/citations.py
scripts/citations.py
+255
-0
No files found.
scripts/citations.py
0 → 100644
View file @
77423465
""" NAME
citations
SYNOPSIS
Produce a status report showing citations
DESCRIPTION
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...limbra/scripts
> run script test_limbra citations.py
AUTHOR
R. Le Gac -- Feb 2020
"""
import
csv
import
logging
import
matplotlib.pyplot
as
plt
import
matplotlib.ticker
as
ticker
import
numpy
as
np
import
pandas
as
pd
import
re
from
graph_tools
import
mplstyle
from
invenio_tools
import
CdsException
,
InvenioStore
from
matplotlib.backends.backend_pdf
import
PdfPages
from
plugin_dbui
import
get_id
CSVFN
=
"/opt/web2py/applications/limbra/scripts/citations.csv"
PDFFN
=
"/opt/web2py/applications/limbra/scripts/citations.pdf"
REX_INS
=
re
.
compile
(
r
"https?://inspirehep.net/"
)
def
cli
():
# collect_citations()
plot_citations
()
def
collect_citations
():
logger
=
logging
.
getLogger
(
"web2py.app.limbra"
)
logger
.
info
(
"-"
*
79
)
logger
.
info
(
"start collect citations..."
)
publications
=
db
.
publications
store
=
InvenioStore
(
"inspirehep.net"
)
# get the list of article stored in inspirehep
id_acl
=
get_id
(
db
.
categories
,
code
=
"ACL"
)
query
=
(
db
.
publications
.
id_categories
==
id_acl
)
&
\
(
db
.
publications
.
origin
.
contains
(
"inspirehep"
))
iterrow
=
db
(
query
).
iterselect
(
publications
.
id
,
publications
.
origin
)
# interrogate inspirehep to get the number of citations
# save data into a local file
with
open
(
CSVFN
,
"w"
,
newline
=
"
\n
"
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
for
rowid
,
url
in
map
(
get_rowid_url
,
iterrow
):
logger
.
info
(
f
"
{
url
}
"
)
try
:
citations
=
get_citations
(
store
,
url
)
writer
.
writerow
((
rowid
,
url
,
citations
))
logging
.
debug
(
"FOO"
)
except
(
CdsException
,
ValueError
)
as
e
:
logging
.
info
(
f
" error
{
e
}
"
)
pass
logger
.
info
(
"end of collect"
)
logger
.
info
(
"-"
*
79
)
def
get_citations
(
store
,
url
):
"""
Args:
store (InvenioStore)
url (str):
Returns:
int:
number of citations
"""
kwargs
=
dict
(
of
=
"recjson"
,
ot
=
"number_of_citations"
)
rep
=
store
.
interogate
(
url
,
timeout
=
10
,
**
kwargs
)
return
rep
.
json
()[
0
].
get
(
"number_of_citations"
)
def
get_rowid_url
(
row
):
"""
Args:
row (pyDAL.Row):
row of the publications table with at least field id and origin
Returns:
tuple:
(rowid (int), url (str))
"""
url
=
[
el
for
el
in
row
.
origin
.
split
(
", "
)
if
REX_INS
.
match
(
el
)][
0
]
return
(
row
.
id
,
url
)
def
plot_citations
():
logger
=
logging
.
getLogger
(
"web2py.app.limbra"
)
logger
.
info
(
"-"
*
79
)
logger
.
info
(
"start plot citations..."
)
mplstyle
()
pdf
=
PdfPages
(
PDFFN
)
df_publis
=
pd
.
read_csv
(
CSVFN
,
names
=
[
"id"
,
"url"
,
"citations"
])
# ........................................................................
#
# overview à la inspirehep (histogram of all citations)
#
citations
=
df_publis
.
citations
bins
=
[
-
0.5
,
0.5
,
9.5
,
49.5
,
99.5
,
249.5
,
499.5
,
citations
.
max
()
+
1
]
hist
,
dummy
=
np
.
histogram
(
citations
,
bins
)
ind
=
np
.
arange
(
len
(
hist
))
fig
=
plt
.
figure
()
ax
=
plt
.
subplot
(
211
)
ax
.
bar
(
ind
,
hist
)
ax
.
minorticks_on
()
xlabels
=
(
""
,
"0"
,
"1 à 9"
,
"10 à 49"
,
"50 à 99"
,
"100 à 249"
,
"250 à 499"
,
"+500"
)
ax
.
xaxis
.
set_ticklabels
(
xlabels
)
ax
.
grid
(
True
)
ax
.
set_xlabel
(
"Number of citations"
,
horizontalalignment
=
'right'
,
x
=
1.
)
ax
.
set_ylabel
(
"Number of publications"
,
horizontalalignment
=
'right'
,
y
=
1.
)
ax
.
xaxis
.
set_minor_locator
(
ticker
.
NullLocator
())
txt
=
[
f
"articles :
{
len
(
df_publis
)
}
"
,
f
"citations:
{
citations
.
sum
()
}
"
,
f
"citations/article (avg):
{
citations
.
mean
():.
1
f
}
"
]
ax
.
text
(
0.72
,
0.8
,
"
\n
"
.
join
(
txt
),
bbox
=
dict
(
facecolor
=
"white"
,
alpha
=
0.5
),
family
=
"monospace"
,
fontsize
=
5
,
transform
=
ax
.
transAxes
)
pdf
.
savefig
(
fig
)
# ........................................................................
#
# per scientific domain
#
publications
=
db
.
publications
# get (id_team, id_project) associate at each publication
id_acl
=
get_id
(
db
.
categories
,
code
=
"ACL"
)
query
=
(
db
.
publications
.
id_categories
==
id_acl
)
&
\
(
db
.
publications
.
origin
.
contains
(
"inspirehep"
))
rows
=
db
(
query
).
select
(
publications
.
id
,
publications
.
id_teams
,
publications
.
id_projects
)
df_publis_teams
=
(
pd
.
DataFrame
(
rows
.
as_list
())
.
rename
(
columns
=
{
"id_teams"
:
"id_team"
,
"id_projects"
:
"id_project"
}))
df_publis
=
pd
.
merge
(
df_publis
,
df_publis_teams
,
how
=
"inner"
,
on
=
[
"id"
,
"id"
])
# expend id_team to domain and team
df_teams
=
(
pd
.
DataFrame
(
db
(
db
.
teams
).
select
().
as_list
())
.
rename
(
columns
=
{
"id"
:
"id_team"
}))
df_publis
=
(
pd
.
merge
(
df_publis
,
df_teams
,
how
=
"inner"
,
on
=
[
"id_team"
,
"id_team"
])
.
drop
([
"id_team"
],
axis
=
"columns"
))
# expend id_project to project
df_projects
=
(
pd
.
DataFrame
(
db
(
db
.
projects
).
select
().
as_list
())
.
rename
(
columns
=
{
"id"
:
"id_project"
}))
df_publis
=
(
pd
.
merge
(
df_publis
,
df_projects
,
how
=
"inner"
,
on
=
[
"id_project"
,
"id_project"
])
.
drop
([
"id_project"
,
"agencies"
],
axis
=
"columns"
))
fig
=
plt
.
figure
()
ax
=
plt
.
subplot
(
121
)
query
=
(
df_publis
.
citations
<
500
)
&
(
df_publis
.
domain
!=
"Hors Equipe"
)
df
=
df_publis
[
query
]
df
.
boxplot
(
"citations"
,
by
=
"domain"
,
ax
=
ax
,
grid
=
True
,
rot
=
20
)
ax
.
minorticks_on
()
ax
.
xaxis
.
set_minor_locator
(
ticker
.
NullLocator
())
ax
.
set_ylabel
(
"Number of citations"
,
horizontalalignment
=
'right'
,
x
=
1.
)
pdf
.
savefig
(
fig
)
# ........................................................................
#
# per team
#
fig
=
plt
.
figure
()
ax
=
plt
.
subplot
(
121
)
query
=
(
df_publis
.
citations
<
500
)
&
(
df_publis
.
domain
!=
"Hors Equipe"
)
df
=
df_publis
[
query
]
df
.
boxplot
(
"citations"
,
by
=
"team"
,
ax
=
ax
,
grid
=
True
,
rot
=
20
)
ax
.
minorticks_on
()
ax
.
xaxis
.
set_minor_locator
(
ticker
.
NullLocator
())
ax
.
set_ylabel
(
"Number of citations"
,
horizontalalignment
=
'right'
,
x
=
1.
)
pdf
.
savefig
(
fig
)
pdf
.
close
()
if
__name__
==
"__main__"
:
import
sys
cli
()
sys
.
exit
(
0
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment