Commit 77423465 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add scripts/citations.py to exercise extraction and plot of citations.

parent 6ce6e0d3
""" NAME
citations
SYNOPSIS
Produce a status report showing citations
DESCRIPTION
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...limbra/scripts
> run script test_limbra citations.py
AUTHOR
R. Le Gac -- Feb 2020
"""
import csv
import logging
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import re
from graph_tools import mplstyle
from invenio_tools import CdsException, InvenioStore
from matplotlib.backends.backend_pdf import PdfPages
from plugin_dbui import get_id
CSVFN = "/opt/web2py/applications/limbra/scripts/citations.csv"
PDFFN = "/opt/web2py/applications/limbra/scripts/citations.pdf"
REX_INS = re.compile(r"https?://inspirehep.net/")
def cli():
# collect_citations()
plot_citations()
def collect_citations():
logger = logging.getLogger("web2py.app.limbra")
logger.info("-"*79)
logger.info("start collect citations...")
publications = db.publications
store = InvenioStore("inspirehep.net")
# get the list of article stored in inspirehep
id_acl = get_id(db.categories, code="ACL")
query = (db.publications.id_categories == id_acl) & \
(db.publications.origin.contains("inspirehep"))
iterrow = db(query).iterselect(publications.id, publications.origin)
# interrogate inspirehep to get the number of citations
# save data into a local file
with open(CSVFN, "w", newline="\n") as csvfile:
writer = csv.writer(csvfile)
for rowid, url in map(get_rowid_url, iterrow):
logger.info(f" {url}")
try:
citations = get_citations(store, url)
writer.writerow((rowid, url, citations))
logging.debug("FOO")
except (CdsException, ValueError) as e:
logging.info(f" error {e}")
pass
logger.info("end of collect")
logger.info("-"*79)
def get_citations(store, url):
"""
Args:
store (InvenioStore)
url (str):
Returns:
int:
number of citations
"""
kwargs = dict(of="recjson", ot="number_of_citations")
rep = store.interogate(url, timeout=10, **kwargs)
return rep.json()[0].get("number_of_citations")
def get_rowid_url(row):
"""
Args:
row (pyDAL.Row):
row of the publications table with at least field id and origin
Returns:
tuple:
(rowid (int), url (str))
"""
url = [el for el in row.origin.split(", ") if REX_INS.match(el)][0]
return (row.id, url)
def plot_citations():
logger = logging.getLogger("web2py.app.limbra")
logger.info("-"*79)
logger.info("start plot citations...")
mplstyle()
pdf = PdfPages(PDFFN)
df_publis = pd.read_csv(CSVFN, names=["id", "url", "citations"])
# ........................................................................
#
# overview à la inspirehep (histogram of all citations)
#
citations = df_publis.citations
bins = [-0.5, 0.5, 9.5, 49.5, 99.5, 249.5, 499.5, citations.max()+1]
hist, dummy = np.histogram(citations, bins)
ind = np.arange(len(hist))
fig = plt.figure()
ax = plt.subplot(211)
ax.bar(ind, hist)
ax.minorticks_on()
xlabels = ("",
"0",
"1 à 9",
"10 à 49",
"50 à 99",
"100 à 249",
"250 à 499",
"+500")
ax.xaxis.set_ticklabels(xlabels)
ax.grid(True)
ax.set_xlabel("Number of citations", horizontalalignment='right', x=1.)
ax.set_ylabel("Number of publications", horizontalalignment='right', y=1.)
ax.xaxis.set_minor_locator(ticker.NullLocator())
txt = [
f"articles : {len(df_publis)}",
f"citations: {citations.sum()}",
f"citations/article (avg): {citations.mean():.1f}"]
ax.text(
0.72, 0.8, "\n".join(txt),
bbox=dict(facecolor="white", alpha=0.5),
family="monospace",
fontsize=5,
transform=ax.transAxes)
pdf.savefig(fig)
# ........................................................................
#
# per scientific domain
#
publications = db.publications
# get (id_team, id_project) associate at each publication
id_acl = get_id(db.categories, code="ACL")
query = (db.publications.id_categories == id_acl) & \
(db.publications.origin.contains("inspirehep"))
rows = db(query).select(publications.id,
publications.id_teams,
publications.id_projects)
df_publis_teams = (pd.DataFrame(rows.as_list())
.rename(columns={"id_teams": "id_team",
"id_projects": "id_project"}))
df_publis = pd.merge(df_publis, df_publis_teams,
how="inner",
on=["id", "id"])
# expend id_team to domain and team
df_teams = (pd.DataFrame(db(db.teams).select().as_list())
.rename(columns={"id": "id_team"}))
df_publis = (pd.merge(df_publis, df_teams,
how="inner",
on=["id_team", "id_team"])
.drop(["id_team"], axis="columns"))
# expend id_project to project
df_projects = (pd.DataFrame(db(db.projects).select().as_list())
.rename(columns={"id": "id_project"}))
df_publis = (pd.merge(df_publis, df_projects,
how="inner",
on=["id_project", "id_project"])
.drop(["id_project", "agencies"], axis="columns"))
fig = plt.figure()
ax = plt.subplot(121)
query = (df_publis.citations < 500) & (df_publis.domain != "Hors Equipe")
df = df_publis[query]
df.boxplot("citations", by="domain", ax=ax, grid=True, rot=20)
ax.minorticks_on()
ax.xaxis.set_minor_locator(ticker.NullLocator())
ax.set_ylabel("Number of citations", horizontalalignment='right', x=1.)
pdf.savefig(fig)
# ........................................................................
#
# per team
#
fig = plt.figure()
ax = plt.subplot(121)
query = (df_publis.citations < 500) & (df_publis.domain != "Hors Equipe")
df = df_publis[query]
df.boxplot("citations", by="team", ax=ax, grid=True, rot=20)
ax.minorticks_on()
ax.xaxis.set_minor_locator(ticker.NullLocator())
ax.set_ylabel("Number of citations", horizontalalignment='right', x=1.)
pdf.savefig(fig)
pdf.close()
if __name__ == "__main__":
import sys
cli()
sys.exit(0)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment