Commit 03db8cdc authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add modules citations_tools.py.

parent 913ce635
......@@ -21,69 +21,35 @@ def citations():
"""Return graphs showing citations
"""
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
from citations_tools import get_acl, histogram
from plugin_dbui import Selector
# select articles stored in inspirehep according to user criteria
selector = Selector(
virtdb.graph_citation_selector,
exclude_fields=("id_graphs"))
selector.append_query(db.publications.id == db.citations.id_publications)
query = selector.query(db.publications)
rows = db(query).iterselect(db.publications.id,
db.teams.team,
db.teams.domain,
db.projects.project,
db.citations.date,
db.citations.count)
columns = ["id_publi", "date", "citations", "domain", "team", "project"]
data = (
(el.publications.id,
el.citations.date,
el.citations.count,
el.teams.domain,
el.teams.team,
el.projects.project) for el in rows)
df = (pd.DataFrame(data, columns=columns)
.groupby(["id_publi", "date"])
.last())
exclude_fields=("id_graphs", "year_start", "year_end"))
data = df.citations
bins = [-0.5, 0.5, 9.5, 49.5, 99.5, 249.5, 499.5, data.max()+1]
hist, dummy = np.histogram(data, bins)
ind = np.arange(len(hist))
year_start = selector.year_start
year_end = selector.year_end
mplstyle()
fig = Figure()
ax = fig.subplots(nrows=1, ncols=1)
ax.bar(ind, hist)
ax.minorticks_on()
if isinstance(year_start, int) and isinstance(year_end, int):
query = db.publications.year >= year_start
query &= db.publications.year <= year_end
selector.append_query(query)
xlabels = ("",
"0",
"1 à 9",
"10 à 49",
"50 à 99",
"100 à 249",
"250 à 499",
"+500")
elif isinstance(year_start, int):
selector.append_query(db.publications.year == year_start)
ax.xaxis.set_ticklabels(xlabels)
ax.grid(True)
acl = get_acl(db, selector)
ax.set_xlabel("Number of citations", horizontalalignment='right', x=1.)
ax.set_ylabel("Number of publications", horizontalalignment='right', y=1.)
# histograms citations for the last scan
df = (acl
.groupby(["id_publication", "scan"])
.last())
ax.xaxis.set_minor_locator(ticker.NullLocator())
fig = histogram(df)
# save the figure
buf = io.BytesIO()
fig.savefig(buf, format="svg")
data = base64.b64encode(buf.getbuffer()).decode("ascii")
......
......@@ -49,6 +49,7 @@
'Article': 'Article',
'article': 'article',
'Articles': 'Articles',
'articles analysed:': 'articles analysés: ',
'Articles deposited in arXiv not yet published': 'Articles déposés dans arXiv pas encore publiés',
'Articles published': 'Articles publiés',
'ASCL': 'ASCL',
......@@ -128,6 +129,8 @@
'Check that the publication URL corresponds to a pdf file.': 'Check that the publication URL corresponds to a pdf file.',
'Check to delete': 'Check to delete',
'citations': 'citations',
'citations/article (avg):': 'citations/article (avg): ',
'citations:': 'citations:',
'Click on the link %(link)s to verify your email': 'Click on the link %(link)s to verify your email',
'Click on the link http://localhost:8000/limbra/default/user/reset_password/%(key)s to reset your password': 'Click on the link http://localhost:8000/limbra/default/user/reset_password/%(key)s to reset your password',
'Client IP': 'Client IP',
......@@ -475,6 +478,7 @@
'patent': 'brevet',
'PDF file url': 'URL du pdf',
'Period': 'Période',
'period:': 'periode: ',
'PHD ': 'PHD ',
'PhD Thesis, ...': 'Doctorat, habilitation à diriger les recherches, ...',
'PhDs': 'PhDs',
......
""" Collections of tools to built plots related to citations
"""
import numpy as np
import pandas as pd
from gluon import current
from graph_tools import mplstyle
from matplotlib.figure import Figure
import matplotlib.ticker as ticker
def get_acl(db, selector):
"""Return articles with their number of citations matching user criteria.
Only look for articles with an origin in the store inspirehep.
Args:
db (pyDAL.DAL):
database connection.
selector (dbui.Selector):
user criteria
Returns:
pandas.DataFrame:
* id_publication (int)
* year
* scan (datetime.date)
* citations (int)
* domain (str)
* team (str)
* project (str)
"""
selector.append_query(db.publications.id == db.citations.id_publications)
query = selector.query(db.publications)
rows = db(query).iterselect(db.publications.id,
db.publications.year,
db.teams.team,
db.teams.domain,
db.projects.project,
db.citations.date,
db.citations.count)
columns = ["id_publication",
"year",
"scan",
"citations",
"domain",
"team",
"project"]
data = (
(el.publications.id,
el.publications.year,
el.citations.date,
el.citations.count,
el.teams.domain,
el.teams.team,
el.projects.project) for el in rows)
return pd.DataFrame(data, columns=columns)
def h_index(df):
"""h-index for the given set of publication.
Args:
df (pandas.DataFrame):
* id_publication (int)
* year (int)
* scan (datetime.date)
* citations (int)
* domain (str)
* team (str)
* project (str)
Returns:
float
"""
data = (df.citations.sort_values(ascending=False)
.reset_index(drop=True))
for npubli, ncitations in data.iteritems():
if ncitations < npubli:
n1, n2 = npubli - 1, npubli
c1, c2 = data.iat[n1], ncitations
if n1 == c1:
return float(n1)
else:
alpha = (n1 - n2) / (c1 - c2)
hfactor = (n1 - alpha*c1) / (1-alpha)
return hfactor
def histogram(df):
"""histogram the number of citation à la inspirehep.
Args:
df (pandas.DataFrame):
* id_publication (int)
* year (int)
* scan (datetime.date)
* citations (int)
* domain (str)
* team (str)
* project (str)
Returns:
matplotlib.Figure
"""
T = current.T
# build the histogram
citations = df.citations
bins = [-0.5, 0.5, 9.5, 49.5, 99.5, 249.5, 499.5, citations.max()+1]
hist, dummy = np.histogram(citations, bins)
index = np.arange(len(hist))
# plot histogram
mplstyle()
fig = Figure()
ax = fig.subplots(nrows=1, ncols=1)
ax.bar(index, hist)
ax.minorticks_on()
xlabels = ("",
"0",
"1 à 9",
"10 à 49",
"50 à 99",
"100 à 249",
"250 à 499",
"+500")
ax.xaxis.set_ticklabels(xlabels)
ax.grid(True)
ax.set_xlabel("Number of citations", horizontalalignment='right', x=1.)
ax.set_ylabel("Number of publications", horizontalalignment='right', y=1.)
ax.xaxis.set_minor_locator(ticker.NullLocator())
# legend
txt = [
f"{T('period:')} {df.year.min()} - {df.year.max()}",
f"{T('articles analysed:')} {len(df)}",
f"$\sum${T('citations:')}: {citations.sum()}",
f"{T('citations/article (avg):')} {citations.mean():.1f}",
f"h-index: {h_index(df):.1f}"]
ax.text(
0.62, 0.80, "\n".join(txt),
bbox=dict(facecolor="white", alpha=0.5),
family="monospace",
fontsize=7,
transform=ax.transAxes)
last_scan = df.reset_index().scan.max()
msg = f"source: inspirehep.net -- last scan: {last_scan:%d %b %Y}"
ax.text(0.01, 1.01, msg,
fontsize=7,
fontstyle="italic",
transform=ax.transAxes)
return fig
......@@ -7,7 +7,7 @@
#--------------------------------------------------------------------------
import urllib.parse
# encode special charactier bo be used in the image URI
# encode special character bo be used in the image URI
data = urllib.parse.quote(data)
}}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment