Commit c18fe61b authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update controller graphs/citations to add estimator vs time.

parent 03db8cdc
......@@ -12,7 +12,6 @@ from graph_tools import (FROM_TO,
LABELY_YEAR,
linechart,
mplstyle,
savefig,
stackchart)
from models.selector import YEAR_SUBMIT
......@@ -21,10 +20,23 @@ def citations():
"""Return graphs showing citations
"""
from citations_tools import get_acl, histogram
import pandas as pd
from citations_tools import (estimator_versus_time,
get_acl,
h_index,
histogram)
from plugin_dbui import Selector
# select articles stored in inspirehep according to user criteria
mplstyle()
pd.set_option("display.width", None)
pd.set_option("display.max_rows", 500)
# ........................................................................
#
# user criteria
#
selector = Selector(
virtdb.graph_citation_selector,
exclude_fields=("id_graphs", "year_start", "year_end"))
......@@ -40,21 +52,87 @@ def citations():
elif isinstance(year_start, int):
selector.append_query(db.publications.year == year_start)
# ........................................................................
#
# prepare list of publications per scan
#
acl = get_acl(db, selector)
# histograms citations for the last scan
df = (acl
.groupby(["id_publication", "scan"])
.last())
fig = histogram(df)
# save the figure
buf = io.BytesIO()
fig.savefig(buf, format="svg")
data = base64.b64encode(buf.getbuffer()).decode("ascii")
return dict(data=data)
[["id_publication", "scan", "citations"]]
.set_index("id_publication"))
dct = {}
for scan in acl.scan.unique():
dct[scan] = (df[df.scan <= scan]
.citations
.groupby("id_publication").max())
df = pd.DataFrame(dct)
df = df[sorted(df.columns)]
last_scan = df.columns[-1]
legend = [
f"{T('period:')} {acl.year.min()} - {acl.year.max()}",
f"{T('articles analysed:')} {df[last_scan].count()}"]
# ........................................................................
#
# histogram citations for the last scan
#
fig1 = histogram(df[last_scan], last_scan, legend)
# ........................................................................
#
# Sum of citations as a function of time
#
fig2 = estimator_versus_time(df.sum(),
last_scan,
legend,
delta=True,
xtitle="$\sum$ citations")
# ........................................................................
#
# Average citations per article as a function of time
#
fig3 = estimator_versus_time(df.mean(),
last_scan,
legend,
xtitle="mean citations / article")
# ........................................................................
#
# h-index as a function of time
#
fig4 = estimator_versus_time(df.apply(h_index),
last_scan,
legend,
xtitle="h-index")
# ........................................................................
#
# rendering
#
dct = {}
buf1 = io.BytesIO()
fig1.savefig(buf1, format="svg")
dct["fig1"] = base64.b64encode(buf1.getbuffer()).decode("ascii")
buf2 = io.BytesIO()
fig2.savefig(buf2, format="svg")
dct["fig2"] = base64.b64encode(buf2.getbuffer()).decode("ascii")
buf3 = io.BytesIO()
fig3.savefig(buf3, format="svg")
dct["fig3"] = base64.b64encode(buf3.getbuffer()).decode("ascii")
buf4 = io.BytesIO()
fig4.savefig(buf4, format="svg")
dct["fig4"] = base64.b64encode(buf4.getbuffer()).decode("ascii")
return dict(**dct)
def dashboard():
......
......@@ -473,7 +473,7 @@ def update_citations():
# interrogate inspirehep.net
try:
url = rex_ins.search(row.origin).group(1)
rep = store.interogate(url, timeout=30, **kwargs)
rep = store.interogate(url, timeout=60, **kwargs)
count = rep.json()[0].get("number_of_citations")
except AttributeError:
......
......@@ -5,12 +5,82 @@ import numpy as np
import pandas as pd
from datetime import timedelta
from gluon import current
from graph_tools import mplstyle
from matplotlib.figure import Figure
import matplotlib.ticker as ticker
def estimator_versus_time(ts, last_scan, legend, delta=False, xtitle=""):
"""Plot citations estimator as a function of time.
Args:
ts (pandas.Series)
last_scan (datetime.datetime)
date of the last scan updating citations
legend (list):
delta (bool):
additional plot showing Delta(yn -yn-1)
xtitle (str)
Returns:
matplotlib.Figure
"""
if len(ts) == 1:
index = (ts.index[0] - timedelta(days=1), ts.index[0])
data = (ts[0], ts[0])
ts = pd.Series(data, index)
fig = Figure()
if delta:
grid = fig.subplots(nrows=2, ncols=1)
ax = grid[0]
else:
ax = fig.subplots(nrows=1, ncols=1)
ts.plot(ax=ax, grid=True, drawstyle="steps-mid")
ax.minorticks_on()
ax.set_xlabel("time", horizontalalignment='right', x=1.)
ax.set_ylabel(xtitle, horizontalalignment='right', y=1.)
# legend
ytxt = (0.80 if delta else 0.91)
ax.text(
0.03, ytxt, "\n".join(legend),
bbox=dict(facecolor="white", alpha=0.5),
family="monospace",
fontsize=7,
transform=ax.transAxes)
msg = f"source: inspirehep.net -- last scan: {last_scan:%d %b %Y}"
ax.text(0.01, 1.01, msg,
fontsize=7,
fontstyle="italic",
transform=ax.transAxes)
if not delta:
return fig
ax = grid[1]
ts.diff().plot(ax=ax,
grid=True,
linewidth=0,
marker="d", markerfacecolor="orange", markersize=8,
ylim=(-0.01, None))
ax.minorticks_on()
ax.set_xlabel("time", horizontalalignment='right', x=1.)
ax.set_ylabel("$\Delta(y_n - y_{n-1})$", horizontalalignment='right', y=1.)
return fig
def get_acl(db, selector):
"""Return articles with their number of citations matching user criteria.
......@@ -66,52 +136,52 @@ def get_acl(db, selector):
return pd.DataFrame(data, columns=columns)
def h_index(df):
def h_index(sr):
"""h-index for the given set of publication.
Args:
df (pandas.DataFrame):
* id_publication (int)
* year (int)
* scan (datetime.date)
* citations (int)
* domain (str)
* team (str)
* project (str)
sr (pandas.Series):
number of citations per publication
Returns:
float
"""
data = (df.citations.sort_values(ascending=False)
data = (sr.sort_values(ascending=False)
.reset_index(drop=True))
data.index += 1
for npubli, ncitations in data.iteritems():
if ncitations < npubli:
if ncitations == npubli:
return float(ncitations)
elif ncitations < npubli:
n1, n2 = npubli - 1, npubli
c1, c2 = data.iat[n1], ncitations
if n1 == c1:
return float(n1)
alpha = (n1 - n2) / (c1 - c2)
hfactor = (n1 - alpha*c1) / (1-alpha)
return hfactor
else:
alpha = (n1 - n2) / (c1 - c2)
hfactor = (n1 - alpha*c1) / (1-alpha)
return hfactor
# not enough publications
# take the number of publication as estimator
# h-index is bigger
return len(data)
def histogram(df):
def histogram(citations, last_scan, legend):
"""histogram the number of citation à la inspirehep.
Args:
df (pandas.DataFrame):
* id_publication (int)
* year (int)
* scan (datetime.date)
* citations (int)
* domain (str)
* team (str)
* project (str)
citations (pandas.Series):
* index is id_publication
* value is the number of citations
last_scan (datetime.datetime)
date of the last scan updating citations
legend (list):
Returns:
matplotlib.Figure
......@@ -120,21 +190,22 @@ def histogram(df):
T = current.T
# build the histogram
citations = df.citations
bins = [-0.5, 0.5, 9.5, 49.5, 99.5, 249.5, 499.5, citations.max()+1]
upper_limit = citations.max()+1
upper_limit = (500 if upper_limit <= 500 else upper_limit)
bins = [-0.5, 0.5, 9.5, 49.5, 99.5, 249.5, 499.5, upper_limit]
hist, dummy = np.histogram(citations, bins)
index = np.arange(len(hist))
# plot histogram
mplstyle()
fig = Figure()
ax = fig.subplots(nrows=1, ncols=1)
ax.bar(index, hist)
df = pd.DataFrame({"all": hist})
df.plot.bar(ax=ax, grid=True, legend=False, rot=0, width=0.9)
ax.minorticks_on()
xlabels = ("",
"0",
xlabels = ("0",
"1 à 9",
"10 à 49",
"50 à 99",
......@@ -143,7 +214,6 @@ def histogram(df):
"+500")
ax.xaxis.set_ticklabels(xlabels)
ax.grid(True)
ax.set_xlabel("Number of citations", horizontalalignment='right', x=1.)
ax.set_ylabel("Number of publications", horizontalalignment='right', y=1.)
......@@ -151,21 +221,19 @@ def histogram(df):
ax.xaxis.set_minor_locator(ticker.NullLocator())
# legend
txt = [
f"{T('period:')} {df.year.min()} - {df.year.max()}",
f"{T('articles analysed:')} {len(df)}",
txt = list(legend)
txt.extend([
f"$\sum${T('citations:')}: {citations.sum()}",
f"{T('citations/article (avg):')} {citations.mean():.1f}",
f"h-index: {h_index(df):.1f}"]
f"h-index: {h_index(citations):.1f}"])
ax.text(
0.62, 0.80, "\n".join(txt),
0.62, 0.82, "\n".join(txt),
bbox=dict(facecolor="white", alpha=0.5),
family="monospace",
fontsize=7,
transform=ax.transAxes)
last_scan = df.reset_index().scan.max()
msg = f"source: inspirehep.net -- last scan: {last_scan:%d %b %Y}"
ax.text(0.01, 1.01, msg,
fontsize=7,
......
......@@ -8,10 +8,40 @@
import urllib.parse
# encode special character bo be used in the image URI
data = urllib.parse.quote(data)
fig1 = urllib.parse.quote(fig1)
fig2 = urllib.parse.quote(fig2)
fig3 = urllib.parse.quote(fig3)
fig4 = urllib.parse.quote(fig4)
}}
<img src="data:image/svg+xml;base64,{{=data}}" height=400></img>
<h2 class="dbui-h2" style="margin-bottom: 0ex; font-variant: small-caps;">
1 General
</h2>
<img src="data:image/svg+xml;base64,{{=fig1}}" height=400>
<h2 class="dbui-h2" style="margin-bottom: 0ex; font-variant: small-caps;">
2 Evolution of estimators as function of time
</h2>
<h2 class="dbui-h2" style="margin-bottom: 0ex;margin-top: 1ex;">
2.1 Total number of citations
</h2>
<img src="data:image/svg+xml;base64,{{=fig2}}" height=400>
<h2 class="dbui-h2" style="margin-bottom: 0ex;margin-top: 0ex;">
2.2 Average number of citations per article
</h2>
<img src="data:image/svg+xml;base64,{{=fig3}}" height=400>
<h2 class="dbui-h2" style="margin-bottom: 0ex;margin-top: 0ex;">
2.3 h-index
</h2>
<img src="data:image/svg+xml;base64,{{=fig4}}" height=400>
{{
from datetime import datetime
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment