Commit 00c76f28 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '78-citations' into 'master'

Resolve "Add the number of citations"

Closes #78

See merge request !101
parents 6ce6e0d3 90af58ed
""" Controllers for cron jobs
"""
import json
from pathlib import Path
from plugin_dbui import JSONEncoder
PATH_CITATIONS_LOG = "static/cron/citations"
def citations():
"""Display last logs of the cron job updating citations table.
"""
log = f"{request.application}.log"
plog = Path(request.folder, PATH_CITATIONS_LOG, log)
if not plog.exists():
return "No log for the cron job updating citations table!!!"
# configure the Ext.data.ArrayStore
lines = plog.read_text().split("\n")
data = [[
T(el[:el.find(":")].replace("\t", "").strip()),
el[el.find(":")+1:].strip()]
for el in lines if ":" in el]
# remove the name of the application
if len(data) > 0:
del data[0]
# improve scan duration
if len(data) > 3 and "." in data[2][1]:
val = data[2][1]
data[2][1] = val[:val.find(".")]
fields = [
{"name": "item", "type": "string"},
{"name": "value", "type": "string"}]
cfg = dict(fields=fields, data=data)
# configure the Ext.grid.Panel
columns = [
{"text": "item", "dataIndex": "item", "flex": 1},
{"text": "value", "dataIndex": "value", "flex": 0.8, "align": "end"}]
return dict(cfg_store=json.dumps(cfg, cls=JSONEncoder),
columns=json.dumps(columns, cls=JSONEncoder))
""" Controllers for building graphs using pandas library
"""
import base64
import io
from matplotlib.figure import Figure
from citations_tools import (PLOT_CITATIONS_AVG,
PLOT_CITATIONS_HINDEX,
PLOT_CITATIONS_OVERVIEW,
PLOT_CITATIONS_TOTAL)
from gluon import current
from gluon.storage import Storage
from graph_tools import (FROM_TO,
LABELY_YEAR,
linechart,
mplstyle,
savefig,
stackchart)
from matplotlib.figure import Figure
from models.selector import YEAR_SUBMIT
def citations():
"""Generate figures with citations plots.
Returns:
dict:
* fig1 (matplotlib.Figure)
* fig2 (matplotlib.Figure)
* fig3 (matplotlib.Figure)
* fig4 (matplotlib.Figure)
"""
import pandas as pd
import matplotlib as mpl
from citations_tools import (estimator_versus_time,
get_acl,
h_index,
histogram)
from plugin_dbui import Selector
from reporting_tools import repr_team_project
mplstyle()
mpl.rcParams['date.autoformatter.year'] = '%Y'
mpl.rcParams['date.autoformatter.month'] = '%b'
mpl.rcParams['date.autoformatter.day'] = '%d %b'
mpl.rcParams['date.autoformatter.hour'] = '%b %d %H'
mpl.rcParams['date.autoformatter.minute'] = '%H:%M'
mpl.rcParams['date.autoformatter.second'] = '%H:%M:%S'
pd.set_option("display.width", None)
pd.set_option("display.max_rows", 500)
# ........................................................................
#
# user criteria
#
selector = Selector(
virtdb.graph_citation_selector,
exclude_fields=("graphs", "year_start", "year_end"))
year_start = selector.year_start
year_end = selector.year_end
if isinstance(year_start, int) and isinstance(year_end, int):
query = db.publications.year >= year_start
query &= db.publications.year <= year_end
selector.append_query(query)
elif isinstance(year_start, int):
selector.append_query(db.publications.year == year_start)
# ........................................................................
#
# prepare list of publications per scan
#
acl = get_acl(db, selector)
cols = ["id_publication", "scan", "citations"]
idx = ["id_publication"]
fgroupby = False
df = (acl[cols].set_index(idx))
dct = {}
for scan in acl.scan.unique():
dct[scan] = (df[df.scan <= scan]
.citations
.groupby(idx).max())
df = pd.DataFrame(dct)
df = df[sorted(df.columns)]
last_scan = df.columns[-1]
legend = [
f"{T('period:')} {acl.year.min()} - {acl.year.max()}",
f"{T('articles analysed:')} {df[last_scan].count()}"]
# ........................................................................
#
# histogram citations for the last scan
#
if selector.graphs == T(PLOT_CITATIONS_OVERVIEW):
fig = histogram(df[last_scan], last_scan, legend)
# ........................................................................
#
# Sum of citations as a function of time
#
elif selector.graphs == T(PLOT_CITATIONS_TOTAL):
data = (df.groupby(level=1).sum().T if fgroupby else df.sum())
fig = estimator_versus_time(data,
last_scan,
legend,
delta=not fgroupby,
logy=fgroupby,
xtitle="$\sum$ citations")
# ........................................................................
#
# Average citations per article as a function of time
#
elif selector.graphs == T(PLOT_CITATIONS_AVG):
data = (df.groupby(level=1).mean().T if fgroupby else df.mean())
fig = estimator_versus_time(data,
last_scan,
legend,
xtitle="mean citations / article")
# ........................................................................
#
# h-index as a function of time
#
elif selector.graphs == T(PLOT_CITATIONS_HINDEX):
data = (df.groupby(level=1).agg(h_index).T if fgroupby
else df.apply(h_index))
fig = estimator_versus_time(data,
last_scan,
legend,
xtitle="h-index")
# ........................................................................
#
# rendering
#
response.view = f"graphs/index.{request.extension}"
dct = dict(fig=fig)
return dct
def dashboard():
"""Return a pre-configure linechart for public used.
Cumulative distribution for the publications are shown for
the current year.
Returns:
dict:
* fig (matplotlib.Figure)
"""
current_year = request.now.year
dashboard_start = current.app.dashboard_start_year
......@@ -73,12 +209,8 @@ def dashboard():
linechart(db, selector, target=ax2, title=title)
# delegate the rendering to the view
buf = io.BytesIO()
fig.savefig(buf, format="svg")
data = base64.b64encode(buf.getbuffer()).decode("ascii")
response.view = "graphs/index.html"
return dict(data=data)
return dict(fig=fig)
def publications_versus_time():
......@@ -103,12 +235,5 @@ def publications_versus_time():
linechart(db, selector, target=ax)
# delegate the rendering to the view
extension = request.extension
fmt = ("svg" if extension == "html" else extension)
buf = io.BytesIO()
fig.savefig(buf, format=fmt)
data = base64.b64encode(buf.getbuffer()).decode("ascii")
response.view = "graphs/index.%s" % extension
return dict(data=data)
response.view = f"graphs/index.{request.extension}"
return dict(fig=fig)
......@@ -492,15 +492,28 @@ def run_all():
for harvester in harvesters:
id_teams = harvester.id_teams
id_projects = harvester.id_projects
controller = harvester.controller
id_categories = harvester.id_categories
logger.info("-"*79)
logger.info(f"run harvester {harvester.controller}")
logger.info(f"run harvester {controller}")
logger.info(f" team: {id_teams}")
logger.info(f" project: {id_projects}")
logger.info(f" controller: {controller}")
logger.info(f" category: {id_categories}")
if id_teams is None or id_projects is None or id_categories is None:
continue
tool = build_harvester_tool(
db,
harvester.id_teams,
harvester.id_projects,
harvester.controller,
harvester.id_categories,
id_teams,
id_projects,
controller,
id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN))
......@@ -514,15 +527,23 @@ def run_all():
logs.extend(tool.logs)
except ToolException as e:
log = tool.logs[-1]
msg = "<h4>Error on record %s (%s)</h4>" % (log.url, log.collection)
msg = ""
if len(tool.logs) > 0:
log = tool.logs[-1]
msg = f"<h4>Error on record {log.url} ({log.collection})</h4>"
msg += T(str(e))
logger.error(msg)
logger.info("-"*79)
return msg
except Exception as e:
msg = "<hr/>"
msg += CODE(traceback.format_exc()).xml()
msg += "<hr/>"
logger.error(msg)
logger.info("-"*79)
return msg
if logger.getEffectiveLevel() <= logging.INFO:
......
......@@ -130,3 +130,116 @@ def index():
return dict(cfg_store=json.dumps(cfg, cls=JSONEncoder),
team_project=repr_team_project(db, selector),
selector=selector)
def top_citations():
"""controller to display publications with highest citations score.
"""
citations = db.citations
projects = db.projects
publications = db.publications
publishers = db.publishers
teams = db.teams
# get the user constraint.
# the selector fields year_start, year_end are excluded
# from the selector query.
tpl = ("id_authors_roles",
"id_teams",
"id_projects",
"top",
"year_start", "year_end")
selector = Selector(virtdb.top_citations_selector, exclude_fields=tpl)
# query directive for the citations table
query = selector.query(citations)
query &= publications.id_projects == projects.id
query &= publications.id_teams == teams.id
query &= publications.id_publishers == publishers.id
id_role = selector.id_authors_roles
if len(id_role) > 0:
query &= publications.id_authors_roles == id_role
id_team = selector.id_teams
if len(id_team) > 0:
query &= publications.id_teams == id_team
id_project = selector.id_projects
if len(id_project) > 0:
query &= publications.id_projects == id_project
if selector.year_start and not selector.year_end:
query &= (publications.year == selector.year_start)
elif selector.year_start and selector.year_end:
q_start = publications.year >= selector.year_start
q_end = publications.year <= selector.year_end
query &= ((q_start) & (q_end))
# configure the Ext.data.ArrayStore for the grid
max_count = citations.count.max()
irows = db(query).iterselect(
teams.domain,
teams.team,
projects.project,
publications.title,
publishers.abbreviation,
publications.volume,
publications.year,
publications.pages,
max_count,
groupby=citations.id_publications,
orderby=~max_count,
limitby=(0, selector.top))
data = [
[
el.teams.domain,
el.teams.team,
el.projects.project,
f"{el.publications.title}, "
f"{el.publishers.abbreviation} {el.publications.volume} "
f"({el.publications.year}) {el.publications.pages}",
el._extra["MAX(`citations`.`count`)"],
el.publications.year
] for el in irows]
fields = [
{"name": "domain", "type": "string"},
{"name": "team", "type": "string"},
{"name": "project", "type": "string"},
{"name": "reference", "type": "string"},
{"name": "citations", "type": "int"},
{"name": "year", "type": "int"}]
cfg = dict(fields=fields, data=data)
# configure the grid
columns = [
{"xtype": "rownumberer"},
{"text": T("domain"), "dataIndex": "domain", "flex": 1},
{"text": T("team"), "dataIndex": "team", "flex": 0.9},
{
"text": T("project"),
"dataIndex": "project",
"flex": 0.5,
"hidden": True
},
{"text": T("reference"), "dataIndex": "reference", "flex": 2.5},
{
"text": T("citations"),
"dataIndex": "citations",
"flex": 0.5,
"align": "end"
},
{"text": T("year"), "dataIndex": "year", "flex": 0.5}
]
return dict(cfg_store=json.dumps(cfg, cls=JSONEncoder),
columns=json.dumps(columns, cls=JSONEncoder),
team_project=repr_team_project(db, selector),
selector=selector)
......@@ -424,3 +424,117 @@ def harvester():
raise HTTP(500, msg)
return
def update_citations():
"""Update the citations table.
* select article in the inspirehep store according to user criteria
* get the number of citations per document
* update the citations table
"""
import datetime
import requests
from invenio_tools import InvenioStore
from json.decoder import JSONDecodeError
from reporting_tools import repr_team_project
citations = db.citations
id_acl = get_id(db.categories, code="ACL")
kwargs = dict(of="recjson", ot="number_of_citations")
publications = db.publications
rex_ins = re.compile(r"(https?://inspirehep.net/record/\d+)")
store = InvenioStore("inspirehep.net")
today = datetime.date.today()
counters = Storage(article=0,
http_error=0,
insert=0,
json_error=0,
list_size=0,
not_list=0,
url_error=0)
# get user requirement
selector = Selector(virtdb.citation_selector)
# get the list of article store in the inspirehep store
selector.append_query(publications.id_categories == id_acl)
selector.append_query(publications.origin.contains("inspirehep"))
query = selector.query(publications)
# get the number of citation and update the database table
for row in db(query).iterselect(publications.id, publications.origin):
counters.article += 1
logger.debug(row.origin)
# interrogate inspirehep.net
try:
url = rex_ins.search(row.origin).group(1)
rep = store.interogate(url, timeout=60, **kwargs)
lst = rep.json()
if not isinstance(lst, list):
logger.warning(f"JSON response is not a list")
counters.not_list += 1
continue
if len(lst) != 1:
logger.warning(f"size of the return list is not one")
counters.list_size += 1
continue
count = lst[0].get("number_of_citations")
except AttributeError:
logger.warning(f"inspirehep URL not well formed {row.origin}")
counters.url_error += 1
continue
except JSONDecodeError:
logger.warning("JSON decoding error")
counters.json_error += 1
continue
except requests.exceptions.RequestException:
logger.warning(f"HTTP error interrogating {url}")
counters.http_error += 1
continue
# check if the number of count changes
myset = db(citations.id_publications == row.id)
if not myset.isempty():
entries = myset.select(orderby=citations.date)
last_count = entries.last().count
logger.debug(f"last count {last_count} new one {count}")
if last_count == count:
continue
# update the citations table
logger.info(f"update {url} citations to {count}")
counters.insert += 1
idpubli = row.id
citations.update_or_insert(
(citations.date == today) & (citations.id_publications == idpubli),
date=today,
id_publications=idpubli,
count=count)
db.commit()
# inform the user
logger.info(f" number of article: {counters.article}")
logger.info(f" bad inspirehep URL: {counters.url_error}")
logger.info(f" HTTP connection error: {counters.url_error}")
logger.info(f" JSON decoding error: {counters.json_error}")
logger.info(f" response is not a list: {counters.not_list}")
logger.info(f" list size is not one: {counters.list_size}")
logger.info(f" insert or update in db: {counters.insert}")
return dict(counters=counters,
team_project=repr_team_project(db, selector))
docs/db_schema/database.png

251 KB | W: | H:

docs/db_schema/database.png

221 KB | W: | H:

docs/db_schema/database.png
docs/db_schema/database.png
docs/db_schema/database.png
docs/db_schema/database.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -26,7 +26,7 @@
<type label="Upload" length="0" sql="upload" quote="'"/>
<type label="Password" length="0" sql="password" quote="'"/>
</group>
</datatypes><table x="70" y="142" name="publishers">
</datatypes><table x="71" y="206" name="publishers">
<row name="id" null="0" autoincrement="1">
<datatype>integer</datatype>
</row>
......@@ -583,4 +583,22 @@
<part>id</part>
</key>
</table>
<table x="79" y="70" name="citations">
<row name="id" null="1" autoincrement="1">
<datatype>integer</datatype>
<default>NULL</default></row>
<row name="date" null="0" autoincrement="0">
<datatype>datetime</datatype>
<default>'NULL'</default></row>
<row name="count" null="0" autoincrement="0">
<datatype>integer</datatype>
</row>
<row name="id_publications" null="0" autoincrement="0">
<datatype>integer</datatype>
<relation table="publications" row="id" />
</row>
<key type="PRIMARY" name="">
<part>id</part>
</key>
</table>
</sql>
.. include:: hyperlinks.txt
Les citations
-------------
Pour chaque article stocké dans *inspirehep.net*, le comptage du
nombre de citations est mise à jour régulièrement.
Limbra interroge *insirehep.net* afin de stocker l'évolution
du nombre de citations en fonction du temps, pour chaque article.
Cette procédure est automatisée. Elle a lieu tous les deux à trois jours.
Plusieurs actions sont disponibles pour manipuler cette information:
* ``Rapports > citations``
génère des graphes globaux pour le laboratoire, pour une équipe, *etc*
* ``Rapports > top citations``
génère une table avec les articles les plus citées.
Cette action permet aussi d'extraire des données au format CSV
pour des analyses détaillées.
* ``taches planifiées > citations``