Commit 32f27735 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch 'master' into 'production'

Release 1.3.0

See merge request !104
parents 2e45be32 3cd2a6b5
...@@ -2,6 +2,16 @@ ...@@ -2,6 +2,16 @@
HEAD HEAD
1.3.0 (Apr 2020)
- Major release to work with the new inspirehep version.
The inspirehep store is divided in "shelf": literature and conferences
and provide a new API to search and to retrieve records.
The new API is open for conferences but not for literature.
Publications are in literature. In order to get them, we have to
use old.inspirehep.net which is based on the old API.
The store cds.cern.ch continues to work with the old API.
- Redesing the InvenioStore and its factory.
1.2.0 (Mar 2020) 1.2.0 (Mar 2020)
- Major release to handle citations count per article - Major release to handle citations count per article
- Several selectors, controllers and views have been added - Several selectors, controllers and views have been added
......
1.2.0 1.3.0
\ No newline at end of file \ No newline at end of file
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
import json import json
import re import re
from check_tools import check_publication from check_tools import check_publication
from gluon.storage import Storage from gluon.storage import Storage
from harvest_tools import DRY_RUN from harvest_tools import DRY_RUN
...@@ -18,7 +17,6 @@ from plugin_dbui import (CALLBACK_ERRORS, ...@@ -18,7 +17,6 @@ from plugin_dbui import (CALLBACK_ERRORS,
Selector, Selector,
to_fields) to_fields)
MODE_DRY_RUN = T(DRY_RUN) MODE_DRY_RUN = T(DRY_RUN)
MSG_NO_AUTHORS = "<br><br>Removing affiliation failed.<br>"\ MSG_NO_AUTHORS = "<br><br>Removing affiliation failed.<br>"\
"Use INSPIRES instead with the tool 'insert RECJSON'" "Use INSPIRES instead with the tool 'insert RECJSON'"
...@@ -437,36 +435,33 @@ def update_citations(): ...@@ -437,36 +435,33 @@ def update_citations():
import datetime import datetime
import requests import requests
from invenio_tools import InvenioStore from invenio_tools import CdsException, InvenioStore
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError
from reporting_tools import repr_team_project from reporting_tools import repr_team_project
today = datetime.date.today()
rex_ins = re.compile(r"https?://inspirehep.net/record/(\d+)")
citations = db.citations citations = db.citations
id_acl = get_id(db.categories, code="ACL")
kwargs = dict(of="recjson", ot="number_of_citations")
publications = db.publications publications = db.publications
rex_ins = re.compile(r"(https?://inspirehep.net/record/\d+)")
store = InvenioStore("inspirehep.net")
today = datetime.date.today()
counters = Storage(article=0, counters = Storage(article=0,
http_error=0, recid=0,
insert=0, failed=0,
json_error=0, insert=0)
list_size=0,
not_list=0,
url_error=0)
# get user requirement # get user requirement
selector = Selector(virtdb.citation_selector) selector = Selector(virtdb.citation_selector)
# get the list of article store in the inspirehep store # get the list of article store in the inspirehep store
id_acl = get_id(db.categories, code="ACL")
selector.append_query(publications.id_categories == id_acl) selector.append_query(publications.id_categories == id_acl)
selector.append_query(publications.origin.contains("inspirehep")) selector.append_query(publications.origin.contains("inspirehep"))
query = selector.query(publications) query = selector.query(publications)
# get the number of citation and update the database table # get the number of citation and update the database table
store = InvenioStore("inspirehep.net", shelf="literature")
for row in db(query).iterselect(publications.id, publications.origin): for row in db(query).iterselect(publications.id, publications.origin):
counters.article += 1 counters.article += 1
...@@ -474,35 +469,17 @@ def update_citations(): ...@@ -474,35 +469,17 @@ def update_citations():
# interrogate inspirehep.net # interrogate inspirehep.net
try: try:
url = rex_ins.search(row.origin).group(1) recid = rex_ins.search(row.origin).group(1)
rep = store.interogate(url, timeout=60, **kwargs) count = store.get_field(recid, "number_of_citations")
lst = rep.json()
if not isinstance(lst, list):
logger.warning(f"JSON response is not a list")
counters.not_list += 1
continue
if len(lst) != 1:
logger.warning(f"size of the return list is not one")
counters.list_size += 1
continue
count = lst[0].get("number_of_citations")
except AttributeError: except AttributeError:
logger.warning(f"inspirehep URL not well formed {row.origin}") logger.warning(f"record identifier not found in {row.origin}")
counters.url_error += 1 counters.recid += 1
continue
except JSONDecodeError:
logger.warning("JSON decoding error")
counters.json_error += 1
continue continue
except requests.exceptions.RequestException: except CdsException:
logger.warning(f"HTTP error interrogating {url}") logger.warning(f"failed to get citations for {recid}")
counters.http_error += 1 counters.failed += 1
continue continue
# check if the number of count changes # check if the number of count changes
...@@ -515,7 +492,7 @@ def update_citations(): ...@@ -515,7 +492,7 @@ def update_citations():
continue continue
# update the citations table # update the citations table
logger.info(f"update {url} citations to {count}") logger.info(f"update citations for {recid} by {count - last_count}")
counters.insert += 1 counters.insert += 1
idpubli = row.id idpubli = row.id
...@@ -529,11 +506,8 @@ def update_citations(): ...@@ -529,11 +506,8 @@ def update_citations():
# inform the user # inform the user
logger.info(f" number of article: {counters.article}") logger.info(f" number of article: {counters.article}")
logger.info(f" bad inspirehep URL: {counters.url_error}") logger.info(f" bad record identifier: {counters.recid}")
logger.info(f" HTTP connection error: {counters.url_error}") logger.info(f" failed to get citations: {counters.failed}")
logger.info(f" JSON decoding error: {counters.json_error}")
logger.info(f" response is not a list: {counters.not_list}")
logger.info(f" list size is not one: {counters.list_size}")
logger.info(f" insert or update in db: {counters.insert}") logger.info(f" insert or update in db: {counters.insert}")
return dict(counters=counters, return dict(counters=counters,
......
...@@ -10,10 +10,12 @@ invenio_tools.inveniostore.InvenioStore ...@@ -10,10 +10,12 @@ invenio_tools.inveniostore.InvenioStore
.. autosummary:: .. autosummary::
:toctree: inveniostore/ :toctree: inveniostore/
~InvenioStore.get_field
~InvenioStore.get_ids ~InvenioStore.get_ids
~InvenioStore.get_record ~InvenioStore.get_record
~InvenioStore.interogate ~InvenioStore.interrogate
~InvenioStore.last_search_url ~InvenioStore.last_search_url
~InvenioStore.search
......
invenio_tools.inveniostore.InvenioStore.get_field
=================================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.get_field
invenio_tools.inveniostore.InvenioStore.interogate
==================================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.interogate
\ No newline at end of file
invenio_tools.inveniostore.InvenioStore.interrogate
===================================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.interrogate
invenio_tools.inveniostore.InvenioStore.search
===============================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.search
...@@ -89,6 +89,7 @@ ...@@ -89,6 +89,7 @@
'Axis Label Converters': 'Axis Label Converters', 'Axis Label Converters': 'Axis Label Converters',
'Axis values are used to defined the title of each level in section. This field allows to replace some values by another ones. Rule: label1: "value1", label2: "value2",....': "Le nom de l'axe est utilisé comme titre pour le niveau. Ce champ permet de remplacer cette valeur par une autre. Régle: label1: 'value1', label2: 'value2',....", 'Axis values are used to defined the title of each level in section. This field allows to replace some values by another ones. Rule: label1: "value1", label2: "value2",....': "Le nom de l'axe est utilisé comme titre pour le niveau. Ce champ permet de remplacer cette valeur par une autre. Régle: label1: 'value1', label2: 'value2',....",
'Axis Vertical': 'Axis Vertical', 'Axis Vertical': 'Axis Vertical',
'Bad record identifer': 'Mauvais identifiant',
'basic': 'basic', 'basic': 'basic',
'Binary files': 'fichiers binaires', 'Binary files': 'fichiers binaires',
'Book': 'Ouvrage', 'Book': 'Ouvrage',
...@@ -251,6 +252,7 @@ ...@@ -251,6 +252,7 @@
'extract authors': 'extraire les auteurs', 'extract authors': 'extraire les auteurs',
'Fail to decode HTTP response': 'Echec du decodage de la réponse HTTP', 'Fail to decode HTTP response': 'Echec du decodage de la réponse HTTP',
'Fail to insert the new record in the database.': "Echec de l'insertion d'un nouvel enregistrement dans la base de donnée.", 'Fail to insert the new record in the database.': "Echec de l'insertion d'un nouvel enregistrement dans la base de donnée.",
'Failed to get citations': 'Echec pour obtenir le nombre de citations',
'Field': 'Champ', 'Field': 'Champ',
'Fields with identical value are not listed.': 'Les champs avec des valeurs identiques ne sont pas listés.', 'Fields with identical value are not listed.': 'Les champs avec des valeurs identiques ne sont pas listés.',
'Fill': 'Remplir', 'Fill': 'Remplir',
...@@ -388,6 +390,7 @@ ...@@ -388,6 +390,7 @@
'Load in the database': 'Chargé dans la base de données', 'Load in the database': 'Chargé dans la base de données',
'Loading failed': 'Echec du chargement', 'Loading failed': 'Echec du chargement',
'Loading...': 'Chargement en cours...', 'Loading...': 'Chargement en cours...',
'log cron jobs': 'journaux des taches planifiées',
'Log In': 'Log In', 'Log In': 'Log In',
'Logged in': 'Logged in', 'Logged in': 'Logged in',
'Logged out': 'Déconnexion', 'Logged out': 'Déconnexion',
......
...@@ -5,7 +5,6 @@ import logging ...@@ -5,7 +5,6 @@ import logging
import re import re
import traceback import traceback
from .base import (MSG_FIX_ORIGIN, from .base import (MSG_FIX_ORIGIN,
MSG_IN_DB, MSG_IN_DB,
ToolException) ToolException)
...@@ -19,7 +18,6 @@ from .msg import Msg ...@@ -19,7 +18,6 @@ from .msg import Msg
from .msgcollection import MsgCollection from .msgcollection import MsgCollection
from plugin_dbui import CALLBACK_ERRORS, get_id from plugin_dbui import CALLBACK_ERRORS, get_id
MSG_NO_CAT = 'Select a "category" !!!' MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!' MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!' MSG_NO_TEAM = 'Select a "team" !!!'
...@@ -109,6 +107,7 @@ class Automaton(object): ...@@ -109,6 +107,7 @@ class Automaton(object):
* team or project or the publication category not defined * team or project or the publication category not defined
""" """
def __init__(self, def __init__(self,
db, db,
id_team, id_team,
...@@ -549,7 +548,7 @@ class Automaton(object): ...@@ -549,7 +548,7 @@ class Automaton(object):
rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0] rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0]
# process the remaining identifiers # process the remaining identifiers
(*map(self.process_recid, rec_ids), ) (*map(self.process_recid, rec_ids),)
def process_recjson(self, recjson): def process_recjson(self, recjson):
"""Process the publication provided as a JSON record: """Process the publication provided as a JSON record:
...@@ -670,13 +669,14 @@ class Automaton(object): ...@@ -670,13 +669,14 @@ class Automaton(object):
self.harvester.collections = collections self.harvester.collections = collections
# instantiate the store # instantiate the store
self.store = InvenioStore(host) shelf = ("literature" if host == "inspirehep.net" else None)
self.store = InvenioStore(host, shelf=shelf)
# list of collections # list of collections
collections = re.sub(" *, *", ",", collections).split(",") collections = re.sub(" *, *", ",", collections).split(",")
# process # process
(*map(self.process_collection, collections), ) (*map(self.process_collection, collections),)
def report(self): def report(self):
"""Build the processing report. """Build the processing report.
......
...@@ -21,7 +21,6 @@ from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS ...@@ -21,7 +21,6 @@ from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS
from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.") DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY # Decode submitted date: DD MMM YYYY or DD MM YYY
...@@ -79,6 +78,7 @@ class CheckAndFix(object): ...@@ -79,6 +78,7 @@ class CheckAndFix(object):
"""A collection of tools to check and repair the content of record. """A collection of tools to check and repair the content of record.
""" """
def __init__(self): def __init__(self):
self.db = current.db self.db = current.db
...@@ -118,13 +118,11 @@ class CheckAndFix(object): ...@@ -118,13 +118,11 @@ class CheckAndFix(object):
# CDS has the opening and closing dates encoded as 20141231 # CDS has the opening and closing dates encoded as 20141231
if "opening_date" in meeting and "closing_date" in meeting: if "opening_date" in meeting and "closing_date" in meeting:
fmt = "%Y%m%d" val = meeting["opening_date"].replace("-", "")
opening = datetime.strptime(val, "%Y%m%d")
val = meeting["opening_date"] val = meeting["closing_date"].replace("-", "")
opening = datetime.strptime(val, fmt) closing = datetime.strptime(val, "%Y%m%d")
val = meeting["closing_date"]
closing = datetime.strptime(val, fmt)
return (opening, closing) return (opening, closing)
......
# Invenio tools
This section describes keywords which can be used to search record within
an invenio store. They depend on the invenio version
## inspirehep.net (March 2020)
New version deployed in March 2020
The previous version as available in the store old.inspirehep.net.
## old.inspirehep.net and cds.cern.ch
Search criteria are defined by the keywords arguments.
The complete list of keyword arguments can be found at
[https://old.inspirehep.net/help/hacking/search-engine-api](
https://old.inspirehep.net/help/hacking/search-engine-api)
Examples how to use the invenio API:
[https://old.inspirehep.net/info/hep/api?ln=fr#json_fnames](
https://old.inspirehep.net/info/hep/api?ln=fr#json_fnames)
List of keyword in the JSON record:
[https://github.com/inspirehep/invenio/blob/prod/modules/bibfield/etc/atlantis.cfg](
https://github.com/inspirehep/invenio/blob/prod/modules/bibfield/etc/atlantis.cfg)
cc (str):
current collection (*e.g.* "ATLAS").
The collection the user started to search/browse from.
c (str):
collection list (*e.g.* ["Theses", "Books"]).
The collections user may have selected/deselected when
starting to search from **cc**.
ec (str):
external collection list (*e.g.* ["CiteSeer", "Google"]).
The external collections may have been selected/deselected
by the user.
p (str):
pattern to search for (*e.g.* "ellis and muon or kaon").
f (str):
field to search within (*e.g.* "author").
rg (int):
records in groups of (*e.g.* "10").
Defines how many hits per collection in the search results
page are displayed.
sf (str):
sort field (*e.g*. "title").
so (str):
sort order ("a"=ascending, "d"=descending).
sp (str):
sort pattern (*e.g.* "CERN-") -- in case there are more
values in a sort field, this argument tells which one
to prefer.
rm (str):
ranking method (*e.g.* "jif").
Defines whether results should be ranked by some known
ranking method.
of (str):
output format (*e.g.* "hb").
Usually starting "h" means HTML output (and "hb" for HTML
brief, "hd" for HTML detailed), "x" means XML output,
"t" means plain text output, "id" means no output at all but to
return list of recIDs found. (Suitable for high-level API.).
ot (str):
output only these MARC tags (*e.g.* "100,700,909C0b").
Useful if only some fields are to be shown in the
output, e.g. for library to control some fields.
as (int):
advanced search ("0" means no, "1" means yes).
Whether search was called from within the advanced search
interface.
p1 (str):
first pattern to search for in the advanced search
interface. Much like **p**.
f1 (str):
first field to search within in the advanced search
interface. Much like **f**.
m1 (str):
first matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact phrase,
"p" partial phrase, "r" regular expression).
op1 (str):
first operator, to join the first and the second unit
in the advanced search interface. ("a" add, "o" or, "n" not).
p2 (str):
second pattern to search for in the advanced search
interface. Much like **p**.
f2 (str):
second field to search within in the advanced search
interface. Much like **f**.
m2 (str):
second matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact phrase,
"p" partial phrase, "r" regular expression).
op2 (str):
second operator, to join the second and the third unit
in the advanced search interface. ("a" add, "o" or, "n" not).
p3 (str):
third pattern to search for in the advanced search
interface. Much like **p**.
f3 (str):
third field to search within in the advanced search
interface. Much like **f**.
m3 (str):
third matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact phrase,
"p" partial phrase, "r" regular expression).
sc (int):
split by collection ("0" no, "1" yes).
Governs whether we want to present the results in a single
huge list, or splitted by collection.
jrec (int):
jump to record (*e.g.* "234").
Used for navigation inside the search results.
recid (int):
display record ID (*e.g.* "20000").
Do not search/browse but go straight away to the Detailed
record page for the given recID.
recidb (int):
display record ID bis (*e.g.* "20010").
If greater than "recid", then display records from recid to
recidb. Useful for example for dumping records from the
database for reformatting.
sysno (str):
display old system SYS number (*e.g.* "").
If you migrate to Invenio from another system, and store your
old SYS call numbers, you can use them instead of recid if you
wish so.
id (int):
the same as recid, in case recid is not set.
For backwards compatibility.
idb (int):
the same as recid, in case recidb is not set.
For backwards compatibility.
sysnb (str):
the same as sysno, in case sysno is not set.
For backwards compatibility.
action (str):
action to do. "SEARCH" for searching,
"Browse" for browsing. Default is to search.
d1 (str):
first datetime in full YYYY-mm-dd HH:MM:DD format
(*e.g.* "1998-08-23 12:34:56"). Useful for search limits
on creation/modification date (see "dt" argument below).
Note that "d1" takes precedence over d1y, d1m, d1d if these
are defined.
d1y (int):
first date's year (*e.g.* "1998").
Useful for search limits on creation/modification date.
d1m (int):
first date's month (*e.g.* "08").
Useful for search limits on creation/modification date.
d1d (int):
first date's day (*e.g.* "23").
Useful for search limits on creation/modification date.
d2 (str):
second datetime in full YYYY-mm-dd HH:MM:DD format
(*e.g.* "1998-09-02 12:34:56"). Useful for search limits
on creation/modification date (see "dt" argument below).
Note that "d2" takes precedence over d2y, d2m, d2d
if these are defined.
d2y (int):
second date's year (*e.g.* "1998").
Useful for search limits on creation/modification date.
d2m (int):
second date's month (*e.g.* "09").
Useful for search limi