Commit 32f27735 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch 'master' into 'production'

Release 1.3.0

See merge request !104
parents 2e45be32 3cd2a6b5
......@@ -2,6 +2,16 @@
HEAD
1.3.0 (Apr 2020)
- Major release to work with the new inspirehep version.
The inspirehep store is divided in "shelf": literature and conferences
and provide a new API to search and to retrieve records.
The new API is open for conferences but not for literature.
Publications are in literature. In order to get them, we have to
use old.inspirehep.net which is based on the old API.
The store cds.cern.ch continues to work with the old API.
- Redesing the InvenioStore and its factory.
1.2.0 (Mar 2020)
- Major release to handle citations count per article
- Several selectors, controllers and views have been added
......
1.2.0
\ No newline at end of file
1.3.0
\ No newline at end of file
......@@ -4,7 +4,6 @@
import json
import re
from check_tools import check_publication
from gluon.storage import Storage
from harvest_tools import DRY_RUN
......@@ -18,7 +17,6 @@ from plugin_dbui import (CALLBACK_ERRORS,
Selector,
to_fields)
MODE_DRY_RUN = T(DRY_RUN)
MSG_NO_AUTHORS = "<br><br>Removing affiliation failed.<br>"\
"Use INSPIRES instead with the tool 'insert RECJSON'"
......@@ -437,36 +435,33 @@ def update_citations():
import datetime
import requests
from invenio_tools import InvenioStore
from invenio_tools import CdsException, InvenioStore
from json.decoder import JSONDecodeError
from reporting_tools import repr_team_project
today = datetime.date.today()
rex_ins = re.compile(r"https?://inspirehep.net/record/(\d+)")
citations = db.citations
id_acl = get_id(db.categories, code="ACL")
kwargs = dict(of="recjson", ot="number_of_citations")
publications = db.publications
rex_ins = re.compile(r"(https?://inspirehep.net/record/\d+)")
store = InvenioStore("inspirehep.net")
today = datetime.date.today()
counters = Storage(article=0,
http_error=0,
insert=0,
json_error=0,
list_size=0,
not_list=0,
url_error=0)
recid=0,
failed=0,
insert=0)
# get user requirement
selector = Selector(virtdb.citation_selector)
# get the list of article store in the inspirehep store
id_acl = get_id(db.categories, code="ACL")
selector.append_query(publications.id_categories == id_acl)
selector.append_query(publications.origin.contains("inspirehep"))
query = selector.query(publications)
# get the number of citation and update the database table
store = InvenioStore("inspirehep.net", shelf="literature")
for row in db(query).iterselect(publications.id, publications.origin):
counters.article += 1
......@@ -474,35 +469,17 @@ def update_citations():
# interrogate inspirehep.net
try:
url = rex_ins.search(row.origin).group(1)
rep = store.interogate(url, timeout=60, **kwargs)
lst = rep.json()
if not isinstance(lst, list):
logger.warning(f"JSON response is not a list")
counters.not_list += 1
continue
if len(lst) != 1:
logger.warning(f"size of the return list is not one")
counters.list_size += 1
continue
count = lst[0].get("number_of_citations")
recid = rex_ins.search(row.origin).group(1)
count = store.get_field(recid, "number_of_citations")
except AttributeError:
logger.warning(f"inspirehep URL not well formed {row.origin}")
counters.url_error += 1
continue
except JSONDecodeError:
logger.warning("JSON decoding error")
counters.json_error += 1
logger.warning(f"record identifier not found in {row.origin}")
counters.recid += 1
continue
except requests.exceptions.RequestException:
logger.warning(f"HTTP error interrogating {url}")
counters.http_error += 1
except CdsException:
logger.warning(f"failed to get citations for {recid}")
counters.failed += 1
continue
# check if the number of count changes
......@@ -515,7 +492,7 @@ def update_citations():
continue
# update the citations table
logger.info(f"update {url} citations to {count}")
logger.info(f"update citations for {recid} by {count - last_count}")
counters.insert += 1
idpubli = row.id
......@@ -529,11 +506,8 @@ def update_citations():
# inform the user
logger.info(f" number of article: {counters.article}")
logger.info(f" bad inspirehep URL: {counters.url_error}")
logger.info(f" HTTP connection error: {counters.url_error}")
logger.info(f" JSON decoding error: {counters.json_error}")
logger.info(f" response is not a list: {counters.not_list}")
logger.info(f" list size is not one: {counters.list_size}")
logger.info(f" bad record identifier: {counters.recid}")
logger.info(f" failed to get citations: {counters.failed}")
logger.info(f" insert or update in db: {counters.insert}")
return dict(counters=counters,
......
......@@ -10,10 +10,12 @@ invenio_tools.inveniostore.InvenioStore
.. autosummary::
:toctree: inveniostore/
~InvenioStore.get_field
~InvenioStore.get_ids
~InvenioStore.get_record
~InvenioStore.interogate
~InvenioStore.interrogate
~InvenioStore.last_search_url
~InvenioStore.search
......
invenio_tools.inveniostore.InvenioStore.get_field
=================================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.get_field
invenio_tools.inveniostore.InvenioStore.interogate
==================================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.interogate
\ No newline at end of file
invenio_tools.inveniostore.InvenioStore.interrogate
===================================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.interrogate
invenio_tools.inveniostore.InvenioStore.search
===============================================
.. currentmodule:: invenio_tools.inveniostore
.. automethod:: InvenioStore.search
......@@ -89,6 +89,7 @@
'Axis Label Converters': 'Axis Label Converters',
'Axis values are used to defined the title of each level in section. This field allows to replace some values by another ones. Rule: label1: "value1", label2: "value2",....': "Le nom de l'axe est utilisé comme titre pour le niveau. Ce champ permet de remplacer cette valeur par une autre. Régle: label1: 'value1', label2: 'value2',....",
'Axis Vertical': 'Axis Vertical',
'Bad record identifer': 'Mauvais identifiant',
'basic': 'basic',
'Binary files': 'fichiers binaires',
'Book': 'Ouvrage',
......@@ -251,6 +252,7 @@
'extract authors': 'extraire les auteurs',
'Fail to decode HTTP response': 'Echec du decodage de la réponse HTTP',
'Fail to insert the new record in the database.': "Echec de l'insertion d'un nouvel enregistrement dans la base de donnée.",
'Failed to get citations': 'Echec pour obtenir le nombre de citations',
'Field': 'Champ',
'Fields with identical value are not listed.': 'Les champs avec des valeurs identiques ne sont pas listés.',
'Fill': 'Remplir',
......@@ -388,6 +390,7 @@
'Load in the database': 'Chargé dans la base de données',
'Loading failed': 'Echec du chargement',
'Loading...': 'Chargement en cours...',
'log cron jobs': 'journaux des taches planifiées',
'Log In': 'Log In',
'Logged in': 'Logged in',
'Logged out': 'Déconnexion',
......
......@@ -5,7 +5,6 @@ import logging
import re
import traceback
from .base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
ToolException)
......@@ -19,7 +18,6 @@ from .msg import Msg
from .msgcollection import MsgCollection
from plugin_dbui import CALLBACK_ERRORS, get_id
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
......@@ -109,6 +107,7 @@ class Automaton(object):
* team or project or the publication category not defined
"""
def __init__(self,
db,
id_team,
......@@ -549,7 +548,7 @@ class Automaton(object):
rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0]
# process the remaining identifiers
(*map(self.process_recid, rec_ids), )
(*map(self.process_recid, rec_ids),)
def process_recjson(self, recjson):
"""Process the publication provided as a JSON record:
......@@ -670,13 +669,14 @@ class Automaton(object):
self.harvester.collections = collections
# instantiate the store
self.store = InvenioStore(host)
shelf = ("literature" if host == "inspirehep.net" else None)
self.store = InvenioStore(host, shelf=shelf)
# list of collections
collections = re.sub(" *, *", ",", collections).split(",")
# process
(*map(self.process_collection, collections), )
(*map(self.process_collection, collections),)
def report(self):
"""Build the processing report.
......
......@@ -21,7 +21,6 @@ from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS
from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY
......@@ -79,6 +78,7 @@ class CheckAndFix(object):
"""A collection of tools to check and repair the content of record.
"""
def __init__(self):
self.db = current.db
......@@ -118,13 +118,11 @@ class CheckAndFix(object):
# CDS has the opening and closing dates encoded as 20141231
if "opening_date" in meeting and "closing_date" in meeting:
fmt = "%Y%m%d"
val = meeting["opening_date"].replace("-", "")
opening = datetime.strptime(val, "%Y%m%d")
val = meeting["opening_date"]
opening = datetime.strptime(val, fmt)
val = meeting["closing_date"]
closing = datetime.strptime(val, fmt)
val = meeting["closing_date"].replace("-", "")
closing = datetime.strptime(val, "%Y%m%d")
return (opening, closing)
......
# Invenio tools
This section describes keywords which can be used to search record within
an invenio store. They depend on the invenio version
## inspirehep.net (March 2020)
New version deployed in March 2020
The previous version as available in the store old.inspirehep.net.
## old.inspirehep.net and cds.cern.ch
Search criteria are defined by the keywords arguments.
The complete list of keyword arguments can be found at
[https://old.inspirehep.net/help/hacking/search-engine-api](
https://old.inspirehep.net/help/hacking/search-engine-api)
Examples how to use the invenio API:
[https://old.inspirehep.net/info/hep/api?ln=fr#json_fnames](
https://old.inspirehep.net/info/hep/api?ln=fr#json_fnames)
List of keyword in the JSON record:
[https://github.com/inspirehep/invenio/blob/prod/modules/bibfield/etc/atlantis.cfg](
https://github.com/inspirehep/invenio/blob/prod/modules/bibfield/etc/atlantis.cfg)
cc (str):
current collection (*e.g.* "ATLAS").
The collection the user started to search/browse from.
c (str):
collection list (*e.g.* ["Theses", "Books"]).
The collections user may have selected/deselected when
starting to search from **cc**.
ec (str):
external collection list (*e.g.* ["CiteSeer", "Google"]).
The external collections may have been selected/deselected
by the user.
p (str):
pattern to search for (*e.g.* "ellis and muon or kaon").
f (str):
field to search within (*e.g.* "author").
rg (int):
records in groups of (*e.g.* "10").
Defines how many hits per collection in the search results
page are displayed.
sf (str):
sort field (*e.g*. "title").
so (str):
sort order ("a"=ascending, "d"=descending).
sp (str):
sort pattern (*e.g.* "CERN-") -- in case there are more
values in a sort field, this argument tells which one
to prefer.
rm (str):
ranking method (*e.g.* "jif").
Defines whether results should be ranked by some known
ranking method.
of (str):
output format (*e.g.* "hb").
Usually starting "h" means HTML output (and "hb" for HTML
brief, "hd" for HTML detailed), "x" means XML output,
"t" means plain text output, "id" means no output at all but to
return list of recIDs found. (Suitable for high-level API.).
ot (str):
output only these MARC tags (*e.g.* "100,700,909C0b").
Useful if only some fields are to be shown in the
output, e.g. for library to control some fields.
as (int):
advanced search ("0" means no, "1" means yes).
Whether search was called from within the advanced search
interface.
p1 (str):
first pattern to search for in the advanced search
interface. Much like **p**.
f1 (str):
first field to search within in the advanced search
interface. Much like **f**.
m1 (str):
first matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact phrase,
"p" partial phrase, "r" regular expression).
op1 (str):
first operator, to join the first and the second unit
in the advanced search interface. ("a" add, "o" or, "n" not).
p2 (str):
second pattern to search for in the advanced search
interface. Much like **p**.
f2 (str):
second field to search within in the advanced search
interface. Much like **f**.
m2 (str):
second matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact phrase,
"p" partial phrase, "r" regular expression).
op2 (str):
second operator, to join the second and the third unit
in the advanced search interface. ("a" add, "o" or, "n" not).
p3 (str):
third pattern to search for in the advanced search
interface. Much like **p**.
f3 (str):
third field to search within in the advanced search
interface. Much like **f**.
m3 (str):
third matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact phrase,
"p" partial phrase, "r" regular expression).
sc (int):
split by collection ("0" no, "1" yes).
Governs whether we want to present the results in a single
huge list, or splitted by collection.
jrec (int):
jump to record (*e.g.* "234").
Used for navigation inside the search results.
recid (int):
display record ID (*e.g.* "20000").
Do not search/browse but go straight away to the Detailed
record page for the given recID.
recidb (int):
display record ID bis (*e.g.* "20010").
If greater than "recid", then display records from recid to
recidb. Useful for example for dumping records from the
database for reformatting.
sysno (str):
display old system SYS number (*e.g.* "").
If you migrate to Invenio from another system, and store your
old SYS call numbers, you can use them instead of recid if you
wish so.
id (int):
the same as recid, in case recid is not set.
For backwards compatibility.
idb (int):
the same as recid, in case recidb is not set.
For backwards compatibility.
sysnb (str):
the same as sysno, in case sysno is not set.
For backwards compatibility.
action (str):
action to do. "SEARCH" for searching,
"Browse" for browsing. Default is to search.
d1 (str):
first datetime in full YYYY-mm-dd HH:MM:DD format
(*e.g.* "1998-08-23 12:34:56"). Useful for search limits
on creation/modification date (see "dt" argument below).
Note that "d1" takes precedence over d1y, d1m, d1d if these
are defined.
d1y (int):
first date's year (*e.g.* "1998").
Useful for search limits on creation/modification date.
d1m (int):
first date's month (*e.g.* "08").
Useful for search limits on creation/modification date.
d1d (int):
first date's day (*e.g.* "23").
Useful for search limits on creation/modification date.
d2 (str):
second datetime in full YYYY-mm-dd HH:MM:DD format
(*e.g.* "1998-09-02 12:34:56"). Useful for search limits
on creation/modification date (see "dt" argument below).
Note that "d2" takes precedence over d2y, d2m, d2d
if these are defined.
d2y (int):
second date's year (*e.g.* "1998").
Useful for search limits on creation/modification date.
d2m (int):
second date's month (*e.g.* "09").
Useful for search limits on creation/modification date.
d2d (int):
second date's day (*e.g.* "02").
Useful for search limits on creation/modification date.
dt (str):
first and second date's type (*e.g.* "c").
Specifies whether to search in creation dates ("c") or in
modification dates ("m"). When dt is not set and d1*
and d2* are set, the default is "c".
verbose (int):
verbose level (0=min, 9=max).
Useful to print some internal information on the searching
process in case something goes wrong.
ap (int):
alternative patterns (0=no, 1=yes).
In case no exact match is found, the search engine can try
alternative patterns e.g. to replace non-alphanumeric
characters by a boolean query. ap defines if this is wanted.
......@@ -33,7 +33,7 @@ from .recordpubli import RecordPubli
from .recordthesis import RecordThesis
def load_record(host, record_id):
def load_record(host, record_id, shelf=None):
"""Helper function to load a single record from an invenio store.
Args:
......@@ -44,6 +44,23 @@ def load_record(host, record_id):
record_id (int):
the record identifier in the store
shelf (str):
section of the store containing records. It depends on the host.
Possible values are ``None``, ``literature``, ``conferences``
and ``institutions``
+----------------+--------------+-----------------------------+
| host | shelf | base API |
+----------------+--------------+-----------------------------+
| cds.cern.ch | None | https://cds.cern.ch/ |
+----------------+--------------+-----------------------------+
| inspirehep.net | None | https://old.inspirehep.net/ |
| inspirehep.net | literature | https://old.inspirehep.net/ |
| inspirehep.net | conferences | https://inspirehep.net/ |
| inspirehep.net | institutions | https://old.inspirehep.net/ |
+----------------+--------------+-----------------------------+
Returns:
Record:
either RecordPubli, RecordInst, RecordConf of RecordThesis.
......@@ -55,6 +72,6 @@ def load_record(host, record_id):
* no JSON object could be decoded.
"""
store = InvenioStore(host)
store = InvenioStore(host, shelf=shelf)
recjson = store.get_record(record_id)
return build_record(recjson)
""" invenio_tools.factory
"""
import requests
import re
from .base import (is_conference,
is_institute,
......@@ -9,17 +9,19 @@ from .base import (is_conference,
MSG_INV_CONF,
MSG_INV_CONF_KEY,
MSG_NO_CONF,
MSG_NO_CONF_ID_KEY,
REG_CONF,
REG_OAI)
REG_CONF)
from datetime import datetime
from .exception import CdsException
from .inveniostore import InvenioStore
from .inveniostore import CDS, INS, InvenioStore
from .recordconf import RecordConf
from .recordinst import RecordInst
from .recordpubli import RecordPubli
from .recordthesis import RecordThesis
REX_T = "\$\$t([\w, ]+)"
REX_U = "\$\$u([\w, ]+)"
def add_affiliation_keys(recjson):
"""A the affiliation keys to the record describing an institute:
......@@ -38,30 +40,29 @@ def add_affiliation_keys(recjson):
| corporate_note | identifier, futur_identifier, name |
+----------------+------------------------------------+
Args
recjson (dict): record data (MarcJSON)
Args:
recjson (dict):
record data (MarcJSON)
"""
url = "https://inspirehep.net/record/%i" % recjson["recid"]
rep = requests.get(url, params={"ot": "110", "of": "txt"})
store = InvenioStore("inspirehep", shelf="institutions")
# decode the string: '000recid 110__ $$aXXX$$bYYY$$tZZZ\n'
txt = rep.text.replace("\n", "")
li = txt[txt.find("$"):].split("$$")
url = f"https://old.inspirehep.net/record/{recjson['recid']}"