Commit b47b5b7e authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Restore Automaton, move Automaton._search_parameters to store

parent 14512643
......@@ -3,7 +3,6 @@
"""
import logging
import re
import traceback
from .base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
......@@ -14,7 +13,7 @@ from .msg import Msg
from .msgcollection import MsgCollection
from plugin_dbui import CALLBACK_ERRORS, get_id
from store_tools import (CdsException,
InvenioStore,
build_store,
OAI_URL)
from store_tools.factory import build_record
......@@ -26,10 +25,6 @@ MSG_INSERT_FAIL = "Fail to insert the new record in the database."
OAI = "oai:%s:%i"
# search collection when using inspirehep
# require for "Hal Hidden"
REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)")
T2 = " "*2
T4 = " "*4
T6 = " "*6
......@@ -153,6 +148,9 @@ class Automaton(object):
self._id_preprint = get_id(db.categories, code="PRE")
self._id_article = get_id(db.categories, code="ACL")
# Keep track of the shelf for inspirehep.net
self.shelf = None
def _insert_in_db(self, log_year="", **fields):
"""Insert the record in the database, handling database exception.
......@@ -291,79 +289,6 @@ class Automaton(object):
return publication.id
def _search_parameters(self, collection):
"""Build the keywords to steer the URL search in invenio store.
The main parameter is the collection and the date range defined
in the selector.
Args:
collection (str):
string defining the collection in the store.
The syntax depends on the invenio store:
* ``"find cn d0 and tc p and not tc c"``
* ``"LHCb Papers"``.
Returns:
dict:
the key are a sub-set of those defined in
:meth:`store_tools.InvenioStore.get_ids`.
"""
year_start = self.year_start
year_end = self.year_end
# INSPIREHEP store
if collection.startswith("find"):
query = collection
if year_start and not year_end:
query += f" and date {year_start}"
elif not year_start and year_end:
query += f" and date {year_end}"
elif year_start and year_end:
tpl = (f"date {el}" for el in range(year_start, year_end + 1))
sdates = " or ".join(tpl)
query += f" and ({sdates})"
dic = dict(p=query, # query à la spires
rg=1000, # maximum number of records returned
sf="year", # sort by date
so="d") # descending order
# handle the cc keyword (true inspirehep collection)
match = REG_COLLECTION.search(query)
if match:
dic["cc"] = match.group(1).strip()
dic["p"] = REG_COLLECTION.sub("", query).strip()
dic["p"] = dic["p"].replace(" ", " ")
if dic["p"] == "find":
del dic["p"]
# CERN INVENIO store
else:
if year_start and not year_end:
rex = year_start
elif not year_start and year_end:
rex = year_end
elif year_start and year_end:
tpl = (str(el) for el in range(year_start, year_end + 1))
rex = "|".join(tpl)
dic = dict(cc=collection, # collection
f1="year", # search on year
m1="r", # use regular expression
p1=rex, # regular expression defining year
sf="year", # sort by date
so="d") # descending order
return dic
def check_record(self, record):
"""Check the content of the record in order to fix non-conformities.
Return ``False`` when non-conformities are found and can not be
......@@ -519,7 +444,11 @@ class Automaton(object):
collection_logs.append(MsgCollection(title=ctitle))
# get search parameters for the collection including user criteria
kwargs = self._search_parameters(collection)
kwargs = store.search_parameters(collection,
year_start=self.year_start,
year_end=self.year_end)
logger.debug(f"search parameters {kwargs}")
# get the list of record identifier matching the search criteria
try:
......@@ -563,15 +492,14 @@ class Automaton(object):
"""
logger = self.logger
logger.info(f"{T4}process record {recjson['recid']} (process_recjson)")
logger.info(f"{T4}process record (process_recjson)")
collection_logs = self.collection_logs
harvester = self.harvester
logs = self.logs
# instantiate the record
record = build_record(recjson)
record = build_record(recjson, shelf=self.shelf)
logger.debug(f"{T4}{record.title()[:72]}")
# start the log for the record
......@@ -669,8 +597,8 @@ class Automaton(object):
self.harvester.collections = collections
# instantiate the store
shelf = ("literature" if host == "inspirehep.net" else None)
self.store = InvenioStore(host, shelf=shelf)
self.shelf = ("literature" if host == "inspirehep.net" else None)
self.store = build_store(host, shelf=self.shelf)
# list of collections
collections = re.sub(" *, *", ",", collections).split(",")
......
......@@ -25,6 +25,7 @@ from .exception import (CdsException,
RecordException)
from .factory import build_record, build_store
from .inspirehepstore import InspirehepStore
from .inveniostore import InvenioStore
from .record import Record
from .recordconf import RecordConf
......
......@@ -239,30 +239,28 @@ def build_store(host=None, shelf=None):
shelf (str):
section of the store containing records. It depends on the host.
Possible values are ``None``, ``literature``, ``conferences``
and ``institutions``
Possible values are ``literature``, ``conferences`` and
``institutions``.
The correlation between host and shelf is in the table:
+----------------+--------------+-----------------------------+
| host | shelf | base API |
+----------------+--------------+-----------------------------+
| cds.cern.ch | None | https://cds.cern.ch/ |
+----------------+--------------+-----------------------------+
| inspirehep.net | None | https://old.inspirehep.net/ |
| inspirehep.net | literature | https://inspirehep.net/ |
| inspirehep.net | conferences | https://inspirehep.net/ |
| inspirehep.net | institutions | https://inspirehep.net/ |
+----------------+--------------+-----------------------------+
Returns:
InvenioStore
InvenioStore or InspirehepStore
"""
if host in CDS:
store = InvenioStore(host="cds.cern.ch")
elif host in INS and shelf is None:
store = InvenioStore(host="old.inspirehep.net")
elif host in INS and shelf in SHELFS:
store = InspirehepStore(host=host, shelf=shelf)
......
......@@ -207,3 +207,37 @@ class InspirehepStore(BaseStore):
raise CdsException(MSG_INVALID_RESPONSE)
return records
def search_parameters(self, collection, year_start=None, year_end=None):
"""Build the (key, value) pairs to steer the search for a collection.
Args:
collection (str):
the collection in the store, *e.g.* ``LHCb Papers``.
* find cn d0 and tc p and not tc c
* find cc HAL Hidden a simpson, g and not tc c
* other syntax accept by inspirehep.net search engine
year_start (str):
year_end (str):
Returns:
dict:
"""
query = collection
if year_start and not year_end:
query += f" and date {year_start}"
elif not year_start and year_end:
query += f" and date {year_end}"
elif year_start and year_end:
tpl = (f"date {el}" for el in range(year_start, year_end + 1))
sdates = " or ".join(tpl)
query += f" and ({sdates})"
# get 100 records per page
return dict(q=query, size=100)
......@@ -191,3 +191,39 @@ class InvenioStore(BaseStore):
return obj[0]
raise CdsException(MSG_INVALID_RESPONSE)
def search_parameters(self, collection, year_start=None, year_end=None):
"""Build the (key, value) pairs to steer the search for a collection.
Args:
collection (str):
the collection in the store:
* ``LHCb Papers``
* other syntax accept by cds.cern.ch search engine
year_start (str):
year_end (str):
Returns:
dict:
"""
rex = ""
if year_start and not year_end:
rex = year_start
elif not year_start and year_end:
rex = year_end
elif year_start and year_end:
tpl = (str(el) for el in range(year_start, year_end + 1))
rex = "|".join(tpl)
dct = dict(cc=collection, # collection
f1="year", # search on year
m1="r", # use regular expression
p1=rex, # regular expression defining year
sf="year", # sort by date
so="d") # descending order
return dct
......@@ -75,7 +75,9 @@ class RecordHepPubli(RecordHep, PluginAuthors, PluginPublicationInfo):
(author["inspire_roles"] if "inspire_roles" in author else [])
full_name = author["full_name"]
last_name, first_name = full_name.split(",")
idx = full_name.find(",")
last_name = full_name[:idx]
first_name = full_name[idx + 1:].strip()
dct = {"affiliation": "|".join(affiliations),
"first_name": first_name.strip(),
......
......@@ -75,13 +75,7 @@ def test_get_record_cds_01020():
recjson = store.get_record(1951625)
assert isinstance(recjson, dict)
def test_get_record_ins_old_01021():
# old inspirehep interface
store = build_store("inspirehep.net", shelf=None)
recjson = store.get_record(1319638)
assert isinstance(recjson, dict)
assert recjson.get("$schema", None) is None
# v1.4.0 remove obsolete test_get_record_ins_old_01021():
def test_get_record_ins_literature_01022():
......
......@@ -60,7 +60,11 @@ def test__is_record_in_db_12001(svc):
assert rec_id_1 == rec_id_2
def test_process_recid_12002(svc):
# ............................................................................
#
# Process a record, collection and URL by using cds.cern.ch
#
def test_process_recid_cds_12010(svc):
"""Test the deepest method to retrieve a record.
"""
......@@ -78,10 +82,10 @@ def test_process_recid_12002(svc):
ctitle = "LHCb / article / %s" % collection
svc.collection_logs.append(MsgCollection(title=ctitle))
# get a list of ids
kwargs = svc._search_parameters(collection)
# get a list of identifier
kwargs = svc.store.search_parameters(collection, year_start="2010")
recids = svc.store.get_ids(**kwargs)
assert len(recids) > 0
assert len(recids) == 2
# try with the oldest one
recid = recids[-1]
......@@ -99,7 +103,7 @@ def test_process_recid_12002(svc):
del svc.store
def test_process_collection_12003(svc):
def test_process_collection_cds_12011(svc):
# mimic the previous stage process_url
collection = "LHCb Papers"
......@@ -119,10 +123,85 @@ def test_process_collection_12003(svc):
del svc.store
def test_process_url_cds_12004(svc):
def test_process_url_cds_12012(svc):
assert svc.process_url("cds.cern.ch", "LHCb Papers") is None
def test_process_url_ins_12005(svc):
assert svc.process_url(
"inspirehep.net", "find cn lhcb and tc p and not tc c") is None
# ............................................................................
#
# Process a record, collection and URL by using inspirehep.net
#
def test_process_recid_ins_12020(svc):
"""Test the deepest method to retrieve a record.
"""
# reset
svc.collection_logs = []
svc.logs = []
# mimic high level stage process_collection and process_url
collection = "find cn LHCb and tc p and not tc c"
host = "inspirehep.net"
shelf = "literature"
svc.harvester.host = host
svc.harvester.collections = collection
svc.shelf = shelf
svc.store = build_store(host, shelf=svc.shelf)
ctitle = "LHCb / article / %s" % collection
svc.collection_logs.append(MsgCollection(title=ctitle))
# get a list of identifier
kwargs = svc.store.search_parameters(collection, year_start="2010")
recids = svc.store.get_ids(**kwargs)
assert len(recids) == 3
# try with the oldest one
recid = recids[0]
assert svc.process_recid(recid) is None
print(svc.logs)
assert len(svc.logs) == 1
assert svc.logs[-1].action is None
assert svc.logs[-1].txt is None
# reset
svc.collection_logs = []
svc.logs = []
del svc.harvester.host
del svc.harvester.collections
del svc.store
def test_process_collection_ins_12021(svc):
# mimic the previous stage process_url
collection = "find cn LHCb and tc p and not tc c"
host = "inspirehep.net"
shelf = "literature"
svc.harvester.host = host
svc.harvester.collections = collection
svc.shelf = shelf
svc.store = build_store(host, shelf=svc.shelf)
# do it
assert svc.process_collection(collection) is None
# reset
svc.collection_logs = []
svc.logs = []
del svc.harvester.host
del svc.harvester.collections
del svc.store
def test_process_url_cds_12022(svc):
collection = "find cn LHCb and tc p and not tc c"
host = "inspirehep.net"
assert svc.process_url(host, collection) is None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment