""" harvest_tools.automaton """ import logging import re import traceback from .base import (MSG_FIX_ORIGIN, MSG_IN_DB, ToolException) from .checkandfix import CheckAndFix from gluon.storage import Storage from invenio_tools import (CdsException, InvenioStore, OAI_URL) from invenio_tools.factory import build_record from .msg import Msg from .msgcollection import MsgCollection from plugin_dbui import CALLBACK_ERRORS, get_id MSG_NO_CAT = 'Select a "category" !!!' MSG_NO_PROJECT = 'Select a "project" !!!' MSG_NO_TEAM = 'Select a "team" !!!' MSG_INSERT_FAIL = "Fail to insert the new record in the database." OAI = "oai:%s:%i" # search collection when using inspirehep # require for "Hal Hidden" REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)") T2 = " "*2 T4 = " "*4 T6 = " "*6 class Automaton(object): """Base class to search and process publications: * Decode the selector defining user criteria. * Search in the store publications matching user criteria. * Instantiate the record and check it. * Insert new records in the database. Note: The parameters of the search are defined by the current ``request``. The logic implements in the ``Automaton`` class is the following: #. Ask to the store, all the `record_id` satisfying the user request. #. Reject `record_id` contains in the *origin* field of a database entry. #. Request to the store, the JSON description of the publications and decode them. #. Reject the record for which the *secondary_oai_url* is contained in the *origin* field of a database entry. Update the *origin* field of the database record. #. Check that the *oai* of the publication is defined and well formed. Recover it, if it is not the case. At this stage the OAI is always defined. #. Reject temporarily publication. #. Check that *authors* are defined. Reject the publication if it is not the case. #. Check that *my institute* is in the list of the institutes signing the publication. Reject the publication if it is not the case. When the affiliation are not defined, try to recover this case, by finding the author of my institute signing the publication. This recovery procedure uses the *author rescue list*. Reject the record when the recovery procedure failed. #. Check that the *collaboration*, if defined, is well formed. Reject the publication if it is not the case #. Several check are applied depending on the publication type. #. At the end of this process, the publisher, the authors are formatted and the list of signatories of my institute extracted. Args: db (gluon.DAL): the database connection. id_team (int): the identifier of the team in the database. id_project (int): the identifier of the project in the database. automaton (str): the name of the automaton which will be used to process the data. Possible values are: ``articles``, ``notes``, ``preprints``, ``proceedings``, ``reports``, ``talks`` and ``theses``. id_category (int): the identifier of the category of publication year_start (int): starting year for the scan year_end (int): ending year of the scan dry_run (bool): new records are not inserted in the database when ``True``. Raises: ToolException: * team or project or the publication category not defined """ def __init__(self, db, id_team, id_project, automaton, id_category, year_start=None, year_end=None, dry_run=True): # protection team, project and/or category have to be defined if not id_team: raise ToolException(MSG_NO_TEAM) if not id_project: raise ToolException(MSG_NO_PROJECT) if not id_category: raise ToolException(MSG_NO_CAT) self.check = CheckAndFix() self.collection_logs = [] self.controller = automaton self.db = db self.dry_run = dry_run self.id_category = id_category self.id_team = id_team self.id_project = id_project self.logs = [] self.logger = logging.getLogger("web2py.app.limbra") self.store = None self.year_start = year_start self.year_end = year_end # Construct harvester Storage needed for the log self.harvester = Storage(id_teams=id_team, id_projects=id_project, controller=automaton, id_categories=id_category) # Identifier of the categories preprint and articles # Used by the method _is_record_in_db self._id_preprint = get_id(db.categories, code="PRE") self._id_article = get_id(db.categories, code="ACL") def _insert_in_db(self, log_year="", **fields): """Insert the record in the database, handling database exception. Args: log_year (str): year of the record for the log Keyword Args: **fields: keyword arguments defining the record values to be inserted in the database. Returns: int: one when the record is inserted / updated in the database, zero otherwise. """ db = self.db try: rec_id = db.publications.insert(**fields) if rec_id: return 1 # operation can be reject by callback table._before_insert else: msg = MSG_INSERT_FAIL if CALLBACK_ERRORS in db.publications: msg = db.publications._callback_errors # reduce the error message if isinstance(msg, list): msg = "%s %s" % (msg[0], msg[-1]) self.logs[-1].reject(msg, log_year) return 0 # operation can be rejected by the database except Exception as dbe: self.logs[-1].reject(str(dbe), log_year) return 0 def _is_record_in_db(self, collection_title, host=None, rec_id=None, oai_url=None): """Return the database identifier when the publication is registered. The search is based on the ``origin`` field and on the primary OAI. Note: A new log entry is created when a record is found. Args: title (str): the title of the publication. Keyword Args: host (str): the store. possible values are ``cds.cern.ch`` or ``inspirehep.net``. To be used with *rec_id*. rec_id (int): the record identifier in the store oai_url (str): the URL of the record in the store. Either use *host* and *rec_id* or *oai_url* Returns: int: the id of the record in the database when a record is found, 0 otherwise. Raises: ValueError: * keyword arguments are not defined properly. """ db = self.db harvester = self.harvester # build the OAI URL if host is not None and rec_id is not None and oai_url is None: url = OAI_URL % (host, rec_id) elif host is None and rec_id is None and oai_url is not None: url = oai_url else: raise ValueError # protection empty URL if len(url) == 0: return 0 # check the OAI query = db.publications.origin.contains(url) setrows = db(query) if setrows.count() == 0: return 0 # one record found columns = [db.publications.id, db.publications.id_categories, db.publications.title, db.publications.year] publication = setrows.select(*columns).first() # Note: # The category for the publication and the harvester have to be equal. # However, keep the record if it is a preprint when the harvester # looks for articles. This is required to transform a preprint # into article # # Category can disagree when the publication is an article and # the harvester look for preprint. In that case, keep the article # if publication.id_categories != harvester.id_categories: is_preprint_to_article = \ publication.id_categories == self._id_preprint \ and harvester.id_categories == self._id_article if is_preprint_to_article: return 0 # log self.logs.append(Msg(harvester=harvester, collection=collection_title, record_id=rec_id, title=publication.title)) self.logs[-1].idle(MSG_IN_DB, publication.year) logger = self.logger logger.debug("") logger.debug(f"{T2}record {rec_id} in db with id {publication.id}") return publication.id def _search_parameters(self, collection): """Build the keywords to steer the URL search in invenio store. The main parameter is the collection and the date range defined in the selector. Args: collection (str): string defining the collection in the store. The syntax depends on the invenio store: * ``"find cn d0 and tc p and not tc c"`` * ``"LHCb Papers"``. Returns: dict: the key are a sub-set of those defined in :meth:`invenio_tools.InvenioStore.get_ids`. """ year_start = self.year_start year_end = self.year_end # INSPIREHEP store if collection.startswith("find"): query = collection if year_start and not year_end: query += " and date %s" % year_start elif not year_start and year_end: query += " and date %s" % year_end elif year_start and year_end: query += " and date > %s and date < %s " \ % (year_start - 1, year_end + 1) dic = dict(p=query, # query à la spires rg=1000, # maximum number of records returned sf="year", # sort by date so="d") # descending order # handle the cc keyword (true inspirehep collection) match = REG_COLLECTION.search(query) if match: dic["cc"] = match.group(1).strip() dic["p"] = REG_COLLECTION.sub("", query).strip() dic["p"] = dic["p"].replace(" ", " ") if dic["p"] == "find": del dic["p"] # CERN INVENIO store else: if year_start and not year_end: rex = year_start elif not year_start and year_end: rex = year_end elif year_start and year_end: li = [str(el) for el in range(year_start, year_end + 1)] rex = "|".join(li) dic = dict(cc=collection, # collection f1="year", # search on year m1="r", # use regular expression p1=rex, # regular expression defining year sf="year", # sort by date so="d") # descending order return dic def check_record(self, record): """Check the content of the record in order to fix non-conformities. Return ``False`` when non-conformities are found and can not be corrected. Note: Some checks depend on the type of publications and have to be implemented in inherited class. Note: The order of the checks matter. It should be OAI, temporary record, authors, my authors and then a series of checks specific to the publication type. Args: record (Record): JSON record describing the publication. Returns: bool: ``False`` when a non-conformity is found and can not be corrected. """ self.logger.debug(f"{T4}check record (automaton)") try: # fix record with a missing OAI if not self.check.is_oai(record): oai = OAI % (self.harvester.host, record.id()) record["oai"] = {"value": oai} if self.check.is_bad_oai_used(record): self.logs[-1].idle(MSG_IN_DB, record.submitted()) return False self.check.temporary_record(record) self.check.authors(record) self.check.my_affiliation(record, self.id_project, self.id_team) self.check.collaboration(record) except Exception as e: self.logs[-1].reject(e, record=record) return False return True def get_record_by_fields(self, oai_url, year, **kwargs): """Get database record matching fields values defined in the keyword arguments. Note: This method is required to deal with publication entered by hand and found later by an harvester. Args: oai_url (str): the oai_url, *e.g.* ``http://cds.cern.ch/record/123456``. The origin field of the existing database record is update to **oai_url** when a match is found. year (int): the year of the publication. It is used by the search algorithm and by the logger. Keyword Args: kwargs (str): a series of key, value pair where the key is the name of a publications database field. Returns: tuple: ``(id, status)`` which contains the ``id`` of the record. The ``id`` is equal to ``None`` when there is no matching. The ``status`` is equal to one when the existing record was modified zero otherwise. """ self.logger.debug(f"{T6}get existing record by fields") # alias db = self.db logs = self.logs # add the publication year to search criteria if year: kwargs["year"] = year # look for an existing record rec_id = get_id(db.publications, **kwargs) if not rec_id: return (None, 0) # fix origin field publication = db.publications[rec_id] ok = publication.origin and publication.origin == oai_url if not ok: if not self.dry_run: publication = dict(origin=oai_url) logs[-1].modify(MSG_FIX_ORIGIN, year) return (rec_id, 1) logs[-1].idle(MSG_IN_DB, year) return (rec_id, 0) def insert_record(self, record): """Insert the record in the database. Note: This method depend on the type of publications. It has to be implemented for each inherited class. Args: record (Record): record describing the publication. Returns: int: one when the record is inserted / updated in the database, zero otherwise. """ return 0 def process_collection(self, collection): """Retrieve JSON objects from the invenio store and for the given collection. Corresponding records are inserted in the database. Args: collection (str): name of the collection to be interrogated. Note: * Design to never stop although exceptions are raised * Have a look to the attributes ``collection_logs`` and ``logs`` in order to understand what happen. """ logger = self.logger logger.debug(f"process collection {collection}") # alias collection_logs = self.collection_logs controller = self.controller host = self.harvester.host project = self.db.projects[self.id_project].project store = self.store # log collection information # A collection is identified as "Project Controller collection" ctitle = "%s / %s / %s" % (project, controller, collection) collection_logs.append(MsgCollection(title=ctitle)) # get search parameters for the collection including user criteria kwargs = self._search_parameters(collection) # get the list of record identifier matching the search criteria try: rec_ids = store.get_ids(**kwargs) except CdsException as error: logger.debug(f"exit process_collection: {error}") collection_logs[-1].url = store.last_search_url() collection_logs[-1].error = error return # log the number of record found for the collection collection_logs[-1].url = store.last_search_url() collection_logs[-1].found = len(rec_ids) if len(rec_ids) == 0: logger.debug(f"no records found in {collection}") return logger.debug(f"{len(rec_ids)} records found in {collection}") # remove form the list identifier already registered in the data base # and log them func = self._is_record_in_db rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0] # process the remaining identifiers (*map(self.process_recid, rec_ids), ) def process_recjson(self, recjson): """Process the publication provided as a JSON record: * instantiate the record (RecordPubli, REcordConf, RecordThesis) * check the record * insert new record in the database Args: recjson (dict): record provided by the store. """ logger = self.logger logger.debug(f"{T4}process record {recjson['recid']} (process_recjson)") collection_logs = self.collection_logs harvester = self.harvester logs = self.logs # instantiate the record record = build_record(recjson) logger.debug(f"{T4}{record.title()[:72]}") # start the log for the record logs.append(Msg(harvester=harvester, collection=collection_logs[-1].title, record_id=record.id(), title=record.title())) # check that the record is well formed # repair non-conformity as far as possible if not self.check_record(record): logger.debug(f"{T4}{logs[-1].txt}") return txt = ("(dry run)" if self.dry_run else "") logger.debug(f"{T4}insert record in the database {txt}") # insert the record in the database self.insert_record(record) if logger.getEffectiveLevel() == logging.DEBUG: log = logs[-1] action = log.action action = (action.upper() if isinstance(action, str) else action) logger.debug(f"{T4}log: {action} {log.txt}") def process_recid(self, rec_id): """Process the publication identified by its record identifier: * get the publication data from the store using its identifier * instantiate the record: ``RecordPubli``, ``RecordConf`` or ``RecordThesis`` * process OAI data * check the record * insert new record in the database Note: * Design to never stop although exception are raised * Have a look to the attribute ``collection_logs`` and ``logs`` in order to understand what happen. Args: rec_id (int): identifier of the publication in the store. """ logger = self.logger logger.debug("") logger.debug(f"{T2}get record {rec_id} (process_recid)") collection_logs = self.collection_logs harvester = self.harvester logs = self.logs try: recjson = self.store.get_record(rec_id) self.process_recjson(recjson) except Exception as e: logger.debug(f"{T2}{str(e)}") url = OAI_URL % (harvester.host, rec_id) logs.append(Msg(harvester=harvester, collection=collection_logs[-1].title, record_id=rec_id, title=url)) logs[-1].reject(e) return def process_url(self, host, collections): """Retrieve JSON objects from the invenio store and insert corresponding records in the database. Note: * Design to never stop although exceptions are raised * Have a look to the attributes ``collection_logs`` and ``logs`` in order to understand what happen. Args: host (str): host name to query for publications, either ``cds.cern.ch`` or ``inspirehep.net``. collections (str): list of collection to be interrogated. Collections are separated by a comma. """ self.logger.debug("") self.logger.debug(f"process URL search -- {host} -- {collections}") # extend harvester for logs self.harvester.host = host self.harvester.collections = collections # instantiate the store self.store = InvenioStore(host) # list of collections collections = re.sub(" *, *", ",", collections).split(",") # process (*map(self.process_collection, collections), ) def report(self): """Build the processing report. Returns: dict: * ``collection_logs`` list of :class:`MsgCollection` * ``controller`` str * ``logs`` list of :class:`Msg` * ``selector`` :class:`plugin_dbui.Selector` """ return dict(collection_logs=self.collection_logs, controller=self.controller, logs=self.logs)