""" harvest_tools.automaton """ import re import traceback from .base import (MSG_FIX_ORIGIN, MSG_IN_DB, search_synonym, ToolException) from .checkandfix import CheckAndFix from gluon.storage import Storage from invenio_tools import (CdsException, InvenioStore, Marc12, OAI_URL) from invenio_tools.factory import build_record from .msg import Msg from .msgcollection import MsgCollection from plugin_dbui import CALLBACK_ERRORS, get_id MSG_NO_CAT = 'Select a "category" !!!' MSG_NO_PROJECT = 'Select a "project" !!!' MSG_NO_TEAM = 'Select a "team" !!!' MSG_NSERT_FAIL = "Fail to insert the new record in the database." MSG_NO_OAI = "Reject no OAI identifier" MSG_WELL_FORM_OAI = "Reject OAI is not well formed" # search collection when using inspirehep # require for "Hal Hidden" REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)") class Automaton(object): """Base class to search and process publications: * Decode the selector defining user criteria. * Search in the store publications matching user criteria. * Instantiate the record and check it. * Insert new records in the database. Note: The parameters of the search are defined by the current ``request``. The logic implements in the ``Automaton`` class is the following: #. Ask to the store, all the `record_id` satisfying the user request. #. Reject `record_id` contains in the *origin* field of a database entry. #. Request to the store, the JSON description of the publications and decode them. #. Reject the record for which the *secondary_oai_url* is contained in the *origin* field of a database entry. Update the *origin* field of the database record. #. Check that the *oai* of the publication is defined and well formed. Recover it, if it is not the case. At this stage the OAI is always defined. #. Reject temporarily publication. #. Check that *authors* are defined. Reject the publication if it is not the case. #. Check that *my institute* is in the list of the institutes signing the publication. Reject the publication if it is not the case. When the affiliation are not defined, try to recover this case, by finding the author of my institute signing the publication. This recovery procedure uses the *author rescue list*. Reject the record when the recovery procedure failed. #. Check that the *collaboration*, if defined, is well formed. Reject the publication if it is not the case #. Several check are applied depending on the publication type. #. At the end of this process, the publisher, the authors are formatted and the list of signatories of my institute extracted. Args: db (gluon.DAL): the database connection. id_team (int): the identifier of the team in the database. id_project (int): the identifier of the project in the database. automaton (unicode): the name of the automaton which will be used to process the data. Possible values are: ``articles``, ``notes``, ``preprints``, ``proceedings``, ``reports``, ``talks`` and ``theses``. id_category (int): the identifier of the category of publication year_start (int): starting year for the scan year_end (int): ending year of the scan dry_run (bool): new records are not inserted in the database when ``True``. debug (bool): activate the verbose mode when ``True``. Raises: ToolException: * team or project or the publication category not defined """ def __init__(self, db, id_team, id_project, automaton, id_category, year_start=None, year_end=None, dry_run=True, debug=False): # protection team, project and/or category have to be defined if not id_team: raise ToolException(MSG_NO_TEAM) if not id_project: raise ToolException(MSG_NO_PROJECT) if not id_category: raise ToolException(MSG_NO_CAT) self.check = CheckAndFix() self.collection_logs = [] self.controller = automaton self.db = db self.dbg = debug self.dry_run = dry_run self.id_category = id_category self.id_team = id_team self.id_project = id_project self.logs = [] self.store = None self.year_start = year_start self.year_end = year_end # Construct harvester Storage needed for the log self.harvester = Storage(id_teams=id_team, id_projects=id_project, controller=automaton, id_categories=id_category) # Identifier of the categories preprint and articles # Used by the method _is_record_in_db self._id_preprint = get_id(db.categories, code="PRE") self._id_article = get_id(db.categories, code="ACL") def _insert_in_db(self, log_year="", **fields): """Insert the record in the database, handling database exception. Args: log_year (str): year of the record for the log Keyword Args: **fields: keyword arguments defining the record values to be inserted in the database. Returns: int: one when the record is inserted / updated in the database, zero otherwise. """ db = self.db try: rec_id = db.publications.insert(**fields) if rec_id: return 1 # operation can be reject by callback table._before_insert else: msg = MSG_NSERT_FAIL if CALLBACK_ERRORS in db.publications: msg = db.publications._callback_errors # reduce the error message if isinstance(msg, list): msg = "%s %s" % (msg[0], msg[-1]) self.logs[-1].reject(msg, log_year) return 0 # operation can be rejected by the database except Exception as dbe: self.logs[-1].reject(str(dbe), log_year) return 0 def _is_record_in_db(self, collection_title, host=None, rec_id=None, oai_url=None): """Return the database identifier when the publication is registered. The search is based on the ``origin`` field and on the primary OAI. Note: A new log entry is created when a record is found. Args: title (str): the title of the publication. Keyword Args: host (unicode): the store. possible values are ``cds.cern.ch`` or ``inspirehep.net``. To be used with *rec_id*. rec_id (int): the record identifier in the store oai_url (unicode): the URL of the record in the store. Either use *host* and *rec_id* or *oai_url* Returns: int: the id of the record in the database when a record is found, 0 otherwise. Raises: ValueError: * keyword arguments are not defined properly. """ db = self.db harvester = self.harvester # build the OAI URL if host is not None and rec_id is not None and oai_url is None: url = OAI_URL % (host, rec_id) elif host is None and rec_id is None and oai_url is not None: url = oai_url else: raise ValueError # protection empty URL if len(url) == 0: return 0 # check the OAI query = db.publications.origin.contains(url) setrows = db(query) if setrows.count() == 0: return 0 # one record found columns = [db.publications.id, db.publications.id_categories, db.publications.title, db.publications.year] publication = setrows.select(*columns).first() # Note: # The category for the publication and the harvester have to be equal. # However, keep the record if it is a preprint when the harvester # looks for articles. This is required to transform a preprint # into article # # Category can disagree when the publication is an article and # the harvester look for preprint. In that case, keep the article # if publication.id_categories != harvester.id_categories: is_preprint_to_article = \ publication.id_categories == self._id_preprint \ and harvester.id_categories == self._id_article if is_preprint_to_article: return 0 # log self.logs.append(Msg(harvester=harvester, collection=collection_title, record_id=rec_id, title=publication.title)) self.logs[-1].idle(MSG_IN_DB, publication.year) if self.dbg: print("\trecord already in db:", rec_id, "->", publication.id) return publication.id def _search_parameters(self, collection): """Build the keywords to steer the URL search in invenio store. The main parameter is the collection and the date range defined in the selector. Args: collection (unicode): string defining the collection in the store. The syntax depends on the invenio store: * ``"find cn d0 and tc p and not tc c"`` * ``"LHCb Papers"``. Returns: dict: the key are a sub-set of those defined in :meth:`invenio_tools.InvenioStore.get_ids`. """ year_start = self.year_start year_end = self.year_end # INSPIREHEP store if collection.startswith("find"): query = collection if year_start and not year_end: query += " and date %s" % year_start elif not year_start and year_end: query += " and date %s" % year_end elif year_start and year_end: query += " and date > %s and date < %s " \ % (year_start - 1, year_end + 1) dic = dict(p=query, # query à la spires rg=1000, # maximum number of records returned sf="year", # sort by date so="d") # descending order # handle the cc keyword (true inspirehep collection) match = REG_COLLECTION.search(query) if match: dic["cc"] = match.group(1).strip() dic["p"] = REG_COLLECTION.sub("", query).strip() dic["p"] = dic["p"].replace(" ", " ") if dic["p"] == "find": del dic["p"] # CERN INVENIO store else: if year_start and not year_end: rex = year_start elif not year_start and year_end: rex = year_end elif year_start and year_end: li = [str(el) for el in xrange(year_start, year_end + 1)] rex = "|".join(li) dic = dict(cc=collection, # collection f1="year", # search on year m1="r", # use regular expression p1=rex, # regular expression defining year sf="year", # sort by date so="d") # descending order return dic def check_record(self, record): """Check the content of the record in order to fix non-conformities. Return ``False`` when non-conformities are found and can not be corrected. Note: Some checks depend on the type of publications and have to be implemented in inherited class. Note: The order of the checks matter. It should be OAI, temporary record, authors, my authors and then a series of checks specific to the publication type. Args: record (Record): JSON record describing the publication. Returns: bool: ``False`` when a non-conformity is found and can not be corrected. """ if self.dbg: print("check record") try: self.check.recover_oai(record, self.harvester.host) if self.check.is_bad_oai_used(record): self.logs[-1].idle(MSG_IN_DB, record.submitted()) return False self.check.temporary_record(record) self.check.authors(record) self.check.my_affiliation(record, self.id_project, self.id_team) self.check.collaboration(record) except Exception as e: self.logs[-1].reject(e, record=record) return False return True def get_record_by_fields(self, oai_url, year, **kwargs): """Get database record matching fields values defined in the keyword arguments. Note: This method is required to deal with publication entered by hand and found later by an harvester. Args: oai_url (unicode): the oai_url, *e.g.* ``http://cds.cern.ch/record/123456``. The origin field of the existing database record is update to **oai_url** when a match is found. year (int): the year of the publication. It is used by the search algorithm and by the logger. Keyword Args: kwargs (unicode): a series of key, value pair where the key is the name of a publications database field. Returns: tuple: ``(id, status)`` which contains the ``id`` of the record. The ``id`` is equal to ``None`` when there is no matching. The ``status`` is equal to one when the existing record was modified zero otherwise. """ if self.dbg: print("get existing record by fields") # alias db = self.db logs = self.logs # add the publication year to search criteria if year: kwargs["year"] = year # look for an existing record rec_id = get_id(db.publications, **kwargs) if not rec_id: return (None, 0) # fix origin field publication = db.publications[rec_id] ok = publication.origin and publication.origin == oai_url if not ok: if not self.dry_run: publication = dict(origin=oai_url) logs[-1].modify(MSG_FIX_ORIGIN, year) return (rec_id, 1) logs[-1].idle(MSG_IN_DB, year) return (rec_id, 0) def insert_record(self, record): """Insert the record in the database. Note: This method depend on the type of publications. It has to be implemented for each inherited class. Args: record (Record): record describing the publication. Returns: int: one when the record is inserted / updated in the database, zero otherwise. """ return 0 def process_collection(self, collection): """"Retrieve JSON objects from the invenio store and for the given collection. Corresponding records are inserted in the database. Args: collection (unicode): name of the collection to be interrogated. Raises: CdsException: * keyword argument is invalid; * the server return an HTTP error; * JSON object can't be decoded * not well formed list of ids. """ if self.dbg: print("\nprocess collection", collection) # alias collection_logs = self.collection_logs controller = self.controller host = self.harvester.host project = self.db.projects[self.id_project].project store = self.store # log collection information # A collection is identified as "Project Controller collection" ctitle = "%s / %s / %s" % (project, controller, collection) collection_logs.append(MsgCollection(title=ctitle)) # get search parameters for the collection including user criteria kwargs = self._search_parameters(collection) # get the list of record identifier matching the search criteria try: rec_ids = store.get_ids(**kwargs) except CdsException as error: collection_logs[-1].url = store.last_search_url() collection_logs[-1].error = error return # log the number of record found for the collection collection_logs[-1].url = store.last_search_url() collection_logs[-1].found = len(rec_ids) if len(rec_ids) == 0: if self.dbg: print("\tNo records found in %s" % collection) return if self.dbg: print("\t%i records found in %s" % (len(rec_ids), collection)) # remove form the list identifier already registered in the data base # and log them func = self._is_record_in_db rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0] # process the remaining identifiers [self.process_recid(rec_id) for rec_id in rec_ids] def process_recid(self, rec_id): """Process the publication: * get the publication data from the store using its identifier * instantiate the record (RecordPubli, REcordConf, RecordThesis) * process OAI data * check the record * insert new record in the database Args: rec_id (int): identifier of the publication in the store. Raise: CdsException: * the server return an HTTP error. * no JSON object could be decoded. """ if self.dbg: print("\nprocessing record", rec_id) collection_logs = self.collection_logs harvester = self.harvester logs = self.logs # instantiate the record try: recjson = self.store.get_record(rec_id) record = build_record(recjson) if self.dbg: print("\t", record.title()) except Exception as e: print(traceback.format_exc()) url = OAI_URL % (harvester.host, rec_id) logs.append(Msg(harvester=harvester, collection=collection_logs[-1].title, record_id=rec_id, title=url)) logs[-1].reject(e) # start the log for the record logs.append(Msg(harvester=harvester, collection=collection_logs[-1].title, record_id=record.id(), title=record.title())) # check that the record is well formed # repair non-conformity as far as possible if not self.check_record(record): if self.dbg: print("\trecord rejected", logs[-1].txt) return if self.dbg: print("\tinsert record in the database") # insert the record in the database self.insert_record(record) if self.dbg: log = logs[-1] action = log.action action = (action.upper() if isinstance(action, str) else action) print("\tlog:", action, log.txt) def process_url(self, host, collections): """Retrieve JSON objects from the invenio store and insert corresponding records in the database. Args: host (unicode): host name to query for publications, either ``cds.cern.ch`` or ``inspirehep.net``. collections (unicode): list of collection to be interrogated. Raises: StoreException: when something goes wrong interrogating the store. CheckException: when the record has non-conformities. Exception: when the python code crashes. """ if self.dbg: print("process URL search") # extend harvester for logs self.harvester.host = host self.harvester.collections = collections # instantiate the store self.store = InvenioStore(host) # list of collections collections = re.sub(" *, *", ",", collections).split(",") # process [self.process_collection(collection) for collection in collections] def report(self): """Build the processing report. Returns: dict: * ``collection_logs`` list of :class:`MsgCollection` * ``controller`` unicode * ``logs`` list of :class:Msg * ``selector`` :class:`plugin_dbui.Selector` """ return dict(collection_logs=self.collection_logs, controller=self.controller, logs=self.logs) def search_collaboration(self, value): """Get the database collaboration identifier using synonyms. Args: value (unicode): the name of the collaboration. Returns: int: * the id of the collaboration record. * UNDEF_ID if value is not defined. Raises: ToolException: when more than one synonym is found or when the collaboration is not defined. """ return search_synonym(self.db.collaborations, "collaboration", value) def search_country(self, value): """Get the database country identifier using synonyms. Args: value (unicode): the name of the country. Returns: int: * the id of the country record. * UNDEF_ID if value is not defined. Raises: ToolException: when more than one synonym is found ot when the country is not defined. """ return search_synonym(self.db.countries, "country", value) def search_publisher(self, value): """Get the database publisher identifier using synonyms. Args: value (unicode): the abbreviation of the publisher. Returns: int: * the id of the publisher record. * UNDEF_ID if value is not defined. Raises: ToolException: when more than one synonym is found or when the publisher is not defined. """ return search_synonym(self.db.publishers, "abbreviation", value)