# -*- coding: utf-8 -*- """ harvest_tools.automaton """ import re import traceback from base import format_author_fr, MSG_FIX_ORIGIN, MSG_IN_DB, ToolException from gluon.storage import Storage from invenio_tools import (CheckAndFix, InvenioStore, Marc12, OAI_URL, REG_OAI, REG_YEAR) from msg import Msg from msgcollection import MsgCollection from plugin_dbui import get_create_id, get_id, UNDEF_ID MSG_NO_CAT = 'Select a "category" !!!' MSG_NO_PROJECT = 'Select a "project" !!!' MSG_NO_TEAM = 'Select a "team" !!!' MSG_NO_OAI = "Reject no OAI identifier" MSG_WELL_FORM_OAI = "Reject OAI is not well formed" class Automaton(object): """Base class to search and process publications. - Decode the parameter of a selector defining user criteria. - Search for publications in the store, according to user criteria or process and XML string. - Load records in the database. The parameters of the search as well as the parameters of the harvester are defined by the current request. """ def __init__(self, db, id_team, id_project, controller, id_category, year_start=None, year_end=None, dry_run=True, debug=False): """ @note see C{build_harvester_tool} factory function building C{PublicationsTools} @type db: gluon.dal.DAL @param db: @type id_team: int @param id_team: Identifier of the team in the db @type id_project: int @param id_project: Identifier of the project in the db @type controller: unicode @param controller: Type of publication (i.e. 'article', 'proceedings', ...) @type id_category: int @param id_category: Identifier of the category of publication (i.e. ACL, ACTI, ...) @type year_start: int @keyword year_start: Start year of search (i.e. '2014') @type year_end: int @keyword year_end: End year of search (i.e. '2015') @type dry_run: boolean @keyword dry_run: True if no record is to be written to the db @type debug: bool @param debug: activate the debug mode """ self.collection_logs = [] self.db = db self.id_team = id_team self.id_project = id_project self.controller = controller self.id_category = id_category self.year_start = year_start self.year_end = year_end self.dry_run = dry_run self.dbg = debug self.logs = [] self.check = CheckAndFix() self.marc12 = Marc12() # check parameters # protection team, project and/or category have to be defined if not self.id_team: raise ToolException(MSG_NO_TEAM) if not self.id_project: raise ToolException(MSG_NO_PROJECT) if not self.id_category: raise ToolException(MSG_NO_CAT) # Construct harvester Storage needed for the log self.harvester = Storage(id_teams=self.id_team, id_projects=self.id_project, controller=self.controller, id_categories=self.id_category) # private cache for my_author rescue list self.__par = None self.__reference = None def _is_in_db(self, rec_id, title): """Return C{True} if the record is already in the database. The search is based on the origin field. A new log entry is created @type rec_id: int @param rec_id: record identifier @type title: str @param title: title of the collection @rtype: bool """ db = self.db harvester = self.harvester # check url = OAI_URL % (harvester.host, rec_id) db_id = get_id(db.publications, origin=url) if db_id is None: return False publication = db.publications[db_id] # same category for the publication and the harvester # keep the record if it is not the case # this is required to transform a preprint into article if publication.id_categories != harvester.id_categories: return False # log self.logs.append(Msg(harvester=self.harvester, collection=title, record_id=rec_id, title=publication.title)) self.logs[-1].idle(MSG_IN_DB, publication.year) return True def _search_parameters(self, collection): """Build the keywords to steer the URL search in invenio store. The main parameter is the collection and the date range defined in the selector. @type collection: unicode @param collection: statement defining the collection in the store, I{i.e.} C{"find cn d0 and tc p and not tc c"} or C{"LHCb Papers"}. The syntax depends on the invenio store. @rtype: dict @return: the key are a sub-set of those defined in L{invenio_tools.InvenioStore.get_ids}. """ # INSPIREHEP store if collection.startswith('find'): query = collection if self.year_start and not self.year_end: query += " and date %s" % self.year_start elif not self.year_start and self.year_end: query += " and date %s" % self.year_end elif self.year_start and self.year_end: query += " and date > %s and date < %s " \ % (self.year_start - 1, self.year_end + 1) dic = dict(p=query, # query à la spires rg=1000, # maximum number of records returned sf='year', # sort by date so='d') # descending order # CERN INVENIO store else: if self.year_start and not self.year_end: rex = self.year_start elif not self.year_start and self.year_end: rex = self.year_end elif self.year_start and self.year_end: li = [] for year in range(self.year_start, self.year_end + 1): li.append(str(year)) rex = '|'.join(li) dic = dict(cc=collection, # collection f1='year', # search on year m1='r', # use regular expression p1=rex, # regular expression defining year sf='year', # sort by date so='d') # descending order return dic def _my_author_list(self, record): """Extract the rescue list for my authors in the database. @type record: L{Record} @param record: @rtype: list @return: empty when not defined """ year = record.year() # try to recover year when not defined if not year: # published article, proceeding if "773" in record and "y" in record["773"]: year = record["773"]["y"] # start date of a conference elif "111" in record and "x" in record["111"]: year = record["111"]["x"] # end date of a conference elif "111" in record and "z" in record["111"]: year = record["111"]["z"] # submitted date elif "269" in record and "c" in record["269"]: year = record["269"]["c"] else: return [] # # NOTE: # keep in mind that the CheckAndfix mechanism is not yet run # therefore year can be a list due to erratum, ... # if isinstance(year, list): year.sort() year = year[0] # the value can have several format 1992, 1992-12-31, .... m = REG_YEAR.search(year) if m: year = m.group(1) else: return [] # caching t = (year, self.id_project, self.id_team) if t == self.__par: return self.__reference # extract the list from the database row = self.db.my_authors(year=year, id_projects=self.id_project, id_teams=self.id_team) if row: self.__reference = row['authors'].split(', ') else: self.__reference = [] return self.__reference def check_by_fields(self, **kwargs): """Check that a record already exist using the fields defined in the keyword arguments. - Fix the field origin when a match is found. - Actions are logged. @keyword oai_url: typical value is "http://cds.cern.ch/record/123456" @keyword year: @note: this method can be customised in inherited class to perform dedicated action. @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing record by fields" db = self.db # origin can't be used for the search oai_url = kwargs["oai_url"] del kwargs["oai_url"] # look for an existing record rec_id = get_id(db.publications, **kwargs) if not rec_id: return (None, 0) # fix origin field ok = db.publications[rec_id].origin and \ db.publications[rec_id].origin == oai_url if not ok: if not self.dry_run: db.publications[rec_id] = dict(origin=oai_url) self.logs[-1].modify(MSG_FIX_ORIGIN, kwargs["year"]) return (rec_id, 1) self.logs[-1].idle(MSG_IN_DB, kwargs["year"]) return (rec_id, 0) def check_collaboration(self, value): """Check that the collaboration exit in the database, create it if not. @type value: str or None @param value: the name of the collaboration. @rtype: int @return: the id of the collaboration, UNDEF_ID when not defined """ if not value: return UNDEF_ID return get_create_id(self.db.collaborations, collaboration=value) def check_publisher(self, value): """Check that publisher exit in the database, create it if not. @type value: str or None @param value: the abbreviation of the publisher name. @rtype: int @return: the id of the publisher, UNDEF_ID when not defined """ if not value: return UNDEF_ID return get_create_id(self.db.publishers, abbreviation=value) def select_record(self, record): """C{True} when the C{record} is selected. This method check and format the author field. @note: The checks depend on the type of publications and have to be implemented in inherited class. @type record: L{Record} @param record: @rtype: bool """ if self.dbg: print "select record and check / format authors" try: self.check.temporary_record(record) self.check.authors(record) self.check.format_authors(record, format_author_fr) self.check.collaboration(record) except BaseException as e: self.logs[-1].reject(e, record.year()) return False return True def load_db(self, record): """Load the record in the database. @note: This method depend on the type of publications. It has to be implemented for each inherited class. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ return 0 def process_xml(self, xml): """Decode the xml and load it in the database. @raise Exception: the type of exception depends on what happen: - L{ToolException} when projet, team or category identifier are not defined. - C{StoreException} when somethings goes wrong interrogating the store. - C{Marc12Exception} when somethings goes wrong decoding the XML string return by the store - C{CheckException} if the L{Record} is not valid - C{Exception} if the python code crash @type xml: unicode @keyword xml: marc12 xml encoding of the publication record """ if self.dbg: print "start processing", self.__class__.__name__ print "decode request" if self.dbg: print "get harvest parameters" # decode the XML request self.collection_logs.append(MsgCollection(found=1)) self.decode_xml(xml) def process_url(self, host, collections): """Retrieve the xml from the invenio store and load it in the database @raise Exception: depending on what happen, can be StoreException, Marc12ZException, ... @type host: unicode @keyword host: Web host name to query for publication @type collections: unicode @keyword collections: Request string to send to the host to get the publications """ if self.dbg: print "process URL search" # extend harvester for logs self.harvester.host = host self.harvester.collections = collections store = InvenioStore(host) # list of collections collections = re.sub(' *, *', ',', collections).split(',') # alias controller = self.controller project = self.db.projects[self.id_project].project # extract the list of publications from the store for each collection # the search is perform on a range of creation date # if not defined all element are return # # The method use here minimise the memory usage # on the server as well as on the client side for collection in collections: # log collection information # A collection is identified as "Project Controller collection" title = "%s / %s / %s" % (project, controller, collection) self.collection_logs.append(MsgCollection(title=title)) # search record in the harvester repository kwargs = self._search_parameters(collection) try: rec_ids = store.get_ids(**kwargs) except Exception as error: self.collection_logs[-1].url = store.last_search_url() self.collection_logs[-1].error = error continue self.collection_logs[-1].url = store.last_search_url() self.collection_logs[-1].found = len(rec_ids) if not rec_ids: continue if self.dbg: print '%i records found in %s' % (len(rec_ids), collection) for rec_id in rec_ids: if self.dbg: print "\nprocessing record", rec_id try: if self._is_in_db(rec_id, title): continue xml = store.get_record(rec_id) self.decode_xml(xml) except Exception as e: print traceback.format_exc() url = OAI_URL % (host, rec_id) self.logs.append(Msg(harvester=self.harvester, collection=title, record_id=rec_id, title=url)) self.logs[-1].reject(e) def decode_xml(self, xml): """Decode the MARC XML string and load records in the database. @type xml: unicode @param xml: MARC XML string """ if self.dbg: print "process xml record" # NOTE: BaseException and inherited class # are catched by the previous stage records = self.marc12(xml) # process individual record for record in records: if self.dbg: print "record decoded" # start the log for the record self.logs.append(Msg(harvester=self.harvester, collection=self.collection_logs[-1].title, record_id=record.id(), title=record.title())) # reject record with undefined OAI field oai = record.oai() if not oai: self.logs[-1].reject(MSG_NO_OAI, record.year()) # reject record is not well form OAI match = REG_OAI.match(oai) if not match: self.logs[-1].reject(MSG_WELL_FORM_OAI, record.year()) # additional selection stage # at this step the validity of the record is checked # and non-conformities are repaired if not self.select_record(record): continue if self.dbg: print "start loading in the database" # load record in the database self.load_db(record) if self.dbg: print self.logs[-1].action.upper(), self.logs[-1].txt def report(self): """Build the processing report. @rtype: dict @return: - C{collection_logs} (list) one L{MsgCollection}) for each collection - C{controller} (str) - C{logs} (list) one L{Msg} for each publication - C{selector} (Selector) """ return dict(collection_logs=self.collection_logs, controller=self.controller, logs=self.logs)