# -*- coding: utf-8 -*- """ harvest_tools.checkandfix """ import numpy as np import re import regex from .base import search_synonym, ToolException from datetime import datetime from .exception import CheckException from gluon import current from invenio_tools import (DECODE_REF, MSG_NO_CONF, MSG_NO_THESIS, OAI_URL, RecordConf, RecordThesis, REG_OAI, REG_YEAR) from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS from itertools import imap from plugin_dbui import CLEAN_SPACES, get_id DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.") # Decode submitted date: DD MMM YYYY or DD MM YYY DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})") DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})") DECODE_YYYY = re.compile(r"^(\d{4})$") MONTHS = {"Jan": "01", "Feb": "02", "Fev": "02", "Mar": "03", "Apr": "04", "Avr": "04", "May": "05", "Mai": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"} MSG_INVALID_HOST = "Invalid host" MSG_NO_AUTHOR = "Reject no author(s)" MSG_NO_CONF_DATE = "Reject no conference date" MSG_NO_DATE = "Reject no submission date" MSG_NO_MY_AUTHOR = "Reject no authors of my institute" MSG_NO_REF = "Reject incomplete paper reference. Check " MSG_NO_YEAR = "Reject no publication year" MSG_TEMPORARY_RECORD = "Temporary record" MSG_TO_MANY_DATE = "Reject to many submit date" MSG_TO_MANY_FAUTHOR = "Reject to many first author" MSG_TO_MANY_YEAR = "Reject to many year" MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed" MSG_WELL_FORMED_DATE = "Reject submission date is not well formed" MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed" OAI_INVENIO = "oai:%s:%s" REG_COLLABORATION = re.compile(regex.REG_COLLABORATION) REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})") REG_CONF_DATES_2 = \ re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})") REG_CONF_DATES = re.compile(regex.REG_CONF_DATES) REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)") REG_SUBMITTED = re.compile(regex.REG_SUBMITTED) REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}") REG_WELL_FORMED_CONF_DATES_2 = \ re.compile("\d{2} [A-Z][a-z]{2} - \d{2} [A-Z][a-z]{2} \d{4}") UNIVERSITY = "University" class CheckAndFix(object): """A collection of tools to check and repair the content of the Marc12 record. """ def __init__(self): self.db = current.db self.reg_institute = self._get_reg_institute() # private cache for my_author rescue list self.__par = None self.__reference = None # private cache for my authors list self._my_authors = {} @staticmethod def _get_conference_dates(record): """Return the opening and closing dates of a conference. Args: record (RecordConf): record describing a conference proceeding or talk. Returns: tuple of datetime.date: opening and closing dates. Raise: ToolException: no conference date found. """ if u"meeting_name" not in record: raise ToolException(MSG_NO_CONF_DATE) meeting = record[u"meeting_name"] meeting = (meeting[0] if isinstance(meeting, list) else meeting) # CDS has the opening and closing dates encoded as 20141231 if u"opening_date" in meeting and u"closing_date" in meeting: fmt = "%Y%m%d" val = meeting[u"opening_date"] opening = datetime.strptime(val, fmt) val = meeting[u"closing_date"] closing = datetime.strptime(val, fmt) return (opening, closing) # both CDS and INSPIRE have the dates subfield val = meeting[u"date"] # date is encode as 12 - 15 Mar 2014 m = REG_CONF_DATES_1.match(val) if m: fmt = "%d-%b-%Y" val = u"%s-%s-%s" % (m.group(1), m.group(3), m.group(4)) opening = datetime.strptime(val, fmt) val = u"%s-%s-%s" % (m.group(2), m.group(3), m.group(4)) closing = datetime.strptime(val, fmt) return (opening, closing) # dates are encoded 29 Feb - 1 Mar 2014 m = REG_CONF_DATES_2.match(val) if not m: raise ToolException(MSG_NO_CONF_DATE) fmt = "%d-%b-%Y" val = u"%s-%s-%s" % (m.group(1), m.group(2), m.group(5)) opening = datetime.strptime(val, fmt) val = u"%s-%s-%s" % (m.group(3), m.group(4), m.group(5)) closing = datetime.strptime(val, fmt) return (opening, closing) def _get_reg_institute(self): """Get the regular expression defining the affiliation of my institute. It is obtained by concatenating the affiliation keys. Affiliation key can contains character like ``(``, ``)`` or ``&``. They are replaced by ``\(`` *etc*. Returns: str: """ # alias db = self.db app = current.app reg_institute = app.reg_institute # regular expression for the affiliation keys # protect special character # add start and end of string for an exact match if not reg_institute: lst = [] for row in db(db.affiliation_keys.id > 0).iterselect(): val = row.key_u val = (val .replace("(", "\(") .replace(")", "\)") .replace("&", "\&") .replace("$", "\$") .replace("+", "\+") .replace("?", "\?")) val = r"(^|\|){}($|\|)" .format(val) lst.append(val) reg_institute = r"|".join(lst) return reg_institute def _get_author_rescue_list(self, record, id_project, id_team): """Get the rescue list for my authors. Args: record (RecordPubli): record describing a publication. id_project (int): identifier of the project in the database. id_team (int): identifier of the team in the database. Returns: list: empty when not defined """ year = record.submitted() # try to recover year when not defined if not year: # published article, proceeding if record[u"publication_info"].year.iloc[0] != "": year = record[u"publication_info"].year.iloc[0] # start date of a conference elif record._get(u"meeting_name", u"opening_date") != u"": year = record._get(u"meeting_name", u"opening_date") # end date of a conference elif record._get(u"meeting_name", u"closing_date") != u"": year = record._get(u"meeting_name", u"closing_date") else: return [] # # protection # submitted and paper year are protect against erratum, but ... # if isinstance(year, list): year.sort() year = year[0] # the value can have several format 1992, 1992-12-31, .... m = REG_YEAR.search(year) if m: year = m.group(1) else: return [] # caching t = (year, id_project, id_team) if t == self.__par: return self.__reference # extract the list from the database row = self.db.my_authors(year=year, id_projects=id_project, id_teams=id_team) if row: self.__reference = row['authors'].strip("\n"). split(', ') else: self.__reference = [] return self.__reference def _is_synonym(self, tablename, value): """Check that the synonym field contains *value*. Args: tablename (str): name of the database table value (str): value to be searched Returns: bool: ``True`` if *one* row is found, ``False`` otherwise. """ db = self.db table = db[tablename] query = table.synonyms.contains(value) if db(query).count() == 1: return True return False def _recover_submitted(self, record): """Recover submitted date using conference, preprint or thesis information. Args: record (RecordPubli): record describing a publication. Returns: unicode: empty when procedure failed """ val = u"" if isinstance(record, RecordConf): opening, closing = self._get_conference_dates(record) return opening.strftime("%Y-%m-%d") elif isinstance(record, RecordThesis): val = record.these_defense() else: report = record.preprint_number() if report: m_arxiv = DECODE_ARXIV.match(report) if m_arxiv: val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2)) return val @staticmethod def authors(record): """Check that author fields are defined. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when there is no authors. """ if not record.is_authors(): raise CheckException(MSG_NO_AUTHOR) def collaboration(self, record): """Check the collaboration. Have a look to the synonyms when the collaboration is not well formed. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the collaboration value is defined nor entered as a synonym. """ val = record.collaboration() if not val: return try: search_synonym(self.db.collaborations, "collaboration", val) except ToolException as e: raise CheckException(*e.args) def country(self, record): """Check conference country. Have a look to the synonyms when the country does not exist. Args: record (RecordConf): record describing a talk or a proceeding. Raises: CheckException: the country is not defined nor entered as a synonym. """ if not isinstance(record, RecordConf): return val = record.conference_country() try: search_synonym(self.db.countries, "country", val) except ToolException as e: raise CheckException(*e.args) def conference_date(self, record): """Check conference date and format it properly. Args: record (RecordConf): record describing a talk or a proceeding. Raises: CheckException: dates are not found. """ # conference information are available, i.e proceeding if not isinstance(record, RecordConf): return val = record.conference_dates() if len(val) == 0: raise CheckException(MSG_NO_CONF_DATE) # is it well formed if REG_WELL_FORMED_CONF_DATES_1.match(val): return if REG_WELL_FORMED_CONF_DATES_2.match(val): return # format the date properly opening, closing = self._get_conference_dates(record) if opening.month == closing.month: val = "%02i - %02i %s %i" % (opening.day, closing.day, opening.strftime("%b"), opening.year) else: val = "%02i %s - %02i %s %i" % (opening.day, opening.strftime("%b"), closing.day, closing.strftime("%b"), opening.year) meeting = record[u"meeting_name"] meeting = (meeting[0] if isinstance(meeting, list) else meeting) meeting[u"date"] = val def is_bad_oai_used(self, record): """Bad OAI is when the ``id`` in the OAI field is different from the ``record id``. This happens when an old record is redirected to new one. Args: record (RecordPubli): record describing a publication. Returns: bool: ``True`` when a record is found in the database with the bad OAI. """ value = record.oai() match = REG_OAI.match(value) if int(match.group(2)) != record.id(): db = self.db # a record with the bad OAI exists in the database bad_oai_url = OAI_URL % (match.group(1), match.group(2)) if get_id(db.publications, origin=bad_oai_url): return True return False @staticmethod def format_authors(record, fmt="Last, First"): """Format the author names. Args: record (RecordPubli): record describing a publication. fmt (str): define the format for author names. Possible values are "First, Last", "F. Last", "Last", "Last, First" and "Last F." """ record.reformat_authors(fmt) @staticmethod def format_editor(record): """Format the editor abbreviation. The encoding depends on the store:: INVENIO: Phys. Lett. B + volume 673 INSPIREHEP: Phys.Lett + volume B673 Standardise the answer as ``Phys. Lett. B``. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the editor is not well formed. """ if not record.is_published(): return df = record[u"publication_info"].iloc[0] editor = df.title volume = df.volume # add space after the dot Phys.Rev -> Phys. Rev editor = re.sub(r'\.([A-Z])', r'. \1', editor) # get the volume letter m = re.match(r'([A-Z]+) *(\d+)', volume) if m and m.group(1) != editor[-1]: editor = "%s %s" % (editor, m.group(1)) volume = m.group(2) # remove stupid mistake editor = CLEAN_SPACES(editor) df[["title", "volume"]] = [editor, volume] def format_universities(self, record): """Format the name of the university for PhD: * Fix the name of Aix-Marseille University * Replace U. by University Args: record (RecordThesis): record describing a thesis. """ # protection if not isinstance(record, RecordThesis): return is_cppm = self._get_reg_institute().find("CPPM") != -1 # CPPM: fix the name of Aix-Marseille university if is_cppm: year = REG_YEAR.search(record.these_defense()).group(1) if int(year) < 2012: university = "Université de la Méditerrannée Aix-Marseille II" else: university = "Aix Marseille Université" if "502" in record and "b" in record["502"]: if isinstance(record["502"]["b"], str): if "Marseille" in record["502"]["b"]: record["502"]["b"] = university elif isinstance(record["502"]["b"], list): for i in range(len(record["502"]["b"])): if "Marseille" in record["502"]["b"][i]: record["502"]["b"][i] = university # Other: replace U. by University else: university = current.T(UNIVERSITY, lazy=False) if "502" in record and "b" in record["502"]: if isinstance(record["502"]["b"], str): value = record["502"]["b"] if "U." in value: value = value.replace('U.', university) record["502"]["b"] = value elif isinstance(record["502"]["b"], list): for i in range(len(record["502"]["b"])): value = record["502"]["b"][i] if "U." in value: value = value.replace('U.', university) record["502"]["b"][i] = value def get_my_authors(self, record, sep=", ", sort=False): """Get authors of my institutes signing the record. The information is append to the Record object via the attribute ``my_authors``. Args: record (RecordPubli): record describing a publication. sep (unicode): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: unicode: the list of authors separated by the ``sep`` argument. Raises: CheckException: when the list is empty """ # might have been computed when affiliation is checked rec_id = record.id() if rec_id in self._my_authors: li = self._my_authors[rec_id] value = sep.join(li) # find authors of my institute signing the record else: reg_institute = self.reg_institute value = \ record.find_authors_by_affiliation(reg_institute, sep, sort) if len(value) == 0: raise CheckException(MSG_NO_MY_AUTHOR) record.my_authors = value @staticmethod def is_conference(record): """Check that the record described a conference talk / proceeding. Args: record (RecordPubli): record describing a publication. Raises: CheckException: the record is not associated to a conference. """ if not isinstance(record, RecordConf): raise CheckException(MSG_NO_CONF) @staticmethod def is_thesis(record): """Check that the record described a thesis. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the record does not describe a thesis. """ if not isinstance(record, RecordThesis): raise CheckException(MSG_NO_THESIS) def my_affiliation( self, record, id_project, id_team, fmt_rescue="F. Last", sort=False): """Check that authors of my institute are signatories. Launch a recovery procedure when affiliations are not defined. It is based on the author rescue list stored in the database. Args: record (RecordPubli): record describing a publication. id_project (int): identifier of the project in the database id_team (int): identifier of the team in the database fmt_rescue (str): the format for the authors used in the rescue list sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Return str: * the found affiliation * an empty string when the rescue list is used. Raises: CheckException: when the rescue list is required but empty or because the intersection between the rescue list and the author is null. """ value = record.find_affiliation(self.reg_institute) if len(value) > 0: return value # affiliation is not defined # try to recover using the authors rescue list rescue_list = self._get_author_rescue_list(record, id_project, id_team) if not rescue_list: raise CheckException(MSG_NO_MY_AUTHOR) # format the author in the same way as the rescue list fmt_ref = record._last_fmt_author record.reformat_authors(fmt_rescue) if sort: authors = (record[u"authors"][["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name) else: authors = (record[u"authors"].fmt_name .sort_index()) # go back to the origin formatting record.reformat_authors(fmt_ref) # compute the intersection between the authors and the rescue list intersection = set(authors) & set(rescue_list) if len(intersection) == 0: raise CheckException(MSG_NO_MY_AUTHOR) # cache the result for a latter use self._my_authors[record.id()] = list(intersection) return "" @staticmethod def paper_reference(record): """Check that editor, page, volume and paper year are defined for a published paper. Repair it from doi when possible. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the paper reference is not well formed. """ if record.is_published(): return # paper reference can be incomplete or missing # is the paper published ? In that case the doi is defined if u"doi" not in record: return # what information is missing ? # * df.columns are title, volume, year and pagination # * df can contains one or more rows due to erratum. # * assume that the first row is the oldest one and corresponds tp # the first publication # * the row contains empty string when the record is not published. # * iloc[0] returns a serie where the index are the column's name # columns = (record[u"publication_info"].iloc[0] .replace("", np.nan) .dropna() .index) missing = PAPER_REFERENCE_KEYS.difference(columns) # try to recover from the doi when it has the form # xx.yyyy/Publisher.Volume.Page m = REG_DOI.match(record[u"doi"]) if not m: raise ToolException(MSG_NO_REF + str(list(missing))) for subfield in missing: if subfield == "title": # transform PhysRevD in Phys. Rev. D li = re.split(r"([A-Z][a-z]+)", m.group(1)) title = ". ".join([el for el in li if len(el) > 0]) record[u"publication_info"].loc[0, u"title"] = title elif subfield == "volume": record[u"publication_info"].loc[0, u"volume"] = m.group(2) elif subfield == "pagination": record[u"publication_info"].loc[0, u"pagination"] = m.group(3) elif subfield == "year": raise ToolException(MSG_NO_REF + "[year]") def publisher(self, record): """Check publisher. Have a look to the synonyms when the publisher does not exist. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the publisher is not defined nor entered as a synonym. """ val = record.paper_editor() if len(val) == 0: return # convert ToolException to CheckExcpetion try: db = self.db search_synonym(db.publishers, "abbreviation", val) except ToolException as e: raise CheckException(*e.args) @staticmethod def recover_oai(record, host): """Recover the OAI identifier when it is not defined or not well form. Args: record (RecordPubli): record describing a publication. host (str): possible values ares ``cds.cern.ch`` or ``inspirehep.net`` """ # Note: # For the record cds 1951625, possible values are: # oai:cds.cern.ch:1951625 (if it does not exist in inspirehep) # oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist # in both store) # In all the case the first OAI corresponds to the record.id() # oai = record.oai() if oai is not None and REG_OAI.match(oai): return if host == "cds.cern.ch": field, subfield = "0248", "a" elif host == "inspirehep.net": field, subfield = "909CO", "o" else: raise ValueError(MSG_INVALID_HOST) if field not in record: record[field] = dict() record[field][subfield] = OAI_INVENIO % (host, record.id()) def submitted(self, record): """Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``. Look for alternative when it is not defined. Note: After this check the year submitted contains one entry. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the date is not well formed or when more than one date are found. """ date = record.submitted() # recover missing date using conference, preprint, thesis information if len(date) == 0: date = self._recover_submitted(record) if len(date) == 0: raise CheckException(MSG_NO_DATE) # 22 Mar 2011 m = DECODE_DD_MMM_YYYY.match(date) if m: data = (m.group(3), MONTHS[m.group(2)], int(m.group(1))) date = '%s-%s-%02i' % data # 22 03 2011 m = DECODE_DD_MM_YYYY.match(date) if m: data = (m.group(3), int(m.group(2)), int(m.group(1))) date = '%s-%02i-%02i' % data # 2011 m_year = DECODE_YYYY.match(date) if m_year: date = self._recover_submitted(record) # check the minimum requirement is 2001-05 if not REG_SUBMITTED.match(date): raise CheckException(MSG_WELL_FORMED_DATE) record[u"prepublication"][u"date"] = date @staticmethod def temporary_record(record): """Some records are marked temporary. Args: record (RecordPubli): record describing a publication. Raises: CheckException: when the record is marked temporary """ # INSPIREHEP # Can be find by using the XML syntax: # http://inspirehep.net/search?500__a="*Temporary record*" # # or the corresponding JSON field: # http://inspirehep.net/comment="*Temporary record*" # if u"comment" in record: if record[u"comment"] == u"*Temporary record*": raise CheckException(MSG_TEMPORARY_RECORD)