""" harvest_tools.base """ import pandas as pd import re DRY_RUN = "dry run" MSG_CRASH = "Crash: %s" MSG_FIX_ORIGIN = "Fixed the origin field" MSG_IN_DB = "Already in the database" MSG_IS = "Reject publication is a {}" MSG_LOAD = "Load in the database" REX_OAI_CDS = re.compile(r"oai:cds") T4 = " "*4 T6 = " "*6 def family_name_fr(full_name): """Extract the family name when the full name is encoded as ``J. Doe``. Args: full_name (str): author name encoded according to French typographic rules. Returns: str: family name """ return full_name[full_name.rfind('. ') + 2:] def filter_logs(logs): """Filter on OAI to remove duplicated entries. Note: * Entries can be duplicated when user harvest several stores. * Prefer entries from inspirehep Args: logs (list): list of message (Msg). Returns: list """ data = [{"oais": dct["oais"]} for dct in logs] df = pd.DataFrame(data) # tag primary OAI as cds or ins df["first_oai"] = df.oais.str.extract(r"oai:(\w{3})", expand=True) # update origin to order oai as cds, ins df["oais"] = df.oais.apply(order_oais) # filter preserving inspirehep fltr = (df .sort_values(["first_oai", "oais"]) .oais .duplicated(keep="last")) return [logs[tpl[0]] for tpl in fltr.items() if tpl[1] is False] def get_rex_institute(db, app): """Get the regular expression defining the affiliation of my institute. It is obtained by concatenating the affiliation keys. Affiliation key can contains character like ``(``, ``)`` or ``&``. They are replaced by ``\(`` *etc*. Args: db (pydal.DAL): database connection app (gluon.storage.Storage): namespace defining the application Returns: str: """ # alias reg_institute = app.reg_institute # regular expression for the affiliation keys # protect special character # add start and end of string for an exact match if not reg_institute: lst = [] for row in db(db.affiliation_keys.id > 0).iterselect(): val = row.key_u val = (val .replace("(", "\(") .replace(")", "\)") .replace("&", "\&") .replace("$", "\$") .replace("+", "\+") .replace("?", "\?")) val = r"(?:^|\|){}(?:$|\|)" .format(val) lst.append(val) app.reg_institute = reg_institute = r"|".join(lst) return reg_institute def learn_my_authors(db, authors=None, id_project=None, id_team=None, year=None): """Train the rescue list of the authors of my institute. Authors which are not in the rescue list, are added. The rescue list is defined by the project, the team identifier and by the year. Warning: all keyword arguments have to be defined. Args: db (gluon.dal.DAL): database connection. authors (list): authors names id_project (int): the identifier of the project in the database. id_team (int): the identifier of the team in the database. year (int): the year """ # get the list of authors store in the database row = db.my_authors(id_projects=id_project, id_teams=id_team, year=year) # no entry in the database if row is None: db.my_authors[None] = dict(authors=authors, id_projects=id_project, id_teams=id_team, year=year) return database_authors = row.authors.split(', ') # compare with the input list # and extract authors which are not in the db new = set(authors.split(', ')) ref = set(database_authors) diff = new.difference(ref) # update the database if diff: # NOTE1: be careful with the string encoding # NOTE2: handle the case J. Foo and J. M. Foo are the same person elems = [] for elem in diff: if isinstance(elem, str): family_name = elem[elem.rfind('. ') + 2:] if family_name not in row.authors: elems.append(elem) database_authors.extend(elems) database_authors.sort(key=family_name_fr) db.my_authors[row.id] = dict(authors=', '.join(database_authors)) def order_oais(oais): """Order OAIS string as cds, inspirehep Args: oais (str): record identifier in stores Returns: str """ if oais is None: return "" if oais.count(",") != 1 or REX_OAI_CDS.match(oais): return oais u, v = (el.strip() for el in oais.split(",")) return f"{v}, {u}"