# -*- coding: utf-8 -*- """ harvest_tools.base """ from exception import ToolException from invenio_tools import REG_AUTHOR from plugin_dbui import get_id, UNDEF_ID DRY_RUN = "dry run" MSG_CRASH = "Crash: %s" MSG_FIX_ORIGIN = "Fixed the origin field" MSG_IN_DB = "Already in the database" MSG_LOAD = "Load in the database" MSG_NO_ENTRY = "Reject %s is not defined" MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms" def family_name_fr(full_name): """Extract the family name when the full name is encoded as ``J. Doe``. Args: full_name (unicode): author name encoded according to French typographic rules. Returns: unicode: family name """ return full_name[full_name.find(' ') + 1:] def format_author_fr(name): """Format the author name according to French typographic rules. Note: The name stays unchanged when the formatting failed. Args: name (unicode): full name. Possible patterns are: * ``Family, L`` * ``Family, P L`` * ``Family, M -H`` * ``Family Name, J`` * ``Family-Name, J`` * ``Family, F Name`` * ``Family, First`` Returns: unicode: the author name encode as ``J.-P. Doe``. """ # protection if name == '' or name is None: return name # name are encoded Family, L # Family, P L # Family, M -H # Family Name, J # Family-Name, J # Family, F Name # Family, First # To avoid to deal with unicode character # look for non empty string \S match = REG_AUTHOR.match(name) # reformat the name as L. Family # or keep it as it is if match: if match.group(3) and match.group(4): result = '%s.%s%s. %s' % (match.group(2)[0], match.group(3)[0], match.group(4)[0], match.group(1)) elif "-" in match.group(2): li = [el[0] for el in match.group(2).split("-")] li.append(match.group(1)) result = "%s.-%s. %s" % tuple(li) else: result = '%s. %s' % (match.group(2)[0], match.group(1)) else: result = name # avoid author name in upper case (R. LE FOO --> R. Le Foo) result = result.title() return result def learn_my_authors(db, authors=None, id_project=None, id_team=None, year=None): """Train the rescue list of the authors of my institute. Authors which are not in the rescue list, are added. The rescue list is defined by the project, the team identifier and by the year. Warning: all keyword arguments have to be defined. Args: db (gluon.dal.DAL): database connection. authors (list): authors names id_project (int): the identifier of the project in the database. id_team (int): the identifier of the team in the database. year (int): the year """ # get the list of authors store in the database row = db.my_authors(id_projects=id_project, id_teams=id_team, year=year) # no entry in the database if not row: db.my_authors[0] = dict(authors=authors, id_projects=id_project, id_teams=id_team, year=year) return database_authors = row.authors.split(', ') # compare with the input list # and extract authors which are not in the db new = set(authors.split(', ')) ref = set(database_authors) diff = new.difference(ref) # update the database if diff: # NOTE1: be careful with the string encoding # NOTE2: handle the case J. Foo and J. M. Foo are the same person elems = [] for elem in diff: if isinstance(elem, unicode): elem = elem.encode('utf8') family_name = elem[elem.rfind('. ') + 2:] if family_name not in row.authors: elems.append(elem) database_authors.extend(elems) database_authors.sort(key=family_name_fr) db.my_authors[row.id] = dict(authors=', '.join(database_authors)) def search_synonym(table, fieldname, value, create=False): """Get the database identifier for the record having the database field or the synonyms field matching the value. Note: The database table must have a field name *synonyms*. It contains a list of strings. Args: table (gluon.DAL.Table): database table. fieldname (unicode): field of the database table identified by its name. value (unicode): value to be matched. create(bool): create a new entry in the database table when it is ``True`` Returns: int: * the id of the database record. * UNDEF_ID if value is not defined. Raises: ToolException: when more than one synonym is found. """ if not value: return UNDEF_ID db = table._db kwargs = {} kwargs[fieldname] = value id_rec = get_id(table, **kwargs) if id_rec is not None: return id_rec # nothing found, have a look to the synonyms field query = table.synonyms.contains(value) setrows = db(query) # no synonym found, create the entry ncount = setrows.count() if ncount == 0: if create: return table.insert(**kwargs) else: msg = MSG_NO_ENTRY % table._tablename raise ToolException(msg) # one synonym found elif ncount == 1: return setrows.select(table.id).first().id # more than one synonyms - don't know what to choose else: msg = MSG_TOOMANY_SYNONYM % table._tablename raise ToolException(msg)