base.py 4.17 KB
Newer Older
1 2 3 4
# -*- coding: utf-8 -*-
""" harvest_tools.base

"""
5
from invenio_tools import REG_AUTHOR
6

7
DRY_RUN = "dry run"
8 9

MSG_CRASH = "Crash: %s"
10 11 12
MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database"
MSG_LOAD = "Load in the database"
13 14 15


def family_name_fr(full_name):
16
    """Extract the family name when the full name is encoded as ``J. Doe``.
17

18 19 20
    Args:
        full_name (unicode): author name encoded according to French
            typographic rules.
21

22 23
    Returns:
        unicode: family name
24 25 26 27 28 29

    """
    return full_name[full_name.find(' ') + 1:]


def format_author_fr(name):
30
    """Format the author name according to French typographic rules.
31

32 33
    Note:
        The name stays unchanged when the formatting failed.
34

35 36 37 38 39 40 41 42 43 44 45 46 47
    Args:
        name (unicode): full name. Possible patterns are:

            * ``Family, L``
            * ``Family, P L``
            * ``Family, M -H``
            * ``Family Name, J``
            * ``Family-Name, J``
            * ``Family, F Name``
            * ``Family, First``

    Returns:
        unicode: the author name encode as ``J.-P. Doe``.
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62

    """
    # protection
    if name == '' or name is None:
        return name

    # name are encoded Family, L
    #                  Family, P L
    #                  Family, M -H
    #                  Family Name, J
    #                  Family-Name, J
    #                  Family, F Name
    #                  Family, First
    # To avoid to deal with unicode character
    # look for non empty string \S
63
    match = REG_AUTHOR.match(name)
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92

    # reformat the name as L. Family
    # or keep it as it is
    if match:
        if match.group(3) and match.group(4):
            result = '%s.%s%s. %s' % (match.group(2)[0], match.group(3)[0],
                                      match.group(4)[0], match.group(1))

        elif "-" in match.group(2):
            li = [el[0] for el in match.group(2).split("-")]
            li.append(match.group(1))
            result = "%s.-%s. %s" % tuple(li)

        else:
            result = '%s. %s' % (match.group(2)[0], match.group(1))
    else:
        result = name

    # avoid author name in upper case (R. LE FOO --> R. Le Foo)
    result = result.title()

    return result


def learn_my_authors(db,
                     authors=None,
                     id_project=None,
                     id_team=None,
                     year=None):
93 94 95 96 97 98 99 100 101 102 103 104 105 106
    """Train the rescue list of the authors of my institute.
    Authors which are not in the rescue list, are added.
    The rescue list is defined by the project, the team identifier and
    by the year.

    Warning:
        all keyword arguments have to be defined.

    Args:
        db (gluon.dal.DAL): database connection.
        authors (list): authors names
        id_project (int): the identifier of the project in the database.
        id_team (int): the identifier of the team in the database.
        year (int): the year
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138

    """
    # get the list of authors store in the database
    row = db.my_authors(id_projects=id_project,
                        id_teams=id_team,
                        year=year)

    # no entry in the database
    if not row:
        db.my_authors[0] = dict(authors=authors,
                                id_projects=id_project,
                                id_teams=id_team,
                                year=year)
        return

    database_authors = row.authors.split(', ')

    # compare with the input list
    # and extract authors which are not in the db
    new = set(authors.split(', '))
    ref = set(database_authors)
    diff = new.difference(ref)

    # update the database
    if diff:

        # NOTE1: be careful with the string encoding
        # NOTE2: handle the case J. Foo and J. M. Foo are the same person
        elems = []
        for elem in diff:
            if isinstance(elem, unicode):
                elem = elem.encode('utf8')
LE GAC Renaud's avatar
LE GAC Renaud committed
139
                family_name = elem[elem.rfind('. ') + 2:]
140 141 142 143 144 145 146 147 148

                if family_name not in row.authors:
                    elems.append(elem)

        database_authors.extend(elems)
        database_authors.sort(key=family_name_fr)
        db.my_authors[row.id] = dict(authors=', '.join(database_authors))


LE GAC Renaud's avatar
LE GAC Renaud committed
149 150
class ToolException(Exception):
    pass