base.py 5.88 KB
Newer Older
1 2 3 4
# -*- coding: utf-8 -*-
""" harvest_tools.base

"""
5
from exception import ToolException
6
from invenio_tools import REG_AUTHOR
7 8
from plugin_dbui import get_id, UNDEF_ID

9

10
DRY_RUN = "dry run"
11 12

MSG_CRASH = "Crash: %s"
13 14 15
MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database"
MSG_LOAD = "Load in the database"
16 17
MSG_NO_ENTRY = "Reject %s is not defined"
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms"
18 19 20


def family_name_fr(full_name):
21
    """Extract the family name when the full name is encoded as ``J. Doe``.
22

23 24 25
    Args:
        full_name (unicode): author name encoded according to French
            typographic rules.
26

27 28
    Returns:
        unicode: family name
29 30 31 32 33 34

    """
    return full_name[full_name.find(' ') + 1:]


def format_author_fr(name):
35
    """Format the author name according to French typographic rules.
36

37 38
    Note:
        The name stays unchanged when the formatting failed.
39

40 41 42 43 44 45 46 47 48 49 50 51 52
    Args:
        name (unicode): full name. Possible patterns are:

            * ``Family, L``
            * ``Family, P L``
            * ``Family, M -H``
            * ``Family Name, J``
            * ``Family-Name, J``
            * ``Family, F Name``
            * ``Family, First``

    Returns:
        unicode: the author name encode as ``J.-P. Doe``.
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

    """
    # protection
    if name == '' or name is None:
        return name

    # name are encoded Family, L
    #                  Family, P L
    #                  Family, M -H
    #                  Family Name, J
    #                  Family-Name, J
    #                  Family, F Name
    #                  Family, First
    # To avoid to deal with unicode character
    # look for non empty string \S
68
    match = REG_AUTHOR.match(name)
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97

    # reformat the name as L. Family
    # or keep it as it is
    if match:
        if match.group(3) and match.group(4):
            result = '%s.%s%s. %s' % (match.group(2)[0], match.group(3)[0],
                                      match.group(4)[0], match.group(1))

        elif "-" in match.group(2):
            li = [el[0] for el in match.group(2).split("-")]
            li.append(match.group(1))
            result = "%s.-%s. %s" % tuple(li)

        else:
            result = '%s. %s' % (match.group(2)[0], match.group(1))
    else:
        result = name

    # avoid author name in upper case (R. LE FOO --> R. Le Foo)
    result = result.title()

    return result


def learn_my_authors(db,
                     authors=None,
                     id_project=None,
                     id_team=None,
                     year=None):
98 99 100 101 102 103 104 105 106 107 108 109 110 111
    """Train the rescue list of the authors of my institute.
    Authors which are not in the rescue list, are added.
    The rescue list is defined by the project, the team identifier and
    by the year.

    Warning:
        all keyword arguments have to be defined.

    Args:
        db (gluon.dal.DAL): database connection.
        authors (list): authors names
        id_project (int): the identifier of the project in the database.
        id_team (int): the identifier of the team in the database.
        year (int): the year
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143

    """
    # get the list of authors store in the database
    row = db.my_authors(id_projects=id_project,
                        id_teams=id_team,
                        year=year)

    # no entry in the database
    if not row:
        db.my_authors[0] = dict(authors=authors,
                                id_projects=id_project,
                                id_teams=id_team,
                                year=year)
        return

    database_authors = row.authors.split(', ')

    # compare with the input list
    # and extract authors which are not in the db
    new = set(authors.split(', '))
    ref = set(database_authors)
    diff = new.difference(ref)

    # update the database
    if diff:

        # NOTE1: be careful with the string encoding
        # NOTE2: handle the case J. Foo and J. M. Foo are the same person
        elems = []
        for elem in diff:
            if isinstance(elem, unicode):
                elem = elem.encode('utf8')
LE GAC Renaud's avatar
LE GAC Renaud committed
144
                family_name = elem[elem.rfind('. ') + 2:]
145 146 147 148 149 150 151 152 153

                if family_name not in row.authors:
                    elems.append(elem)

        database_authors.extend(elems)
        database_authors.sort(key=family_name_fr)
        db.my_authors[row.id] = dict(authors=', '.join(database_authors))


154 155 156 157 158 159
def search_synonym(table, fieldname, value, create=False):
    """Get the database identifier for the record having the database field
    or the synonyms field matching the value.

    Note:
        The database table must have a field name *synonyms*.
160
        It contains a list of strings.
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
    Args:
        table (gluon.DAL.Table): database table.
        fieldname (unicode): field of the database table
            identified by its name.
        value (unicode): value to be matched.
        create(bool): create a new entry in the database table when
            it is ``True``

    Returns:
        int:
            * the id of the database record.
            * UNDEF_ID if value is not defined.

    Raises:
        ToolException: when more than one synonym is found.

    """
    if not value:
        return UNDEF_ID

    db = table._db

    kwargs = {}
    kwargs[fieldname] = value

    id_rec = get_id(table, **kwargs)
    if id_rec is not None:
        return id_rec

    # nothing found, have a look to the synonyms field
    query = table.synonyms.contains(value)
    setrows = db(query)

    # no synonym found, create the entry
    ncount = setrows.count()
    if ncount == 0:
        if create:
            return table.insert(**kwargs)
        else:
            msg = MSG_NO_ENTRY % table._tablename
            raise ToolException(msg)

    # one synonym found
    elif ncount == 1:
        return setrows.select(table.id).first().id

207
    # more than one synonyms - don't know what to choose
208 209 210
    else:
        msg = MSG_TOOMANY_SYNONYM % table._tablename
        raise ToolException(msg)