base.py 4.51 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
# -*- coding: utf-8 -*-
""" harvest_tools.base

"""
import re


from gluon import current


MSG_CRASH = "Crash: %s"
DRY_RUN = current.T("dry run")
MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False)
MSG_IN_DB = current.T("Already in the database", lazy=False)
MSG_LOAD = current.T("Load in the database", lazy=False)


def family_name_fr(full_name):
    """Extract the family name when the full name is encoded as C{J. Doe}.

    @type full_name: unicode

    @rtype: unicode

    """
    return full_name[full_name.find(' ') + 1:]


def fix_amu(record):
    """Fix the name of the C{Aix Marseille University}

    @type record: L{Record}

    @rtype: unicode
    @return: the university names separated by comma.

    """
    universities = record.these_universities()
    for idx in range(len(universities)):
        if re.search(current.app.reg_institute, universities[idx]):

            year = re.search(r"(\d\d\d\d)", record.these_defense()).group(1)
            if int(year) < 2012:
                universities[idx] = \
                    u"Université de la Méditerrannée Aix-Marseille II"
            else:
                universities[idx] = u"Aix Marseille Université"

    return ', '.join(universities)


def format_author_fr(name):
    """Format the author name according to French typographic rules,
    I{i.e.} C{J.-P. Doe}.
    The name stays unchanged when the formatting failed.

    @type name: unicode
    @param name:

    @rtype: unicode

    """
    # protection
    if name == '' or name is None:
        return name

    # name are encoded Family, L
    #                  Family, P L
    #                  Family, M -H
    #                  Family Name, J
    #                  Family-Name, J
    #                  Family, F Name
    #                  Family, First
    # To avoid to deal with unicode character
    # look for non empty string \S
    match = re.match(r'(.+), (\S+)( |\-)*(\S+)*', name)

    # reformat the name as L. Family
    # or keep it as it is
    if match:
        if match.group(3) and match.group(4):
            result = '%s.%s%s. %s' % (match.group(2)[0], match.group(3)[0],
                                      match.group(4)[0], match.group(1))

        elif "-" in match.group(2):
            li = [el[0] for el in match.group(2).split("-")]
            li.append(match.group(1))
            result = "%s.-%s. %s" % tuple(li)

        else:
            result = '%s. %s' % (match.group(2)[0], match.group(1))
    else:
        result = name

    # avoid author name in upper case (R. LE FOO --> R. Le Foo)
    result = result.title()

    return result


def learn_my_authors(db,
                     authors=None,
                     id_project=None,
                     id_team=None,
                     year=None):
    """Train the rescue list of the authors of my institute,
    stored in the database, using the list C{authors} provided in argument.

    @note: all keyword arguments have to be defined.

    @type db: gluon.dal.DAL
    @param db:

    @type authors: list
    @param authors: authors names

    @type id_project: int
    @param id_project: project identifier

    @type id_team: int
    @param id_team: team identifier

    @type year: int
    @param year:

    """
    # get the list of authors store in the database
    row = db.my_authors(id_projects=id_project,
                        id_teams=id_team,
                        year=year)

    # no entry in the database
    if not row:
        db.my_authors[0] = dict(authors=authors,
                                id_projects=id_project,
                                id_teams=id_team,
                                year=year)
        return

    database_authors = row.authors.split(', ')

    # compare with the input list
    # and extract authors which are not in the db
    new = set(authors.split(', '))
    ref = set(database_authors)
    diff = new.difference(ref)

    # update the database
    if diff:

        # NOTE1: be careful with the string encoding
        # NOTE2: handle the case J. Foo and J. M. Foo are the same person
        elems = []
        for elem in diff:
            if isinstance(elem, unicode):
                elem = elem.encode('utf8')
                family_name = elem[elem.rfind('. ') + 2:]  # extract family name

                if family_name not in row.authors:
                    elems.append(elem)

        database_authors.extend(elems)
        database_authors.sort(key=family_name_fr)
        db.my_authors[row.id] = dict(authors=', '.join(database_authors))


class ToolException(Exception): pass