base.py 4.93 KB
Newer Older
1 2 3
""" harvest_tools.base

"""
4 5 6
import pandas as pd
import re

7
DRY_RUN = "dry run"
8 9

MSG_CRASH = "Crash: %s"
10 11
MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database"
12
MSG_IS = "Reject publication is a {}"
13
MSG_LOAD = "Load in the database"
14

15 16
REX_OAI_CDS = re.compile(r"oai:cds")

17 18 19
T4 = " "*4
T6 = " "*6

20 21

def family_name_fr(full_name):
22
    """Extract the family name when the full name is encoded as ``J. Doe``.
23

24
    Args:
25
        full_name (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
26
            author name encoded according to French typographic rules.
27

28
    Returns:
29
        str:
LE GAC Renaud's avatar
LE GAC Renaud committed
30
            family name
31 32

    """
33
    return full_name[full_name.rfind('. ') + 2:]
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68


def filter_logs(logs):
    """Filter on OAI to remove duplicated entries.

    Note:
        * Entries can be duplicated when user harvest several stores.
        * Prefer entries from inspirehep

    Args:
        logs (list):
            list of message (Msg).

    Returns:
        list

    """
    data = [{"oais": dct["oais"]} for dct in logs]
    df = pd.DataFrame(data)

    # tag primary OAI as cds or ins
    df["first_oai"] = df.oais.str.extract(r"oai:(\w{3})", expand=True)

    # update origin to order oai as cds, ins
    df["oais"] = df.oais.apply(order_oais)

    # filter preserving inspirehep
    fltr = (df
            .sort_values(["first_oai", "oais"])
            .oais
            .duplicated(keep="last"))

    return [logs[tpl[0]] for tpl in fltr.items() if tpl[1] is False]


69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
def get_rex_institute(db, app):
    """Get the regular expression defining the affiliation of my institute.

    It is obtained by concatenating the affiliation keys.
    Affiliation key can contains character like ``(``, ``)`` or ``&``.
    They are replaced by ``\(`` *etc*.

    Args:
        db (pydal.DAL):
            database connection

        app (gluon.storage.Storage):
            namespace defining the application

    Returns:
        str:

    """
    # alias
    reg_institute = app.reg_institute

    # regular expression for the affiliation keys
    # protect special character
    # add start and end of string for an exact match
    if not reg_institute:

        lst = []
        for row in db(db.affiliation_keys.id > 0).iterselect():
            val = row.key_u

            val = (val
                   .replace("(", "\(")
                   .replace(")", "\)")
                   .replace("&", "\&")
                   .replace("$", "\$")
                   .replace("+", "\+")
                   .replace("?", "\?"))

107
            val = r"(?:^|\|){}(?:$|\|)" .format(val)
108 109 110 111 112 113 114 115

            lst.append(val)

        app.reg_institute = reg_institute = r"|".join(lst)

    return reg_institute


116 117 118 119 120
def learn_my_authors(db,
                     authors=None,
                     id_project=None,
                     id_team=None,
                     year=None):
121 122 123 124 125 126 127 128 129
    """Train the rescue list of the authors of my institute.
    Authors which are not in the rescue list, are added.
    The rescue list is defined by the project, the team identifier and
    by the year.

    Warning:
        all keyword arguments have to be defined.

    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
130 131 132 133 134 135 136 137 138 139 140 141 142 143
        db (gluon.dal.DAL):
            database connection.

        authors (list):
            authors names

        id_project (int):
            the identifier of the project in the database.

        id_team (int):
            the identifier of the team in the database.

        year (int):
            the year
144 145 146 147 148 149 150 151

    """
    # get the list of authors store in the database
    row = db.my_authors(id_projects=id_project,
                        id_teams=id_team,
                        year=year)

    # no entry in the database
152 153 154 155 156
    if row is None:
        db.my_authors[None] = dict(authors=authors,
                                   id_projects=id_project,
                                   id_teams=id_team,
                                   year=year)
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
        return

    database_authors = row.authors.split(', ')

    # compare with the input list
    # and extract authors which are not in the db
    new = set(authors.split(', '))
    ref = set(database_authors)
    diff = new.difference(ref)

    # update the database
    if diff:

        # NOTE1: be careful with the string encoding
        # NOTE2: handle the case J. Foo and J. M. Foo are the same person
        elems = []
        for elem in diff:
174
            if isinstance(elem, str):
LE GAC Renaud's avatar
LE GAC Renaud committed
175
                family_name = elem[elem.rfind('. ') + 2:]
176 177 178 179 180 181 182 183 184

                if family_name not in row.authors:
                    elems.append(elem)

        database_authors.extend(elems)
        database_authors.sort(key=family_name_fr)
        db.my_authors[row.id] = dict(authors=', '.join(database_authors))


185 186
def order_oais(oais):
    """Order OAIS string as cds, inspirehep
187 188

    Args:
189 190
        oais (str):
            record identifier in stores
191 192

    Returns:
193
        str
LE GAC Renaud's avatar
LE GAC Renaud committed
194

195
    """
196 197
    if oais is None:
        return ""
198

199 200
    if oais.count(",") != 1 or REX_OAI_CDS.match(oais):
        return oais
201

202 203
    u, v = (el.strip() for el in oais.split(","))
    return f"{v}, {u}"
204