wizards.py 15.7 KB
Newer Older
1 2 3
""" Wizards Controllers

"""
4
import json
5
import re
6

LE GAC Renaud's avatar
LE GAC Renaud committed
7
from check_tools import check_publication
8
from gluon.storage import Storage
9
from harvest_tools import DRY_RUN
10
from invenio_tools import CdsException, load_record
11
from plugin_dbui import (CALLBACK_ERRORS,
LE GAC Renaud's avatar
LE GAC Renaud committed
12
                         get_foreign_field,
13
                         get_id,
LE GAC Renaud's avatar
LE GAC Renaud committed
14 15 16
                         inline_alert,
                         is_foreign_field,
                         JSONEncoder,
LE GAC Renaud's avatar
LE GAC Renaud committed
17
                         Selector,
18
                         to_fields)
19

LE GAC Renaud's avatar
LE GAC Renaud committed
20
MODE_DRY_RUN = T(DRY_RUN)
21
MSG_NO_AUTHORS = "<br><br>Removing affiliation failed.<br>"\
LE GAC Renaud's avatar
LE GAC Renaud committed
22
    "Use INSPIRES instead with the tool 'insert RECJSON'"
23

24
MSG_EXISTING_KEY = "Keys already exist!"
25
MSG_NO_AFFILIATION = "Affiliation is not defined for the selected author."
26
MSG_NO_AUTHOR = "Author not found!"
27
MSG_NO_KEYS = "Affiliation keys are not defined!"
28 29 30
MSG_NO_INSTITUTE = "Institute not found in the inspirehep database!"
MSG_NO_PUBLICATION = "Publication not found!"
MSG_NO_SERVER = "Server is not reachable or respond badly!"
31 32
MSG_TO0_MANY_AFFILIATION = "More than one affiliation for the selected author!"
MSG_TO0_MANY_AUTHOR = "More than one author found!"
33

34

35
def affiliation_institute():
36
    """Determine affiliation keys using the institute database.
37 38 39 40 41

    """
    # shortcuts
    institute_id = request.vars.institute_id

42
    # find the record for the institute
43 44 45
    try:
        record = load_record("inspirehep.net", institute_id)

46
    except CdsException:
47 48
        raise HTTP(500, T(MSG_NO_SERVER))

49
    if record is None:
50
        raise HTTP(500, T(MSG_NO_INSTITUTE))
51 52

    # extract keys defining the affiliation
53 54
    # subfields are identifier and futur_identifier
    # they are not part of the standard JSON record but add by the factory
55
    if "corporate_note" not in record:
56 57
        raise HTTP(500, T(MSG_NO_KEYS))

58
    di = record["corporate_note"]
59 60 61
    keys = [di[k] for k in di]

    # some time the name of the institute is used (ATLAS NOte, ...)
62
    obj = record["corporate_name"]
63 64
    di = (obj[0] if isinstance(obj, list) else obj)

65 66
    if "name" in di:
        keys.append(di["name"])
67

68
    keys = (dict(key_u=key, key_v="") for key in keys)
69 70 71

    # check that the rules does not exist
    # load new rules
72
    is_key_add = False
73
    for key in keys:
74 75
        if get_id(db.affiliation_keys, **key) is None:
            db.affiliation_keys[0] = key
76 77 78 79
            is_key_add = True

    if not is_key_add:
        raise HTTP(500, T(MSG_EXISTING_KEY))
80

81 82 83 84
    return


def affiliation_publication():
85
    """Determine affiliation keys using a given publication.
86 87 88

    """
    # shortcuts
LE GAC Renaud's avatar
LE GAC Renaud committed
89 90 91 92 93
    rvars = request.vars
    family_name = rvars.family_name.strip()
    first_name = rvars.first_name.strip()
    publication_id = rvars.publication_id
    publication_store = rvars.publication_store
94

95
    # find the publication
96 97 98
    try:
        record = load_record(publication_store, publication_id)

99
    except CdsException:
100 101
        raise HTTP(500, T(MSG_NO_SERVER))

102
    if record is None:
103
        raise HTTP(500, T(MSG_NO_PUBLICATION))
104

105
    # find the author and its affiliation
106
    df = record["authors"]
107

108 109 110
    query = \
        (df.last_name.str.lower() == family_name.lower()) & \
        (df.first_name.str.lower() == first_name.lower())
111

112
    df = df[query]
113

114 115
    if len(df) == 0:
        raise HTTP(500, T(MSG_NO_AUTHOR))
116

117 118
    elif len(df) > 1:
        raise HTTP(500, T(MSG_TO0_MANY_AUTHOR))
119

120 121 122 123 124 125 126 127
    affiliation = df.affiliation.iloc[0]
    # reject author with several affiliations

    if "|" in affiliation:
        raise HTTP(500, MSG_TO0_MANY_AFFILIATION)

    elif len(affiliation) == 0:
        raise HTTP(500, T(MSG_NO_AFFILIATION))
128 129

    # check that the rules does not exist
130
    # load new rules
131
    key = dict(key_u=affiliation, key_v="")
132 133
    if get_id(db.affiliation_keys, **key) is None:
        db.affiliation_keys[0] = key
134 135

    else:
136
        raise HTTP(500, T(MSG_EXISTING_KEY))
137

138 139 140
    return


141
def check_validate():
LE GAC Renaud's avatar
LE GAC Renaud committed
142
    """Check and validate publication records.
LE GAC Renaud's avatar
LE GAC Renaud committed
143

144
    """
145
    counters = {}
146
    logs = []
LE GAC Renaud's avatar
LE GAC Renaud committed
147
    id_ok = db(db.status.code == "OK").select().first().id
LE GAC Renaud's avatar
LE GAC Renaud committed
148

149
    # get user requirement
LE GAC Renaud's avatar
LE GAC Renaud committed
150
    selector = Selector(virtdb.check_selector, exclude_fields=("mode"))
LE GAC Renaud's avatar
LE GAC Renaud committed
151

152
    # extract the publication satisfying selector criteria
LE GAC Renaud's avatar
LE GAC Renaud committed
153
    rows = selector.select(db.publications,
154
                           orderby=(db.projects.project, db.categories.code))
LE GAC Renaud's avatar
LE GAC Renaud committed
155

156
    # analyse the publications
157
    for row in rows:
158

LE GAC Renaud's avatar
LE GAC Renaud committed
159
        # alias
160
        project = row.projects.project
LE GAC Renaud's avatar
LE GAC Renaud committed
161

LE GAC Renaud's avatar
LE GAC Renaud committed
162 163
        # initialise counters
        if project not in counters:
LE GAC Renaud's avatar
LE GAC Renaud committed
164
            counters[project] = Storage(found=0, ok=0, validated=0)
165 166

        counters[project].found += 1
LE GAC Renaud's avatar
LE GAC Renaud committed
167

168
        # skip publication already validated
LE GAC Renaud's avatar
LE GAC Renaud committed
169
        if row.status.code == "OK":
170
            counters[project].ok += 1
171 172
            continue

173 174 175 176 177 178
        # initialise the log for this record
        msg = Storage()
        logs.append(msg)

        msg.category = row.categories.code
        msg.id = row.publications.id
179
        msg.project = project
LE GAC Renaud's avatar
LE GAC Renaud committed
180
        msg.title = row.publications.title
181
        msg.year = row.publications.year
LE GAC Renaud's avatar
LE GAC Renaud committed
182 183

        # check the record
184
        msg.txt, msg.ids = check_publication(row)
LE GAC Renaud's avatar
LE GAC Renaud committed
185

186
        # update publication status
LE GAC Renaud's avatar
LE GAC Renaud committed
187
        if not msg.txt:
188
            counters[project].validated += 1
189
            del logs[-1]
190

LE GAC Renaud's avatar
LE GAC Renaud committed
191
            if selector.mode != MODE_DRY_RUN:
192
                row.publications.update_record(id_status=id_ok)
193

194 195
    return dict(counters=counters,
                logs=logs,
LE GAC Renaud's avatar
LE GAC Renaud committed
196 197 198 199

                publications_fields=json.dumps(
                    to_fields(db.publications), cls=JSONEncoder),

200
                selector=selector)
201 202


203 204 205
def compare_publications():
    """Compare the publication fields for two ids and show only the difference.
    The arguments of the URL are id1 and id2.
LE GAC Renaud's avatar
LE GAC Renaud committed
206

207
    """
208
    data, idrow = [], []
LE GAC Renaud's avatar
LE GAC Renaud committed
209
    rvars = request.vars
LE GAC Renaud's avatar
LE GAC Renaud committed
210

LE GAC Renaud's avatar
LE GAC Renaud committed
211 212
    if "id1" not in rvars or "id2" not in rvars:
        return inline_alert(T("Error"), T("Specify id1 and id2 in the URL"))
LE GAC Renaud's avatar
LE GAC Renaud committed
213

LE GAC Renaud's avatar
LE GAC Renaud committed
214 215
    row1 = db.publications[rvars.id1]
    row2 = db.publications[rvars.id2]
LE GAC Renaud's avatar
LE GAC Renaud committed
216

217 218 219 220 221 222 223 224
    # find the fields of row2 which are different from those of row1
    # the difference is a set containing  (key, value) tuples
    s1 = set(row1.items())
    s2 = set(row2.items())

    for el in s2.difference(s1):
        fieldname, value2 = el

LE GAC Renaud's avatar
LE GAC Renaud committed
225
        if fieldname in ("delete_record", "update_record"):
226
            continue
LE GAC Renaud's avatar
LE GAC Renaud committed
227

228 229 230 231 232
        # alias
        field = db.publications[fieldname]
        value1 = row1[fieldname]

        # the first data should contains the ids
LE GAC Renaud's avatar
LE GAC Renaud committed
233 234
        if fieldname == "id":
            idrow = ["id", value1, value2]
235
            continue
LE GAC Renaud's avatar
LE GAC Renaud committed
236

237 238 239
        # convert foreign fields
        if is_foreign_field(field):
            k_tablename, k_fieldname, k_id = get_foreign_field(field)
LE GAC Renaud's avatar
LE GAC Renaud committed
240

241 242
            value1 = db[k_tablename][value1][k_fieldname]
            value2 = db[k_tablename][value2][k_fieldname]
LE GAC Renaud's avatar
LE GAC Renaud committed
243

LE GAC Renaud's avatar
LE GAC Renaud committed
244
        data.append([T(field.label), value1, value2])
LE GAC Renaud's avatar
LE GAC Renaud committed
245

246
    # add the ids as the first data
247 248 249
    # the protection covers the case in which id1=id2
    if idrow:
        data.insert(0, idrow)
LE GAC Renaud's avatar
LE GAC Renaud committed
250

251 252 253 254
    # delegate the rendering to the view
    return dict(data=data, title=row1.title)


255 256 257 258
def extract_authors():
    """Extract a list of authors in a string containing
    author name and their affiliation. It also extract authors for a
    given affiliation.
LE GAC Renaud's avatar
LE GAC Renaud committed
259

260
    """
261
    selector = Selector(virtdb.authors_selector)
LE GAC Renaud's avatar
LE GAC Renaud committed
262

263
    # remove stupid character in the authors string
LE GAC Renaud's avatar
LE GAC Renaud committed
264
    authors = re.sub(r"[\n\t\r\v\f]", "", selector.authors)
265 266 267 268 269 270

    # CASE 1 -- affiliation is based on number
    # M.-H. Doe 13,34,m,56 , P. Schmidt 56 ,...
    # J. DOe,56 P. Schmidt,67 ...
    case_1 = True
    all_authors = []
LE GAC Renaud's avatar
LE GAC Renaud committed
271
    my_authors = []
LE GAC Renaud's avatar
LE GAC Renaud committed
272
    rex = re.compile("([^\d]+)[ ,]?([\d,a-z\*]+)[ ,]")
LE GAC Renaud's avatar
LE GAC Renaud committed
273

274 275
    for el in rex.finditer(authors):
        author = el.group(1)
LE GAC Renaud's avatar
LE GAC Renaud committed
276

277 278
        # when the matching work, author contains 0 or 1 comma
        # use this property to detect when matching failed
LE GAC Renaud's avatar
LE GAC Renaud committed
279
        if author.count(",") > 1:
280 281
            case_1 = False
            break
LE GAC Renaud's avatar
LE GAC Renaud committed
282

LE GAC Renaud's avatar
LE GAC Renaud committed
283
        author = author.strip().replace(",", "")
284
        all_authors.append(author)
LE GAC Renaud's avatar
LE GAC Renaud committed
285

286
        if el.group(2) == selector.affiliation:
LE GAC Renaud's avatar
LE GAC Renaud committed
287
            my_authors.append(author)
288 289 290 291 292

#    # CASE 2 -- affiliation is base on letter
#    # J. Doeb,ae , P. Schmidtb,1 , H. Fooae,e, ....
#    if not case_1:
#        # remove number for footnote
LE GAC Renaud's avatar
LE GAC Renaud committed
293
#        authors = re.sub(r"(,\d)", "", authors)
LE GAC Renaud's avatar
LE GAC Renaud committed
294
#
295
#        # remove space before comma
LE GAC Renaud's avatar
LE GAC Renaud committed
296
#        authors = re.sub(r"( ,)", ",", authors)
LE GAC Renaud's avatar
LE GAC Renaud committed
297
#
298
#        # get author and its affiliation
LE GAC Renaud's avatar
LE GAC Renaud committed
299
#        rex = re.compile(r"([\w \.-]+)[ ]?(a?[a-z],(a?[a-z],)*) ")
300 301
#        for el in rex.finditer(authors):
#            print el.groups()
LE GAC Renaud's avatar
LE GAC Renaud committed
302
#
LE GAC Renaud's avatar
LE GAC Renaud committed
303 304
#        authors = re.sub(r"(a?[a-z],(a?[a-z],)*)", ",", authors)
#        authors = re.sub(r"[a-z]\Z", "", authors)
LE GAC Renaud's avatar
LE GAC Renaud committed
305
#
LE GAC Renaud's avatar
LE GAC Renaud committed
306
#        all_authors = authors.split(",")
LE GAC Renaud's avatar
LE GAC Renaud committed
307

308 309
    if not case_1:
        return MSG_NO_AUTHORS
LE GAC Renaud's avatar
LE GAC Renaud committed
310

LE GAC Renaud's avatar
LE GAC Renaud committed
311
    return dict(all=", ".join(all_authors), my_authors=", ".join(my_authors))
312 313 314


def harvester():
315 316 317 318 319 320 321
    """Process the data send by the wizard harvester and fill the
    harvesters table. The data block contains the keys:

        * automaton (str)
        * collaboration (str)
        * collection (str)
        * people (list)
LE GAC Renaud's avatar
LE GAC Renaud committed
322
        * producer (str) either "collaboration" or "people"
323 324 325 326 327 328 329
        * project (int)
        * store (str) either cds.cern.ch or inspirehep.net

    The logic to interpret the data depends on the store.

    """
    # shortcuts
LE GAC Renaud's avatar
LE GAC Renaud committed
330 331
    rvars = request.vars
    automaton = rvars.automaton
332
    organisation = db.organisation
LE GAC Renaud's avatar
LE GAC Renaud committed
333 334
    project = rvars.project
    store = rvars.store
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352

    # protection
    if store not in ("cds.cern.ch", "inspirehep.net"):
        raise HTTP(500, "Invalid store !")

    values = Storage(controller=automaton,
                     host=store,
                     id_projects=project)

    # the team
    rec_id = get_id(organisation, id_projects=project)
    if rec_id is None:
        raise HTTP(500, "Project is unknown !")

    values.id_teams = organisation[rec_id].id_teams

    # the collection
    if store == "cds.cern.ch":
LE GAC Renaud's avatar
LE GAC Renaud committed
353
        values.collections = rvars.collection
354 355 356 357

    elif store == "inspirehep.net":
        collection = []

LE GAC Renaud's avatar
LE GAC Renaud committed
358
        collaboration = rvars.collaboration
359 360 361 362
        if collaboration:
            collection.append("cn %s" % collaboration)

        else:
363 364
            authors = ["a %s" % elt for elt in rvars.people]
            collection.append(" or ".join(authors))
365 366

        if automaton == "articles":
367
            collection.append("tc p and not tc c")
368 369

        elif automaton == "proceedings":
370
            collection.append("tc c")
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416

        values.collections = "find %s" % " and ".join(collection)

    # the publication category
    if automaton == "articles":
        id_category = get_id(db.categories, code="ACL")

    elif automaton in ("notes", "reports"):
        id_category = get_id(db.categories, code="AP")

    elif automaton == "preprints":
        id_category = get_id(db.categories, code="PRE")

    elif automaton == "proceedings":
        id_category = get_id(db.categories, code="ACTI")

    elif automaton == "talks":
        id_category = get_id(db.categories, code="COM")

    elif automaton == "theses":
        id_category = get_id(db.categories, code="PHD")

    if id_category is None:
        raise HTTP(500, "Category is unknown !")

    values.id_categories = id_category

    # insert new values in the database
    msg = None
    try:
        rec_id = db.harvesters.insert(**values)
        if rec_id:
            return

        # operation can be reject by callback table._before_insert
        else:
            msg = "Fail to insert the new harvester in the database."
            if CALLBACK_ERRORS in db.harvesters:
                msg = db.harvesters._callback_errors

            # reduce the error message
            if isinstance(msg, list):
                msg = "%s %s" % (msg[0], msg[-1])

    # operation is rejected by the database
    except Exception as dbe:
417
        raise HTTP(500, str(dbe))
418 419 420 421 422 423

    # operation is rejected by the callback
    # NOTE in the else branch to avoid recursive exception generation
    if msg is not None:
        raise HTTP(500, msg)

424
    return
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453


def update_citations():
    """Update the citations table.

    * select article in the inspirehep store according to user criteria
    * get the number of citations per document
    * update the citations table

    """
    import datetime
    import requests

    from invenio_tools import InvenioStore
    from json.decoder import JSONDecodeError
    from reporting_tools import repr_team_project

    citations = db.citations
    id_acl = get_id(db.categories, code="ACL")
    kwargs = dict(of="recjson", ot="number_of_citations")
    publications = db.publications
    rex_ins = re.compile(r"(https?://inspirehep.net/record/\d+)")
    store = InvenioStore("inspirehep.net")
    today = datetime.date.today()

    counters = Storage(article=0,
                       http_error=0,
                       insert=0,
                       json_error=0,
454 455
                       list_size=0,
                       not_list=0,
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
                       url_error=0)

    # get user requirement
    selector = Selector(virtdb.citation_selector)

    # get the list of article store in the inspirehep store
    selector.append_query(publications.id_categories == id_acl)
    selector.append_query(publications.origin.contains("inspirehep"))

    query = selector.query(publications)

    # get the number of citation and update the database table
    for row in db(query).iterselect(publications.id, publications.origin):

        counters.article += 1
        logger.debug(row.origin)

        # interrogate inspirehep.net
        try:
            url = rex_ins.search(row.origin).group(1)
476
            rep = store.interrogate(url, timeout=60, **kwargs)
477 478 479 480 481 482 483 484 485 486 487 488 489
            lst = rep.json()

            if not isinstance(lst, list):
                logger.warning(f"JSON response is not a list")
                counters.not_list += 1
                continue

            if len(lst) != 1:
                logger.warning(f"size of the return list is not one")
                counters.list_size += 1
                continue

            count = lst[0].get("number_of_citations")
490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532

        except AttributeError:
            logger.warning(f"inspirehep URL not well formed {row.origin}")
            counters.url_error += 1
            continue

        except JSONDecodeError:
            logger.warning("JSON decoding error")
            counters.json_error += 1
            continue

        except requests.exceptions.RequestException:
            logger.warning(f"HTTP error interrogating {url}")
            counters.http_error += 1
            continue

        # check if the number of count changes
        myset = db(citations.id_publications == row.id)
        if not myset.isempty():
            entries = myset.select(orderby=citations.date)
            last_count = entries.last().count
            logger.debug(f"last count {last_count} new one {count}")
            if last_count == count:
                continue

        # update the citations table
        logger.info(f"update {url} citations to {count}")
        counters.insert += 1
        idpubli = row.id

        citations.update_or_insert(
            (citations.date == today) & (citations.id_publications == idpubli),
            date=today,
            id_publications=idpubli,
            count=count)

        db.commit()

    # inform the user
    logger.info(f"        number of article: {counters.article}")
    logger.info(f"       bad inspirehep URL: {counters.url_error}")
    logger.info(f"    HTTP connection error: {counters.url_error}")
    logger.info(f"      JSON decoding error: {counters.json_error}")
533 534
    logger.info(f"   response is not a list: {counters.not_list}")
    logger.info(f"     list size is not one: {counters.list_size}")
535 536 537 538
    logger.info(f"   insert or update in db: {counters.insert}")

    return dict(counters=counters,
                team_project=repr_team_project(db, selector))