harvest.py 17.7 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import json
5
import logging
6
import traceback
7 8

from gluon import current
9
from gluon.restricted import RestrictedError
10
from harvest_tools import (build_harvester_tool,
LE GAC Renaud's avatar
LE GAC Renaud committed
11
                           DRY_RUN,
12
                           filter_logs,
13
                           get_rex_institute,
14
                           MsgCollection)
LE GAC Renaud's avatar
LE GAC Renaud committed
15
from plugin_dbui import (inline_alert,
16
                         Selector,
17 18
                         to_formPanel,
                         UNDEF_ID)
19
from requests.exceptions import RequestException
20 21
from store_tools import (CheckException,
                         load_record,
22
                         OAI_URL,
23
                         RecordCdsConfPaper,
24
                         RecordCdsThesis,
25 26
                         RecordHepConfPaper,
                         RecordHepThesis,
27 28
                         search_synonym,
                         ToolException)
29

LE GAC Renaud's avatar
LE GAC Renaud committed
30
MODE_DRY_RUN = T(DRY_RUN)
31 32
MSG_GREMLIN = "Oops a gremlin..."
MSG_LOST_CONNECTION = "Lost HTTP connection (timeout or site unavailable)"
33
MSG_NO_AFFILIATION = "Affiliation keys are not defined !!!"
34
MSG_NO_HARVESTER = "No harvesters for your selection !!!"
35
MSG_NO_RECORD = "Sorry, the record does not exist."
36

LE GAC Renaud's avatar
LE GAC Renaud committed
37

38
def free_run():
39 40
    """Run a free harvester.
    All harvester parameters are defined via the selector.
41

42
    """
43 44
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
45

46
    table = virtdb.free_harvester_selector
LE GAC Renaud's avatar
LE GAC Renaud committed
47 48 49 50 51 52 53
    fields = ("collections",
              "controller",
              "host",
              "id_projects",
              "id_teams",
              "id_categories",
              "ratio")
54 55

    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
56 57 58
        selector = Selector(
            table,
            exclude_fields=("mode", "year_start", "year_end"))
59 60 61

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
62
                msg = T("All fields of the form have to be defined !!!")
LE GAC Renaud's avatar
LE GAC Renaud committed
63
                msg += "<br>"
LE GAC Renaud's avatar
LE GAC Renaud committed
64 65 66 67 68 69 70 71 72 73 74
                msg += T("The field '%s' is missing ...") % T(table[el].label)
                return inline_alert(T("Error"), msg)

        tool = build_harvester_tool(
            db,
            selector.id_teams,
            selector.id_projects,
            selector.controller,
            selector.id_categories,
            year_start=selector.year_start,
            year_end=selector.year_end,
75
            dry_run=(selector.mode == MODE_DRY_RUN))
LE GAC Renaud's avatar
LE GAC Renaud committed
76

77
        if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
78
            return inline_alert(T("Error"), T("Select an harvester."))
79

80 81 82
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
83
        return T(str(e))
84 85

    except BaseException as e:
LE GAC Renaud's avatar
LE GAC Renaud committed
86
        msg = "<br><br><hr/>"
87
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
88
        msg += "<hr/>"
89
        return msg
90

LE GAC Renaud's avatar
LE GAC Renaud committed
91
    response.view = "harvest/layout.html"
92
    report = tool.report()
LE GAC Renaud's avatar
LE GAC Renaud committed
93
    report["selector"] = selector
94
    return report
95 96


97
def edit_insert():
98
    """Edit a record and insert it in the database.
99

100 101 102
    Note:
        Recovery procedures are applied to fix basic non-conformity, but
        no checks are run. The user is editing the record to fix problems.
103

104
    """
105 106 107
    logger.debug("-"*72)
    logger.debug("start controller edit_insert...")

108 109
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
110

LE GAC Renaud's avatar
LE GAC Renaud committed
111 112 113 114 115 116 117
    fields = (
        "controller",
        "host",
        "id_projects",
        "id_teams",
        "id_categories",
        "record_id")
118 119

    table = virtdb.edit_insert_selector
120

121 122 123 124
    # ------------------------------------------------------------------------
    #
    # Get the record
    #
125
    try:
126 127 128 129 130 131
        # Protection
        #
        # NOTE
        # With plugin_dbui 0.7.1 it is possible to enter decimal value
        # for the record id (e.g by typing 1503,03 in the field)
        #
132
        if not request.vars.Edit_insert_selectorRecord_id.isalnum():
133 134 135
            msg = T("The <i>record id</i> is not well formed.")
            msg += "<br>"
            msg += T("Use only digit character, no comma, no dot...")
LE GAC Renaud's avatar
LE GAC Renaud committed
136
            return inline_alert(T("Error"), msg)
137

138 139 140 141
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
142
                msg = T("All fields of the form have to be defined !!!")
143
                msg += "<br>"
LE GAC Renaud's avatar
LE GAC Renaud committed
144 145
                msg += T("The field '%s' is missing ...") % T(table[el].label)
                return inline_alert(T("Error"), msg)
146

147
        # record
148 149 150 151
        host = selector.host
        shelf = ("literature" if host == "inspirehep.net" else None)
        logger.debug(f"load the record {selector.host} {selector.record_id}")
        record = load_record(selector.host, selector.record_id, shelf=shelf)
152

153
        if record is None:
154
            return inline_alert(T(MSG_GREMLIN), T(MSG_NO_RECORD))
155

156 157
    except Exception as e:
        logger.error(str(e))
158

159 160 161 162 163 164
        # log the exception in the web2py ticker system
        ticket = RestrictedError(layer="harvester.py",
                                 code="edit_insert",
                                 output="",
                                 environment=current.globalenv)
        ticket.log(request)
165

166 167
        # inform the user that something went wrong in the server
        raise HTTP(500, T(str(e)))
168

169 170 171 172 173 174 175 176 177 178 179 180 181 182
    # ------------------------------------------------------------------------
    #
    # Prepare the form
    #
    cfg = to_formPanel(db.publications)

    values = {
        "PublicationsTitle": record.title(),
        "PublicationsPreprint": record.preprint_number(),
        "PublicationsPublication_url": record.paper_url(),
        "PublicationsReport_numbers": record.report_number()}

    # ------------------------------------------------------------------------
    #
183 184 185 186 187 188 189
    # fix basic non-conformity (general)
    #  - is record with authors
    #  - is record with authors form my institute
    #  - standardise name of collaboration
    #  - format authors according to my format
    #  - extract authors form my institute signing the publication
    #  - check and fix submitted date
190 191 192
    #
    # authors
    try:
193 194 195 196
        record.check_and_fix(fmt_author="F. Last",
                             rex_institute=get_rex_institute(db, current.app),
                             sep_author=", ",
                             sort_author=True)
197

198
    except CheckException as e:
199
        logger.debug(str(e))
200
        pass
201

202 203 204
    fauthor = record.first_author()
    if isinstance(fauthor, list):
        fauthor = ", ".join(fauthor)
205

206 207 208
    values["PublicationsFirst_author"] = fauthor
    values["PublicationsAuthors"] = record.authors()
    values["PublicationsAuthors_institute"] = record.my_authors
209

210
    # repeat collaboration check
211
    try:
212
        recId = UNDEF_ID
213 214 215 216 217
        recId = search_synonym(db.collaborations,
                               "collaboration",
                               record.collaboration())
    except ToolException as e:
        pass
218

219
    values["PublicationsId_collaborations"] = int(recId)
220

221 222 223 224
    # teams, project, categories
    values["PublicationsId_categories"] = int(selector.id_categories)
    values["PublicationsId_projects"] = int(selector.id_projects)
    values["PublicationsId_teams"] = int(selector.id_teams)
225

226 227 228 229 230 231 232
    # origin
    # Note:
    #  - It is always defined
    #  - Use a trivial algorithm to recover it
    oai_url = record.oai_url()
    if not oai_url:
        oai_url = OAI_URL % (selector.host, selector.record_id)
233

234
    values["PublicationsOrigin"] = oai_url
235

236 237
    # ------------------------------------------------------------------------
    #
238
    # fix basic non-conformity (article)
239 240
    #
    if selector.controller in ("articles", "proceedings"):
241

242
        try:
243 244 245
            record.format_editor()

            recId = UNDEF_ID
246 247 248
            recId = search_synonym(db.publishers,
                                   "abbreviation",
                                   record.paper_editor())
249

250
        except ToolException as e:
251
            logger.debug(str(e))
252
            pass
253

254 255 256
        values["PublicationsId_publishers"] = int(recId)
        values["PublicationsVolume"] = record.paper_volume()
        values["PublicationsPages"] = record.paper_pages()
257

258 259
    # ------------------------------------------------------------------------
    #
260
    # fix basic non-conformity (conference)
261 262
    #
    if selector.controller in ("proceedings", "talks"):
263

264
        if isinstance(record, (RecordCdsConfPaper, RecordHepConfPaper)):
LE GAC Renaud's avatar
LE GAC Renaud committed
265

266 267 268 269 270 271
            try:
                record.check_conference_date()

            except CheckException as e:
                logger.debug(str(e))
                pass
LE GAC Renaud's avatar
LE GAC Renaud committed
272

273 274
            values["PublicationsConference_title"] = \
                record.conference_title()
LE GAC Renaud's avatar
LE GAC Renaud committed
275

276 277
            values["PublicationsConference_url"] = \
                record.conference_url()
278

279 280
            values["PublicationsConference_dates"] = \
                record.conference_dates()
281

282 283
            values["PublicationsConference_town"] = \
                record.conference_town()
LE GAC Renaud's avatar
LE GAC Renaud committed
284

285
            try:
286
                recId = UNDEF_ID
287 288 289
                recId = search_synonym(db.countries,
                                       "country",
                                       record.conference_country())
290

291
            except ToolException as e:
292
                logger.debug(str(e))
293
                pass
294

295
            values["PublicationsId_countries"] = int(recId)
LE GAC Renaud's avatar
LE GAC Renaud committed
296

297 298
            values["PublicationsConference_speaker"] = \
                record.first_author()
299

300 301
    # ------------------------------------------------------------------------
    #
302
    # fix basic non-conformity (thesis)
303 304
    #
    if selector.controller == "theses":
LE GAC Renaud's avatar
LE GAC Renaud committed
305

306 307 308 309
        if isinstance(record, (RecordCdsThesis, RecordHepThesis)):

            record.format_universities()

310 311
            values["PublicationsUniversities"] = \
                record.these_universities()
312

313 314
            values["PublicationsDirectors"] = record.these_directors()
            values["PublicationsDefense"] = record.these_defense()
315

316 317
    # ------------------------------------------------------------------------
    #
318
    # submitted date and year
319
    #
320
    values["PublicationsSubmitted"] = record.submitted()
321

322 323 324 325
    if record.is_published():
        year = record.paper_year()
    else:
        year = record.submitted()[0:4]
326

327
    values["PublicationsYear"] = year
328

329
    logger.debug("-"*72)
330 331 332
    return dict(cfg=cfg, values=values)


333 334
def insert_recjson():
    """Insert a recjson record in the database.
335

336
    """
337 338
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
339

tux091's avatar
tux091 committed
340
    try:
341
        selector = Selector(virtdb.recjson_selector, exclude_fields=("mode"))
LE GAC Renaud's avatar
LE GAC Renaud committed
342 343 344 345 346 347 348 349 350

        tool = build_harvester_tool(
            db,
            selector.id_teams,
            selector.id_projects,
            selector.controller,
            selector.id_categories,
            year_start=selector.year_start,
            year_end=selector.year_end,
351
            dry_run=(selector.mode == MODE_DRY_RUN))
LE GAC Renaud's avatar
LE GAC Renaud committed
352

353
        if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
354
            return inline_alert(T("Error"), T("Select an harvester."))
355

356 357 358 359
        ctitle = "%s / %s " % (db.projects[selector.id_projects].project,
                               selector.controller)

        tool.collection_logs = [MsgCollection(title=ctitle)]
360
        tool.harvester.host = selector.host
361 362
        tool.logs = []

363 364 365
        tool.shelf = \
            ("literature" if selector.host == "inspirehep.net" else None)

366 367 368
        recjson = json.loads(selector.recjson)
        recjson = (recjson[0] if isinstance(recjson, list) else recjson)

369 370 371
        if selector.host == "inspirehep.net":
            recjson = recjson["metadata"]

372
        tool.process_recjson(recjson)
373 374

    except ToolException as e:
375
        return T(str(e))
376 377

    except BaseException as e:
LE GAC Renaud's avatar
LE GAC Renaud committed
378
        msg = "<br><br><hr/>"
379
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
380
        msg += "<hr/>"
381
        return msg
382

LE GAC Renaud's avatar
LE GAC Renaud committed
383
    response.view = "harvest/layout.html"
384
    report = tool.report()
LE GAC Renaud's avatar
LE GAC Renaud committed
385
    report["selector"] = selector
386
    return report
387 388


389
def run():
390
    """Run an harvester.
391

392
    Scan the cds/invenio stores to find publication during
393
    a given range of years and for a given team/project.
394
    Insert them in the database if they don't exist.
395

396 397
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
398

399
    Search arguments are defined via the harvester selector.
400

Renaud Le Gac's avatar
Renaud Le Gac committed
401
    """
402 403
    logger.info("-"*79)
    logger.info(f"run harvester {request.vars.Harvester_selectorController}")
404

405 406
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
407

Renaud Le Gac's avatar
Renaud Le Gac committed
408
    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
409 410 411
        selector = Selector(
            virtdb.harvester_selector,
            exclude_fields=("mode", "year_start", "year_end"))
Renaud Le Gac's avatar
Renaud Le Gac committed
412

413 414 415
        # Get hosts and collections
        rows = selector.select(db.harvesters)
        if not rows:
416
            raise ToolException(T(MSG_NO_HARVESTER))
MEESSEN Christophe's avatar
MEESSEN Christophe committed
417

418 419 420 421
        collection_logs = []
        logs = []

        for row in rows:
LE GAC Renaud's avatar
LE GAC Renaud committed
422 423 424 425 426 427 428 429
            tool = build_harvester_tool(
                db,
                selector.id_teams,
                selector.id_projects,
                selector.controller,
                row.harvesters.id_categories,
                year_start=selector.year_start,
                year_end=selector.year_end,
430
                dry_run=(selector.mode == MODE_DRY_RUN))
LE GAC Renaud's avatar
LE GAC Renaud committed
431

432
            if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
433
                return inline_alert(T("Error"), T("Select an harvester."))
434 435

            tool.process_url(row.harvesters.host, row.harvesters.collections)
436

437 438
            collection_logs.extend(tool.collection_logs)
            logs.extend(tool.logs)
439

440 441 442 443
    except RequestException as e:
        logger.error(MSG_LOST_CONNECTION)
        return inline_alert(T(MSG_GREMLIN), T(MSG_LOST_CONNECTION))

444
    except ToolException as e:
445 446 447
        log = tool.logs[-1]
        msg = "<h4>Error on record %s (%s)</h4>" % (log.url, log.collection)
        msg += T(str(e))
448
        logger.error(f"{msg.strip('<h4>')}")
449
        return msg
450

451
    except Exception as e:
452
        logger.error(f"{str(e)}")
453
        msg = "<hr/>"
454
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
455
        msg += "<hr/>"
456
        return msg
457

458 459 460
    if logger.getEffectiveLevel() <= logging.INFO:
        logger.info("")
        logger.info(f"end of run harvester {selector.controller}:")
461 462

        for el in collection_logs:
463 464 465
            logger.info(f"  {el.title}: {el.found}")

        logger.info("-"*79)
466

467 468 469
    # filter logs to remove duplicated entries
    logs = filter_logs(logs)

470
    # delegate rendering to the report view
LE GAC Renaud's avatar
LE GAC Renaud committed
471
    response.view = "harvest/layout.%s" % request.extension
472 473 474 475
    return dict(collection_logs=collection_logs,
                controller=selector.controller,
                logs=logs,
                selector=selector)
476 477


478 479
def run_all():
    """Run all harvesters in one go.
480

481
    """
482 483
    logger.info(f"run all harvesters")

484 485
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
486

487
    collection_logs = []
488
    logs = []
489

490
    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
491 492 493
        selector = Selector(
            virtdb.run_all_harvesters_selector,
            exclude_fields=("mode", "year_start", "year_end"))
494

495
        query = None
LE GAC Renaud's avatar
LE GAC Renaud committed
496
        for fieldname in ("id_teams", "id_projects"):
497 498
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
LE GAC Renaud's avatar
LE GAC Renaud committed
499
                query = (q if query is None else (query) & (q))
500

501 502
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
LE GAC Renaud's avatar
LE GAC Renaud committed
503
            return inline_alert(T("Error"), T(MSG_NO_HARVESTER))
504

505
        for harvester in harvesters:
506

507 508 509 510 511
            id_teams = harvester.id_teams
            id_projects = harvester.id_projects
            controller = harvester.controller
            id_categories = harvester.id_categories

512
            logger.info("-"*79)
513 514 515 516 517 518 519 520 521
            logger.info(f"run harvester {controller}")

            logger.info(f"        team: {id_teams}")
            logger.info(f"     project: {id_projects}")
            logger.info(f"  controller: {controller}")
            logger.info(f"    category: {id_categories}")

            if id_teams is None or id_projects is None or id_categories is None:
                continue
522

LE GAC Renaud's avatar
LE GAC Renaud committed
523 524
            tool = build_harvester_tool(
                db,
525 526 527 528
                id_teams,
                id_projects,
                controller,
                id_categories,
LE GAC Renaud's avatar
LE GAC Renaud committed
529 530
                year_start=selector.year_start,
                year_end=selector.year_end,
531
                dry_run=(selector.mode == MODE_DRY_RUN))
LE GAC Renaud's avatar
LE GAC Renaud committed
532

533
            if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
534
                return inline_alert(T("Error"), T("Select an harvester."))
535 536 537

            tool.process_url(harvester.host, harvester.collections)

538
            collection_logs.extend(tool.collection_logs)
539
            logs.extend(tool.logs)
540 541

    except ToolException as e:
542 543 544 545
        msg = ""
        if len(tool.logs) > 0:
            log = tool.logs[-1]
            msg = f"<h4>Error on record {log.url} ({log.collection})</h4>"
546
        msg += T(str(e))
547 548 549

        logger.error(msg)
        logger.info("-"*79)
550
        return msg
551

552
    except Exception as e:
553
        msg = "<hr/>"
554
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
555
        msg += "<hr/>"
556 557 558

        logger.error(msg)
        logger.info("-"*79)
559
        return msg
560

561 562 563 564 565 566 567 568 569 570 571
    if logger.getEffectiveLevel() <= logging.INFO:
        logger.info("")
        logger.info(f"end of run all harvesters:")

        for el in collection_logs:
            logger.info(f"  {el.title}: {el.found}")

        logger.info("-"*79)

    logger.info("-"*79)

572 573 574
    # filter logs to remove duplicated entries
    logs = filter_logs(logs)

575
    # tune selector parameters used in the report title
576
    if query is None:
577
        selector.id_projects = None
578

579
    # delegate rendering to the report view
LE GAC Renaud's avatar
LE GAC Renaud committed
580
    response.view = "harvest/layout.%s" % request.extension
581
    return dict(collection_logs=collection_logs,
LE GAC Renaud's avatar
LE GAC Renaud committed
582
                controller="all harvesters",
583
                logs=logs,
584
                selector=selector)