harvest.py 14.3 KB
Newer Older
1 2 3 4
""" Harvest Controllers

"""

5
import traceback
6 7

from gluon import current
8
from gluon.restricted import RestrictedError
9
from harvest_tools import (build_harvester_tool,
10 11
                           CheckAndFix,
                           CheckException,
LE GAC Renaud's avatar
LE GAC Renaud committed
12
                           DRY_RUN,
13 14
                           format_author_fr,
                           family_name_fr,
15
                           search_synonym,
16
                           ToolException)
17
from invenio_tools import (load_record,
18 19
                           OAI_URL,
                           RecordConf,
20 21
                           RecordThesis,
                           REG_INT)
LE GAC Renaud's avatar
LE GAC Renaud committed
22
from plugin_dbui import (inline_alert,
23
                         Selector,
24 25
                         to_formPanel,
                         UNDEF_ID)
26

LE GAC Renaud's avatar
LE GAC Renaud committed
27
MODE_DRY_RUN = T(DRY_RUN)
28
MSG_NO_AFFILIATION = "Affiliation keys are not defined !!!"
29
MSG_NO_HARVESTER = "No harvesters for your selection !!!"
30
MSG_NO_RECORD = "Sorry, the record does not exist."
31

LE GAC Renaud's avatar
LE GAC Renaud committed
32

33
def free_run():
34 35
    """Run a free harvester.
    All harvester parameters are defined via the selector.
36

37
    """
38 39
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
40

41
    table = virtdb.free_harvester_selector
LE GAC Renaud's avatar
LE GAC Renaud committed
42 43 44 45 46 47 48
    fields = ("collections",
              "controller",
              "host",
              "id_projects",
              "id_teams",
              "id_categories",
              "ratio")
49 50

    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
51 52 53
        selector = Selector(
            table,
            exclude_fields=("mode", "year_start", "year_end"))
54 55 56

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
57
                msg = T("All fields of the form have to be defined !!!")
LE GAC Renaud's avatar
LE GAC Renaud committed
58
                msg += "<br>"
LE GAC Renaud's avatar
LE GAC Renaud committed
59 60 61 62 63 64 65 66 67 68 69 70 71 72
                msg += T("The field '%s' is missing ...") % T(table[el].label)
                return inline_alert(T("Error"), msg)

        tool = build_harvester_tool(
            db,
            selector.id_teams,
            selector.id_projects,
            selector.controller,
            selector.id_categories,
            year_start=selector.year_start,
            year_end=selector.year_end,
            dry_run=(selector.mode == MODE_DRY_RUN),
            debug=False)

73
        if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
74
            return inline_alert(T("Error"), T("Select an harvester."))
75

76 77 78
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
79
        return T(str(e))
80 81

    except BaseException as e:
LE GAC Renaud's avatar
LE GAC Renaud committed
82
        msg = "<br><br><hr/>"
83
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
84
        msg += "<hr/>"
85
        return msg
86

LE GAC Renaud's avatar
LE GAC Renaud committed
87
    response.view = "harvest/layout.html"
88
    report = tool.report()
LE GAC Renaud's avatar
LE GAC Renaud committed
89
    report["selector"] = selector
90
    return report
91 92


93 94
def edit_insert():
    """Edit an invenio record and insert it in the database.
95

96 97 98
    Note:
        Recovery procedures are applied to fix basic non-conformity, but
        no checks are run. The user is editing the record to fix problems.
99

100
    """
101 102
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
103

LE GAC Renaud's avatar
LE GAC Renaud committed
104 105 106 107 108 109 110
    fields = (
        "controller",
        "host",
        "id_projects",
        "id_teams",
        "id_categories",
        "record_id")
111 112

    table = virtdb.edit_insert_selector
113

114
    try:
115 116 117 118 119 120 121 122 123 124
        # Protection
        #
        # NOTE
        # With plugin_dbui 0.7.1 it is possible to enter decimal value
        # for the record id (e.g by typing 1503,03 in the field)
        #
        if REG_INT.match(request.vars.Edit_insert_selectorRecord_id) is None:
            msg = T("The <i>record id</i> is not well formed.")
            msg += "<br>"
            msg += T("Use only digit character, no comma, no dot...")
LE GAC Renaud's avatar
LE GAC Renaud committed
125
            return inline_alert(T("Error"), msg)
126

127 128 129 130
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
131
                msg = T("All fields of the form have to be defined !!!")
132
                msg += "<br>"
LE GAC Renaud's avatar
LE GAC Renaud committed
133 134
                msg += T("The field '%s' is missing ...") % T(table[el].label)
                return inline_alert(T("Error"), msg)
135

136
        # record
LE GAC Renaud's avatar
LE GAC Renaud committed
137
        record = load_record(selector.host, selector.record_id)
138

139
        if record is None:
LE GAC Renaud's avatar
LE GAC Renaud committed
140
            return inline_alert(T("Error"), T(MSG_NO_RECORD))
141

142 143
        # form configuration
        cfg = to_formPanel(db.publications)
144

145 146 147
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
148

149 150 151
        # fix invalid oai
        check.recover_oai(record, selector.host)

152
        # title, preprint, URL, report number
LE GAC Renaud's avatar
LE GAC Renaud committed
153 154 155 156
        values["PublicationsTitle"] = record.title()
        values["PublicationsPreprint"] = record.preprint_number()
        values["PublicationsPublication_url"] = record.paper_url()
        values["PublicationsReport_numbers"] = record.report_number()
157

158
        # authors
159
        try:
160 161
            check.authors(record)
            check.format_authors(record, format_author_fr)
162

LE GAC Renaud's avatar
LE GAC Renaud committed
163 164 165
            check.my_affiliation(
                record, selector.id_projects, selector.id_teams)

166
            check.get_my_authors(record, cmpFct=family_name_fr)
167

168 169
        except CheckException:
            pass
170

171 172 173 174
        fauthor = record.first_author()
        if isinstance(fauthor, list):
            fauthor = u", ".join(fauthor)

LE GAC Renaud's avatar
LE GAC Renaud committed
175 176 177
        values["PublicationsFirst_author"] = fauthor
        values["PublicationsAuthors"] = record.authors()
        values["PublicationsAuthors_institute"] = record.my_authors
178 179

        # collaboration
180 181 182 183 184 185 186 187
        recId = UNDEF_ID
        try:
            recId = search_synonym(db.collaborations,
                                   "collaboration",
                                   record.collaboration())
        except ToolException:
            pass

LE GAC Renaud's avatar
LE GAC Renaud committed
188
        values["PublicationsId_collaborations"] = int(recId)
189

190
        # teams, project, categories
LE GAC Renaud's avatar
LE GAC Renaud committed
191 192 193
        values["PublicationsId_categories"] = int(selector.id_categories)
        values["PublicationsId_projects"] = int(selector.id_projects)
        values["PublicationsId_teams"] = int(selector.id_teams)
194 195 196 197 198 199 200 201 202

        # origin
        # Note:
        #  - It is always defined
        #  - Use a trivial algorithm to recover it
        oai_url = record.oai_url()
        if not oai_url:
            oai_url = OAI_URL % (selector.host, selector.record_id)

LE GAC Renaud's avatar
LE GAC Renaud committed
203
        values["PublicationsOrigin"] = oai_url
204

205
        # publishers
LE GAC Renaud's avatar
LE GAC Renaud committed
206
        if selector.controller in ("articles", "proceedings"):
207

208
            check.clean_erratum(record)
209
            check.paper_reference(record)
210
            check.format_editor(record)
211

212 213 214 215 216 217 218 219
            recId = UNDEF_ID
            try:
                recId = search_synonym(db.publishers,
                                       "abbreviation",
                                       record.paper_editor())
            except ToolException:
                pass

LE GAC Renaud's avatar
LE GAC Renaud committed
220 221 222
            values["PublicationsId_publishers"] = int(recId)
            values["PublicationsVolume"] = record.paper_volume()
            values["PublicationsPages"] = record.paper_pages()
223

224
        # conference
LE GAC Renaud's avatar
LE GAC Renaud committed
225
        if selector.controller in ("proceedings", "talks"):
226

227
            try:
228
                check.country(record)
229
                check.conference_date(record, selector.host)
230

231 232
            except CheckException:
                pass
233

234
            if isinstance(record, RecordConf):
LE GAC Renaud's avatar
LE GAC Renaud committed
235 236 237 238 239 240 241 242 243 244 245
                values["PublicationsConference_title"] = \
                    record.conference_title()

                values["PublicationsConference_url"] = \
                    record.conference_url()

                values["PublicationsConference_dates"] = \
                    record.conference_dates()

                values["PublicationsConference_town"] = \
                    record.conference_town()
246

247 248 249 250 251 252 253
                recId = UNDEF_ID
                try:
                    recId = search_synonym(db.countries,
                                           "country",
                                           record.conference_country())
                except ToolException:
                    pass
254

LE GAC Renaud's avatar
LE GAC Renaud committed
255 256 257 258
                values["PublicationsId_countries"] = int(recId)

                values["PublicationsConference_speaker"] = \
                    record.first_author()
259

260
        # thesis
LE GAC Renaud's avatar
LE GAC Renaud committed
261
        if selector.controller == "theses":
262

263
            if isinstance(record, RecordThesis):
LE GAC Renaud's avatar
LE GAC Renaud committed
264 265 266 267 268
                values["PublicationsUniversities"] = \
                    record.these_universities()

                values["PublicationsDirectors"] = record.these_directors()
                values["PublicationsDefense"] = record.these_defense()
269 270 271 272 273 274 275

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
276

LE GAC Renaud's avatar
LE GAC Renaud committed
277
        values["PublicationsSubmitted"] = ", ".join(record.submitted())
278 279 280 281 282 283

        if record.is_published():
            year = record.paper_year()
        else:
            year = record.year()

LE GAC Renaud's avatar
LE GAC Renaud committed
284
        values["PublicationsYear"] = year
285

286
    except Exception:
287

288
        # log the exception in the web2py ticker system
LE GAC Renaud's avatar
LE GAC Renaud committed
289 290 291
        ticket = RestrictedError(layer="harvester.py",
                                 code="edit_insert",
                                 output="",
292 293 294 295 296
                                 environment=current.globalenv)
        ticket.log(request)

        # inform the user that something went wrong in the server
        raise HTTP(500)
297 298 299 300

    return dict(cfg=cfg, values=values)


301 302
def insert_marcxml():
    """Insert a MarcXML record in the database.
303

304
    """
305 306
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
307

tux091's avatar
tux091 committed
308
    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
309 310 311 312 313 314 315 316 317 318 319 320 321
        selector = Selector(virtdb.marc12_selector, exclude_fields=("mode"))

        tool = build_harvester_tool(
            db,
            selector.id_teams,
            selector.id_projects,
            selector.controller,
            selector.id_categories,
            year_start=selector.year_start,
            year_end=selector.year_end,
            dry_run=(selector.mode == MODE_DRY_RUN),
            debug=False)

322
        if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
323
            return inline_alert(T("Error"), T("Select an harvester."))
324

325
        tool.harvester.host = selector.host
326 327 328
        tool.process_xml(selector.xml)

    except ToolException as e:
329
        return T(str(e))
330 331

    except BaseException as e:
LE GAC Renaud's avatar
LE GAC Renaud committed
332
        msg = "<br><br><hr/>"
333
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
334
        msg += "<hr/>"
335
        return msg
336

LE GAC Renaud's avatar
LE GAC Renaud committed
337
    response.view = "harvest/layout.html"
338
    report = tool.report()
LE GAC Renaud's avatar
LE GAC Renaud committed
339
    report["selector"] = selector
340
    return report
341 342


343
def run():
344
    """Run an harvester.
345 346 347

    Scan the cds/invenio stores to find articles published during
    a given range of years and for a given team/project.
348
    Insert them in the database if they don't exist.
349

350 351
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
352

353
    Search arguments are defined via the harvester selector.
354

Renaud Le Gac's avatar
Renaud Le Gac committed
355
    """
356 357
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
358

Renaud Le Gac's avatar
Renaud Le Gac committed
359
    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
360 361 362
        selector = Selector(
            virtdb.harvester_selector,
            exclude_fields=("mode", "year_start", "year_end"))
Renaud Le Gac's avatar
Renaud Le Gac committed
363

364 365 366
        # Get hosts and collections
        rows = selector.select(db.harvesters)
        if not rows:
367
            raise ToolException(T(MSG_NO_HARVESTER))
MEESSEN Christophe's avatar
MEESSEN Christophe committed
368

369 370 371 372
        collection_logs = []
        logs = []

        for row in rows:
LE GAC Renaud's avatar
LE GAC Renaud committed
373 374 375 376 377 378 379 380 381 382 383
            tool = build_harvester_tool(
                db,
                selector.id_teams,
                selector.id_projects,
                selector.controller,
                row.harvesters.id_categories,
                year_start=selector.year_start,
                year_end=selector.year_end,
                dry_run=(selector.mode == MODE_DRY_RUN),
                debug=False)

384
            if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
385
                return inline_alert(T("Error"), T("Select an harvester."))
386 387

            tool.process_url(row.harvesters.host, row.harvesters.collections)
388

389 390
            collection_logs.extend(tool.collection_logs)
            logs.extend(tool.logs)
391 392

    except ToolException as e:
393
        return T(str(e))
394 395

    except BaseException as e:
LE GAC Renaud's avatar
LE GAC Renaud committed
396
        msg = "<br><br><hr/>"
397
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
398
        msg += "<hr/>"
399
        return msg
400

401
    # delegate rendering to the report view
LE GAC Renaud's avatar
LE GAC Renaud committed
402
    response.view = "harvest/layout.%s" % request.extension
403 404 405 406
    return dict(collection_logs=collection_logs,
                controller=selector.controller,
                logs=logs,
                selector=selector)
407 408


409 410
def run_all():
    """Run all harvesters in one go.
411

412
    """
413 414
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
415

416
    collection_logs = []
417
    logs = []
418

419
    try:
LE GAC Renaud's avatar
LE GAC Renaud committed
420 421 422
        selector = Selector(
            virtdb.run_all_harvesters_selector,
            exclude_fields=("mode", "year_start", "year_end"))
423

424
        query = None
LE GAC Renaud's avatar
LE GAC Renaud committed
425
        for fieldname in ("id_teams", "id_projects"):
426 427
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
LE GAC Renaud's avatar
LE GAC Renaud committed
428
                query = (q if query is None else (query) & (q))
429

430 431
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
LE GAC Renaud's avatar
LE GAC Renaud committed
432
            return inline_alert(T("Error"), T(MSG_NO_HARVESTER))
433

434
        for harvester in harvesters:
435

LE GAC Renaud's avatar
LE GAC Renaud committed
436 437 438 439 440 441 442 443 444 445 446
            tool = build_harvester_tool(
                db,
                harvester.id_teams,
                harvester.id_projects,
                harvester.controller,
                harvester.id_categories,
                year_start=selector.year_start,
                year_end=selector.year_end,
                dry_run=(selector.mode == MODE_DRY_RUN),
                debug=False)

447
            if not tool:
LE GAC Renaud's avatar
LE GAC Renaud committed
448
                return inline_alert(T("Error"), T("Select an harvester."))
449 450 451

            tool.process_url(harvester.host, harvester.collections)

452
            collection_logs.extend(tool.collection_logs)
453
            logs.extend(tool.logs)
454 455

    except ToolException as e:
456
        return T(str(e))
457 458

    except BaseException as e:
LE GAC Renaud's avatar
LE GAC Renaud committed
459
        msg = "<br><br><hr/>"
460
        msg += CODE(traceback.format_exc()).xml()
LE GAC Renaud's avatar
LE GAC Renaud committed
461
        msg += "<hr/>"
462
        return msg
463

464
    # tune selector parameters used in the report title
465
    if query is None:
466
        selector.id_projects = None
467

468
    # delegate rendering to the report view
LE GAC Renaud's avatar
LE GAC Renaud committed
469
    response.view = "harvest/layout.%s" % request.extension
470
    return dict(collection_logs=collection_logs,
LE GAC Renaud's avatar
LE GAC Renaud committed
471
                controller="all harvesters",
472
                logs=logs,
473
                selector=selector)