harvest.py 14.8 KB
Newer Older
1 2 3 4
""" Harvest Controllers

"""

5
import traceback
6 7

from gluon import current
8
from gluon.restricted import RestrictedError
9
from harvest_tools import (build_harvester_tool,
10 11
                           CheckAndFix,
                           CheckException,
LE GAC Renaud's avatar
LE GAC Renaud committed
12
                           DRY_RUN,
13 14
                           format_author_fr,
                           family_name_fr,
15
                           search_synonym,
16
                           ToolException)
17
from invenio_tools import (load_record,
18 19
                           OAI_URL,
                           RecordConf,
20 21
                           RecordThesis,
                           REG_INT)
22
from plugin_dbui import (get_id,
23
                         inline_alert,
24
                         Selector,
25 26
                         to_formPanel,
                         UNDEF_ID)
27

LE GAC Renaud's avatar
LE GAC Renaud committed
28
MODE_DRY_RUN = T(DRY_RUN)
29
MSG_NO_AFFILIATION = "Affiliation keys are not defined !!!"
30
MSG_NO_HARVESTER = "No harvesters for your selection !!!"
31
MSG_NO_RECORD = "Sorry, the record does not exist."
32

33
def free_run():
34 35
    """Run a free harvester.
    All harvester parameters are defined via the selector.
36

37
    """
38 39
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
40

41
    table = virtdb.free_harvester_selector
42 43 44 45 46 47
    fields = ('collections',
              'controller',
              'host',
              'id_projects',
              'id_teams',
              'id_categories',
48 49 50
              'ratio')

    try:
51
        selector = Selector(table,
52 53 54 55
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
56 57
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
58
                msg += T('The field "%s" is missing ...') % T(table[el].label)
59
                return inline_alert(T('Error'), msg)
60 61

        tool = build_harvester_tool(db,
62 63 64 65 66 67
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
LE GAC Renaud's avatar
LE GAC Renaud committed
68
                          dry_run=(selector.mode == MODE_DRY_RUN),
69
                          debug=False)
70
        if not tool:
71
            return inline_alert(T('Error'), T('Select an harvester.'))
72

73 74 75
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
76
        return T(str(e))
77 78 79

    except BaseException as e:
        msg = '<br><br><hr/>'
80 81 82
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
83

84
    response.view = 'harvest/layout.html'
85 86 87
    report = tool.report()
    report['selector'] = selector
    return report
88 89


90 91
def edit_insert():
    """Edit an invenio record and insert it in the database.
92

93 94 95
    Note:
        Recovery procedures are applied to fix basic non-conformity, but
        no checks are run. The user is editing the record to fix problems.
96

97
    """
98 99
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
100

101 102 103 104
    fields = ('controller',
              'host',
              'id_projects',
              'id_teams',
105 106 107 108
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
109

110
    try:
111 112 113 114 115 116 117 118 119 120
        # Protection
        #
        # NOTE
        # With plugin_dbui 0.7.1 it is possible to enter decimal value
        # for the record id (e.g by typing 1503,03 in the field)
        #
        if REG_INT.match(request.vars.Edit_insert_selectorRecord_id) is None:
            msg = T("The <i>record id</i> is not well formed.")
            msg += "<br>"
            msg += T("Use only digit character, no comma, no dot...")
121
            return inline_alert(T('Error'), msg)
122

123 124 125 126 127 128 129
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
130
                return inline_alert(T('Error'), msg)
131

132
        # record
LE GAC Renaud's avatar
LE GAC Renaud committed
133
        record = load_record(selector.host, selector.record_id)
134

135
        if record is None:
136
            return inline_alert(T('Error'), T(MSG_NO_RECORD))
137

138 139
        # form configuration
        cfg = to_formPanel(db.publications)
140

141 142 143
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
144

145 146 147
        # fix invalid oai
        check.recover_oai(record, selector.host)

148 149 150 151 152
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
153

154
        # authors
155
        try:
156 157
            check.authors(record)
            check.format_authors(record, format_author_fr)
158 159

            check.my_affiliation(record, selector.id_projects, selector.id_teams)
160
            check.get_my_authors(record, cmpFct=family_name_fr)
161

162 163
        except CheckException:
            pass
164

165 166 167 168 169
        fauthor = record.first_author()
        if isinstance(fauthor, list):
            fauthor = u", ".join(fauthor)

        values['PublicationsFirst_author'] = fauthor
170
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
171
        values['PublicationsAuthors_institute'] = record.my_authors
172 173

        # collaboration
174 175 176 177 178 179 180 181 182
        recId = UNDEF_ID
        try:
            recId = search_synonym(db.collaborations,
                                   "collaboration",
                                   record.collaboration())
        except ToolException:
            pass

        values['PublicationsId_collaborations'] = int(recId)
183

184
        # teams, project, categories
185 186 187
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
188 189 190 191 192 193 194 195 196 197

        # origin
        # Note:
        #  - It is always defined
        #  - Use a trivial algorithm to recover it
        oai_url = record.oai_url()
        if not oai_url:
            oai_url = OAI_URL % (selector.host, selector.record_id)

        values['PublicationsOrigin'] = oai_url
198

199 200 201
        # publishers
        if selector.controller in ('articles', 'proceedings'):

202
            check.clean_erratum(record)
203
            check.paper_reference(record)
204
            check.format_editor(record)
205

206 207 208 209 210 211 212 213 214
            recId = UNDEF_ID
            try:
                recId = search_synonym(db.publishers,
                                       "abbreviation",
                                       record.paper_editor())
            except ToolException:
                pass

            values['PublicationsId_publishers'] = int(recId)
215 216
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
217

218 219
        # conference
        if selector.controller in ('proceedings', 'talks'):
220

221
            try:
222
                check.country(record)
223
                check.conference_date(record, selector.host)
224

225 226
            except CheckException:
                pass
227

228 229 230 231 232
            if isinstance(record, RecordConf):
                values['PublicationsConference_title'] = record.conference_title()
                values['PublicationsConference_url'] = record.conference_url()
                values['PublicationsConference_dates'] = record.conference_dates()
                values['PublicationsConference_town'] = record.conference_town()
233

234 235 236 237 238 239 240
                recId = UNDEF_ID
                try:
                    recId = search_synonym(db.countries,
                                           "country",
                                           record.conference_country())
                except ToolException:
                    pass
241

242
                values['PublicationsId_countries'] = int(recId)
243
                values['PublicationsConference_speaker'] = record.first_author()
244

245 246
        # thesis
        if selector.controller == 'theses':
247

248 249 250 251
            if isinstance(record, RecordThesis):
                values['PublicationsUniversities'] = record.these_universities()
                values['PublicationsDirectors'] = record.these_directors()
                values['PublicationsDefense'] = record.these_defense()
252 253 254 255 256 257 258

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
259

260
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
261 262 263 264 265 266 267

        if record.is_published():
            year = record.paper_year()
        else:
            year = record.year()

        values['PublicationsYear'] = year
268

269
    except Exception:
270

271 272 273 274 275 276 277 278 279
        # log the exception in the web2py ticker system
        ticket = RestrictedError(layer='harvester.py',
                                 code='edit_insert',
                                 output='',
                                 environment=current.globalenv)
        ticket.log(request)

        # inform the user that something went wrong in the server
        raise HTTP(500)
280 281 282 283

    return dict(cfg=cfg, values=values)


284 285
def insert_marcxml():
    """Insert a MarcXML record in the database.
286

287
    """
288 289
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
290

tux091's avatar
tux091 committed
291
    try:
292 293 294 295 296 297 298 299 300
        selector = Selector(virtdb.marc12_selector, exclude_fields=('mode'))

        tool = build_harvester_tool(db,
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    selector.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
LE GAC Renaud's avatar
LE GAC Renaud committed
301
                                    dry_run=(selector.mode == MODE_DRY_RUN),
302
                                    debug=False)
303
        if not tool:
304
            return inline_alert(T('Error'), T('Select an harvester.'))
305 306 307 308

        tool.process_xml(selector.xml)

    except ToolException as e:
309
        return T(str(e))
310 311 312

    except BaseException as e:
        msg = '<br><br><hr/>'
313 314 315
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
316

317
    response.view = 'harvest/layout.html'
318 319 320
    report = tool.report()
    report['selector'] = selector
    return report
321 322


323
def run():
324
    """Run an harvester.
325 326 327

    Scan the cds/invenio stores to find articles published during
    a given range of years and for a given team/project.
328
    Insert them in the database if they don't exist.
329

330 331
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
332

333
    Search arguments are defined via the harvester selector.
334

Renaud Le Gac's avatar
Renaud Le Gac committed
335
    """
336 337
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
338

Renaud Le Gac's avatar
Renaud Le Gac committed
339
    try:
340
        selector = Selector(virtdb.harvester_selector,
341
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
342

343 344 345
        # Get hosts and collections
        rows = selector.select(db.harvesters)
        if not rows:
346
            raise ToolException(T(MSG_NO_HARVESTER))
MEESSEN Christophe's avatar
MEESSEN Christophe committed
347

348 349 350 351 352 353 354 355 356 357 358 359 360 361
        collection_logs = []
        logs = []

        for row in rows:
            tool = build_harvester_tool(db,
                                        selector.id_teams,
                                        selector.id_projects,
                                        selector.controller,
                                        row.harvesters.id_categories,
                                        year_start=selector.year_start,
                                        year_end=selector.year_end,
                                        dry_run=(selector.mode == MODE_DRY_RUN),
                                        debug=False)
            if not tool:
362
                return inline_alert(T('Error'), T('Select an harvester.'))
363 364

            tool.process_url(row.harvesters.host, row.harvesters.collections)
365

366 367
            collection_logs.extend(tool.collection_logs)
            logs.extend(tool.logs)
368 369

    except ToolException as e:
370
        return T(str(e))
371 372 373

    except BaseException as e:
        msg = '<br><br><hr/>'
374 375 376
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
377

378
    # delegate rendering to the report view
379
    response.view = 'harvest/layout.%s' % request.extension
380 381 382 383
    return dict(collection_logs=collection_logs,
                controller=selector.controller,
                logs=logs,
                selector=selector)
384 385


386 387
def run_all():
    """Run all harvesters in one go.
388

389
    """
390 391
    if db(db.affiliation_keys.id > 0).count() == 0:
        return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
392

393
    collection_logs = []
394
    logs = []
395

396 397
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
398
                            exclude_fields=('mode', 'year_start', 'year_end'))
399

400 401 402 403 404 405 406 407 408
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

409 410
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
411
            return inline_alert(T('Error'), T(MSG_NO_HARVESTER))
412

413
        for harvester in harvesters:
414

415
            tool = build_harvester_tool(db,
416 417 418 419 420 421
                                        harvester.id_teams,
                                        harvester.id_projects,
                                        harvester.controller,
                                        harvester.id_categories,
                                        year_start=selector.year_start,
                                        year_end=selector.year_end,
LE GAC Renaud's avatar
LE GAC Renaud committed
422
                                        dry_run=(selector.mode == MODE_DRY_RUN),
423
                                        debug=False)
424
            if not tool:
425
                return inline_alert(T('Error'), T('Select an harvester.'))
426 427 428

            tool.process_url(harvester.host, harvester.collections)

429
            collection_logs.extend(tool.collection_logs)
430
            logs.extend(tool.logs)
431 432

    except ToolException as e:
433
        return T(str(e))
434 435 436

    except BaseException as e:
        msg = '<br><br><hr/>'
437 438 439
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
440

441
    # tune selector parameters used in the report title
442
    if query is None:
443
        selector.id_projects = None
444

445
    # delegate rendering to the report view
446
    response.view = 'harvest/layout.%s' % request.extension
447
    return dict(collection_logs=collection_logs,
448
                controller='all harvesters',
449
                logs=logs,
450
                selector=selector)