harvest.py 13.1 KB
Newer Older
1 2 3 4
""" Harvest Controllers

"""

5
import traceback
6 7

from gluon import current
8
from gluon.restricted import RestrictedError
9
from harvest_tools import (Automaton,
10
                           build_harvester_tool,
11 12
                           format_author_fr,
                           family_name_fr,
13
                           ToolException)
LE GAC Renaud's avatar
LE GAC Renaud committed
14
from invenio_tools import (CheckAndFix,
15
                           CheckException,
LE GAC Renaud's avatar
LE GAC Renaud committed
16
                           load_record,
17 18 19
                           OAI_URL,
                           RecordConf,
                           RecordThesis)
20
from plugin_dbui import (get_id,
21
                         INLINE_ALERT,
22
                         Selector,
23 24
                         to_formPanel,
                         UNDEF_ID)
25

26
DRY_RUN = T("dry run")
27 28
MSG_NO_REG_INSTITUTE = T("Preference REG_INSTITUTE is not defined.")
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
29

30

31
def free_run():
32 33
    """Run a free harvester.
    All harvester parameters are defined via the selector.
34

35
    """
36
    if not current.app.inspirehep_institute_id:
37 38
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

39
    table = virtdb.free_harvester_selector
40 41 42 43 44 45
    fields = ('collections',
              'controller',
              'host',
              'id_projects',
              'id_teams',
              'id_categories',
46 47 48
              'ratio')

    try:
49
        selector = Selector(table,
50 51 52 53
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
54 55
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
56
                msg += T('The field "%s" is missing ...') % T(table[el].label)
57
                return INLINE_ALERT % (T('Error'), msg)
58 59

        tool = build_harvester_tool(db,
60 61 62 63 64 65 66 67
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
68 69 70
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

71 72 73
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
74
        return T(str(e))
75 76 77

    except BaseException as e:
        msg = '<br><br><hr/>'
78 79 80
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
81

82
    response.view = 'harvest/layout.html'
83 84 85
    report = tool.report()
    report['selector'] = selector
    return report
86 87


88 89
def edit_insert():
    """Edit an invenio record and insert it in the database.
90

91 92
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
93

94
    """
95
    if not current.app.inspirehep_institute_id:
96 97
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

98 99 100 101
    fields = ('controller',
              'host',
              'id_projects',
              'id_teams',
102 103 104 105
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
106

107 108 109 110 111 112 113 114 115
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
116

117
        # record
LE GAC Renaud's avatar
LE GAC Renaud committed
118
        record = load_record(selector.host, selector.record_id)
119

120 121
        # form configuration
        cfg = to_formPanel(db.publications)
122

123 124 125
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
126

127 128 129 130 131 132 133 134 135
        # NOTE
        # publication tool is only require to extract the list of my authors
        tool = Automaton(db,
                         selector.id_teams,
                         selector.id_projects,
                         selector.controller,
                         selector.id_categories,
                         dry_run=True,
                         debug=False)
136

137 138 139 140 141
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
142

143
        # authors
144
        try:
145 146
            check.authors(record)
            check.format_authors(record, format_author_fr)
147 148 149

            check.my_authors(record,
                             reference=tool._my_author_list(record),
150
                             cmpFct=family_name_fr)
151

152 153
        except CheckException:
            pass
154

155 156
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
157
        values['PublicationsAuthors_institute'] = record.my_authors
158 159

        # collaboration
160
        recId = get_id(db.collaborations, collaboration=record.collaboration())
161
        values['PublicationsId_collaborations'] = int(recId) if recId else UNDEF_ID
162

163 164 165 166
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
167
        values['PublicationsOrigin'] = OAI_URL % (selector.host, selector.record_id)
168

169 170 171
        # publishers
        if selector.controller in ('articles', 'proceedings'):

172
            check.clean_erratum(record)
173
            check.format_editor(record)
174

175
            recId = get_id(db.publishers, abbreviation=record.paper_editor())
176
            values['PublicationsId_publishers'] = int(recId) if recId else UNDEF_ID
177 178
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
179

180 181
        # conference
        if selector.controller in ('proceedings', 'talks'):
182

183 184 185 186
            try:
                check.conference(record)
            except CheckException:
                pass
187

188 189 190 191 192
            if isinstance(record, RecordConf):
                values['PublicationsConference_title'] = record.conference_title()
                values['PublicationsConference_url'] = record.conference_url()
                values['PublicationsConference_dates'] = record.conference_dates()
                values['PublicationsConference_town'] = record.conference_town()
193

194 195 196
                recId = get_id(db.countries, country=record.conference_country())
                values['PublicationsId_countries'] = \
                recId if recId is not None else UNDEF_ID
197

198
                values['PublicationsConference_speaker'] = record.first_author()
199

200 201
        # thesis
        if selector.controller == 'theses':
202

203 204 205 206
            if isinstance(record, RecordThesis):
                values['PublicationsUniversities'] = record.these_universities()
                values['PublicationsDirectors'] = record.these_directors()
                values['PublicationsDefense'] = record.these_defense()
207 208 209 210 211 212 213

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
214

215 216
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
217

218
    except Exception:
219

220 221 222 223 224 225 226 227 228
        # log the exception in the web2py ticker system
        ticket = RestrictedError(layer='harvester.py',
                                 code='edit_insert',
                                 output='',
                                 environment=current.globalenv)
        ticket.log(request)

        # inform the user that something went wrong in the server
        raise HTTP(500)
229 230 231 232

    return dict(cfg=cfg, values=values)


233 234
def insert_marcxml():
    """Insert a MarcXML record in the database.
235

236
    """
237
    if not current.app.inspirehep_institute_id:
238 239
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

tux091's avatar
tux091 committed
240
    try:
241 242 243 244 245 246 247 248 249 250 251
        selector = Selector(virtdb.marc12_selector, exclude_fields=('mode'))

        tool = build_harvester_tool(db,
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    selector.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
252 253
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
254 255 256 257

        tool.process_xml(selector.xml)

    except ToolException as e:
258
        return T(str(e))
259 260 261

    except BaseException as e:
        msg = '<br><br><hr/>'
262 263 264
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
265

266
    response.view = 'harvest/layout.html'
267 268 269
    report = tool.report()
    report['selector'] = selector
    return report
270 271


272
def run():
273
    """Run an harvester.
274 275 276

    Scan the cds/invenio stores to find articles published during
    a given range of years and for a given team/project.
277
    Insert them in the database if they don't exist.
278

279 280
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
281

282
    Search arguments are defined via the harvester selector.
283

Renaud Le Gac's avatar
Renaud Le Gac committed
284
    """
285
    if not current.app.inspirehep_institute_id:
286 287
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

Renaud Le Gac's avatar
Renaud Le Gac committed
288
    try:
289
        selector = Selector(virtdb.harvester_selector,
290
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
291

292 293 294 295
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
MEESSEN Christophe's avatar
MEESSEN Christophe committed
296

297
        tool = build_harvester_tool(db,
298 299 300 301 302 303 304 305
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    row.harvesters.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
306 307 308
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

309 310 311
        tool.process_url(row.harvesters.host, row.harvesters.collections)

    except ToolException as e:
312
        return T(str(e))
313 314 315

    except BaseException as e:
        msg = '<br><br><hr/>'
316 317 318
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
319

320 321 322 323
    response.view = 'harvest/layout.%s' % request.extension
    report = tool.report()
    report['selector'] = selector
    return report
324 325


326 327
def run_all():
    """Run all harvesters in one go.
328

329
    """
330
    if not current.app.inspirehep_institute_id:
331 332
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

333
    collection_logs = []
334
    logs = []
335

336 337
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
338
                            exclude_fields=('mode', 'year_start', 'year_end'))
339

340 341 342 343 344 345 346 347 348
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

349 350 351
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
352

353
        for harvester in harvesters:
354

355
            tool = build_harvester_tool(db,
356 357 358 359 360 361 362 363
                                        harvester.id_teams,
                                        harvester.id_projects,
                                        harvester.controller,
                                        harvester.id_categories,
                                        year_start=selector.year_start,
                                        year_end=selector.year_end,
                                        dry_run=(selector.mode == DRY_RUN),
                                        debug=False)
364 365
            if not tool:
                return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
366 367 368

            tool.process_url(harvester.host, harvester.collections)

369
            collection_logs.extend(tool.collection_logs)
370
            logs.extend(tool.logs)
371 372

    except ToolException as e:
373
        return T(str(e))
374 375 376

    except BaseException as e:
        msg = '<br><br><hr/>'
377 378 379
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
380

381
    # tune selector parameters used in the report title
382
    if query is None:
383
        selector.id_projects = None
384

385
    # delegate rendering to the report view
386
    response.view = 'harvest/layout.%s' % request.extension
387
    return dict(collection_logs=collection_logs,
388
                controller='all harvesters',
389
                logs=logs,
390
                selector=selector)