harvest.py 13.2 KB
Newer Older
1 2 3 4
""" Harvest Controllers

"""

5
import traceback
6 7

from gluon import current
8
from gluon.restricted import RestrictedError
9 10
from harvest_tools import (format_author_fr,
                           family_name_fr,
11
                           build_harvester_tool,
12
                           PublicationsTool,
13 14 15 16 17
                           ToolException)
from invenio_tools import (CdsException,
                           CheckAndFix,
                           CheckException,
                           Marc12Exception,
18
                           InvenioStore,
19 20
                           Marc12,
                           OAI_URL)
21
from plugin_dbui import (get_id,
22
                         INLINE_ALERT,
23
                         Selector,
24 25
                         to_formPanel,
                         UNDEF_ID)
26

27
DRY_RUN = T("dry run")
28 29
MSG_NO_REG_INSTITUTE = T("Preference REG_INSTITUTE is not defined.")
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
30

31

32
def free_run():
33 34
    """Run a free harvester.
    All harvester parameters are defined via the selector.
35

36
    """
37
    if not current.app.inspirehep_institute_id:
38 39
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

40
    table = virtdb.free_harvester_selector
41 42 43 44 45 46
    fields = ('collections',
              'controller',
              'host',
              'id_projects',
              'id_teams',
              'id_categories',
47 48 49
              'ratio')

    try:
50
        selector = Selector(table,
51 52 53 54
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
55 56
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
57
                msg += T('The field "%s" is missing ...') % T(table[el].label)
58
                return INLINE_ALERT % (T('Error'), msg)
59 60

        tool = build_harvester_tool(db,
61 62 63 64 65 66 67 68
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
69 70 71
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

72 73 74
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
75
        return T(str(e))
76 77 78

    except BaseException as e:
        msg = '<br><br><hr/>'
79 80 81
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
82

83
    response.view = 'harvest/layout.html'
84 85 86
    report = tool.report()
    report['selector'] = selector
    return report
87 88


89 90
def edit_insert():
    """Edit an invenio record and insert it in the database.
91

92 93
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
94

95
    """
96
    if not current.app.inspirehep_institute_id:
97 98
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

99 100 101 102
    fields = ('controller',
              'host',
              'id_projects',
              'id_teams',
103 104 105 106
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
107

108 109 110 111 112 113 114 115 116
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
117

118 119 120 121 122
        # record
        store = InvenioStore(selector.host)
        xml = store.get_record(selector.record_id)
        decode = Marc12()
        record = decode(xml)[0]
123

124 125
        # form configuration
        cfg = to_formPanel(db.publications)
126

127 128 129
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
130 131 132 133 134 135 136 137 138 139

        # NOTE: the publication tool is only require to extract
        # the list of my authors
        tool = PublicationsTool(db,
                                selector.id_teams,
                                selector.id_projects,
                                selector.controller,
                                selector.id_categories,
                                dry_run=True,
                                debug=False)
140

141 142 143 144 145
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
146

147
        # authors
148
        try:
149 150
            check.authors(record)
            check.format_authors(record, format_author_fr)
151 152 153

            check.my_authors(record,
                             reference=tool._my_author_list(record),
154
                             cmpFct=family_name_fr)
155

156 157
        except CheckException:
            pass
158

159 160
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
161
        values['PublicationsAuthors_institute'] = record.my_authors
162 163

        # collaboration
164
        recId = get_id(db.collaborations, collaboration=record.collaboration())
165
        values['PublicationsId_collaborations'] = int(recId) if recId else UNDEF_ID
166

167 168 169 170
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
171
        values['PublicationsOrigin'] = OAI_URL % (selector.host, selector.record_id)
172

173 174 175
        # publishers
        if selector.controller in ('articles', 'proceedings'):

176
            check.clean_erratum(record)
177
            check.format_editor(record)
178

179
            recId = get_id(db.publishers, abbreviation=record.paper_editor())
180
            values['PublicationsId_publishers'] = int(recId) if recId else UNDEF_ID
181 182
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
183

184 185
        # conference
        if selector.controller in ('proceedings', 'talks'):
186

187 188 189 190
            try:
                check.conference(record)
            except CheckException:
                pass
191

192 193 194 195
            values['PublicationsConference_title'] = record.conference_title()
            values['PublicationsConference_url'] = record.conference_url()
            values['PublicationsConference_dates'] = record.conference_dates()
            values['PublicationsConference_town'] = record.conference_town()
196 197 198

            recId = get_id(db.countries, country=record.conference_country())
            values['PublicationsId_countries'] = \
199
            recId if recId is not None else UNDEF_ID
200

201
            values['PublicationsConference_speaker'] = record.first_author()
202

203 204
        # thesis
        if selector.controller == 'theses':
205

206
            values['PublicationsUniversities'] = record.these_universities()
LE GAC Renaud's avatar
LE GAC Renaud committed
207 208
            values['PublicationsDirectors'] = record.these_directors()
            values['PublicationsDefense'] = record.these_defense()
209 210 211 212 213 214 215

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
216

217 218
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
219

220
    except Exception:
221

222 223 224 225 226 227 228 229 230
        # log the exception in the web2py ticker system
        ticket = RestrictedError(layer='harvester.py',
                                 code='edit_insert',
                                 output='',
                                 environment=current.globalenv)
        ticket.log(request)

        # inform the user that something went wrong in the server
        raise HTTP(500)
231 232 233 234

    return dict(cfg=cfg, values=values)


235 236
def insert_marcxml():
    """Insert a MarcXML record in the database.
237

238
    """
239
    if not current.app.inspirehep_institute_id:
240 241
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

tux091's avatar
tux091 committed
242
    try:
243 244 245 246 247 248 249 250 251 252 253
        selector = Selector(virtdb.marc12_selector, exclude_fields=('mode'))

        tool = build_harvester_tool(db,
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    selector.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
254 255
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
256 257 258 259

        tool.process_xml(selector.xml)

    except ToolException as e:
260
        return T(str(e))
261 262 263

    except BaseException as e:
        msg = '<br><br><hr/>'
264 265 266
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
267

268
    response.view = 'harvest/layout.html'
269 270 271
    report = tool.report()
    report['selector'] = selector
    return report
272 273


274
def run():
275
    """Run an harvester.
276 277 278

    Scan the cds/invenio stores to find articles published during
    a given range of years and for a given team/project.
279
    Insert them in the database if they don't exist.
280

281 282
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
283

284
    Search arguments are defined via the harvester selector.
285

Renaud Le Gac's avatar
Renaud Le Gac committed
286
    """
287
    if not current.app.inspirehep_institute_id:
288 289
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

Renaud Le Gac's avatar
Renaud Le Gac committed
290
    try:
291
        selector = Selector(virtdb.harvester_selector,
292
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
293

294 295 296 297
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
MEESSEN Christophe's avatar
MEESSEN Christophe committed
298

299
        tool = build_harvester_tool(db,
300 301 302 303 304 305 306 307
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    row.harvesters.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
308 309 310
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

311 312 313
        tool.process_url(row.harvesters.host, row.harvesters.collections)

    except ToolException as e:
314
        return T(str(e))
315 316 317

    except BaseException as e:
        msg = '<br><br><hr/>'
318 319 320
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
321

322 323 324 325
    response.view = 'harvest/layout.%s' % request.extension
    report = tool.report()
    report['selector'] = selector
    return report
326 327


328 329
def run_all():
    """Run all harvesters in one go.
330

331
    """
332
    if not current.app.inspirehep_institute_id:
333 334
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

335
    collection_logs = []
336
    logs = []
337

338 339
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
340
                            exclude_fields=('mode', 'year_start', 'year_end'))
341

342 343 344 345 346 347 348 349 350
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

351 352 353
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
354

355
        for harvester in harvesters:
356

357
            tool = build_harvester_tool(db,
358 359 360 361 362 363 364 365
                                        harvester.id_teams,
                                        harvester.id_projects,
                                        harvester.controller,
                                        harvester.id_categories,
                                        year_start=selector.year_start,
                                        year_end=selector.year_end,
                                        dry_run=(selector.mode == DRY_RUN),
                                        debug=False)
366 367
            if not tool:
                return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
368 369 370

            tool.process_url(harvester.host, harvester.collections)

371
            collection_logs.extend(tool.collection_logs)
372
            logs.extend(tool.logs)
373 374

    except ToolException as e:
375
        return T(str(e))
376 377 378

    except BaseException as e:
        msg = '<br><br><hr/>'
379 380 381
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
382

383
    # tune selector parameters used in the report title
384
    if query is None:
385
        selector.id_projects = None
386

387
    # delegate rendering to the report view
388
    response.view = 'harvest/layout.%s' % request.extension
389
    return dict(collection_logs=collection_logs,
390
                controller='all harvesters',
391
                logs=logs,
392
                selector=selector)