harvest.py 12.6 KB
Newer Older
1 2 3 4
""" Harvest Controllers

"""

5
import traceback
6 7

from gluon import current
8 9
from harvest_tools import (format_author_fr,
                           family_name_fr,
10
                           build_harvester_tool,
11
                           PublicationsTool,
12 13 14 15 16
                           ToolException)
from invenio_tools import (CdsException,
                           CheckAndFix,
                           CheckException,
                           Marc12Exception,
17
                           InvenioStore,
18 19
                           Marc12,
                           OAI_URL)
20
from plugin_dbui import (get_id,
21
                         INLINE_ALERT,
22
                         Selector,
23 24
                         to_formPanel,
                         UNDEF_ID)
25

26
DRY_RUN = T("dry run")
27 28
MSG_NO_REG_INSTITUTE = T("Preference REG_INSTITUTE is not defined.")
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
29

30

31
def free_run():
32 33
    """Run a free harvester.
    All harvester parameters are defined via the selector.
34

35
    """
36 37 38
    if not current.app.reg_institute:
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

39
    table = virtdb.free_harvester_selector
40 41 42 43 44 45
    fields = ('collections',
              'controller',
              'host',
              'id_projects',
              'id_teams',
              'id_categories',
46 47 48
              'ratio')

    try:
49
        selector = Selector(table,
50 51 52 53
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
54 55
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
56
                msg += T('The field "%s" is missing ...') % T(table[el].label)
57
                return INLINE_ALERT % (T('Error'), msg)
58 59

        tool = build_harvester_tool(db,
60 61 62 63 64 65 66 67
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
68 69 70
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

71 72 73
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
74
        return T(str(e))
75 76 77

    except BaseException as e:
        msg = '<br><br><hr/>'
78 79 80
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
81

82
    response.view = 'harvest/layout.html'
83 84 85
    report = tool.report()
    report['selector'] = selector
    return report
86 87


88 89
def edit_insert():
    """Edit an invenio record and insert it in the database.
90

91 92
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
93

94
    """
95 96 97
    if not current.app.reg_institute:
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

98 99 100 101
    fields = ('controller',
              'host',
              'id_projects',
              'id_teams',
102 103 104 105
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
106

107 108 109 110 111 112 113 114 115
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
116

117 118 119 120 121
        # record
        store = InvenioStore(selector.host)
        xml = store.get_record(selector.record_id)
        decode = Marc12()
        record = decode(xml)[0]
122

123 124
        # form configuration
        cfg = to_formPanel(db.publications)
125

126 127 128 129
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
        tool = PublicationsTool(db, selector)
130

131 132 133 134 135
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
136

137
        # authors
138
        try:
139 140
            check.authors(record)
            check.format_authors(record, format_author_fr)
141 142 143

            check.my_authors(record,
                             reference=tool._my_author_list(record),
144
                             cmpFct=family_name_fr)
145

146 147
        except CheckException:
            pass
148

149 150
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
151
        values['PublicationsAuthors_institute'] = record.my_authors
152 153

        # collaboration
154
        recId = get_id(db.collaborations, collaboration=record.collaboration())
155
        values['PublicationsId_collaborations'] = int(recId) if recId else UNDEF_ID
156

157 158 159 160
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
161
        values['PublicationsOrigin'] = OAI_URL % (selector.host, selector.record_id)
162

163 164 165
        # publishers
        if selector.controller in ('articles', 'proceedings'):

166
            check.clean_erratum(record)
167
            check.format_editor(record)
168

169
            recId = get_id(db.publishers, abbreviation=record.paper_editor())
170
            values['PublicationsId_publishers'] = int(recId) if recId else UNDEF_ID
171 172
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
173

174 175
        # conference
        if selector.controller in ('proceedings', 'talks'):
176

177 178 179 180
            try:
                check.conference(record)
            except CheckException:
                pass
181

182 183 184 185
            values['PublicationsConference_title'] = record.conference_title()
            values['PublicationsConference_url'] = record.conference_url()
            values['PublicationsConference_dates'] = record.conference_dates()
            values['PublicationsConference_town'] = record.conference_town()
186 187 188

            recId = get_id(db.countries, country=record.conference_country())
            values['PublicationsId_countries'] = \
189
            recId if recId is not None else UNDEF_ID
190

191
            values['PublicationsConference_speaker'] = record.first_author()
192

193 194
        # thesis
        if selector.controller == 'theses':
195

196
            values['PublicationsUniversities'] = record.these_universities()
LE GAC Renaud's avatar
LE GAC Renaud committed
197 198
            values['PublicationsDirectors'] = record.these_directors()
            values['PublicationsDefense'] = record.these_defense()
199 200 201 202 203 204 205

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
206

207 208
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
209 210

    except (CdsException, Marc12Exception, ToolException) as e:
211
        return INLINE_ALERT % (T('Error'), T(str(e)))
212 213

    except BaseException as e:
214
        # for debug when web2py is in debug mode
215
        print((traceback.format_exc()))
216
        return INLINE_ALERT % (T('Error'), T(str(e)))
217 218 219 220

    return dict(cfg=cfg, values=values)


221 222
def insert_marcxml():
    """Insert a MarcXML record in the database.
223

224
    """
225 226 227
    if not current.app.reg_institute:
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

tux091's avatar
tux091 committed
228
    try:
229 230 231 232 233 234 235 236 237 238 239
        selector = Selector(virtdb.marc12_selector, exclude_fields=('mode'))

        tool = build_harvester_tool(db,
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    selector.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
240 241
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
242 243 244 245

        tool.process_xml(selector.xml)

    except ToolException as e:
246
        return T(str(e))
247 248 249

    except BaseException as e:
        msg = '<br><br><hr/>'
250 251 252
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
253

254
    response.view = 'harvest/layout.html'
255 256 257
    report = tool.report()
    report['selector'] = selector
    return report
258 259


260
def run():
261
    """Run an harvester.
262 263 264

    Scan the cds/invenio stores to find articles published during
    a given range of years and for a given team/project.
265
    Insert them in the database if they don't exist.
266

267 268
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
269

270
    Search arguments are defined via the harvester selector.
271

Renaud Le Gac's avatar
Renaud Le Gac committed
272
    """
273 274 275
    if not current.app.reg_institute:
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

Renaud Le Gac's avatar
Renaud Le Gac committed
276
    try:
277
        selector = Selector(virtdb.harvester_selector,
278
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
279

280 281 282 283
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
MEESSEN Christophe's avatar
MEESSEN Christophe committed
284

285
        tool = build_harvester_tool(db,
286 287 288 289 290 291 292 293
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    row.harvesters.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
294 295 296
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

297 298 299
        tool.process_url(row.harvesters.host, row.harvesters.collections)

    except ToolException as e:
300
        return T(str(e))
301 302 303

    except BaseException as e:
        msg = '<br><br><hr/>'
304 305 306
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
307

308 309 310 311
    response.view = 'harvest/layout.%s' % request.extension
    report = tool.report()
    report['selector'] = selector
    return report
312 313


314 315
def run_all():
    """Run all harvesters in one go.
316

317
    """
318 319 320
    if not current.app.reg_institute:
        return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)

321
    collection_logs = []
322
    logs = []
323

324 325
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
326
                            exclude_fields=('mode', 'year_start', 'year_end'))
327

328 329 330 331 332 333 334 335 336
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

337 338 339
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
340

341
        for harvester in harvesters:
342

343
            tool = build_harvester_tool(db,
344 345 346 347 348 349 350 351
                                        harvester.id_teams,
                                        harvester.id_projects,
                                        harvester.controller,
                                        harvester.id_categories,
                                        year_start=selector.year_start,
                                        year_end=selector.year_end,
                                        dry_run=(selector.mode == DRY_RUN),
                                        debug=False)
352 353
            if not tool:
                return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
354 355 356

            tool.process_url(harvester.host, harvester.collections)

357
            collection_logs.extend(tool.collection_logs)
358
            logs.extend(tool.logs)
359 360

    except ToolException as e:
361
        return T(str(e))
362 363 364

    except BaseException as e:
        msg = '<br><br><hr/>'
365 366 367
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
368

369
    # tune selector parameters used in the report title
370
    if query is None:
371
        selector.id_projects = None
372

373
    # delegate rendering to the report view
374
    response.view = 'harvest/layout.%s' % request.extension
375
    return dict(collection_logs=collection_logs,
376
                controller='all harvesters',
377
                logs=logs,
378
                selector=selector)