harvest.py 12 KB
Newer Older
1 2 3 4
""" Harvest Controllers

"""

5 6 7
import traceback
from harvest_tools import (format_author_fr,
                           family_name_fr,
8
                           build_harvester_tool,
9
                           PublicationsTool,
10 11 12 13 14
                           ToolException)
from invenio_tools import (CdsException,
                           CheckAndFix,
                           CheckException,
                           Marc12Exception,
15
                           InvenioStore,
16 17
                           Marc12,
                           OAI_URL)
18 19
from plugin_dbui import (get_id,
                         Selector,
20 21
                         to_formPanel,
                         UNDEF_ID)
22 23 24 25
# Dummy import to validate code in Ninja IDE
if 0:
    from ninja_hack import (virtdb, db, T, CODE, response, request)

26

27
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
28

29 30
INLINE_ALERT = "<script>Ext.Msg.alert('%s', '%s');</script>"

31 32
DRY_RUN = T("dry run")

33

34
def free_run():
35 36
    """Run a free harvester.
    All harvester parameters are defined via the selector.
37

38
    """
39
    table = virtdb.free_harvester_selector
40 41 42 43 44 45
    fields = ('collections',
              'controller',
              'host',
              'id_projects',
              'id_teams',
              'id_categories',
46 47 48
              'ratio')

    try:
49
        selector = Selector(table,
50 51 52 53
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
54 55
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
56
                msg += T('The field "%s" is missing ...') % T(table[el].label)
57
                return INLINE_ALERT % (T('Error'), msg)
58 59

        tool = build_harvester_tool(db,
60 61 62 63 64 65 66 67
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
68 69 70
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

71 72 73
        tool.process_url(selector.host, selector.collections)

    except ToolException as e:
74
        return T(str(e))
75 76 77

    except BaseException as e:
        msg = '<br><br><hr/>'
78 79 80
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
81

82
    response.view = 'harvest/layout.html'
83 84 85
    report = tool.report()
    report['selector'] = selector
    return report
86 87


88 89
def edit_insert():
    """Edit an invenio record and insert it in the database.
90

91 92
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
93

94
    """
95 96 97 98
    fields = ('controller',
              'host',
              'id_projects',
              'id_teams',
99 100 101 102
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
103

104 105 106 107 108 109 110 111 112
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
113

114 115 116 117 118
        # record
        store = InvenioStore(selector.host)
        xml = store.get_record(selector.record_id)
        decode = Marc12()
        record = decode(xml)[0]
119

120 121
        # form configuration
        cfg = to_formPanel(db.publications)
122

123 124 125 126
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
        tool = PublicationsTool(db, selector)
127

128 129 130 131 132
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
133

134
        # authors
135
        try:
136 137
            check.authors(record)
            check.format_authors(record, format_author_fr)
138 139 140

            check.my_authors(record,
                             reference=tool._my_author_list(record),
141
                             cmpFct=family_name_fr)
142

143 144
        except CheckException:
            pass
145

146 147
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
148
        values['PublicationsAuthors_institute'] = record.my_authors
149 150

        # collaboration
151 152 153 154
        recId = get_id(db.collaborations, collaboration=record.collaboration())
        values['PublicationsId_collaborations'] = \
            int(recId) if recId else UNDEF_ID

155 156 157 158
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
159 160 161
        values['PublicationsOrigin'] = \
            OAI_URL % (selector.host, selector.record_id)

162 163 164 165
        # publishers
        if selector.controller in ('articles', 'proceedings'):

            check.format_editor(record)
166 167 168
            recId = get_id(db.publishers, abbreviation=record.paper_editor())
            values['PublicationsId_publishers'] = \
                int(recId) if recId else UNDEF_ID
169 170
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
171

172 173
        # conference
        if selector.controller in ('proceedings', 'talks'):
174

175 176 177 178
            try:
                check.conference(record)
            except CheckException:
                pass
179

180 181 182 183
            values['PublicationsConference_title'] = record.conference_title()
            values['PublicationsConference_url'] = record.conference_url()
            values['PublicationsConference_dates'] = record.conference_dates()
            values['PublicationsConference_town'] = record.conference_town()
184 185 186 187 188

            recId = get_id(db.countries, country=record.conference_country())
            values['PublicationsId_countries'] = \
                recId if recId is not None else UNDEF_ID

189
            values['PublicationsConference_speaker'] = record.first_author()
190

191 192
        # thesis
        if selector.controller == 'theses':
193

194
            values['PublicationsUniversities'] = record.these_universities()
LE GAC Renaud's avatar
LE GAC Renaud committed
195 196
            values['PublicationsDirectors'] = record.these_directors()
            values['PublicationsDefense'] = record.these_defense()
197 198 199 200 201 202 203

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
204

205 206
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
207 208

    except (CdsException, Marc12Exception, ToolException) as e:
209
        return INLINE_ALERT % (T('Error'), T(str(e)))
210 211

    except BaseException as e:
212
        # for debug when web2py is in debug mode
213
        print((traceback.format_exc()))
214
        return INLINE_ALERT % (T('Error'), T(str(e)))
215 216 217 218

    return dict(cfg=cfg, values=values)


219 220
def insert_marcxml():
    """Insert a MarcXML record in the database.
221

222
    """
tux091's avatar
tux091 committed
223
    try:
224 225 226 227 228 229 230 231 232 233 234
        selector = Selector(virtdb.marc12_selector, exclude_fields=('mode'))

        tool = build_harvester_tool(db,
                                    selector.id_teams,
                                    selector.id_projects,
                                    selector.controller,
                                    selector.id_categories,
                                    year_start=selector.year_start,
                                    year_end=selector.year_end,
                                    dry_run=(selector.mode == DRY_RUN),
                                    debug=False)
235 236
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
237 238 239 240

        tool.process_xml(selector.xml)

    except ToolException as e:
241
        return T(str(e))
242 243 244

    except BaseException as e:
        msg = '<br><br><hr/>'
245 246 247
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
248

249
    response.view = 'harvest/layout.html'
250 251 252
    report = tool.report()
    report['selector'] = selector
    return report
253 254


255
def run():
256
    """Run an harvester.
257 258 259

    Scan the cds/invenio stores to find articles published during
    a given range of years and for a given team/project.
260
    Insert them in the database if they don't exist.
261

262 263
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
264

265
    Search arguments are defined via the harvester selector.
266

Renaud Le Gac's avatar
Renaud Le Gac committed
267 268
    """
    try:
269
        selector = Selector(virtdb.harvester_selector,
270
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
271

272 273 274 275
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
MEESSEN Christophe's avatar
MEESSEN Christophe committed
276

277
        tool = build_harvester_tool(db,
278 279 280 281 282 283 284 285
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          row.harvesters.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
286 287 288
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))

289 290 291
        tool.process_url(row.harvesters.host, row.harvesters.collections)

    except ToolException as e:
292
        return T(str(e))
293 294 295

    except BaseException as e:
        msg = '<br><br><hr/>'
296 297 298
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
299

300 301 302 303
    response.view = 'harvest/layout.%s' % request.extension
    report = tool.report()
    report['selector'] = selector
    return report
304 305


306 307
def run_all():
    """Run all harvesters in one go.
308

309
    """
310
    collection_logs = []
311
    logs = []
312

313 314
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
315
                            exclude_fields=('mode', 'year_start', 'year_end'))
316

317 318 319 320 321 322 323 324 325
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

326 327 328
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
329

330
        for harvester in harvesters:
331

332
            tool = build_harvester_tool(db,
333 334 335 336 337 338 339 340
                              harvester.id_teams,
                              harvester.id_projects,
                              harvester.controller,
                              harvester.id_categories,
                              year_start=selector.year_start,
                              year_end=selector.year_end,
                              dry_run=(selector.mode == DRY_RUN),
                              debug=False)
341 342
            if not tool:
                return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
343 344 345

            tool.process_url(harvester.host, harvester.collections)

346
            collection_logs.extend(tool.collection_logs)
347
            logs.extend(tool.logs)
348 349

    except ToolException as e:
350
        return T(str(e))
351 352 353

    except BaseException as e:
        msg = '<br><br><hr/>'
354 355 356
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
357

358
    # tune selector parameters used in the report title
359
    if query is None:
360
        selector.id_projects = None
361

362
    # delegate rendering to the report view
363
    response.view = 'harvest/layout.%s' % request.extension
364
    return dict(collection_logs=collection_logs,
365
                controller='all harvesters',
366
                logs=logs,
367
                selector=selector)