harvest.py 12.4 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import traceback
5

6
from gluon.storage import Storage
7 8
from harvest_tools import (format_author_fr, 
                           family_name_fr, 
9
                           build_harvester_tool,
10 11 12 13 14 15 16 17 18 19 20 21 22
                           PublicationsTool, 
                           ToolException)
from invenio_tools import (CdsException,
                           CheckAndFix,
                           CheckException,
                           Marc12Exception,
                           InvenioStore, 
                           Marc12,
                           OAI_URL)
from plugin_dbui import (get_id, 
                         Selector, 
                         to_formPanel,
                         UNDEF_ID)
23

24
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
25

26 27
INLINE_ALERT = "<script>Ext.Msg.alert('%s', '%s');</script>"

28
DRY_RUN = current.T("dry run")
29

30
def free_run():
31 32
    """Run a free harvester.
    All harvester parameters are defined via the selector.
33 34
    
    """
35
    table = virtdb.free_harvester_selector
36 37 38 39 40 41 42 43 44
    fields = ('collections', 
              'controller', 
              'host', 
              'id_projects', 
              'id_teams', 
              'id_categories', 
              'ratio')

    try:
45
        selector = Selector(table,
46 47 48 49
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
50 51
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
52
                msg += T('The field "%s" is missing ...') % T(table[el].label)
53
                return INLINE_ALERT % (T('Error'), msg)
54
        
55
        tool = build_harvester_tool(db, 
56 57 58 59 60 61 62 63
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
64 65 66 67 68 69 70
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
        
        marc12xmls = tool.process_url(selector.host, selector.collections)

        for xml in marc12xmls:
            tool(xml)
71 72 73 74 75 76 77 78 79 80 81
    
    except ToolException, e:
        return T(str(e))
    
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
    
    response.view = 'harvest/layout.html'
82 83 84
    r = tool.report()
    r['selector'] = selector
    return r
85 86


87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
def edit_insert():
    """Edit an invenio record and insert it in the database.
    
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
    
    """
    fields = ('controller', 
              'host', 
              'id_projects', 
              'id_teams', 
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
    
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
    
        # record
        store = InvenioStore(selector.host)
        xml = store.get_record(selector.record_id)
        decode = Marc12()
        record = decode(xml)[0]
        
        # form configuration
        cfg = to_formPanel(db.publications)
        
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
        tool = PublicationsTool(db, selector)
         
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
        
        # authors
134
        try:
135 136 137 138 139 140 141
            check.authors(record)
            check.format_authors(record, format_author_fr)
            
            check.my_authors(record, 
                             reference=tool._my_author_list(record), 
                             cmpFct=family_name_fr)
        
142 143
        except CheckException:
            pass
144 145 146
        
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
147
        values['PublicationsAuthors_institute'] = record.my_authors
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170

        # collaboration
        id = get_id(db.collaborations, collaboration=record.collaboration())
        values['PublicationsId_collaborations'] = (int(id) if id else UNDEF_ID)
                
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
        values['PublicationsOrigin'] = OAI_URL %(selector.host, selector.record_id) 
        
        # publishers
        if selector.controller in ('articles', 'proceedings'):

            check.format_editor(record)
            id = get_id(db.publishers, abbreviation=record.paper_editor())
            values['PublicationsId_publishers'] = (int(id) if id else UNDEF_ID)
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
            
        # conference
        if selector.controller in ('proceedings', 'talks'):
            
171 172 173 174 175
            try:
                check.conference(record)
            except CheckException:
                pass
            
176 177 178 179 180 181
            values['PublicationsConference_title'] = record.conference_title()
            values['PublicationsConference_url'] = record.conference_url()
            values['PublicationsConference_dates'] = record.conference_dates()
            values['PublicationsConference_town'] = record.conference_town()
            
            id = get_id(db.countries, country=record.conference_country())
182
            values['PublicationsId_countries'] = (id if id != None else UNDEF_ID)
183 184 185 186 187 188 189
        
            values['PublicationsConference_speaker'] = record.first_author()
            
        # thesis
        if selector.controller == 'theses':
            
            values['PublicationsUniversities'] = record.these_universities()
LE GAC Renaud's avatar
LE GAC Renaud committed
190 191
            values['PublicationsDirectors'] = record.these_directors()
            values['PublicationsDefense'] = record.these_defense()
192 193 194 195 196 197 198 199 200 201

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
        
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
202 203
        
    except (CdsException, Marc12Exception, ToolException), e:
204
        return INLINE_ALERT % (T('Error'), T(str(e)))
205 206
    
    except BaseException, e:
207 208 209
        # for debug when web2py is in debug mode
        print traceback.format_exc()
        return INLINE_ALERT % (T('Error'), T(str(e)))
210 211 212 213

    return dict(cfg=cfg, values=values)


214 215
def insert_marcxml():
    """Insert a MarcXML record in the database.
tux091's avatar
tux091 committed
216
    
217
    """
tux091's avatar
tux091 committed
218
    try:
219
        selector = Selector(virtdb.marc12_selector,  exclude_fields=('mode'))
220

221 222
        tool_class = get_harvester_tool(selector.controller)
        if not tool_class:
223
            return INLINE_ALERT % (T('Error'), T('Select a controller.'))
224
    
225
        tool = build_harvester_tool(db, 
226 227 228 229 230 231 232 233
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
234 235 236 237
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
        
        tool(selector.xml)
238
    
239 240 241
    except ToolException, e:
        return T(str(e))
    
242
    except BaseException, e:
243 244 245 246
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
247 248
    
    response.view = 'harvest/layout.html'
249 250 251
    r = tool.report()
    r['selector'] = selector
    return r
252 253


254
def run():
255 256 257
    """Run an harvester.
    
    Scan the cds/invenio stores to find articles published during 
258 259
    a given range of years and for a given team/project. 
    Insert them in the database if they don't exist.
260
    
261 262
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
263

264
    Search arguments are defined via the harvester selector. 
265

Renaud Le Gac's avatar
Renaud Le Gac committed
266 267
    """
    try:
268
        selector = Selector(virtdb.harvester_selector,
269
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
270

271 272 273 274
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
MEESSEN Christophe's avatar
MEESSEN Christophe committed
275

276
        tool = build_harvester_tool(db, 
277 278 279 280 281 282 283 284
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          row.harvesters.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
285 286 287 288 289 290 291
        if not tool:
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
        
        marc12xmls = tool.process_url(row.harvesters.host, row.harvesters.collections)

        for xml in marc12xmls:
            tool(xml)
292
    
293 294 295
    except ToolException, e:
        return T(str(e))
    
296
    except BaseException, e:
297 298 299 300
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
301 302

    response.view = 'harvest/layout.%s' % request.extension 
303 304 305
    r = tool.report()
    r['selector'] = selector
    return r
306 307


308 309 310 311
def run_all():
    """Run all harvesters in one go.
    
    """
312
    collection_logs = []
313 314 315 316
    logs = []
    
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
317
                            exclude_fields=('mode', 'year_start', 'year_end'))
318

319 320 321 322 323 324 325 326 327
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

328 329 330
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
331
        
332
        for harvester in harvesters:
333

334
            tool = build_harvester_tool(db, 
335 336 337 338 339 340 341 342
                              harvester.id_teams,
                              harvester.id_projects,
                              harvester.controller,
                              harvester.id_categories,
                              year_start=selector.year_start,
                              year_end=selector.year_end,
                              dry_run=(selector.mode == DRY_RUN),
                              debug=False)
343 344 345 346 347 348 349
            if not tool:
                return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
            
            marc12xmls = tool.process_url(harvester.host, harvester.collections)
    
            for xml in marc12xmls:
                tool(xml)
350
    
351
            collection_logs.extend(tool.collection_logs)
352 353 354 355 356 357 358 359 360 361 362
            logs.extend(tool.logs)
            
    except ToolException, e:
        return T(str(e))
    
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
    
363
    # tune selector parameters used in the report title
364 365 366 367
    if query == None:
        selector.id_projects = None
        
    # delegate rendering to the report view
368
    response.view = 'harvest/layout.%s'  % request.extension
369
    return dict(collection_logs=collection_logs,
370
                controller='all harvesters',
371
                logs=logs,
372
                selector=selector)