harvest.py 12.3 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import traceback
5

6
from gluon.storage import Storage
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
from harvest_tools import (format_author_fr, 
                           family_name_fr, 
                           get_harvester_tool,
                           PublicationsTool, 
                           ToolException)
from invenio_tools import (CdsException,
                           CheckAndFix,
                           CheckException,
                           Marc12Exception,
                           InvenioStore, 
                           Marc12,
                           OAI_URL)
from plugin_dbui import (get_id, 
                         Selector, 
                         to_formPanel,
                         UNDEF_ID)
23

24
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
25

26 27
INLINE_ALERT = "<script>Ext.Msg.alert('%s', '%s');</script>"

28
DRY_RUN = current.T("dry run")
29

30
def free_run():
31 32
    """Run a free harvester.
    All harvester parameters are defined via the selector.
33 34
    
    """
35
    table = virtdb.free_harvester_selector
36 37 38 39 40 41 42 43 44
    fields = ('collections', 
              'controller', 
              'host', 
              'id_projects', 
              'id_teams', 
              'id_categories', 
              'ratio')

    try:
45
        selector = Selector(table,
46 47
                            exclude_fields=('mode', 'year_start', 'year_end'))

48 49 50
        #from pprint import pprint
        #pprint(selector))

51 52
        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
53 54
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
55
                msg += T('The field "%s" is missing ...') % T(table[el].label)
56
                return INLINE_ALERT % (T('Error'), msg)
57 58
        
        tool_class = get_harvester_tool(selector.controller)
59 60 61 62 63 64 65 66 67 68 69
        tool = tool_class(db, 
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          host=selector.host,
                          collections=selector.collections,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
70
        tool()
71 72 73 74 75 76 77 78 79 80 81
    
    except ToolException, e:
        return T(str(e))
    
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
    
    response.view = 'harvest/layout.html'
82 83 84
    r = tool.report()
    r['selector'] = selector
    return r
85 86


87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
def edit_insert():
    """Edit an invenio record and insert it in the database.
    
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
    
    """
    fields = ('controller', 
              'host', 
              'id_projects', 
              'id_teams', 
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
    
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
    
        # record
        store = InvenioStore(selector.host)
        xml = store.get_record(selector.record_id)
        decode = Marc12()
        record = decode(xml)[0]
        
        # form configuration
        cfg = to_formPanel(db.publications)
        
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
        tool = PublicationsTool(db, selector)
         
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
        
        # authors
134
        try:
135 136 137 138 139 140 141
            check.authors(record)
            check.format_authors(record, format_author_fr)
            
            check.my_authors(record, 
                             reference=tool._my_author_list(record), 
                             cmpFct=family_name_fr)
        
142 143
        except CheckException:
            pass
144 145 146
        
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
147
        values['PublicationsAuthors_institute'] = record.my_authors
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170

        # collaboration
        id = get_id(db.collaborations, collaboration=record.collaboration())
        values['PublicationsId_collaborations'] = (int(id) if id else UNDEF_ID)
                
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
        values['PublicationsOrigin'] = OAI_URL %(selector.host, selector.record_id) 
        
        # publishers
        if selector.controller in ('articles', 'proceedings'):

            check.format_editor(record)
            id = get_id(db.publishers, abbreviation=record.paper_editor())
            values['PublicationsId_publishers'] = (int(id) if id else UNDEF_ID)
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
            
        # conference
        if selector.controller in ('proceedings', 'talks'):
            
171 172 173 174 175
            try:
                check.conference(record)
            except CheckException:
                pass
            
176 177 178 179 180 181
            values['PublicationsConference_title'] = record.conference_title()
            values['PublicationsConference_url'] = record.conference_url()
            values['PublicationsConference_dates'] = record.conference_dates()
            values['PublicationsConference_town'] = record.conference_town()
            
            id = get_id(db.countries, country=record.conference_country())
182
            values['PublicationsId_countries'] = (id if id != None else UNDEF_ID)
183 184 185 186 187 188 189
        
            values['PublicationsConference_speaker'] = record.first_author()
            
        # thesis
        if selector.controller == 'theses':
            
            values['PublicationsUniversities'] = record.these_universities()
LE GAC Renaud's avatar
LE GAC Renaud committed
190 191
            values['PublicationsDirectors'] = record.these_directors()
            values['PublicationsDefense'] = record.these_defense()
192 193 194 195 196 197 198 199 200 201

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
        
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
202 203
        
    except (CdsException, Marc12Exception, ToolException), e:
204
        return INLINE_ALERT % (T('Error'), T(str(e)))
205 206
    
    except BaseException, e:
207 208 209
        # for debug when web2py is in debug mode
        print traceback.format_exc()
        return INLINE_ALERT % (T('Error'), T(str(e)))
210 211 212 213

    return dict(cfg=cfg, values=values)


214 215
def insert_marcxml():
    """Insert a MarcXML record in the database.
tux091's avatar
tux091 committed
216
    
217
    """
tux091's avatar
tux091 committed
218
    try:
219
        selector = Selector(virtdb.marc12_selector,  exclude_fields=('mode'))
220

221 222
        tool_class = get_harvester_tool(selector.controller)
        if not tool_class:
223
            return INLINE_ALERT % (T('Error'), T('Select a controller.'))
224 225 226 227 228 229 230 231 232 233 234
    
        tool = tool_class(db, 
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
        tool(selector.xml)
235
    
236 237 238
    except ToolException, e:
        return T(str(e))
    
239
    except BaseException, e:
240 241 242 243
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
244 245
    
    response.view = 'harvest/layout.html'
246 247 248
    r = tool.report()
    r['selector'] = selector
    return r
249 250


251
def run():
252 253 254
    """Run an harvester.
    
    Scan the cds/invenio stores to find articles published during 
255 256
    a given range of years and for a given team/project. 
    Insert them in the database if they don't exist.
257
    
258 259
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
260

261
    Search arguments are defined via the harvester selector. 
262

Renaud Le Gac's avatar
Renaud Le Gac committed
263 264
    """
    try:
265
        selector = Selector(virtdb.harvester_selector,
266
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
267

268 269
        tool_class = get_harvester_tool(selector.controller)
        if not tool_class:
270
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
        
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
        
        tool = tool_class(db, 
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          row.harvesters.id_categories,
                          host=row.harvesters.host,
                          collections=row.harvesters.collections,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
288
        tool()
289
    
290 291 292
    except ToolException, e:
        return T(str(e))
    
293
    except BaseException, e:
294 295 296 297
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
298 299

    response.view = 'harvest/layout.%s' % request.extension 
300 301 302
    r = tool.report()
    r['selector'] = selector
    return r
303 304


305 306 307 308
def run_all():
    """Run all harvesters in one go.
    
    """
309
    collection_logs = []
310 311 312 313
    logs = []
    
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
314
                            exclude_fields=('mode', 'year_start', 'year_end'))
315

316 317 318 319 320 321 322 323 324
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

325 326 327
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
328
        
329
        for harvester in harvesters:
330

331 332 333 334 335 336 337 338 339 340 341 342
            tool_class = get_harvester_tool(harvester.controller)
            tool = tool_class(db, 
                              harvester.id_teams,
                              harvester.id_projects,
                              harvester.controller,
                              harvester.id_categories,
                              host=harvester.host,
                              collections=harvester.collections,
                              year_start=selector.year_start,
                              year_end=selector.year_end,
                              dry_run=(selector.mode == DRY_RUN),
                              debug=False)
343
            tool()
344
    
345
            collection_logs.extend(tool.collection_logs)
346 347 348 349 350 351 352 353 354 355 356
            logs.extend(tool.logs)
            
    except ToolException, e:
        return T(str(e))
    
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
    
357
    # tune selector parameters used in the report title
358 359 360 361
    if query == None:
        selector.id_projects = None
        
    # delegate rendering to the report view
362
    response.view = 'harvest/layout.%s'  % request.extension
363
    return dict(collection_logs=collection_logs,
364
                controller='all harvesters',
365
                logs=logs,
366
                selector=selector)