harvest.py 12.2 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import traceback
5

6
from gluon.storage import Storage
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
from harvest_tools import (format_author_fr, 
                           family_name_fr, 
                           get_harvester_tool,
                           PublicationsTool, 
                           ToolException)
from invenio_tools import (CdsException,
                           CheckAndFix,
                           CheckException,
                           Marc12Exception,
                           InvenioStore, 
                           Marc12,
                           OAI_URL)
from plugin_dbui import (get_id, 
                         Selector, 
                         to_formPanel,
                         UNDEF_ID)
23

24
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
25

26 27
INLINE_ALERT = "<script>Ext.Msg.alert('%s', '%s');</script>"

28
DRY_RUN = current.T("dry run")
29

30
def free_run():
31 32
    """Run a free harvester.
    All harvester parameters are defined via the selector.
33 34
    
    """
35
    table = virtdb.free_harvester_selector
36 37 38 39 40 41 42 43 44
    fields = ('collections', 
              'controller', 
              'host', 
              'id_projects', 
              'id_teams', 
              'id_categories', 
              'ratio')

    try:
45
        selector = Selector(table,
46 47 48 49
                            exclude_fields=('mode', 'year_start', 'year_end'))

        for el in fields:
            if not selector[el]:
LE GAC Renaud's avatar
LE GAC Renaud committed
50 51
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
52
                msg += T('The field "%s" is missing ...') % T(table[el].label)
53
                return INLINE_ALERT % (T('Error'), msg)
54 55
        
        tool_class = get_harvester_tool(selector.controller)
56 57 58 59 60 61 62 63 64 65 66
        tool = tool_class(db, 
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
                          host=selector.host,
                          collections=selector.collections,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
67
        tool()
68 69 70 71 72 73 74 75 76 77 78
    
    except ToolException, e:
        return T(str(e))
    
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
    
    response.view = 'harvest/layout.html'
79 80 81
    r = tool.report()
    r['selector'] = selector
    return r
82 83


84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
def edit_insert():
    """Edit an invenio record and insert it in the database.
    
    @note: Recovery procedures are applied to fix basic non-conformity, but
    no checks are run. The user is editing the record to fix problems.
    
    """
    fields = ('controller', 
              'host', 
              'id_projects', 
              'id_teams', 
              'id_categories',
              'record_id')

    table = virtdb.edit_insert_selector
    
    try:
        selector = Selector(table)

        for el in fields:
            if not selector[el]:
                msg = T('All fields of the form have to be defined !!!')
                msg += "<br>"
                msg += T('The field "%s" is missing ...') % T(table[el].label)
                return INLINE_ALERT % (T('Error'), msg)
    
        # record
        store = InvenioStore(selector.host)
        xml = store.get_record(selector.record_id)
        decode = Marc12()
        record = decode(xml)[0]
        
        # form configuration
        cfg = to_formPanel(db.publications)
        
        # tools to extract values to be loaded in the form
        values = {}
        check = CheckAndFix()
        tool = PublicationsTool(db, selector)
         
        # title, preprint, URL, report number
        values['PublicationsTitle'] = record.title()
        values['PublicationsPreprint'] = record.preprint_number()
        values['PublicationsPublication_url'] = record.paper_url()
        values['PublicationsReport_numbers'] = record.report_number()
        
        # authors
131
        try:
132 133 134 135 136 137 138
            check.authors(record)
            check.format_authors(record, format_author_fr)
            
            check.my_authors(record, 
                             reference=tool._my_author_list(record), 
                             cmpFct=family_name_fr)
        
139 140
        except CheckException:
            pass
141 142 143
        
        values['PublicationsFirst_author'] = record.first_author()
        values['PublicationsAuthors'] = record.authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
144
        values['PublicationsAuthors_institute'] = record.my_authors
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167

        # collaboration
        id = get_id(db.collaborations, collaboration=record.collaboration())
        values['PublicationsId_collaborations'] = (int(id) if id else UNDEF_ID)
                
        # teams, project, categories, origin
        values['PublicationsId_categories'] = int(selector.id_categories)
        values['PublicationsId_projects'] = int(selector.id_projects)
        values['PublicationsId_teams'] = int(selector.id_teams)
        values['PublicationsOrigin'] = OAI_URL %(selector.host, selector.record_id) 
        
        # publishers
        if selector.controller in ('articles', 'proceedings'):

            check.format_editor(record)
            id = get_id(db.publishers, abbreviation=record.paper_editor())
            values['PublicationsId_publishers'] = (int(id) if id else UNDEF_ID)
            values['PublicationsVolume'] = record.paper_volume()
            values['PublicationsPages'] = record.paper_pages()
            
        # conference
        if selector.controller in ('proceedings', 'talks'):
            
168 169 170 171 172
            try:
                check.conference(record)
            except CheckException:
                pass
            
173 174 175 176 177 178
            values['PublicationsConference_title'] = record.conference_title()
            values['PublicationsConference_url'] = record.conference_url()
            values['PublicationsConference_dates'] = record.conference_dates()
            values['PublicationsConference_town'] = record.conference_town()
            
            id = get_id(db.countries, country=record.conference_country())
179
            values['PublicationsId_countries'] = (id if id != None else UNDEF_ID)
180 181 182 183 184 185 186
        
            values['PublicationsConference_speaker'] = record.first_author()
            
        # thesis
        if selector.controller == 'theses':
            
            values['PublicationsUniversities'] = record.these_universities()
LE GAC Renaud's avatar
LE GAC Renaud committed
187 188
            values['PublicationsDirectors'] = record.these_directors()
            values['PublicationsDefense'] = record.these_defense()
189 190 191 192 193 194 195 196 197 198

        # submitted date and year
        try:
            check.submitted(record)
            check.year(record)
        except CheckException:
            pass
        
        values['PublicationsSubmitted'] = ', '.join(record.submitted())
        values['PublicationsYear'] = record.year()
199 200
        
    except (CdsException, Marc12Exception, ToolException), e:
201
        return INLINE_ALERT % (T('Error'), T(str(e)))
202 203
    
    except BaseException, e:
204 205 206
        # for debug when web2py is in debug mode
        print traceback.format_exc()
        return INLINE_ALERT % (T('Error'), T(str(e)))
207 208 209 210

    return dict(cfg=cfg, values=values)


211 212
def insert_marcxml():
    """Insert a MarcXML record in the database.
tux091's avatar
tux091 committed
213
    
214
    """
tux091's avatar
tux091 committed
215
    try:
216
        selector = Selector(virtdb.marc12_selector,  exclude_fields=('mode'))
217

218 219
        tool_class = get_harvester_tool(selector.controller)
        if not tool_class:
220
            return INLINE_ALERT % (T('Error'), T('Select a controller.'))
221 222 223 224 225 226
    
        tool = tool_class(db, 
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          selector.id_categories,
227
                          xml=selector.xml,
228 229 230 231
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
232
        tool()
233
    
234 235 236
    except ToolException, e:
        return T(str(e))
    
237
    except BaseException, e:
238 239 240 241
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
242 243
    
    response.view = 'harvest/layout.html'
244 245 246
    r = tool.report()
    r['selector'] = selector
    return r
247 248


249
def run():
250 251 252
    """Run an harvester.
    
    Scan the cds/invenio stores to find articles published during 
253 254
    a given range of years and for a given team/project. 
    Insert them in the database if they don't exist.
255
    
256 257
    The scanning is steered using the current request arguments as well as
    the harvest parameters associated to this action.
258

259
    Search arguments are defined via the harvester selector. 
260

Renaud Le Gac's avatar
Renaud Le Gac committed
261 262
    """
    try:
263
        selector = Selector(virtdb.harvester_selector,
264
                            exclude_fields=('mode', 'year_start', 'year_end'))
Renaud Le Gac's avatar
Renaud Le Gac committed
265

266 267
        tool_class = get_harvester_tool(selector.controller)
        if not tool_class:
268
            return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
269 270 271 272 273
        
        # Get the host and collections
        row = selector.select(db.harvesters).first()
        if not row:
            raise ToolException(MSG_NO_HARVESTER)
MEESSEN Christophe's avatar
MEESSEN Christophe committed
274

275 276 277 278 279 280 281 282 283 284 285
        tool = tool_class(db, 
                          selector.id_teams,
                          selector.id_projects,
                          selector.controller,
                          row.harvesters.id_categories,
                          host=row.harvesters.host,
                          collections=row.harvesters.collections,
                          year_start=selector.year_start,
                          year_end=selector.year_end,
                          dry_run=(selector.mode == DRY_RUN),
                          debug=False)
286
        tool()
287
    
288 289 290
    except ToolException, e:
        return T(str(e))
    
291
    except BaseException, e:
292 293 294 295
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
296 297

    response.view = 'harvest/layout.%s' % request.extension 
298 299 300
    r = tool.report()
    r['selector'] = selector
    return r
301 302


303 304 305 306
def run_all():
    """Run all harvesters in one go.
    
    """
307
    collection_logs = []
308 309 310 311
    logs = []
    
    try:
        selector = Selector(virtdb.run_all_harvesters_selector,
312
                            exclude_fields=('mode', 'year_start', 'year_end'))
313

314 315 316 317 318 319 320 321 322
        query = None
        for fieldname in ('id_teams', 'id_projects'):
            if selector[fieldname]:
                q = db.harvesters[fieldname] == selector[fieldname]
                if query:
                    query = (query) & (q)
                else:
                    query = q

323 324 325
        harvesters = db(query).select(db.harvesters.ALL)
        if not len(harvesters):
            return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
326
        
327
        for harvester in harvesters:
328

329 330 331 332 333 334 335 336 337 338 339 340
            tool_class = get_harvester_tool(harvester.controller)
            tool = tool_class(db, 
                              harvester.id_teams,
                              harvester.id_projects,
                              harvester.controller,
                              harvester.id_categories,
                              host=harvester.host,
                              collections=harvester.collections,
                              year_start=selector.year_start,
                              year_end=selector.year_end,
                              dry_run=(selector.mode == DRY_RUN),
                              debug=False)
341
            tool()
342
    
343
            collection_logs.extend(tool.collection_logs)
344 345 346 347 348 349 350 351 352 353 354
            logs.extend(tool.logs)
            
    except ToolException, e:
        return T(str(e))
    
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
    
355
    # tune selector parameters used in the report title
356 357 358 359
    if query == None:
        selector.id_projects = None
        
    # delegate rendering to the report view
360
    response.view = 'harvest/layout.%s'  % request.extension
361
    return dict(collection_logs=collection_logs,
362
                controller='all harvesters',
363
                logs=logs,
364
                selector=selector)