""" Harvest Controllers """ import traceback from gluon import current from gluon.restricted import RestrictedError from harvest_tools import (Automaton, build_harvester_tool, format_author_fr, family_name_fr, ToolException) from invenio_tools import (CheckAndFix, CheckException, load_record, OAI_URL, RecordConf, RecordThesis) from plugin_dbui import (get_id, INLINE_ALERT, Selector, to_formPanel, UNDEF_ID) DRY_RUN = T("dry run") MSG_NO_REG_INSTITUTE = T("Preference REG_INSTITUTE is not defined.") MSG_NO_HARVESTER = T("No harvesters for your selection !!!") def free_run(): """Run a free harvester. All harvester parameters are defined via the selector. """ if not current.app.inspirehep_institute_id: return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE) table = virtdb.free_harvester_selector fields = ('collections', 'controller', 'host', 'id_projects', 'id_teams', 'id_categories', 'ratio') try: selector = Selector(table, exclude_fields=('mode', 'year_start', 'year_end')) for el in fields: if not selector[el]: msg = T('All fields of the form have to be defined !!!') msg += "
" msg += T('The field "%s" is missing ...') % T(table[el].label) return INLINE_ALERT % (T('Error'), msg) tool = build_harvester_tool(db, selector.id_teams, selector.id_projects, selector.controller, selector.id_categories, year_start=selector.year_start, year_end=selector.year_end, dry_run=(selector.mode == DRY_RUN), debug=False) if not tool: return INLINE_ALERT % (T('Error'), T('Select an harvester.')) tool.process_url(selector.host, selector.collections) except ToolException as e: return T(str(e)) except BaseException as e: msg = '


' msg += CODE(traceback.format_exc()).xml() msg += '
' return msg response.view = 'harvest/layout.html' report = tool.report() report['selector'] = selector return report def edit_insert(): """Edit an invenio record and insert it in the database. @note: Recovery procedures are applied to fix basic non-conformity, but no checks are run. The user is editing the record to fix problems. """ if not current.app.inspirehep_institute_id: return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE) fields = ('controller', 'host', 'id_projects', 'id_teams', 'id_categories', 'record_id') table = virtdb.edit_insert_selector try: selector = Selector(table) for el in fields: if not selector[el]: msg = T('All fields of the form have to be defined !!!') msg += "
" msg += T('The field "%s" is missing ...') % T(table[el].label) return INLINE_ALERT % (T('Error'), msg) # record record = load_record(selector.host, selector.record_id) # form configuration cfg = to_formPanel(db.publications) # tools to extract values to be loaded in the form values = {} check = CheckAndFix() # NOTE # publication tool is only require to extract the list of my authors tool = Automaton(db, selector.id_teams, selector.id_projects, selector.controller, selector.id_categories, dry_run=True, debug=False) # title, preprint, URL, report number values['PublicationsTitle'] = record.title() values['PublicationsPreprint'] = record.preprint_number() values['PublicationsPublication_url'] = record.paper_url() values['PublicationsReport_numbers'] = record.report_number() # authors try: check.authors(record) check.format_authors(record, format_author_fr) check.my_authors(record, reference=tool._my_author_list(record), cmpFct=family_name_fr) except CheckException: pass values['PublicationsFirst_author'] = record.first_author() values['PublicationsAuthors'] = record.authors() values['PublicationsAuthors_institute'] = record.my_authors # collaboration recId = get_id(db.collaborations, collaboration=record.collaboration()) values['PublicationsId_collaborations'] = int(recId) if recId else UNDEF_ID # teams, project, categories, origin values['PublicationsId_categories'] = int(selector.id_categories) values['PublicationsId_projects'] = int(selector.id_projects) values['PublicationsId_teams'] = int(selector.id_teams) values['PublicationsOrigin'] = OAI_URL % (selector.host, selector.record_id) # publishers if selector.controller in ('articles', 'proceedings'): check.clean_erratum(record) check.format_editor(record) recId = get_id(db.publishers, abbreviation=record.paper_editor()) values['PublicationsId_publishers'] = int(recId) if recId else UNDEF_ID values['PublicationsVolume'] = record.paper_volume() values['PublicationsPages'] = record.paper_pages() # conference if selector.controller in ('proceedings', 'talks'): try: check.conference(record) except CheckException: pass if isinstance(record, RecordConf): values['PublicationsConference_title'] = record.conference_title() values['PublicationsConference_url'] = record.conference_url() values['PublicationsConference_dates'] = record.conference_dates() values['PublicationsConference_town'] = record.conference_town() recId = get_id(db.countries, country=record.conference_country()) values['PublicationsId_countries'] = \ recId if recId is not None else UNDEF_ID values['PublicationsConference_speaker'] = record.first_author() # thesis if selector.controller == 'theses': if isinstance(record, RecordThesis): values['PublicationsUniversities'] = record.these_universities() values['PublicationsDirectors'] = record.these_directors() values['PublicationsDefense'] = record.these_defense() # submitted date and year try: check.submitted(record) check.year(record) except CheckException: pass values['PublicationsSubmitted'] = ', '.join(record.submitted()) values['PublicationsYear'] = record.year() except Exception: # log the exception in the web2py ticker system ticket = RestrictedError(layer='harvester.py', code='edit_insert', output='', environment=current.globalenv) ticket.log(request) # inform the user that something went wrong in the server raise HTTP(500) return dict(cfg=cfg, values=values) def insert_marcxml(): """Insert a MarcXML record in the database. """ if not current.app.inspirehep_institute_id: return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE) try: selector = Selector(virtdb.marc12_selector, exclude_fields=('mode')) tool = build_harvester_tool(db, selector.id_teams, selector.id_projects, selector.controller, selector.id_categories, year_start=selector.year_start, year_end=selector.year_end, dry_run=(selector.mode == DRY_RUN), debug=False) if not tool: return INLINE_ALERT % (T('Error'), T('Select an harvester.')) tool.process_xml(selector.xml) except ToolException as e: return T(str(e)) except BaseException as e: msg = '


' msg += CODE(traceback.format_exc()).xml() msg += '
' return msg response.view = 'harvest/layout.html' report = tool.report() report['selector'] = selector return report def run(): """Run an harvester. Scan the cds/invenio stores to find articles published during a given range of years and for a given team/project. Insert them in the database if they don't exist. The scanning is steered using the current request arguments as well as the harvest parameters associated to this action. Search arguments are defined via the harvester selector. """ if not current.app.inspirehep_institute_id: return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE) try: selector = Selector(virtdb.harvester_selector, exclude_fields=('mode', 'year_start', 'year_end')) # Get the host and collections row = selector.select(db.harvesters).first() if not row: raise ToolException(MSG_NO_HARVESTER) tool = build_harvester_tool(db, selector.id_teams, selector.id_projects, selector.controller, row.harvesters.id_categories, year_start=selector.year_start, year_end=selector.year_end, dry_run=(selector.mode == DRY_RUN), debug=False) if not tool: return INLINE_ALERT % (T('Error'), T('Select an harvester.')) tool.process_url(row.harvesters.host, row.harvesters.collections) except ToolException as e: return T(str(e)) except BaseException as e: msg = '


' msg += CODE(traceback.format_exc()).xml() msg += '
' return msg response.view = 'harvest/layout.%s' % request.extension report = tool.report() report['selector'] = selector return report def run_all(): """Run all harvesters in one go. """ if not current.app.inspirehep_institute_id: return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE) collection_logs = [] logs = [] try: selector = Selector(virtdb.run_all_harvesters_selector, exclude_fields=('mode', 'year_start', 'year_end')) query = None for fieldname in ('id_teams', 'id_projects'): if selector[fieldname]: q = db.harvesters[fieldname] == selector[fieldname] if query: query = (query) & (q) else: query = q harvesters = db(query).select(db.harvesters.ALL) if not len(harvesters): return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER) for harvester in harvesters: tool = build_harvester_tool(db, harvester.id_teams, harvester.id_projects, harvester.controller, harvester.id_categories, year_start=selector.year_start, year_end=selector.year_end, dry_run=(selector.mode == DRY_RUN), debug=False) if not tool: return INLINE_ALERT % (T('Error'), T('Select an harvester.')) tool.process_url(harvester.host, harvester.collections) collection_logs.extend(tool.collection_logs) logs.extend(tool.logs) except ToolException as e: return T(str(e)) except BaseException as e: msg = '


' msg += CODE(traceback.format_exc()).xml() msg += '
' return msg # tune selector parameters used in the report title if query is None: selector.id_projects = None # delegate rendering to the report view response.view = 'harvest/layout.%s' % request.extension return dict(collection_logs=collection_logs, controller='all harvesters', logs=logs, selector=selector)