""" Harvest Controllers
"""
import traceback
from gluon import current
from gluon.restricted import RestrictedError
from harvest_tools import (Automaton,
build_harvester_tool,
format_author_fr,
family_name_fr,
ToolException)
from invenio_tools import (CheckAndFix,
CheckException,
load_record,
OAI_URL,
RecordConf,
RecordThesis)
from plugin_dbui import (get_id,
INLINE_ALERT,
Selector,
to_formPanel,
UNDEF_ID)
DRY_RUN = T("dry run")
MSG_NO_REG_INSTITUTE = T("Preference REG_INSTITUTE is not defined.")
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
def free_run():
"""Run a free harvester.
All harvester parameters are defined via the selector.
"""
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
table = virtdb.free_harvester_selector
fields = ('collections',
'controller',
'host',
'id_projects',
'id_teams',
'id_categories',
'ratio')
try:
selector = Selector(table,
exclude_fields=('mode', 'year_start', 'year_end'))
for el in fields:
if not selector[el]:
msg = T('All fields of the form have to be defined !!!')
msg += "
"
msg += T('The field "%s" is missing ...') % T(table[el].label)
return INLINE_ALERT % (T('Error'), msg)
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool.process_url(selector.host, selector.collections)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = '
'
msg += CODE(traceback.format_exc()).xml()
msg += '
'
return msg
response.view = 'harvest/layout.html'
report = tool.report()
report['selector'] = selector
return report
def edit_insert():
"""Edit an invenio record and insert it in the database.
@note: Recovery procedures are applied to fix basic non-conformity, but
no checks are run. The user is editing the record to fix problems.
"""
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
fields = ('controller',
'host',
'id_projects',
'id_teams',
'id_categories',
'record_id')
table = virtdb.edit_insert_selector
try:
selector = Selector(table)
for el in fields:
if not selector[el]:
msg = T('All fields of the form have to be defined !!!')
msg += "
"
msg += T('The field "%s" is missing ...') % T(table[el].label)
return INLINE_ALERT % (T('Error'), msg)
# record
record = load_record(selector.host, selector.record_id)
# form configuration
cfg = to_formPanel(db.publications)
# tools to extract values to be loaded in the form
values = {}
check = CheckAndFix()
# NOTE
# publication tool is only require to extract the list of my authors
tool = Automaton(db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
dry_run=True,
debug=False)
# title, preprint, URL, report number
values['PublicationsTitle'] = record.title()
values['PublicationsPreprint'] = record.preprint_number()
values['PublicationsPublication_url'] = record.paper_url()
values['PublicationsReport_numbers'] = record.report_number()
# authors
try:
check.authors(record)
check.format_authors(record, format_author_fr)
check.my_authors(record,
reference=tool._my_author_list(record),
cmpFct=family_name_fr)
except CheckException:
pass
values['PublicationsFirst_author'] = record.first_author()
values['PublicationsAuthors'] = record.authors()
values['PublicationsAuthors_institute'] = record.my_authors
# collaboration
recId = get_id(db.collaborations, collaboration=record.collaboration())
values['PublicationsId_collaborations'] = int(recId) if recId else UNDEF_ID
# teams, project, categories, origin
values['PublicationsId_categories'] = int(selector.id_categories)
values['PublicationsId_projects'] = int(selector.id_projects)
values['PublicationsId_teams'] = int(selector.id_teams)
values['PublicationsOrigin'] = OAI_URL % (selector.host, selector.record_id)
# publishers
if selector.controller in ('articles', 'proceedings'):
check.clean_erratum(record)
check.format_editor(record)
recId = get_id(db.publishers, abbreviation=record.paper_editor())
values['PublicationsId_publishers'] = int(recId) if recId else UNDEF_ID
values['PublicationsVolume'] = record.paper_volume()
values['PublicationsPages'] = record.paper_pages()
# conference
if selector.controller in ('proceedings', 'talks'):
try:
check.conference(record)
except CheckException:
pass
if isinstance(record, RecordConf):
values['PublicationsConference_title'] = record.conference_title()
values['PublicationsConference_url'] = record.conference_url()
values['PublicationsConference_dates'] = record.conference_dates()
values['PublicationsConference_town'] = record.conference_town()
recId = get_id(db.countries, country=record.conference_country())
values['PublicationsId_countries'] = \
recId if recId is not None else UNDEF_ID
values['PublicationsConference_speaker'] = record.first_author()
# thesis
if selector.controller == 'theses':
if isinstance(record, RecordThesis):
values['PublicationsUniversities'] = record.these_universities()
values['PublicationsDirectors'] = record.these_directors()
values['PublicationsDefense'] = record.these_defense()
# submitted date and year
try:
check.submitted(record)
check.year(record)
except CheckException:
pass
values['PublicationsSubmitted'] = ', '.join(record.submitted())
values['PublicationsYear'] = record.year()
except Exception:
# log the exception in the web2py ticker system
ticket = RestrictedError(layer='harvester.py',
code='edit_insert',
output='',
environment=current.globalenv)
ticket.log(request)
# inform the user that something went wrong in the server
raise HTTP(500)
return dict(cfg=cfg, values=values)
def insert_marcxml():
"""Insert a MarcXML record in the database.
"""
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
try:
selector = Selector(virtdb.marc12_selector, exclude_fields=('mode'))
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool.process_xml(selector.xml)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = '
'
msg += CODE(traceback.format_exc()).xml()
msg += '
'
return msg
response.view = 'harvest/layout.html'
report = tool.report()
report['selector'] = selector
return report
def run():
"""Run an harvester.
Scan the cds/invenio stores to find articles published during
a given range of years and for a given team/project.
Insert them in the database if they don't exist.
The scanning is steered using the current request arguments as well as
the harvest parameters associated to this action.
Search arguments are defined via the harvester selector.
"""
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
try:
selector = Selector(virtdb.harvester_selector,
exclude_fields=('mode', 'year_start', 'year_end'))
# Get the host and collections
row = selector.select(db.harvesters).first()
if not row:
raise ToolException(MSG_NO_HARVESTER)
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
row.harvesters.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool.process_url(row.harvesters.host, row.harvesters.collections)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = '
'
msg += CODE(traceback.format_exc()).xml()
msg += '
'
return msg
response.view = 'harvest/layout.%s' % request.extension
report = tool.report()
report['selector'] = selector
return report
def run_all():
"""Run all harvesters in one go.
"""
if not current.app.inspirehep_institute_id:
return INLINE_ALERT % (T("Error"), MSG_NO_REG_INSTITUTE)
collection_logs = []
logs = []
try:
selector = Selector(virtdb.run_all_harvesters_selector,
exclude_fields=('mode', 'year_start', 'year_end'))
query = None
for fieldname in ('id_teams', 'id_projects'):
if selector[fieldname]:
q = db.harvesters[fieldname] == selector[fieldname]
if query:
query = (query) & (q)
else:
query = q
harvesters = db(query).select(db.harvesters.ALL)
if not len(harvesters):
return INLINE_ALERT % (T('Error'), MSG_NO_HARVESTER)
for harvester in harvesters:
tool = build_harvester_tool(db,
harvester.id_teams,
harvester.id_projects,
harvester.controller,
harvester.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool.process_url(harvester.host, harvester.collections)
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = '
'
msg += CODE(traceback.format_exc()).xml()
msg += '
'
return msg
# tune selector parameters used in the report title
if query is None:
selector.id_projects = None
# delegate rendering to the report view
response.view = 'harvest/layout.%s' % request.extension
return dict(collection_logs=collection_logs,
controller='all harvesters',
logs=logs,
selector=selector)