""" Harvest Controllers
"""
import traceback
from gluon import current
from gluon.restricted import RestrictedError
from harvest_tools import (build_harvester_tool,
CheckAndFix,
CheckException,
DRY_RUN,
format_author_fr,
family_name_fr,
search_synonym,
ToolException)
from invenio_tools import (load_record,
OAI_URL,
RecordConf,
RecordThesis,
REG_INT)
from plugin_dbui import (inline_alert,
Selector,
to_formPanel,
UNDEF_ID)
MODE_DRY_RUN = T(DRY_RUN)
MSG_NO_AFFILIATION = "Affiliation keys are not defined !!!"
MSG_NO_HARVESTER = "No harvesters for your selection !!!"
MSG_NO_RECORD = "Sorry, the record does not exist."
def free_run():
"""Run a free harvester.
All harvester parameters are defined via the selector.
"""
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
table = virtdb.free_harvester_selector
fields = ("collections",
"controller",
"host",
"id_projects",
"id_teams",
"id_categories",
"ratio")
try:
selector = Selector(
table,
exclude_fields=("mode", "year_start", "year_end"))
for el in fields:
if not selector[el]:
msg = T("All fields of the form have to be defined !!!")
msg += "
"
msg += T("The field '%s' is missing ...") % T(table[el].label)
return inline_alert(T("Error"), msg)
tool = build_harvester_tool(
db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
tool.process_url(selector.host, selector.collections)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = "
"
msg += CODE(traceback.format_exc()).xml()
msg += "
"
return msg
response.view = "harvest/layout.html"
report = tool.report()
report["selector"] = selector
return report
def edit_insert():
"""Edit an invenio record and insert it in the database.
Note:
Recovery procedures are applied to fix basic non-conformity, but
no checks are run. The user is editing the record to fix problems.
"""
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
fields = (
"controller",
"host",
"id_projects",
"id_teams",
"id_categories",
"record_id")
table = virtdb.edit_insert_selector
try:
# Protection
#
# NOTE
# With plugin_dbui 0.7.1 it is possible to enter decimal value
# for the record id (e.g by typing 1503,03 in the field)
#
if REG_INT.match(request.vars.Edit_insert_selectorRecord_id) is None:
msg = T("The record id is not well formed.")
msg += "
"
msg += T("Use only digit character, no comma, no dot...")
return inline_alert(T("Error"), msg)
selector = Selector(table)
for el in fields:
if not selector[el]:
msg = T("All fields of the form have to be defined !!!")
msg += "
"
msg += T("The field '%s' is missing ...") % T(table[el].label)
return inline_alert(T("Error"), msg)
# record
record = load_record(selector.host, selector.record_id)
if record is None:
return inline_alert(T("Error"), T(MSG_NO_RECORD))
# form configuration
cfg = to_formPanel(db.publications)
# tools to extract values to be loaded in the form
values = {}
check = CheckAndFix()
# fix invalid oai
check.recover_oai(record, selector.host)
# title, preprint, URL, report number
values["PublicationsTitle"] = record.title()
values["PublicationsPreprint"] = record.preprint_number()
values["PublicationsPublication_url"] = record.paper_url()
values["PublicationsReport_numbers"] = record.report_number()
# authors
try:
check.authors(record)
check.format_authors(record, format_author_fr)
check.my_affiliation(
record, selector.id_projects, selector.id_teams)
check.get_my_authors(record, cmpFct=family_name_fr)
except CheckException:
pass
fauthor = record.first_author()
if isinstance(fauthor, list):
fauthor = u", ".join(fauthor)
values["PublicationsFirst_author"] = fauthor
values["PublicationsAuthors"] = record.authors()
values["PublicationsAuthors_institute"] = record.my_authors
# collaboration
recId = UNDEF_ID
try:
recId = search_synonym(db.collaborations,
"collaboration",
record.collaboration())
except ToolException:
pass
values["PublicationsId_collaborations"] = int(recId)
# teams, project, categories
values["PublicationsId_categories"] = int(selector.id_categories)
values["PublicationsId_projects"] = int(selector.id_projects)
values["PublicationsId_teams"] = int(selector.id_teams)
# origin
# Note:
# - It is always defined
# - Use a trivial algorithm to recover it
oai_url = record.oai_url()
if not oai_url:
oai_url = OAI_URL % (selector.host, selector.record_id)
values["PublicationsOrigin"] = oai_url
# publishers
if selector.controller in ("articles", "proceedings"):
check.clean_erratum(record)
check.paper_reference(record)
check.format_editor(record)
recId = UNDEF_ID
try:
recId = search_synonym(db.publishers,
"abbreviation",
record.paper_editor())
except ToolException:
pass
values["PublicationsId_publishers"] = int(recId)
values["PublicationsVolume"] = record.paper_volume()
values["PublicationsPages"] = record.paper_pages()
# conference
if selector.controller in ("proceedings", "talks"):
try:
check.country(record)
check.conference_date(record, selector.host)
except CheckException:
pass
if isinstance(record, RecordConf):
values["PublicationsConference_title"] = \
record.conference_title()
values["PublicationsConference_url"] = \
record.conference_url()
values["PublicationsConference_dates"] = \
record.conference_dates()
values["PublicationsConference_town"] = \
record.conference_town()
recId = UNDEF_ID
try:
recId = search_synonym(db.countries,
"country",
record.conference_country())
except ToolException:
pass
values["PublicationsId_countries"] = int(recId)
values["PublicationsConference_speaker"] = \
record.first_author()
# thesis
if selector.controller == "theses":
if isinstance(record, RecordThesis):
values["PublicationsUniversities"] = \
record.these_universities()
values["PublicationsDirectors"] = record.these_directors()
values["PublicationsDefense"] = record.these_defense()
# submitted date and year
try:
check.submitted(record)
check.year(record)
except CheckException:
pass
values["PublicationsSubmitted"] = ", ".join(record.submitted())
if record.is_published():
year = record.paper_year()
else:
year = record.year()
values["PublicationsYear"] = year
except Exception:
# log the exception in the web2py ticker system
ticket = RestrictedError(layer="harvester.py",
code="edit_insert",
output="",
environment=current.globalenv)
ticket.log(request)
# inform the user that something went wrong in the server
raise HTTP(500)
return dict(cfg=cfg, values=values)
def insert_marcxml():
"""Insert a MarcXML record in the database.
"""
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
try:
selector = Selector(virtdb.marc12_selector, exclude_fields=("mode"))
tool = build_harvester_tool(
db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
tool.harvester.host = selector.host
tool.process_xml(selector.xml)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = "
"
msg += CODE(traceback.format_exc()).xml()
msg += "
"
return msg
response.view = "harvest/layout.html"
report = tool.report()
report["selector"] = selector
return report
def run():
"""Run an harvester.
Scan the cds/invenio stores to find articles published during
a given range of years and for a given team/project.
Insert them in the database if they don't exist.
The scanning is steered using the current request arguments as well as
the harvest parameters associated to this action.
Search arguments are defined via the harvester selector.
"""
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
try:
selector = Selector(
virtdb.harvester_selector,
exclude_fields=("mode", "year_start", "year_end"))
# Get hosts and collections
rows = selector.select(db.harvesters)
if not rows:
raise ToolException(T(MSG_NO_HARVESTER))
collection_logs = []
logs = []
for row in rows:
tool = build_harvester_tool(
db,
selector.id_teams,
selector.id_projects,
selector.controller,
row.harvesters.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
tool.process_url(row.harvesters.host, row.harvesters.collections)
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = "
"
msg += CODE(traceback.format_exc()).xml()
msg += "
"
return msg
# delegate rendering to the report view
response.view = "harvest/layout.%s" % request.extension
return dict(collection_logs=collection_logs,
controller=selector.controller,
logs=logs,
selector=selector)
def run_all():
"""Run all harvesters in one go.
"""
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
collection_logs = []
logs = []
try:
selector = Selector(
virtdb.run_all_harvesters_selector,
exclude_fields=("mode", "year_start", "year_end"))
query = None
for fieldname in ("id_teams", "id_projects"):
if selector[fieldname]:
q = db.harvesters[fieldname] == selector[fieldname]
query = (q if query is None else (query) & (q))
harvesters = db(query).select(db.harvesters.ALL)
if not len(harvesters):
return inline_alert(T("Error"), T(MSG_NO_HARVESTER))
for harvester in harvesters:
tool = build_harvester_tool(
db,
harvester.id_teams,
harvester.id_projects,
harvester.controller,
harvester.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
tool.process_url(harvester.host, harvester.collections)
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
except ToolException as e:
return T(str(e))
except BaseException as e:
msg = "
"
msg += CODE(traceback.format_exc()).xml()
msg += "
"
return msg
# tune selector parameters used in the report title
if query is None:
selector.id_projects = None
# delegate rendering to the report view
response.view = "harvest/layout.%s" % request.extension
return dict(collection_logs=collection_logs,
controller="all harvesters",
logs=logs,
selector=selector)