Commit 63074034 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '77-marc-to-json-py37' into 'master'

Resolve "Migrate from MARC to JSON format"

Closes #77

See merge request !99
parents 57c5f384 cc2dc8aa
......@@ -2,7 +2,7 @@
"""
from gluon import current
from auth import ADMIN, USER
from authentication import ADMIN, USER
@auth.requires(True, requires_login=not request.is_local)
......
......@@ -14,7 +14,7 @@ from graph_tools import (FROM_TO,
mplstyle,
savefig,
stackchart)
from model_selector import YEAR_SUBMIT
from models.selector import YEAR_SUBMIT
def dashboard():
......
""" Harvest Controllers
"""
import json
import logging
import traceback
from gluon import current
......@@ -10,19 +11,22 @@ from harvest_tools import (build_harvester_tool,
CheckAndFix,
CheckException,
DRY_RUN,
MsgCollection,
search_synonym,
ToolException)
from invenio_tools import (load_record,
OAI_URL,
RecordConf,
RecordThesis,
REG_INT)
RecordThesis)
from plugin_dbui import (inline_alert,
Selector,
to_formPanel,
UNDEF_ID)
from requests.exceptions import RequestException
MODE_DRY_RUN = T(DRY_RUN)
MSG_GREMLIN = "Oops a gremlin..."
MSG_LOST_CONNECTION = "Lost HTTP connection (timeout or site unavailable)"
MSG_NO_AFFILIATION = "Affiliation keys are not defined !!!"
MSG_NO_HARVESTER = "No harvesters for your selection !!!"
MSG_NO_RECORD = "Sorry, the record does not exist."
......@@ -65,8 +69,7 @@ def free_run():
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
dry_run=(selector.mode == MODE_DRY_RUN))
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
......@@ -96,6 +99,9 @@ def edit_insert():
no checks are run. The user is editing the record to fix problems.
"""
logger.debug("-"*72)
logger.debug("start controller edit_insert...")
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
......@@ -109,6 +115,10 @@ def edit_insert():
table = virtdb.edit_insert_selector
# ------------------------------------------------------------------------
#
# Get the record
#
try:
# Protection
#
......@@ -116,7 +126,7 @@ def edit_insert():
# With plugin_dbui 0.7.1 it is possible to enter decimal value
# for the record id (e.g by typing 1503,03 in the field)
#
if REG_INT.match(request.vars.Edit_insert_selectorRecord_id) is None:
if not request.vars.Edit_insert_selectorRecord_id.isalnum():
msg = T("The <i>record id</i> is not well formed.")
msg += "<br>"
msg += T("Use only digit character, no comma, no dot...")
......@@ -132,179 +142,200 @@ def edit_insert():
return inline_alert(T("Error"), msg)
# record
logger.debug("load the record...")
record = load_record(selector.host, selector.record_id)
if record is None:
return inline_alert(T("Error"), T(MSG_NO_RECORD))
# form configuration
cfg = to_formPanel(db.publications)
return inline_alert(T(MSG_GREMLIN), T(MSG_NO_RECORD))
# tools to extract values to be loaded in the form
values = {}
check = CheckAndFix()
except Exception as e:
logger.error(str(e))
# fix invalid oai
check.recover_oai(record, selector.host)
# log the exception in the web2py ticker system
ticket = RestrictedError(layer="harvester.py",
code="edit_insert",
output="",
environment=current.globalenv)
ticket.log(request)
# title, preprint, URL, report number
values["PublicationsTitle"] = record.title()
values["PublicationsPreprint"] = record.preprint_number()
values["PublicationsPublication_url"] = record.paper_url()
values["PublicationsReport_numbers"] = record.report_number()
# inform the user that something went wrong in the server
raise HTTP(500, T(str(e)))
# authors
try:
check.authors(record)
check.format_authors(record, fmt="F. Last")
# ------------------------------------------------------------------------
#
# Prepare the form
#
cfg = to_formPanel(db.publications)
values = {
"PublicationsTitle": record.title(),
"PublicationsPreprint": record.preprint_number(),
"PublicationsPublication_url": record.paper_url(),
"PublicationsReport_numbers": record.report_number()}
# ------------------------------------------------------------------------
#
# CheckAndFix (general)
#
check = CheckAndFix()
# authors
try:
check.authors(record)
check.format_authors(record, fmt="F. Last")
check.my_affiliation(
record, selector.id_projects, selector.id_teams)
check.my_affiliation(
record, selector.id_projects, selector.id_teams)
check.get_my_authors(record, sort=True)
check.get_my_authors(record, sort=True)
except CheckException:
pass
except CheckException as e:
logging.debug(str(e))
pass
fauthor = record.first_author()
if isinstance(fauthor, list):
fauthor = ", ".join(fauthor)
fauthor = record.first_author()
if isinstance(fauthor, list):
fauthor = ", ".join(fauthor)
values["PublicationsFirst_author"] = fauthor
values["PublicationsAuthors"] = record.authors()
values["PublicationsAuthors_institute"] = record.my_authors
values["PublicationsFirst_author"] = fauthor
values["PublicationsAuthors"] = record.authors()
values["PublicationsAuthors_institute"] = record.my_authors
# collaboration
recId = UNDEF_ID
try:
recId = search_synonym(db.collaborations,
"collaboration",
record.collaboration())
except ToolException:
pass
# collaboration
recId = UNDEF_ID
try:
recId = search_synonym(db.collaborations,
"collaboration",
record.collaboration())
except ToolException as e:
logging.debug(str(e))
pass
values["PublicationsId_collaborations"] = int(recId)
values["PublicationsId_collaborations"] = int(recId)
# teams, project, categories
values["PublicationsId_categories"] = int(selector.id_categories)
values["PublicationsId_projects"] = int(selector.id_projects)
values["PublicationsId_teams"] = int(selector.id_teams)
# teams, project, categories
values["PublicationsId_categories"] = int(selector.id_categories)
values["PublicationsId_projects"] = int(selector.id_projects)
values["PublicationsId_teams"] = int(selector.id_teams)
# origin
# Note:
# - It is always defined
# - Use a trivial algorithm to recover it
oai_url = record.oai_url()
if not oai_url:
oai_url = OAI_URL % (selector.host, selector.record_id)
# origin
# Note:
# - It is always defined
# - Use a trivial algorithm to recover it
oai_url = record.oai_url()
if not oai_url:
oai_url = OAI_URL % (selector.host, selector.record_id)
values["PublicationsOrigin"] = oai_url
values["PublicationsOrigin"] = oai_url
# publishers
if selector.controller in ("articles", "proceedings"):
# ------------------------------------------------------------------------
#
# CheckAndFix (article)
#
if selector.controller in ("articles", "proceedings"):
check.clean_erratum(record)
check.paper_reference(record)
check.format_editor(record)
check.paper_reference(record)
check.format_editor(record)
recId = UNDEF_ID
try:
recId = search_synonym(db.publishers,
"abbreviation",
record.paper_editor())
except ToolException:
pass
recId = UNDEF_ID
try:
recId = search_synonym(db.publishers,
"abbreviation",
record.paper_editor())
values["PublicationsId_publishers"] = int(recId)
values["PublicationsVolume"] = record.paper_volume()
values["PublicationsPages"] = record.paper_pages()
except ToolException as e:
logging.debug(str(e))
pass
# conference
if selector.controller in ("proceedings", "talks"):
values["PublicationsId_publishers"] = int(recId)
values["PublicationsVolume"] = record.paper_volume()
values["PublicationsPages"] = record.paper_pages()
try:
check.country(record)
check.conference_date(record, selector.host)
# ------------------------------------------------------------------------
#
# CheckAndFix (conference)
#
if selector.controller in ("proceedings", "talks"):
except CheckException:
pass
try:
check.country(record)
check.conference_date(record)
if isinstance(record, RecordConf):
values["PublicationsConference_title"] = \
record.conference_title()
except CheckException as e:
logging.debug(str(e))
pass
values["PublicationsConference_url"] = \
record.conference_url()
if isinstance(record, RecordConf):
values["PublicationsConference_title"] = \
record.conference_title()
values["PublicationsConference_dates"] = \
record.conference_dates()
values["PublicationsConference_url"] = \
record.conference_url()
values["PublicationsConference_town"] = \
record.conference_town()
values["PublicationsConference_dates"] = \
record.conference_dates()
recId = UNDEF_ID
try:
recId = search_synonym(db.countries,
"country",
record.conference_country())
except ToolException:
pass
values["PublicationsConference_town"] = \
record.conference_town()
values["PublicationsId_countries"] = int(recId)
recId = UNDEF_ID
try:
recId = search_synonym(db.countries,
"country",
record.conference_country())
values["PublicationsConference_speaker"] = \
record.first_author()
except ToolException as e:
logging.debug(str(e))
pass
# thesis
if selector.controller == "theses":
values["PublicationsId_countries"] = int(recId)
if isinstance(record, RecordThesis):
values["PublicationsUniversities"] = \
record.these_universities()
values["PublicationsConference_speaker"] = \
record.first_author()
values["PublicationsDirectors"] = record.these_directors()
values["PublicationsDefense"] = record.these_defense()
# ------------------------------------------------------------------------
#
# CheckAndFix (theses)
#
if selector.controller == "theses":
# submitted date and year
try:
check.submitted(record)
check.year(record)
except CheckException:
pass
if isinstance(record, RecordThesis):
values["PublicationsUniversities"] = \
record.these_universities()
values["PublicationsSubmitted"] = ", ".join(record.submitted())
values["PublicationsDirectors"] = record.these_directors()
values["PublicationsDefense"] = record.these_defense()
if record.is_published():
year = record.paper_year()
else:
year = record.year()
# submitted date and year
try:
check.submitted(record)
values["PublicationsYear"] = year
except CheckException as e:
logger.debug(str(e))
pass
except Exception as e:
values["PublicationsSubmitted"] = record.submitted()
# log the exception in the web2py ticker system
ticket = RestrictedError(layer="harvester.py",
code="edit_insert",
output="",
environment=current.globalenv)
ticket.log(request)
if record.is_published():
year = record.paper_year()
else:
year = record.submitted()[0:4]
# inform the user that something went wrong in the server
raise HTTP(500, T(str(e)))
values["PublicationsYear"] = year
logger.debug("-"*72)
return dict(cfg=cfg, values=values)
def insert_marcxml():
"""Insert a MarcXML record in the database.
def insert_recjson():
"""Insert a recjson record in the database.
"""
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
try:
selector = Selector(virtdb.marc12_selector, exclude_fields=("mode"))
selector = Selector(virtdb.recjson_selector, exclude_fields=("mode"))
tool = build_harvester_tool(
db,
......@@ -314,14 +345,22 @@ def insert_marcxml():
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
dry_run=(selector.mode == MODE_DRY_RUN))
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
ctitle = "%s / %s " % (db.projects[selector.id_projects].project,
selector.controller)
tool.collection_logs = [MsgCollection(title=ctitle)]
tool.harvester.host = selector.host
tool.process_xml(selector.xml)
tool.logs = []
recjson = json.loads(selector.recjson)
recjson = (recjson[0] if isinstance(recjson, list) else recjson)
tool.process_recjson(recjson)
except ToolException as e:
return T(str(e))
......@@ -341,7 +380,7 @@ def insert_marcxml():
def run():
"""Run an harvester.
Scan the cds/invenio stores to find articles published during
Scan the cds/invenio stores to find publication during
a given range of years and for a given team/project.
Insert them in the database if they don't exist.
......@@ -351,6 +390,9 @@ def run():
Search arguments are defined via the harvester selector.
"""
logger.info("-"*79)
logger.info(f"run harvester {request.vars.Harvester_selectorController}")
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
......@@ -376,8 +418,7 @@ def run():
row.harvesters.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
dry_run=(selector.mode == MODE_DRY_RUN))
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
......@@ -387,15 +428,33 @@ def run():
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
except RequestException as e:
logger.error(MSG_LOST_CONNECTION)
return inline_alert(T(MSG_GREMLIN), T(MSG_LOST_CONNECTION))
except ToolException as e:
return T(str(e))
log = tool.logs[-1]
msg = "<h4>Error on record %s (%s)</h4>" % (log.url, log.collection)
msg += T(str(e))
logger.error(f"{msg.strip('<h4>')}")
return msg
except BaseException as e:
msg = "<br><br><hr/>"
except Exception as e:
logger.error(f"{str(e)}")
msg = "<hr/>"
msg += CODE(traceback.format_exc()).xml()
msg += "<hr/>"
return msg
if logger.getEffectiveLevel() <= logging.INFO:
logger.info("")
logger.info(f"end of run harvester {selector.controller}:")
for el in collection_logs:
logger.info(f" {el.title}: {el.found}")
logger.info("-"*79)
# delegate rendering to the report view
response.view = "harvest/layout.%s" % request.extension
return dict(collection_logs=collection_logs,
......@@ -408,6 +467,8 @@ def run_all():
"""Run all harvesters in one go.
"""
logger.info(f"run all harvesters")
if db(db.affiliation_keys.id > 0).count() == 0:
return inline_alert(T("Error"), T(MSG_NO_AFFILIATION))
......@@ -431,6 +492,9 @@ def run_all():
for harvester in harvesters:
logger.info("-"*79)
logger.info(f"run harvester {harvester.controller}")
tool = build_harvester_tool(
db,
harvester.id_teams,
......@@ -439,8 +503,7 @@ def run_all():
harvester.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
dry_run=(selector.mode == MODE_DRY_RUN))
if not tool:
return inline_alert(T("Error"), T("Select an harvester."))
......@@ -451,14 +514,28 @@ def run_all():
logs.extend(tool.logs)
except ToolException as e:
return T(str(e))
log = tool.logs[-1]
msg = "<h4>Error on record %s (%s)</h4>" % (log.url, log.collection)
msg += T(str(e))
return msg
except BaseException as e:
msg = "<br><br><hr/>"
except Exception as e:
msg = "<hr/>"
msg += CODE(traceback.format_exc()).xml()
msg += "<hr/>"
return msg
if logger.getEffectiveLevel() <= logging.INFO:
logger.info("")
logger.info(f"end of run all harvesters:")
for el in collection_logs:
logger.info(f" {el.title}: {el.found}")
logger.info("-"*79)
logger.info("-"*79)
# tune selector parameters used in the report title
if query is None:
selector.id_projects = None
......
......@@ -8,7 +8,7 @@ import re
from check_tools import check_publication
from gluon.storage import Storage
from harvest_tools import DRY_RUN
from invenio_tools import CdsException, load_record, Marc12Exception
from invenio_tools import CdsException, load_record
from plugin_dbui import (CALLBACK_ERRORS,
get_foreign_field,
get_id,
......@@ -21,14 +21,17 @@ from plugin_dbui import (CALLBACK_ERRORS,
MODE_DRY_RUN = T(DRY_RUN)
MSG_NO_AUTHORS = "<br><br>Removing affiliation failed.<br>"\
"Use INSPIRES instead with the tool 'insert MARCXML'"
"Use INSPIRES instead with the tool 'insert RECJSON'"
MSG_EXISTING_KEY = "Keys already exist!"
MSG_NO_AFFILIATION = "Affiliation is not defined for the selected author."
MSG_NO_AUTHOR = "Author not found!"
MSG_NO_KEYS = "Affiliation keys are not defined!"
MSG_NO_INSTITUTE = "Institute not found in the inspirehep database!"
MSG_NO_PUBLICATION = "Publication not found!"
MSG_NO_SERVER = "Server is not reachable or respond badly!"
MSG_TO_MANY_AFFILIATION = "More than one affiliation for the selected author!"
MSG_TO0_MANY_AFFILIATION = "More than one affiliation for the selected author!"
MSG_TO0_MANY_AUTHOR = "More than one author found!"
def affiliation_institute():
......@@ -42,16 +45,28 @@ def affiliation_institute():
try:
record = load_record("inspirehep.net", institute_id)
except (CdsException, Marc12Exception):
except CdsException:
raise HTTP(500, T(MSG_NO_SERVER))
if record is None:
raise HTTP(500, T(MSG_NO_INSTITUTE))
# extract keys defining the affiliation
# u and v are the main keys use in inspirehep and cds
# b is uses by some note in Atlas
keys = (record["110"][k] for k in ("u", "t", "b") if k in record["110"])
# subfields are identifier and futur_identifier
# they are not part of the standard JSON record but add by the factory
if "corporate_note" not in record:
raise HTTP(500, T(MSG_NO_KEYS))
di = record["corporate_note"]
keys = [di