Commit 8280655d authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add the class CheckAndFixSvc to validate record before processing.

parent a464865d
......@@ -387,12 +387,15 @@
'Registration key': 'Registration key',
'Registration successful': 'Registration successful',
'Reject': 'Rejeter',
'Reject article is not published': "Rejeter l'article n'est pas publié",
'Reject article is not published': "Rejeté l'article n'est pas publié",
'Reject incomplete paper reference': 'Rejeté la référence du papier est incomplète',
'Reject no author(s)': "Rejeté pas d'autheur(s)",
'Reject no authors': "Rejeté pas d'auteurs",
'Reject no conference information': "Rejeté pas d'information sur la conférence",
'Reject no CPPM authors': "Rejeté pas d'auteurs du CPPM",
'Reject no OAI identifier': "Rejeté pas d'identifiant OAI",
'Reject no preprint number nor submission date': 'Rejeté pas de numéro de preprint ou de date de soumission',
'Reject no submission date': 'Rejeté pas de date de soumission',
'Reject not a thesis record': "Rejeté cet enregistement n'est pas une thèse",
'Reject preprint is a conference': 'Rejeté ce preprint est une conférence',
'Reject preprint is a published paper': 'Rejeté ce preprint est un article publié',
......
......@@ -14,6 +14,10 @@ import re
from gluon import current
from gluon.storage import Storage
from invenio_tools import (OAI_URL,
CdsSvc,
CheckAndFixSvc,
Marc12Svc)
from plugin_dbui import (UNDEF_ID,
UNKNOWN,
get_create_id,
......@@ -24,19 +28,14 @@ DRY_RUN = current.T("dry run")
# explain message
MSG_DELETE_TALK = current.T("Delete the associated talk", lazy=False)
MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False)
MSG_FIX_PAGE = current.T("Fixed the page field", lazy=False)
MSG_IN_DB = current.T("Already in the database", lazy=False)
MSG_LOAD = current.T("Load in the database", lazy=False)
MSG_MATCH = current.T("Reject the talk match a proceeding", lazy=False)
MSG_NO_AUTHOR = current.T("Reject no authors", lazy=False)
MSG_NO_CAT = current.T('Select a "category" !!!', lazy=False)
MSG_NO_CPPM_AUTHOR = current.T("Reject no CPPM authors", lazy=False)
MSG_NO_CONF_ID = current.T('Missing record for conference identified by id', lazy=False)
MSG_NO_CONF_KEY = current.T('Missing record for conference identified by key', lazy=False)
MSG_NO_EDITOR = current.T("Reject article is not published", lazy=False)
MSG_NO_HARVESTER = current.T('Harvester parameters not defined in the database.', lazy=False)
MSG_NO_LINK = current.T('Record not link to a conference', lazy=False)
MSG_NO_OAI = current.T("Reject no OAI identifier", lazy=False)
MSG_NO_PROJECT = current.T('Select a "project" !!!', lazy=False)
MSG_NO_TEAM = current.T('Select a "team" !!!', lazy=False)
MSG_NO_THESIS = current.T("Reject not a thesis record", lazy=False)
......@@ -44,7 +43,6 @@ MSG_PREPRINT_IS_PAPER = current.T("Reject preprint is a published paper", lazy=
MSG_PREPRINT_IS_CONFERENCE = current.T("Reject preprint is a conference", lazy=False)
MSG_PREPRINT_IS_THESIS = current.T("Reject preprint is a thesis", lazy=False)
MSG_PREPRINT_NO_NUMBER = current.T("Reject no preprint number nor submission date", lazy=False)
MSG_SERVER_ERROR = current.T("Error !!!", lazy=False)
MSG_TRANSFORM_PREPRINT = current.T("Transform the preprint into an article", lazy=False)
MSG_TRANSFORM_TALK = current.T("Transform the talk into a proceeding", lazy=False)
......@@ -202,78 +200,6 @@ def format_author_fr(name):
return r
def get_conference_record(cds, marc12, record):
"""Get the conference information associated to the talk
or to the proceeding.
@type cds: L{CdsSvc}
@param cds: service to interrogate invenio store
@type marc12: L{Marc12Svc}
@param marc12: service to decode record encoded with the MARC format.
@type record: L{Record}
@param record: The record defining the talk or the proceeding.
@rtype: L{Record} or unicode
@return: The conference record or a string with an error message.
"""
msg = ''
id = record.reference_conference_id()
key = record.reference_conference_key()
# search the conference by id the preferred method
if id:
try:
xml = cds.get_record(id)
except invenio_tools.CdsSvcException as error:
msg = "%s. %s" % (MSG_NO_CONF_ID, error)
return msg
conferences = marc12.process(xml)
for el in conferences:
if el.id() == id:
return el
msg = MSG_NO_CONF_ID
# search the conference by key if the previous method failed.
# the method depends on the store.
if key:
if cds._host.startswith('inspirehep'):
key = key.replace('/', '-')
ids = cds.get_ids(cc='Conferences',
p='111__g:%s-%s-%s' % tuple(key.split('-')))
else:
ids = cds.get_ids(p=key)
if not ids:
msg = "%s. %s" % (msg, MSG_NO_CONF_KEY)
return msg
for id in ids:
try:
xml = cds.get_record(id)
except invenio_tools.CdsSvcException as error:
msg = "%s. %s" % (MSG_NO_CONF_KEY, error)
return msg
conferences = marc12.process(xml)
for el in conferences:
if el.conference_key() == key:
return el
msg = "%s. %s" % (msg, MSG_NO_CONF_KEY)
return msg
# no id and no key
return MSG_NO_LINK
def get_harvester_tool(controller):
"""Get the harvester tool associated to the controller
or None if .
......@@ -321,22 +247,6 @@ def get_harvester_tool(controller):
return Tool
def is_published_paper(record):
"""C{True} when the record corresponds to a published paper.
It should have a review and a volume numbers.
@type record: L{Record}
@param record:
@rtype: bool
"""
if record.paper_editor() and record.paper_volume():
return True
return False
def learn_cppm_authors(db, authors=None,
id_project=None,
id_team=None,
......@@ -432,7 +342,8 @@ class Msg(Storage):
"""
self.action = 'idle'
self.txt = txt
self._set_txt(txt)
def load(self, txt):
"""Set the action as C{load} and the message as C{txt}.
......@@ -442,7 +353,8 @@ class Msg(Storage):
"""
self.action = 'load'
self.txt = txt
self._set_txt(txt)
def modify(self, txt):
"""Set the action as C{modify} and the message as C{txt}.
......@@ -452,7 +364,8 @@ class Msg(Storage):
"""
self.action = 'modify'
self.txt = txt
self._set_txt(txt)
def reject(self, txt):
"""Set the action as C{reject} set the message as C{txt}.
......@@ -462,7 +375,13 @@ class Msg(Storage):
"""
self.action = 'reject'
self.txt = txt
self._set_txt(txt)
def _set_txt(self, value):
if isinstance(value, unicode):
value = value.encode("utf-8")
self.txt = value
class MsgCollection(Storage):
......@@ -523,8 +442,8 @@ class PublicationsTool(object):
self.harvester = None
self.logs = []
self.marc12 = invenio_tools.Marc12Svc()
self.marc12.set_format_author_name(format_author_fr)
self.check_me = CheckAndFixSvc()
self.marc12 = Marc12Svc()
self.selector = selector
......@@ -800,7 +719,7 @@ class PublicationsTool(object):
if self.dbg:
print "process URL search"
cds = invenio_tools.CdsSvc(host=self.harvester.host)
cds = CdsSvc(host=self.harvester.host)
# list of collections
collections = self.harvester.collections
......@@ -830,7 +749,7 @@ class PublicationsTool(object):
try:
ids = cds.get_ids(**kwargs)
except invenio_tools.CdsSvcException as error:
except Exception as error:
self.collection_logs[-1].url = cds.last_search_url()
self.collection_logs[-1].error = error
continue
......@@ -851,15 +770,15 @@ class PublicationsTool(object):
try:
xml = cds.get_record(id)
self.process_xml(xml)
except invenio_tools.CdsSvcException as error:
url = invenio_tools.OAI_URL % (self.harvester.host, id)
except Exception as error:
url = OAI_URL % (self.harvester.host, id)
self.logs.append(Msg(url=url))
self.logs[-1].title = url
self.logs[-1].reject(error)
return
self.process_xml(xml)
def process_xml(self, xml):
......@@ -873,22 +792,27 @@ class PublicationsTool(object):
print "process xml record"
li = self.marc12.process(xml)
for record in li:
if self.dbg:
print "record decoded"
oai_url = record.oai_url()
self.logs.append(Msg(url=oai_url))
url = OAI_URL % (self.harvester.host, record.id())
self.logs.append(Msg(url=url))
self.logs[-1].title = record.title()
self.logs[-1].collection = self.collection_logs[-1].title
self.logs[-1].year = record.year()
if not oai_url:
self.logs[-1].reject(MSG_NO_OAI)
self.check_me(record, format_author_fr)
if record.is_valid:
self.logs[-1].year = record.year()
else:
self.logs[-1].year = record.year()
self.logs[-1].reject(record.msg)
continue
if not self.select_record(record):
continue
......@@ -1192,7 +1116,7 @@ class Articles(PublicationsTool):
@rtype: bool
"""
if not is_published_paper(record):
if not record.is_published():
self.logs[-1].reject(MSG_NO_EDITOR)
return False
......@@ -1343,7 +1267,7 @@ class Preprints(PublicationsTool):
"""
if is_published_paper(record):
if record.is_published():
self.logs[-1].reject(MSG_PREPRINT_IS_PAPER)
return False
......@@ -1411,7 +1335,7 @@ class Proceedings(PublicationsTool):
db = self.db
origin_proc = oai_url
origin_talk = invenio_tools.OAI_URL % (host, reference_talk)
origin_talk = OAI_URL % (host, reference_talk)
id_proc = get_id(db.publications, origin=origin_proc)
id_talk = get_id(db.publications, origin=origin_talk)
......@@ -1600,10 +1524,6 @@ class Proceedings(PublicationsTool):
if isinstance(pages, list) and len(pages) == 1:
pages = pages[0]
# try to recover missing year using the submission field
if not year:
year = submitted[0:4]
# check the publisher
id_publisher = self.check_publisher(editor)
......@@ -1625,22 +1545,10 @@ class Proceedings(PublicationsTool):
if id:
return status
# get the conference record
cds = invenio_tools.CdsSvc(host=host)
conference = get_conference_record(cds, self.marc12, record)
# conference is either a Record or and error msg
if isinstance(conference, (str, unicode)):
msg = conference
if isinstance(msg, unicode):
msg = msg.encode('utf-8')
self.logs[-1].reject(msg)
return 0
# alias
conference_dates = conference.conference_dates()
conference_title = conference.conference_title()
country = conference.conference_country()
# alias for the conference information
conference_dates = record.conference_dates()
conference_title = record.conference_title()
country = record.conference_country()
first_author = record.first_author()
# check conference country
......@@ -1679,8 +1587,8 @@ class Proceedings(PublicationsTool):
conference_dates=conference_dates,
conference_speaker=first_author,
conference_title=conference_title,
conference_town=conference.conference_town(),
conference_url=conference.conference_url(),
conference_town=record.conference_town(),
conference_url=record.conference_url(),
first_author=first_author,
id_categories=self.harvester.id_categories,
id_collaborations=id_collaboration,
......@@ -1716,7 +1624,7 @@ class Reports(PublicationsTool):
@type record: L{Record}
@param record:
@rtype: int
@rtype: int
@return: one when the record is inserted / updated in the database
zero otherwise.
......@@ -1732,19 +1640,11 @@ class Reports(PublicationsTool):
title = record.title()
year = record.year()
# protection against authors not defined
if not authors:
authors = UNKNOWN
id_status = get_id(db.status, code=UNKNOWN)
# protection against cppm authors not defined
if not authors_cppm:
authors_cppm = UNKNOWN
id_status = get_id(db.status, code=UNKNOWN)
if not first_author:
first_author = UNKNOWN
id_status = get_id(db.status, code=UNKNOWN)
# check the collaboration
id_collaboration = self.check_collaboration(record.collaboration())
......@@ -1823,7 +1723,7 @@ class Talks(PublicationsTool):
db = self.db
origin_proc = invenio_tools.OAI_URL % (host, reference_proceeding)
origin_proc = OAI_URL % (host, reference_proceeding)
origin_talk = oai_url
id_proc = get_id(db.publications, origin=origin_proc)
......@@ -1947,22 +1847,10 @@ class Talks(PublicationsTool):
if id:
return status
# get the conference record
cds = invenio_tools.CdsSvc(host=host)
conference = get_conference_record(cds, self.marc12, record)
# conference is either a Record or and error msg
if isinstance(conference, (str, unicode)):
msg = conference
if isinstance(msg, unicode):
msg = msg.encode('utf-8')
self.logs[-1].reject(msg)
return 0
# conference alias
conference_dates = conference.conference_dates()
conference_title = conference.conference_title()
country = conference.conference_country()
# alias for the conference information
conference_dates = record.conference_dates()
conference_title = record.conference_title()
country = record.conference_country()
first_author = record.first_author()
submitted = record.submitted()
title = record.title()
......@@ -1991,8 +1879,8 @@ class Talks(PublicationsTool):
conference_dates=conference_dates,
conference_speaker=first_author,
conference_title=conference_title,
conference_town=conference.conference_town(),
conference_url=conference.conference_url(),
conference_town=record.conference_town(),
conference_url=record.conference_url(),
first_author=first_author,
id_categories=self.harvester.id_categories,
id_collaborations=id_collaboration,
......
This diff is collapsed.
......@@ -2,6 +2,10 @@
HEAD
- Modify the logic of the harvester by introducing the class CheckAndFixSvc.
Validation and corrections of each record is performed at only one place.
Should improve code stability and maintenance.
0.8.7.2 (Sep 2014)
- Migrate to plugin_dbui 0.6.1.7.
- More robust harvester algorithms.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment