Commit 517343f6 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Revisit the naming of the classes in invenio and harvester tools.

Polish documentation.
parent 8280655d
......@@ -43,7 +43,7 @@ def free_run():
for el in fields:
tool.harvester[el] = selector[el]
tool.process()
tool()
except ToolException, e:
return T(str(e))
......@@ -70,7 +70,7 @@ def insert_marcxml():
return INLINE_ALERT % (T('Error'), T('Select a controller.'))
tool = tool_class(db, selector, debug=False)
tool.process()
tool()
except ToolException, e:
return T(str(e))
......@@ -107,7 +107,7 @@ def run():
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool = tool_class(db, selector, debug=False)
tool.process()
tool()
except ToolException, e:
return T(str(e))
......@@ -157,7 +157,7 @@ def run_all():
tool_class = get_harvester_tool(selector.controller)
tool = tool_class(db, selector, debug=False)
tool.process()
tool()
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
......
......@@ -15,9 +15,9 @@ import re
from gluon import current
from gluon.storage import Storage
from invenio_tools import (OAI_URL,
CdsSvc,
CheckAndFixSvc,
Marc12Svc)
CheckAndFix,
InvenioStore,
Marc12)
from plugin_dbui import (UNDEF_ID,
UNKNOWN,
get_create_id,
......@@ -31,7 +31,6 @@ MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False)
MSG_IN_DB = current.T("Already in the database", lazy=False)
MSG_LOAD = current.T("Load in the database", lazy=False)
MSG_MATCH = current.T("Reject the talk match a proceeding", lazy=False)
MSG_NO_AUTHOR = current.T("Reject no authors", lazy=False)
MSG_NO_CAT = current.T('Select a "category" !!!', lazy=False)
MSG_NO_CPPM_AUTHOR = current.T("Reject no CPPM authors", lazy=False)
MSG_NO_EDITOR = current.T("Reject article is not published", lazy=False)
......@@ -442,8 +441,8 @@ class PublicationsTool(object):
self.harvester = None
self.logs = []
self.check_me = CheckAndFixSvc()
self.marc12 = Marc12Svc()
self.check = CheckAndFix()
self.marc12 = Marc12()
self.selector = selector
......@@ -460,7 +459,7 @@ class PublicationsTool(object):
@rtype: dict
@return: the key are a sub-set of those defined in
L{invenio_tools.CdsSvc.get_ids}.
L{invenio_tools.InvenioStore.get_ids}.
"""
selector = self.selector
......@@ -659,67 +658,17 @@ class PublicationsTool(object):
return 0
def process(self):
"""Launch the search in the invenio store according to search
criteria and load publications in the database.
@raise ToolException: when projet, team or category identifier
are not defined.
"""
selector = self.selector
if self.dbg:
print "start processing", self.__class__.__name__
print "decode request"
# protection team, project and/or category have to be defined
if not selector.id_projects:
raise ToolException(MSG_NO_PROJECT)
if not selector.id_teams:
raise ToolException(MSG_NO_TEAM)
if selector.xml and not selector.id_categories:
raise ToolException(MSG_NO_CAT)
if self.dbg:
print "get harvest parameters"
# process an XML request
if selector.xml:
self.harvester = Storage(controller=selector.controller,
id_categories=selector.id_categories,
id_projects=selector.id_projects,
id_teams=selector.id_teams)
self.collection_logs.append(MsgCollection(found=1))
self.process_xml(selector.xml)
return
# retrieve the harvester parameter in the database
# if not yet defined (free run)
if not self.harvester:
row = selector.select(self.db.harvesters).first()
if not row:
raise ToolException(MSG_NO_HARVESTER)
self.harvester = row.harvesters
# retrieve records in the store and load them in the database
self.process_url()
def process_url(self):
"""Retrieve the MARC XML records from the store
and load them in the database.
"""Retrieve the MARC XML string and launch its decoding.
@raise Exception: depending on what happen, can be StoreException,
Marc12ZException, ...
"""
if self.dbg:
print "process URL search"
cds = CdsSvc(host=self.harvester.host)
store = InvenioStore(self.harvester.host)
# list of collections
collections = self.harvester.collections
......@@ -747,14 +696,14 @@ class PublicationsTool(object):
kwargs = self._search_parameters(collection)
try:
ids = cds.get_ids(**kwargs)
ids = store.get_ids(**kwargs)
except Exception as error:
self.collection_logs[-1].url = cds.last_search_url()
self.collection_logs[-1].url = store.last_search_url()
self.collection_logs[-1].error = error
continue
self.collection_logs[-1].url = cds.last_search_url()
self.collection_logs[-1].url = store.last_search_url()
self.collection_logs[-1].found = len(ids)
if not ids:
......@@ -769,7 +718,7 @@ class PublicationsTool(object):
print "\nprocessing record", id
try:
xml = cds.get_record(id)
xml = store.get_record(id)
self.process_xml(xml)
except Exception as error:
......@@ -779,7 +728,6 @@ class PublicationsTool(object):
self.logs[-1].reject(error)
return
def process_xml(self, xml):
"""Decode the MARC XML string and load records in the database.
......@@ -791,7 +739,7 @@ class PublicationsTool(object):
if self.dbg:
print "process xml record"
li = self.marc12.process(xml)
li = self.marc12(xml)
for record in li:
......@@ -803,7 +751,7 @@ class PublicationsTool(object):
self.logs[-1].title = record.title()
self.logs[-1].collection = self.collection_logs[-1].title
self.check_me(record, format_author_fr)
self.check(record, format_author_fr)
if record.is_valid:
self.logs[-1].year = record.year()
......@@ -845,6 +793,64 @@ class PublicationsTool(object):
selector=self.selector)
def __call__(self):
"""Search publication in the invenio store according to criteria
and load them in the database.
@raise Exception: the type of exception depends on what happen:
- L{ToolException} when projet, team or category identifier
are not defined.
- C{StoreException} when somethings goes wrong interrogating the store.
- C{Marc12Exception} when somethings goes wrong decoding the XML
string return by the store
- C{CheckException} if the L{Record} is not valid
- C{Exception} if the python code crash
"""
selector = self.selector
if self.dbg:
print "start processing", self.__class__.__name__
print "decode request"
# protection team, project and/or category have to be defined
if not selector.id_projects:
raise ToolException(MSG_NO_PROJECT)
if not selector.id_teams:
raise ToolException(MSG_NO_TEAM)
if selector.xml and not selector.id_categories:
raise ToolException(MSG_NO_CAT)
if self.dbg:
print "get harvest parameters"
# process an XML request
if selector.xml:
self.harvester = Storage(controller=selector.controller,
id_categories=selector.id_categories,
id_projects=selector.id_projects,
id_teams=selector.id_teams)
self.collection_logs.append(MsgCollection(found=1))
self.process_xml(selector.xml)
return
# retrieve the harvester parameter in the database
# if not yet defined (free run)
if not self.harvester:
row = selector.select(self.db.harvesters).first()
if not row:
raise ToolException(MSG_NO_HARVESTER)
self.harvester = row.harvesters
# retrieve records in the store and load them in the database
self.process_url()
class Articles(PublicationsTool):
"""Publications tool for articles.
......@@ -1213,11 +1219,6 @@ class Preprints(PublicationsTool):
# check the collaboration
id_collaboration = self.check_collaboration(record.collaboration())
# Protection to only keep preprints with authors
if not first_author:
self.logs[-1].reject(MSG_NO_AUTHOR)
return 0
# check against preprint or article already published
id, status = self.check_by_origin(oai_url=oai_url)
if id:
......@@ -1271,11 +1272,11 @@ class Preprints(PublicationsTool):
self.logs[-1].reject(MSG_PREPRINT_IS_PAPER)
return False
if record.is_conference_record():
if record.is_conference_data():
self.logs[-1].reject(MSG_PREPRINT_IS_CONFERENCE)
return False
if record.is_thesis_record():
if record.is_thesis():
self.logs[-1].reject(MSG_PREPRINT_IS_THESIS)
return False
......@@ -1971,7 +1972,7 @@ class Thesis(PublicationsTool):
@param record:
"""
if not record.is_thesis_record():
if not record.is_thesis():
self.logs[-1].reject(MSG_NO_THESIS)
return False
......
......@@ -32,8 +32,8 @@ CDS_SEARCH_KEYS = ('req', 'cc', 'c', 'ec', 'p', 'f', 'rg', 'sf', 'so', 'sp',
'verbose', 'ap', 'ln', 'ec')
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_SUBMITTED_MMM = re.compile(r"(\d{1,2}) +([A-Za-z]{3}) +(\d{4})")
DECODE_SUBMITTED_MM = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) +([A-Za-z]{3}) +(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
......@@ -121,12 +121,13 @@ def get_origin_data(xml):
return (m.group(1), m.group(2))
class CheckAndFixSvc(object):
"""Service to check and fix the Marc12 record:
- Check if the record is valid and mark invalid one.
- Detailed information on conference are added.
- Fix as far as possible inconsistencies.
class CheckAndFix(object):
"""Tool to check and fix the content of the Marc12
record:
- Check the validity of the record.
- Conference information are added for talk and proceeding.
- Fix as far as possible inconsistencies and stupid mistakes.
"""
def _check_authors(self, record):
......@@ -217,6 +218,10 @@ class CheckAndFixSvc(object):
@raise CheckException:
"""
# no submitted date for thesis only a defence date
if record.is_thesis():
return
if "269" not in record:
raise CheckException(MSG_NO_DATE)
......@@ -233,12 +238,12 @@ class CheckAndFixSvc(object):
date = dates[i]["c"]
m = DECODE_SUBMITTED_MMM.match(date)
m = DECODE_DD_MMM_YYYY.match(date)
if m:
dates[i]["c"] = '%s-%s-%02i' % (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
continue
m = DECODE_SUBMITTED_MM.match(date)
m = DECODE_DD_MM_YYYY.match(date)
if m:
dates[i]["c"] = '%s-%02i-%02i' % (m.group(3), int(m.group(2)), int(m.group(1)))
continue
......@@ -416,7 +421,7 @@ class CheckAndFixSvc(object):
else:
raise CheckException(MSG_NO_REF)
def _format_authors(self, record, func):
"""Format the author names using the function func.
......@@ -500,7 +505,7 @@ class CheckAndFixSvc(object):
@param host:
@type id: unicode
@parapm id:
@param id:
@type key: unicode
@param key:
......@@ -511,13 +516,13 @@ class CheckAndFixSvc(object):
@raise CheckException:
"""
cds = CdsSvc(host)
marc12 = Marc12Svc()
cds = InvenioStore(host)
marc12 = Marc12()
# search the conference by id the preferred method
if id:
xml = cds.get_record(id)
for conference in marc12.process(xml):
for conference in marc12(xml):
if conference.id() == id:
return conference
......@@ -536,7 +541,7 @@ class CheckAndFixSvc(object):
for id in ids:
xml = cds.get_record(id)
for conference in marc12.process(xml):
for conference in marc12(xml):
if conference.conference_key() == key:
return conference
......@@ -546,10 +551,10 @@ class CheckAndFixSvc(object):
def _oldest_year(self, li):
"""Helper function.
@type li: list or string
@type li: list or str
@param li: list of years
@rtype: string
@rtype: str
@return: the oldest year or empty string when not defined
"""
......@@ -602,14 +607,17 @@ class CheckAndFixSvc(object):
record.is_valid = False
record.msg = "Crash %s" % e
print traceback.format_exc()
print traceback.format_stack()
class CdsSvc(object):
"""Service to interrogate U{invenio<http://invenio-software.org/>} store.
class InvenioStore(object):
"""Class to dialogue with U{invenio<http://invenio-software.org/>} store.
In the dialogue between CdsSvc and the invenio store, the request is provided by
an URL while the response is an XML string
compliant with the U{MARC<http://www.loc.gov/marc/>} standard.
In the dialogue:
- the request is an URL
- the response is an XML string which is compliant with the
U{MARC<http://www.loc.gov/marc/>} standard.
"""
......@@ -784,7 +792,7 @@ class CdsSvc(object):
@rtype: unicode
@return: the XML string is compliant with
the U{MARC<http://www.loc.gov/marc/>} standard.
Use L{Marc12Svc.process} to decode it.
Use L{Marc12.__call__} to decode it.
@raise CdsException: when the server return an HTTP error.
......@@ -1074,11 +1082,11 @@ class CdsSvc(object):
so=so)
class Marc12Svc(object):
"""Service to decode record string encoded with the
class Marc12(object):
"""Decode the XML string encoded with the
U{MARC<http://www.loc.gov/marc>} format.
The main method L{process} analyses the XML string
The main method L{__call__} analyses the XML string
which has the follwing structure::
<?xml version="1.0" encoding="UTF-8"?>
......@@ -1256,10 +1264,8 @@ class Marc12Svc(object):
return True
def process(self, xml, filter=None, func=None):
"""Transform the I{<record>} nodes of the XML string
into a list of L{Record}.
def __call__(self, xml, filter=None, func=None):
"""Transform the the XML string into a list of L{Record}.
@type xml: unicode
@param xml: the XML string has the following structure::
......@@ -1747,8 +1753,8 @@ class Record(dict):
return True
def is_thesis_record(self):
"""C{True} when the record is a thesis.
def is_thesis(self):
"""C{True} when the record corresponf to a thesis.
@rtype: bool
@return:
......@@ -2182,8 +2188,8 @@ def print_talk(record):
if __name__ == "__main__":
csv = CdsSvc()
msv = Marc12Svc()
csv = InvenioStore()
msv = Marc12()
# papers
xml = csv.search_year('LHCb Papers', '2010', rg=100)
......
......@@ -110,7 +110,7 @@ def highlight_cppm_speaker(value, template, record):
def remove_undef(value, template, record):
"""Remove the L{plugin_dbui.UNDEF} string.
"""Remove the C{UNDEF} string.
@type value: unicode
@param value: the current string representing the record
......
......@@ -2,9 +2,11 @@
HEAD
- Modify the logic of the harvester by introducing the class CheckAndFixSvc.
- Consolidate harvesters software.
- Modify the logic of the harvester by introducing the class CheckAndFix.
Validation and corrections of each record is performed at only one place.
Should improve code stability and maintenance.
- Review class naming of the invenio and harvester tools modules.
0.8.7.2 (Sep 2014)
- Migrate to plugin_dbui 0.6.1.7.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment