Commit 869c3d54 authored by MEESSEN Christophe's avatar MEESSEN Christophe
Browse files

Refactoring of PublicationsTool

- Make parameters for PublicationsTool explicit
- Defined a factory for PublicationsTool
- PublicationsTool.process_url returns a list of marc12xmls
- PublicationsTool.__call__ receives the marc12xml as argument

Conflicts:
	controllers/harvest.py
	modules/harvest_tools.py
parent 1ef89c74
......@@ -6,7 +6,7 @@ import traceback
from gluon.storage import Storage
from harvest_tools import (format_author_fr,
family_name_fr,
get_harvester_tool,
build_harvester_tool,
PublicationsTool,
ToolException)
from invenio_tools import (CdsException,
......@@ -52,19 +52,22 @@ def free_run():
msg += T('The field "%s" is missing ...') % T(table[el].label)
return INLINE_ALERT % (T('Error'), msg)
tool_class = get_harvester_tool(selector.controller)
tool = tool_class(db,
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
host=selector.host,
collections=selector.collections,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
tool()
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
marc12xmls = tool.process_url(selector.host, selector.collections)
for xml in marc12xmls:
tool(xml)
except ToolException, e:
return T(str(e))
......@@ -219,17 +222,19 @@ def insert_marcxml():
if not tool_class:
return INLINE_ALERT % (T('Error'), T('Select a controller.'))
tool = tool_class(db,
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
xml=selector.xml,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
tool()
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool(selector.xml)
except ToolException, e:
return T(str(e))
......@@ -263,27 +268,27 @@ def run():
selector = Selector(virtdb.harvester_selector,
exclude_fields=('mode', 'year_start', 'year_end'))
tool_class = get_harvester_tool(selector.controller)
if not tool_class:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
# Get the host and collections
row = selector.select(db.harvesters).first()
if not row:
raise ToolException(MSG_NO_HARVESTER)
tool = tool_class(db,
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
row.harvesters.id_categories,
host=row.harvesters.host,
collections=row.harvesters.collections,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
tool()
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
marc12xmls = tool.process_url(row.harvesters.host, row.harvesters.collections)
for xml in marc12xmls:
tool(xml)
except ToolException, e:
return T(str(e))
......@@ -326,19 +331,22 @@ def run_all():
for harvester in harvesters:
tool_class = get_harvester_tool(harvester.controller)
tool = tool_class(db,
tool = build_harvester_tool(db,
harvester.id_teams,
harvester.id_projects,
harvester.controller,
harvester.id_categories,
host=harvester.host,
collections=harvester.collections,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
debug=False)
tool()
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
marc12xmls = tool.process_url(harvester.host, harvester.collections)
for xml in marc12xmls:
tool(xml)
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
......
......@@ -175,9 +175,50 @@ def get_harvester_tool(controller):
return Tool
def learn_my_authors(db, authors=None,
id_project=None,
id_team=None,
def build_harvester_tool(db, id_team, id_project, controller, id_category,
year_start=None, year_end=None, dry_run=True, debug=False):
"""
Harvest tool factory function, returns non if no factory exist for controller.
@type db: gluon.dal.DAL
@param db:
@type id_team: int
@param id_team: Identifier of the team in the db
@type id_project: int
@param id_project: Identifier of the project in the db
@type controller: unicode
@param controller: Type of publication (i.e. 'article', 'proceedings', ...)
@type id_category: int
@param id_category: Identifier of the category of publication (i.e. ACL, ACTI, ...)
@type year_start: int
@keyword year_start: Start year of search (i.e. '2014')
@type year_end: int
@keyword year_end: End year of search (i.e. '2015')
@type dry_run: boolean
@keyword dry_run: True if no record is to be written to the db
@type debug: bool
@param debug: activate the debug mode
"""
tool_class = get_harvester_tool(controller)
if tool_class is None:
return None
return tool_class(db, id_team, id_project, controller, id_category,
year_start, year_end, dry_run, debug)
def learn_my_authors(db, authors=None,
id_project=None,
id_team=None,
year=None):
"""Train the rescue list of the authors of my institute,
stored in the database, using the list C{authors} provided in argument.
......@@ -406,10 +447,13 @@ class PublicationsTool(object):
are defined by the current request.
"""
def __init__(self, db, id_team, id_project, controller, id_category, host=None, collections=None,
xml=None, year_start=None, year_end=None, dry_run=True, debug=False):
def __init__(self, db, id_team, id_project, controller, id_category,
year_start=None, year_end=None, dry_run=True, debug=False):
"""
@note see C{build_harvester_tool} factory function building C{PublicationsTools}
@type db: gluon.dal.DAL
@param db:
......@@ -425,16 +469,7 @@ class PublicationsTool(object):
@type id_category: int
@param id_category: Identifier of the category of publication (i.e. ACL, ACTI, ...)
@type host: unicode
@keyword host: Web host name to query for publication
@type collections: unicode
@keyword collections: Request string to send to the host to get the publications
@type xml: unicode
@keyword xml: marc12 xml encoding of the publication record
@type year_start: int
@keyword year_start: Start year of search (i.e. '2014')
......@@ -446,9 +481,7 @@ class PublicationsTool(object):
@type debug: bool
@param debug: activate the debug mode
The constructor expect that host and collections is provided or xml is provided instead.
"""
self.collection_logs = []
self.db = db
......@@ -456,11 +489,9 @@ class PublicationsTool(object):
self.id_project = id_project
self.controller = controller
self.id_category = id_category
self.host = host
self.collections = collections
self.xml = xml
self.year_start = year_start
self.year_end = year_end
self.dry_run = dry_run
self.dbg = debug
self.logs = []
......@@ -478,23 +509,11 @@ class PublicationsTool(object):
if not self.id_category:
raise ToolException(MSG_NO_CAT)
# xml or (host and collections) keyargs must be provided
if not ((xml and not host and not collections) or (not xml and host and collections)):
raise ToolException(MSG_MISSING_PARAMETER)
# Construct harvester Storage needed for the log
if host and collections:
self.harvester = Storage(id_teams=self.id_team,
id_projects=self.id_project,
controller=self.controller,
id_categories=self.id_category,
host=self.host,
collections=self.collections)
else:
self.harvester = Storage(id_teams=self.id_team,
id_projects=self.id_project,
controller=self.controller,
id_categories=self.id_category)
self.harvester = Storage(id_teams=self.id_team,
id_projects=self.id_project,
controller=self.controller,
id_categories=self.id_category)
# private cache for my_author rescue list
self.__par = None
......@@ -783,16 +802,34 @@ class PublicationsTool(object):
"""
return 0
def process_url(self):
def process_url(self, host, collections):
"""Retrieve the MARC XML string and launch its decoding.
@raise Exception: depending on what happen, can be StoreException,
Marc12ZException, ...
@type host: unicode
@keyword host: Web host name to query for publication
@type collections: unicode
@keyword collections: Request string to send to the host to get the publications
@rtype: int
@return: one when the record is inserted / updated in the database
zero otherwise.
"""
if self.dbg:
print "process URL search"
self.host = host
self.collections = collections
marc12xmls = []
# extend harvester for logs
self.harvester.host = host
self.harvester.collections = collections
store = InvenioStore(self.host)
......@@ -845,7 +882,7 @@ class PublicationsTool(object):
try:
xml = store.get_record(id)
self.process_xml(xml)
marc12xmls.append(xml)
except BaseException as e:
print traceback.format_exc()
......@@ -856,6 +893,8 @@ class PublicationsTool(object):
title=url))
self.logs[-1].reject(e)
return marc12xmls
def process_xml(self, xml):
"""Decode the MARC XML string and load records in the database.
......@@ -916,7 +955,7 @@ class PublicationsTool(object):
logs=self.logs)
def __call__(self, xml=None):
def __call__(self, xml):
"""Search publication in the invenio store according to criteria
and load them in the database.
......@@ -930,7 +969,7 @@ class PublicationsTool(object):
- C{Exception} if the python code crash
@type xml: unicode
@param xml: MARC XML string
@keyword xml: marc12 xml encoding of the publication record
"""
if self.dbg:
......@@ -941,17 +980,8 @@ class PublicationsTool(object):
print "get harvest parameters"
# process an XML request
if self.xml:
self.collection_logs.append(MsgCollection(found=1))
self.process_xml(self.xml)
return
# retrieve the harvester parameter in the database
# if not yet defined (free run)
# retrieve records in the store and load them in the database
self.process_url()
self.collection_logs.append(MsgCollection(found=1))
self.process_xml(xml)
class Articles(PublicationsTool):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment