Commit 5cc72345 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update Automaton.process_url to disable multiple collections.

parent b732782b
......@@ -483,23 +483,31 @@ class Automaton(object):
self.collection_logs.append(MsgCollection(found=1))
self.decode_xml(xml)
def process_url(self, host, collections):
def process_url(self, host, collection):
"""Retrieve the XML string from the invenio store and
insert corresponding records in the database.
Args:
host (unicode): host name to query for publications, either
host (str):
host name to query for publications, either
``cds.cern.ch`` or ``inspirehep.net``.
collections (unicode): list of collection to be interrogated.
collection (str):
the collection to be interrogated.
Raises:
StoreException: when something goes wrong interrogating the
store.
Marc12Exception: when something goes wrong decoding the XML
string return by the store.
CheckException: when the record has non-conformities.
Exception: when the python code crashes.
StoreException:
when something goes wrong interrogating the store.
Marc12Exception:
when something goes wrong decoding the XML string
return by the store.
CheckException:
when the record has non-conformities.
Exception:
when the python code crashes.
"""
if self.dbg:
......@@ -507,13 +515,10 @@ class Automaton(object):
# extend harvester for logs
self.harvester.host = host
self.harvester.collections = collections
self.harvester.collection = collection
store = InvenioStore(host)
# list of collections
collections = re.sub(" *, *", ",", collections).split(",")
# alias
collection_logs = self.collection_logs
controller = self.controller
......@@ -522,63 +527,61 @@ class Automaton(object):
logs = self.logs
project = self.db.projects[self.id_project].project
# extract the list of publications from the store for each collection
# extract the list of publications from the store for the collection
# the search is perform on a range of creation date
# if not defined all element are return
#
# The method uses here minimise the memory usage
# on the server as well as on the client side
for collection in collections:
# log collection information
# A collection is identified as "Project Controller collection"
title = "%s / %s / %s" % (project, controller, collection)
collection_logs.append(MsgCollection(title=title))
# log collection information
# A collection is identified as "Project Controller collection"
title = "%s / %s / %s" % (project, controller, collection)
collection_logs.append(MsgCollection(title=title))
# search record in the harvester repository
kwargs = self._search_parameters(collection)
# search record in the harvester repository
kwargs = self._search_parameters(collection)
try:
rec_ids = store.get_ids(**kwargs)
try:
rec_ids = store.get_ids(**kwargs)
except Exception as error:
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].error = error
return
except Exception as error:
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].error = error
continue
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].found = len(rec_ids)
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].found = len(rec_ids)
if not rec_ids:
return
if not rec_ids:
continue
if self.dbg:
print "%i records found in %s" % (len(rec_ids), collection)
if self.dbg:
print "%i records found in %s" % (len(rec_ids), collection)
for rec_id in rec_ids:
for rec_id in rec_ids:
if self.dbg:
print "\nprocessing record", rec_id
if self.dbg:
print "\nprocessing record", rec_id
try:
db_id = is_record_in_db(title, host=host, rec_id=rec_id)
if db_id:
if self.dbg:
print "record in db", rec_id, "->", db_id
continue
xml = store.get_record(rec_id)
decode_xml(xml)
except Exception as e:
print traceback.format_exc()
url = OAI_URL % (host, rec_id)
logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=url))
logs[-1].reject(e)
try:
db_id = is_record_in_db(title, host=host, rec_id=rec_id)
if db_id:
if self.dbg:
print "record in db", rec_id, "->", db_id
return
xml = store.get_record(rec_id)
decode_xml(xml)
except Exception as e:
print traceback.format_exc()
url = OAI_URL % (host, rec_id)
logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=url))
logs[-1].reject(e)
def decode_xml(self, xml):
"""Decode the MARC XML string and insert records in the database.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment