Commit 357f0db5 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '92-harvesters.collections' into 'master'

Resolve "Disable mutiple collection in the field harvesters.collections"

Closes #92

See merge request !90
parents b732782b 8878da32
......@@ -483,23 +483,31 @@ class Automaton(object):
self.collection_logs.append(MsgCollection(found=1))
self.decode_xml(xml)
def process_url(self, host, collections):
def process_url(self, host, collection):
"""Retrieve the XML string from the invenio store and
insert corresponding records in the database.
Args:
host (unicode): host name to query for publications, either
host (str):
host name to query for publications, either
``cds.cern.ch`` or ``inspirehep.net``.
collections (unicode): list of collection to be interrogated.
collection (str):
the collection to be interrogated.
Raises:
StoreException: when something goes wrong interrogating the
store.
Marc12Exception: when something goes wrong decoding the XML
string return by the store.
CheckException: when the record has non-conformities.
Exception: when the python code crashes.
StoreException:
when something goes wrong interrogating the store.
Marc12Exception:
when something goes wrong decoding the XML string
return by the store.
CheckException:
when the record has non-conformities.
Exception:
when the python code crashes.
"""
if self.dbg:
......@@ -507,13 +515,10 @@ class Automaton(object):
# extend harvester for logs
self.harvester.host = host
self.harvester.collections = collections
self.harvester.collection = collection
store = InvenioStore(host)
# list of collections
collections = re.sub(" *, *", ",", collections).split(",")
# alias
collection_logs = self.collection_logs
controller = self.controller
......@@ -522,63 +527,61 @@ class Automaton(object):
logs = self.logs
project = self.db.projects[self.id_project].project
# extract the list of publications from the store for each collection
# extract the list of publications from the store for the collection
# the search is perform on a range of creation date
# if not defined all element are return
#
# The method uses here minimise the memory usage
# on the server as well as on the client side
for collection in collections:
# log collection information
# A collection is identified as "Project Controller collection"
title = "%s / %s / %s" % (project, controller, collection)
collection_logs.append(MsgCollection(title=title))
# log collection information
# A collection is identified as "Project Controller collection"
title = "%s / %s / %s" % (project, controller, collection)
collection_logs.append(MsgCollection(title=title))
# search record in the harvester repository
kwargs = self._search_parameters(collection)
# search record in the harvester repository
kwargs = self._search_parameters(collection)
try:
rec_ids = store.get_ids(**kwargs)
try:
rec_ids = store.get_ids(**kwargs)
except Exception as error:
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].error = error
return
except Exception as error:
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].error = error
continue
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].found = len(rec_ids)
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].found = len(rec_ids)
if not rec_ids:
return
if not rec_ids:
continue
if self.dbg:
print "%i records found in %s" % (len(rec_ids), collection)
if self.dbg:
print "%i records found in %s" % (len(rec_ids), collection)
for rec_id in rec_ids:
for rec_id in rec_ids:
if self.dbg:
print "\nprocessing record", rec_id
if self.dbg:
print "\nprocessing record", rec_id
try:
db_id = is_record_in_db(title, host=host, rec_id=rec_id)
if db_id:
if self.dbg:
print "record in db", rec_id, "->", db_id
continue
xml = store.get_record(rec_id)
decode_xml(xml)
except Exception as e:
print traceback.format_exc()
url = OAI_URL % (host, rec_id)
logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=url))
logs[-1].reject(e)
try:
db_id = is_record_in_db(title, host=host, rec_id=rec_id)
if db_id:
if self.dbg:
print "record in db", rec_id, "->", db_id
return
xml = store.get_record(rec_id)
decode_xml(xml)
except Exception as e:
print traceback.format_exc()
url = OAI_URL % (host, rec_id)
logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=url))
logs[-1].reject(e)
def decode_xml(self, xml):
"""Decode the MARC XML string and insert records in the database.
......
......@@ -150,8 +150,8 @@ class Harvester(object):
"""
tp_collections = \
T("List of collections separated by comma: "
"LHCb Papers, LHCb Talks")
T("the collection to be interrogated, e.g. "
"'LHCb Papers' or 'find cn lhcb and tc p and not tc c'.")
tp_host = \
T("Address of the invenio store where the search is performed.")
......
# -*- coding: utf-8 -*-
""" NAME
fix_harvesters_collections
SYNOPSIS
fix_harvesters_collections [options]
DESCRIPTION
Up to version 0.9.6.5, the field collections can contains a list
of collection separated by a comma. This rule is in conflict when
defining a collection using author name since author name can contain
also comma.
In version 0.9.6.6, it has been decided that the collection field
contains the definition for only one collection.
This script allow to migrate from 0.9.6.5 to 0.9.6.6 approach.
OPTIONS
EXAMPLE
> cd ...limbra/scripts
> run -S test_limbra script fix_harvesters_collections.py
> run loop fix_harvesters_collections.py
> ...
AUTHOR
R. Le Gac -- Oct 2017
"""
if __name__ == "__main__":
import sys
# scan the harvesters table
for row in db(db.harvesters).iterselect():
collections = row.collections
if row.host == "cds.cern.ch" and "," in collections:
print "\n\tsplit:", collections
data = row.as_dict()
del data["id"]
# insert new harvesters
for elt in collections.split(","):
print "\t\tinsert →", elt.strip()
data["collections"] = elt.strip()
db.harvesters.insert(**data)
# delete old harvesters
print "\t\tdelete →", row.id
del db.harvesters[row.id]
# commit change
rep = raw_input("Commit database changes [y/N]: ")
if rep == 'y':
db.commit()
# exit gently
sys.exit(0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment