Commit a132584f authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Propagate the new definiton of the oai_url in the Automaton class.

parent c88a8953
......@@ -172,41 +172,65 @@ class Automaton(object):
self.logs[-1].reject(dbe.message, log_year)
return 0
def _is_record_in_db(self, rec_id, title):
def _is_record_in_db(self, title, host=None, rec_id=None, oai_url=None):
"""Return ``True`` when the record is already in the database.
The search is based on the origin field.
The search is based on the origin field and on the primary OAI.
Note:
A new log entry is created when a record is found.
Args:
rec_id (int): record identifier
title (unicode): title of the collection
title (unicode): the title of the publication.
Keyword Args:
host (unicode): the store. possible values are ``cds.cern.ch`` or
``inspirehep.net``. To be used with *rec_id*.
rec_id (int): the record identifier.
oai_url (unicode): the URL of the record in the store.
Note:
Either use *host* and *rec_id* or *oai_url*
Returns:
bool: ``True`` when a record if found, ``False`` otherwise.
Raises:
ValueError: when keyword arguments are not defined properly.
"""
db = self.db
harvester = self.harvester
# check
url = OAI_URL % (harvester.host, rec_id)
db_id = get_id(db.publications, origin=url)
# build the OAI URL
if host is not None and rec_id is not None and oai_url is None:
url = OAI_URL % (host, rec_id)
elif host is None and rec_id is None and oai_url is not None:
url = oai_url
else:
raise ValueError
# check the OAI
query = db.publications.origin.contains(url)
setrows = db(query)
if db_id is None:
if setrows.count() == 0:
return False
publication = db.publications[db_id]
# one record found
columns = [db.publications.id_categories,
db.publications.title,
db.publications.year]
publication = setrows.select(*columns).first()
# same category for the publication and the harvester
# keep the record if it is not the case
# this is required to transform a preprint into article
# Note:
# The category for the publication and the harvester have to be equal.
# However, keep the record if it is not the case.
# This is required to transform a preprint into article
if publication.id_categories != harvester.id_categories:
return False
# log
self.logs.append(Msg(harvester=self.harvester,
self.logs.append(Msg(harvester=harvester,
collection=title,
record_id=rec_id,
title=publication.title))
......@@ -323,7 +347,7 @@ class Automaton(object):
"""Get database record matching fields values defined
in the keyword arguments.
Args:
Keyword Args:
oai_url (unicode): *e.g* ``"http://cds.cern.ch/record/123456"``
year (int): the year of the publication.
......@@ -343,7 +367,9 @@ class Automaton(object):
if self.dbg:
print "get existing record by fields"
# alias
db = self.db
logs = self.logs
# origin can't be used for the search
oai_url = kwargs["oai_url"]
......@@ -355,16 +381,16 @@ class Automaton(object):
return (None, 0)
# fix origin field
ok = db.publications[rec_id].origin and \
db.publications[rec_id].origin == oai_url
publication = db.publications[rec_id]
ok = publication.origin and publication.origin == oai_url
if not ok:
if not self.dry_run:
db.publications[rec_id] = dict(origin=oai_url)
publication = dict(origin=oai_url)
self.logs[-1].modify(MSG_FIX_ORIGIN, kwargs["year"])
logs[-1].modify(MSG_FIX_ORIGIN, kwargs["year"])
return (rec_id, 1)
self.logs[-1].idle(MSG_IN_DB, kwargs["year"])
logs[-1].idle(MSG_IN_DB, kwargs["year"])
return (rec_id, 0)
def insert_record(self, record):
......@@ -447,14 +473,18 @@ class Automaton(object):
collections = re.sub(' *, *', ',', collections).split(',')
# alias
collection_logs = self.collection_logs
controller = self.controller
decode_xml = self.decode_xml
is_record_in_db = self._is_record_in_db
logs = self.logs
project = self.db.projects[self.id_project].project
# extract the list of publications from the store for each collection
# the search is perform on a range of creation date
# if not defined all element are return
#
# The method use here minimize the memory usage
# The method uses here minimise the memory usage
# on the server as well as on the client side
for collection in collections:
......@@ -462,7 +492,7 @@ class Automaton(object):
# log collection information
# A collection is identified as "Project Controller collection"
title = "%s / %s / %s" % (project, controller, collection)
self.collection_logs.append(MsgCollection(title=title))
collection_logs.append(MsgCollection(title=title))
# search record in the harvester repository
kwargs = self._search_parameters(collection)
......@@ -471,12 +501,12 @@ class Automaton(object):
rec_ids = store.get_ids(**kwargs)
except Exception as error:
self.collection_logs[-1].url = store.last_search_url()
self.collection_logs[-1].error = error
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].error = error
continue
self.collection_logs[-1].url = store.last_search_url()
self.collection_logs[-1].found = len(rec_ids)
collection_logs[-1].url = store.last_search_url()
collection_logs[-1].found = len(rec_ids)
if not rec_ids:
continue
......@@ -490,20 +520,20 @@ class Automaton(object):
print "\nprocessing record", rec_id
try:
if self._is_record_in_db(rec_id, title):
if is_record_in_db(title, host=host, rec_id=rec_id):
continue
xml = store.get_record(rec_id)
self.decode_xml(xml)
decode_xml(xml)
except Exception as e:
print traceback.format_exc()
url = OAI_URL % (host, rec_id)
self.logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=url))
self.logs[-1].reject(e)
logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=url))
logs[-1].reject(e)
def decode_xml(self, xml):
"""Decode the MARC XML string and insert records in the database.
......@@ -512,9 +542,16 @@ class Automaton(object):
xml (unicode): MARC XML string
"""
if self.dbg:
print "process xml record"
# alias
is_record_in_db = self._is_record_in_db
check_record = self.check_record
insert_record = self.insert_record
logs = self.logs
# NOTE
# BaseException and inherited class
# are caught by the previous stage
......@@ -526,25 +563,34 @@ class Automaton(object):
if self.dbg:
print "record decoded"
# reject the record using the secondary OAI
# require to cover the case:
# - san store A. Only OAI_A is defined.
# - later scan store B. OAI_B and OAI_A are found.
ok = is_record_in_db(record.title(),
oai_url=record.secondary_oai_url())
if ok:
continue
# start the log for the record
self.logs.append(Msg(harvester=self.harvester,
collection=self.collection_logs[-1].title,
record_id=record.id(),
title=record.title()))
logs.append(Msg(harvester=self.harvester,
collection=self.collection_logs[-1].title,
record_id=record.id(),
title=record.title()))
# check that the record is well formed
# repair non-conformity as far as possible
if not self.check_record(record):
if not check_record(record):
continue
if self.dbg:
print "insert record in the database"
# insert the record in the database
self.insert_record(record)
insert_record(record)
if self.dbg:
print self.logs[-1].action.upper(), self.logs[-1].txt
print logs[-1].action.upper(), logs[-1].txt
def report(self):
"""Build the processing report.
......
# -*- coding: utf-8 -*-
"""test_reject_on_oai
"""
from gluon import current
from harvest_tools import Automaton
def test_is_record_in_db():
"""The record cds 1389970 and ins 939619 describe the same LHcb paper.
It is in the database and load from cds.cern.ch.
"""
db = current.globalenv['db']
# check that the record is in the database
query = db.publications.origin.contains("http://cds.cern.ch/record/1389907")
row = db(query).select().first()
assert row.origin == "http://cds.cern.ch/record/1389907, http://inspirehep.net/record/939619"
atm = Automaton(db, 7, 8, u"articles", 2)
flag = atm._is_record_in_db("Luminosity measurement",
host="cds.cern.ch",
rec_id=1389907, )
assert flag == True
flag = atm._is_record_in_db("Luminosity measurement",
oai_url = "http://inspirehep.net/record/939619" )
assert flag == True
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment