Commit df4943f4 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Do no reject a record with a bad OAI, recover it.

parent 6409d1e6
...@@ -6,7 +6,7 @@ import re ...@@ -6,7 +6,7 @@ import re
import traceback import traceback
from base import MSG_FIX_ORIGIN, MSG_IN_DB, ToolException from base import MSG_FIX_ORIGIN, MSG_IN_DB, recover_oai, ToolException
from gluon.storage import Storage from gluon.storage import Storage
from invenio_tools import (CheckAndFix, from invenio_tools import (CheckAndFix,
InvenioStore, InvenioStore,
...@@ -392,17 +392,17 @@ class Automaton(object): ...@@ -392,17 +392,17 @@ class Automaton(object):
"""Decode the xml and load it in the database. """Decode the xml and load it in the database.
@raise Exception: the type of exception depends on what happen: @raise Exception: the type of exception depends on what happen:
- L{ToolException} when projet, team or category identifier - L{ToolException} when project, team or category identifier
are not defined. are not defined.
- C{StoreException} when somethings goes wrong interrogating the - C{StoreException} when something goes wrong interrogating the
store. store.
- C{Marc12Exception} when somethings goes wrong decoding the XML - C{Marc12Exception} when something goes wrong decoding the XML
string return by the store string return by the store
- C{CheckException} if the L{Record} is not valid - C{CheckException} if the L{Record} is not valid
- C{Exception} if the python code crash - C{Exception} if the python code crash
@type xml: unicode @type xml: unicode
@keyword xml: marc12 xml encoding of the publication record @keyword xml: marc12 XML encoding of the publication record
""" """
if self.dbg: if self.dbg:
...@@ -417,7 +417,7 @@ class Automaton(object): ...@@ -417,7 +417,7 @@ class Automaton(object):
self.decode_xml(xml) self.decode_xml(xml)
def process_url(self, host, collections): def process_url(self, host, collections):
"""Retrieve the xml from the invenio store and load it in the database """Retrieve the XML from the invenio store and load it in the database
@raise Exception: depending on what happen, can be StoreException, @raise Exception: depending on what happen, can be StoreException,
Marc12ZException, ... Marc12ZException, ...
...@@ -528,17 +528,14 @@ class Automaton(object): ...@@ -528,17 +528,14 @@ class Automaton(object):
record_id=record.id(), record_id=record.id(),
title=record.title())) title=record.title()))
# reject record with undefined OAI field # the OAI is not defined -- recover it
oai = record.oai() oai = record.oai()
if oai is None: if oai is None:
self.logs[-1].reject(MSG_NO_OAI, record.year()) recover_oai(record, self.harvester.host)
continue
# reject the record when the OAI is not well # the OAI is not well --recover it
match = REG_OAI.match(oai) if not REG_OAI.match(oai):
if not match: recover_oai(record, self.harvester.host)
self.logs[-1].reject(MSG_WELL_FORM_OAI, record.year())
continue
# check that the record is well formed # check that the record is well formed
# repair non-conformity as far as possible # repair non-conformity as far as possible
......
...@@ -14,6 +14,10 @@ MSG_FIX_ORIGIN = "Fixed the origin field" ...@@ -14,6 +14,10 @@ MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database" MSG_IN_DB = "Already in the database"
MSG_LOAD = "Load in the database" MSG_LOAD = "Load in the database"
MSG_INVALID_HOST = "Invalid host"
OAI_INVENIO = "oai:%s:%s"
def family_name_fr(full_name): def family_name_fr(full_name):
"""Extract the family name when the full name is encoded as C{J. Doe}. """Extract the family name when the full name is encoded as C{J. Doe}.
...@@ -141,5 +145,28 @@ def learn_my_authors(db, ...@@ -141,5 +145,28 @@ def learn_my_authors(db,
db.my_authors[row.id] = dict(authors=', '.join(database_authors)) db.my_authors[row.id] = dict(authors=', '.join(database_authors))
def recover_oai(record, host):
"""Helper function to recover the OAI identifier when it is not defined
or not well form.
@type record: Record
@param record:
"""
if host == "cds.cern.ch":
field, subfield = u"0248", "a"
elif host == "inspirehep.net":
field, subfield = u"909CO", "o"
else:
raise ValueError(MSG_INVALID_HOST)
if field not in record:
record[field] = dict()
record[field][subfield] = OAI_INVENIO % (host, record.id())
class ToolException(Exception): pass class ToolException(Exception): pass
...@@ -21,18 +21,19 @@ from recordconf import RecordConf ...@@ -21,18 +21,19 @@ from recordconf import RecordConf
from recordthesis import RecordThesis from recordthesis import RecordThesis
DECODE_ARXIV = re.compile("arXiv:(\d{2})(\d{2})\.") DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY # Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})") DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})") DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
DECODE_YYYY = re.compile("^(\d{4})$") DECODE_YYYY = re.compile(r"^(\d{4})$")
# Decode publication reference: # Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014) # Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883 # Eur. Phys. J. C (2014) 74:2883
DECODE_REF = [re.compile("(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"), _ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
re.compile("(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)")] _ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
MONTHS = {u'Jan':'01', MONTHS = {u'Jan':'01',
u'Feb':'02', u'Feb':'02',
...@@ -70,11 +71,12 @@ MSG_WELL_FORMED_DATE = "Reject submission date is not well formed" ...@@ -70,11 +71,12 @@ MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed" MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
OAI_INVENIO = "oai:%s:%s"
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION) REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})") REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = \
re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES) REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED) REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
...@@ -483,7 +485,7 @@ class CheckAndFix(object): ...@@ -483,7 +485,7 @@ class CheckAndFix(object):
- INVENIO: Phys. Lett. B + volume 673 - INVENIO: Phys. Lett. B + volume 673
- INSPIREHEP: Phys.Lett + volume B673 - INSPIREHEP: Phys.Lett + volume B673
Standardise the answer as Phys Lett B Standardize the answer as Phys Lett B
@note: It is recommended to call this method when erratum are removed. @note: It is recommended to call this method when erratum are removed.
...@@ -595,7 +597,8 @@ class CheckAndFix(object): ...@@ -595,7 +597,8 @@ class CheckAndFix(object):
def get_my_authors(self, record, cmpFct=None): def get_my_authors(self, record, cmpFct=None):
"""Get authors of my institutes signing the record. """Get authors of my institutes signing the record.
The information is append to the L{Record} via the attribute C{my_authors}. The information is append to the L{Record} via the attribute
C{my_authors}.
@type record: L{Record} @type record: L{Record}
@param record: @param record:
...@@ -764,13 +767,15 @@ class CheckAndFix(object): ...@@ -764,13 +767,15 @@ class CheckAndFix(object):
# 22 Mar 2011 # 22 Mar 2011
m = DECODE_DD_MMM_YYYY.match(dates[i]) m = DECODE_DD_MMM_YYYY.match(dates[i])
if m: if m:
dates[i] = '%s-%s-%02i' % (m.group(3), MONTHS[m.group(2)], int(m.group(1))) data = (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
dates[i] = '%s-%s-%02i' % data
continue continue
# 22 03 2011 # 22 03 2011
m = DECODE_DD_MM_YYYY.match(dates[i]) m = DECODE_DD_MM_YYYY.match(dates[i])
if m: if m:
dates[i] = '%s-%02i-%02i' % (m.group(3), int(m.group(2)), int(m.group(1))) data (m.group(3), int(m.group(2)), int(m.group(1)))
dates[i] = '%s-%02i-%02i' % data
continue continue
# 2011 # 2011
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment