Commit df4943f4 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Do no reject a record with a bad OAI, recover it.

parent 6409d1e6
......@@ -6,7 +6,7 @@ import re
import traceback
from base import MSG_FIX_ORIGIN, MSG_IN_DB, ToolException
from base import MSG_FIX_ORIGIN, MSG_IN_DB, recover_oai, ToolException
from import Storage
from invenio_tools import (CheckAndFix,
......@@ -392,17 +392,17 @@ class Automaton(object):
"""Decode the xml and load it in the database.
@raise Exception: the type of exception depends on what happen:
- L{ToolException} when projet, team or category identifier
- L{ToolException} when project, team or category identifier
are not defined.
- C{StoreException} when somethings goes wrong interrogating the
- C{Marc12Exception} when somethings goes wrong decoding the XML
- C{StoreException} when something goes wrong interrogating the
- C{Marc12Exception} when something goes wrong decoding the XML
string return by the store
- C{CheckException} if the L{Record} is not valid
- C{Exception} if the python code crash
@type xml: unicode
@keyword xml: marc12 xml encoding of the publication record
@keyword xml: marc12 XML encoding of the publication record
if self.dbg:
......@@ -417,7 +417,7 @@ class Automaton(object):
def process_url(self, host, collections):
"""Retrieve the xml from the invenio store and load it in the database
"""Retrieve the XML from the invenio store and load it in the database
@raise Exception: depending on what happen, can be StoreException,
Marc12ZException, ...
......@@ -528,17 +528,14 @@ class Automaton(object):,
# reject record with undefined OAI field
# the OAI is not defined -- recover it
oai = record.oai()
if oai is None:
self.logs[-1].reject(MSG_NO_OAI, record.year())
# reject the record when the OAI is not well
match = REG_OAI.match(oai)
if not match:
self.logs[-1].reject(MSG_WELL_FORM_OAI, record.year())
# the OAI is not well --recover it
if not REG_OAI.match(oai):
# check that the record is well formed
# repair non-conformity as far as possible
......@@ -14,6 +14,10 @@ MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database"
MSG_LOAD = "Load in the database"
MSG_INVALID_HOST = "Invalid host"
OAI_INVENIO = "oai:%s:%s"
def family_name_fr(full_name):
"""Extract the family name when the full name is encoded as C{J. Doe}.
......@@ -141,5 +145,28 @@ def learn_my_authors(db,
db.my_authors[] = dict(authors=', '.join(database_authors))
def recover_oai(record, host):
"""Helper function to recover the OAI identifier when it is not defined
or not well form.
@type record: Record
@param record:
if host == "":
field, subfield = u"0248", "a"
elif host == "":
field, subfield = u"909CO", "o"
raise ValueError(MSG_INVALID_HOST)
if field not in record:
record[field] = dict()
record[field][subfield] = OAI_INVENIO % (host,
class ToolException(Exception): pass
......@@ -21,18 +21,19 @@ from recordconf import RecordConf
from recordthesis import RecordThesis
DECODE_ARXIV = re.compile("arXiv:(\d{2})(\d{2})\.")
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
DECODE_YYYY = re.compile("^(\d{4})$")
DECODE_YYYY = re.compile(r"^(\d{4})$")
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
DECODE_REF = [re.compile("(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"),
re.compile("(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)")]
_ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
MONTHS = {u'Jan':'01',
......@@ -70,11 +71,12 @@ MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
OAI_INVENIO = "oai:%s:%s"
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
......@@ -483,7 +485,7 @@ class CheckAndFix(object):
- INVENIO: Phys. Lett. B + volume 673
- INSPIREHEP: Phys.Lett + volume B673
Standardise the answer as Phys Lett B
Standardize the answer as Phys Lett B
@note: It is recommended to call this method when erratum are removed.
......@@ -595,7 +597,8 @@ class CheckAndFix(object):
def get_my_authors(self, record, cmpFct=None):
"""Get authors of my institutes signing the record.
The information is append to the L{Record} via the attribute C{my_authors}.
The information is append to the L{Record} via the attribute
@type record: L{Record}
@param record:
......@@ -764,13 +767,15 @@ class CheckAndFix(object):
# 22 Mar 2011
m = DECODE_DD_MMM_YYYY.match(dates[i])
if m:
dates[i] = '%s-%s-%02i' % (, MONTHS[], int(
data = (, MONTHS[], int(
dates[i] = '%s-%s-%02i' % data
# 22 03 2011
m = DECODE_DD_MM_YYYY.match(dates[i])
if m:
dates[i] = '%s-%02i-%02i' % (, int(, int(
data (, int(, int(
dates[i] = '%s-%02i-%02i' % data
# 2011
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment