Commit d4754c1c authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '30-conference-decoding' into 'master'

30 conference decoding

* Major redesign of the Marc12 decoding section.
* Introduce the iterator `IterRecord`.
* The Marc12 service return `RecordConf`, `RecodInst`, `RecordPubli` or `RecordThesis`, depending on the content of the record. It can describe and institute, a publication, a conference talk/proceeding or a thesis.
* Improved method `Record.oai()`.
* `CheckAndFix` as well as tests have been adapted to the new class type.
* Close #30.

See merge request !24
parents 9ffd73ff f65f458f
......@@ -6,7 +6,7 @@ import traceback
from base import family_name_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from invenio_tools import CheckException, RecordConf, RecordThesis
from publicationstool import PublicationsTool
from plugin_dbui import UNDEF_ID
......@@ -100,11 +100,11 @@ class Preprints(PublicationsTool):
self.logs[-1].reject(MSG_PREPRINT_IS_PAPER, record.year())
return False
if record.is_conference_data():
if isinstance(record, RecordConf):
self.logs[-1].reject(MSG_PREPRINT_IS_CONFERENCE, record.year())
return False
if record.is_thesis():
if isinstance(record, RecordThesis):
self.logs[-1].reject(MSG_PREPRINT_IS_THESIS, record.year())
return False
......
......@@ -7,7 +7,7 @@ import traceback
from base import family_name_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from invenio_tools import CheckException, RecordThesis
from publicationstool import PublicationsTool
from plugin_dbui import get_id, UNDEF_ID
......@@ -113,7 +113,7 @@ class Thesis(PublicationsTool):
if self.dbg:
print "select thesis record"
if record.is_thesis():
if isinstance(record, RecordThesis):
return True
self.logs[-1].reject(MSG_NO_THESIS, record.year())
......
......@@ -3,19 +3,29 @@
@note: details on the invenio API at U{http://invenio-software.org/}
"""
from base import (is_institute,
from base import (ARXIV,
ARXIV_PDF,
is_conference,
is_institute,
is_thesis,
OAI_URL,
REG_ARXIV_NUMBER,
REG_OAI,
REG_YEAR)
REG_YEAR,
THESIS_DIR)
from exception import (CdsException,
CheckException,
InstituteException,
Marc12Exception,
RecordException,
XmlException)
from checkandfix import CheckAndFix, load_record
from institute import Institute
from inveniostore import InvenioStore
from iterrecord import IterRecord
from marc12 import Marc12
from record import Record
from recordconf import RecordConf
from recordinst import RecordInst
from recordpubli import RecordPubli
from recordthesis import RecordThesis
......@@ -4,14 +4,45 @@
"""
import re
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
OAI_URL = "http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
THESIS_DIR = u"dir."
def is_conference(record):
"""True when the record describes a publication related to a conference.
Args:
record (Record):
Return:
bool: true when the MARC record describes a publication related
to a conference.
"""
if u"111" in record:
return True
# try with the conference key
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
if record.host().startswith("cds"):
field, subfield = u"962", "n"
else:
field, subfield = u"773", "w"
return len(record._get(field, subfield)) > 0
def is_institute(record):
""" True when the record describe an institute.
"""True when the record describes an institute.
Args:
record (Record):
......@@ -37,3 +68,18 @@ def is_institute(record):
return True
return False
def is_thesis(record):
"""True when the record describes a thesis.
Args:
record (Record):
Return:
bool: true when the MARC record describes a thesis
"""
li = record._get(u"980", "a", force_list=True)
val = ", ".join(li)
return 'THESIS' in val
......@@ -12,6 +12,8 @@ from gluon import current
from inveniostore import InvenioStore
from marc12 import Marc12
from plugin_dbui import get_id
from recordconf import RecordConf
from recordthesis import RecordThesis
DECODE_ARXIV = re.compile("arXiv:(\d{2})(\d{2})\.")
......@@ -44,7 +46,6 @@ MONTHS = {u'Jan':'01',
u'Dec':'12'}
MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_DATE = "Reject no submission date"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
......@@ -68,7 +69,6 @@ MSG_WELL_FORMED_OAI = "Reject OAI is not well formed"
OAI_INVENIO = "oai:%s:%s"
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
......@@ -106,56 +106,6 @@ class CheckAndFix(object):
Most of the method raise the CheckException when something went wrong.
"""
def _get_conference(self, host, id, key):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
@type host: unicode
@param host:
@type id: unicode
@param id:
@type key: unicode
@param key:
@rtype: L{Record}
@return: The conference record
@raise CheckException:
"""
cds = InvenioStore(host)
marc12 = Marc12()
# search the conference by id the preferred method
if id:
xml = cds.get_record(id)
for conference in marc12(xml):
if conference.id() == id:
return conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if key:
ids = []
if cds._host.startswith('inspirehep'):
key = key.replace('/', '-')
if REG_CONF.match(key):
ids = cds.get_ids(cc='Conferences', p='111__g:%s' % key)
else:
ids = cds.get_ids(p=key)
for id in ids:
xml = cds.get_record(id)
for conference in marc12(xml):
if conference.conference_key() == key:
return conference
raise CheckException(MSG_NO_CONF)
def _recover_submitted(self, record):
"""Recover submitted date using conference, preprint or thesis
information.
......@@ -168,7 +118,7 @@ class CheckAndFix(object):
"""
val = u''
if record.is_conference_data():
if isinstance(record, RecordConf):
# INSPIREHEP start date encoded as 2014-12-31
if "x" in record["111"]:
......@@ -179,7 +129,7 @@ class CheckAndFix(object):
val = record["111"]["z"]
val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])
elif record.is_thesis():
elif isinstance(record, RecordThesis):
val = record.these_defense()
else:
......@@ -338,10 +288,7 @@ class CheckAndFix(object):
raise CheckException(MSG_WELL_FORMED_COLLABORATION)
def conference(self, record):
"""Get the conference data associated to a talk/proceeding and
push them in the record.
@note: the conference is looks by its key.
"""check country and conference date
@type record: L{Record}
@param record:
......@@ -350,31 +297,9 @@ class CheckAndFix(object):
"""
# conference information are available, i.e proceeding
if "111" in record:
if not isinstance(record, RecordConf):
return
# alias
host = record.host()
key = record.reference_conference_key()
# for talk or proceeding a key is always defined
if not key:
raise CheckException(MSG_NO_CONF)
# get conference information
id = record.reference_conference_id()
conference = self._get_conference(host, id, key)
# protection id can be a reference to other object like book
if "111" not in conference:
return
# copy conference information in the current record
# the conference URL is in 8564u
record[u"111"] = conference["111"]
if "8564" in conference:
record[u"8564"] = conference["8564"]
# check country information (all valid countries have been enter once)
db = current.globalenv['db']
id = get_id(db.countries, country=record.conference_country())
......@@ -495,7 +420,7 @@ class CheckAndFix(object):
"""
# protection
if not record.is_thesis():
if not isinstance(record, RecordThesis):
return
# CPPM: fix the name of Aix-Marseille university
......@@ -600,57 +525,33 @@ class CheckAndFix(object):
@raise CheckException:
"""
# the location of the OAI information depends on the store
# CDS: (248, a) or INSPIREHEP: (909CO, o)
if "0248" in record:
field, subfield = "0248", "a"
elif "909CO" in record:
field, subfield = "909CO", "o"
else:
# check that the OAI is defined
value = record.oai()
if not value:
raise CheckException(MSG_NO_OAI)
myid = record.id()
# Clean OAI information.
# in some case OAI is a list,e.g when two records were entered
# for the same entry but one deleted.
# Select the OAI corresponding to the current ID.
if isinstance(record[field], list):
val = ''
for di in record[field]:
if di[subfield].endswith(myid):
val = di
break
if val:
record[field] = val
else:
raise CheckException(MSG_NO_OAI)
# check that the OAI is well formed
m = REG_OAI.match(record[field][subfield])
if not m:
match = REG_OAI.match(value)
if not match:
raise CheckException(MSG_WELL_FORMED_OAI)
# The id in the OAI field might be different from the record id.
# In INVENIO there is a mechanism to redirect to the correct one
#
# The fix depend on the content of the database
if m.group(2) != myid:
if match.group(2) != myid:
db = current.globalenv['db']
# The record OAI is already used in the database. Do nothing
oai_url = OAI_URL % (m.group(1), m.group(2))
oai_url = OAI_URL % (match.group(1), match.group(2))
if get_id(db.publications, origin=oai_url):
return
# The OAI based on the record id is already used in the database.
# Modify the record OAI
oai_url = OAI_URL % (m.group(1), myid)
oai_url = OAI_URL % (match.group(1), myid)
if get_id(db.publications, origin=oai_url):
record[field][subfield] = OAI_INVENIO % (m.group(1), myid)
record[field][subfield] = OAI_INVENIO % (match.group(1), myid)
def paper_reference(self, record):
"""Check that editor, page, volume and paper year are defined
......
......@@ -16,7 +16,6 @@ class ExceptionUTF8(Exception):
class CdsException(ExceptionUTF8): pass
class CheckException(ExceptionUTF8): pass
class InstituteException(ExceptionUTF8): pass
class Marc12Exception(ExceptionUTF8): pass
class RecordException(ExceptionUTF8): pass
class XmlException(ExceptionUTF8): pass
# -*- coding: utf-8 -*-
""" invenio_tools.iterrecord
"""
import re
from exception import Marc12Exception
from record import Record
from xml.dom.minidom import parseString
MSG_WELL_FORMED_XML = "Reject XML is not well formed"
REG_INT = re.compile("^\d+$")
class IterRecord(object):
"""Iterator to decode the XML string and to iterate on C{Record}.
The XML string is encoded with the MARC format
U{MARC<http://www.loc.gov/marc>}.
The XML string has the following structure::
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
The iterator finds each record block and decode it.
"""
def __init__(self, xml):
"""
@type xml: str
@param xml:
@raise Marc12Exception: not well formed XML.
"""
if self._is_not_xml(xml):
raise Marc12Exception(MSG_WELL_FORMED_XML)
dom = parseString(xml)
root = dom.documentElement
nodes = root.getElementsByTagName('record')
self.i = 0
self.length = len(nodes)
self.nodes = nodes
def _clean_record(self, record):
"""Internal tool to clean the record.
concatenate the following dictionary::
record[field] = [dict(subfield1=val1), dict(subfield2=val2), dict(subfield3=val3),...]
record[field] = [dict(subfield1=val1), dict(subfield2=val2, subfield3=val3),...]
into a single one::
record[field] = dict1(subfield1=val1, subfield2=val2, subfield3=val3)
@type record: Record
@param record:
"""
for field in record:
if not isinstance(record[field], list):
continue
nkeys = [len(di) for di in record[field]]
# several dictionary with more than one nkeys
# don't know how to treat that case
if max(nkeys) > 1 and nkeys.count(max(nkeys)) > 1:
continue
# merge single entity dict in one big dict
# works when all the nkeys are different
# otherwise don't know what to do
if max(nkeys) == 1:
keys = []
for di in record[field]:
keys.extend(di.iterkeys())
# in a set duplicate entries are removed
# the next statement is true when all keys are different
if len(keys) == len(set(keys)):
di = record[field][0]
for i in range(1, len(record[field])):
for (k, v) in record[field][i].iteritems():
di[k] = v
record[field] = di
# merge a single entity one dict into an existing big one
# works when key don't exist in the big one
# otherwise don't known what to do
#
# Example 1: the following list is kept unchanged
# [{'a': u'LHCB-PAPER-2014-047'},
# {'a': u'CERN-PH-EP-2014-221'},
# {'9': u'arXiv', 'a': u'arXiv:1410.0149', 'c': u'hep-ex'}]
#
else:
index = nkeys.index(max(nkeys))
di, ko = record[field][index], False
# check that key do not exist in the big one
keys = di.keys()
for i in range(len(record[field])):
if i == index:
continue
for k in record[field][i].iterkeys():
if k in di:
ko = True
break
else:
keys.append(k)
if ko:
continue
# copy keys
for i in range(len(record[field])):
if i == index:
continue
for (k, v) in record[field][i].iteritems():
di[k] = v
record[field] = di
def _decode_record(self, node):
"""Transform the XML node I{<record>} into a L{Record}.
@type node: unicode
@param node: the I{<record>} node has the following structure::
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield key="a">oai:cds.cern.ch:1540265</subfield>
<subfield key="p">cerncds:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN</subfield>
</datafield>
...
</record>
@rtype: Record
@return: the keys of the record correspond to the I{datafield tag}.
"""
record = Record()
# controlfield
for controlfield in node.getElementsByTagName('controlfield'):
key = controlfield.getAttribute('tag')
value = controlfield.childNodes[0].nodeValue
record[key] = value
# datafield
for datafield in node.getElementsByTagName('datafield'):
di = self._decode_datafield(datafield)
key = datafield.getAttribute('tag')
ind1 = datafield.getAttribute('ind1').replace(' ', '')
ind2 = datafield.getAttribute('ind2').replace(' ', '')
# In almost all case the tag is an integer
# but from time to time it is equal to "FFT" (inspirehep) !!
if not REG_INT.match(key):
continue
# build the key by concataining all attributes
key = "%s%s%s" % (key, ind1, ind2)
# one occurrence of the key
if key not in record:
record[key] = di
# several occurrence of the key - transform a list of dictionary
elif isinstance(record[key], list):
record[key].append(di)
else:
record[key] = [record[key], di]
return record
def _decode_datafield(self, node):
"""Transform the XML node I{<datafiled>} into a dictionary.
@type node: unicode
@param node: the I{<datafiled>} node has the following structure::
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
@rtype: dict
@return: the keys correspond to the I{subfield code} while the values
are a string of a list of strings.
"""
di = {}
for subfield in node.getElementsByTagName('subfield'):
code = str(subfield.getAttribute('code'))
value = ''
if subfield.childNodes:
value = subfield.childNodes[0].nodeValue
if code not in di:
di[code] = value
elif isinstance(di[code], list):
di[code].append(value)
else:
di[code] = [di[code], value]
return di