Commit 420d003a authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add the iterator iterRecord to decode the xml string.

parent 9ffd73ff
......@@ -3,7 +3,8 @@
@note: details on the invenio API at U{http://invenio-software.org/}
"""
from base import (is_institute,
from base import (is_conference,
is_institute,
OAI_URL,
REG_OAI,
REG_YEAR)
......@@ -17,5 +18,6 @@ from exception import (CdsException,
from checkandfix import CheckAndFix, load_record
from institute import Institute
from inveniostore import InvenioStore
from iterrecord import IterRecord
from marc12 import Marc12
from record import Record
# -*- coding: utf-8 -*-
""" invenio_tools.iterrecord
"""
import re
from exception import Marc12Exception
from record import Record
from xml.dom.minidom import parseString
MSG_WELL_FORMED_XML = "Reject XML is not well formed"
REG_INT = re.compile("^\d+$")
class IterRecord(object):
"""Iterator to decode the XML string and to iterate on C{Record}.
The XML string is encoded with the MARC format
U{MARC<http://www.loc.gov/marc>}.
The XML string has the following structure::
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
The iterator finds each record block and decode it.
"""
def __init__(self, xml):
"""
@type xml: str
@param xml:
@raise Marc12Exception: not well formed XML.
"""
if self._is_not_xml(xml):
raise Marc12Exception(MSG_WELL_FORMED_XML)
dom = parseString(xml)
root = dom.documentElement
nodes = root.getElementsByTagName('record')
self.i = 0
self.length = len(nodes)
self.nodes = nodes
def _clean_record(self, record):
"""Internal tool to clean the record.
concatenate the following dictionary::
record[field] = [dict(subfield1=val1), dict(subfield2=val2), dict(subfield3=val3),...]
record[field] = [dict(subfield1=val1), dict(subfield2=val2, subfield3=val3),...]
into a single one::
record[field] = dict1(subfield1=val1, subfield2=val2, subfield3=val3)
@type record: Record
@param record:
"""
for field in record:
if not isinstance(record[field], list):
continue
nkeys = [len(di) for di in record[field]]
# several dictionary with more than one nkeys
# don't know how to treat that case
if max(nkeys) > 1 and nkeys.count(max(nkeys)) > 1:
continue
# merge single entity dict in one big dict
# works when all the nkeys are different
# otherwise don't know what to do
if max(nkeys) == 1:
keys = []
for di in record[field]:
keys.extend(di.iterkeys())
# in a set duplicate entries are removed
# the next statement is true when all keys are different
if len(keys) == len(set(keys)):
di = record[field][0]
for i in range(1, len(record[field])):
for (k, v) in record[field][i].iteritems():
di[k] = v
record[field] = di
# merge a single entity one dict into an existing big one
# works when key don't exist in the big one
# otherwise don't known what to do
#
# Example 1: the following list is kept unchanged
# [{'a': u'LHCB-PAPER-2014-047'},
# {'a': u'CERN-PH-EP-2014-221'},
# {'9': u'arXiv', 'a': u'arXiv:1410.0149', 'c': u'hep-ex'}]
#
else:
index = nkeys.index(max(nkeys))
di, ko = record[field][index], False
# check that key do not exist in the big one
keys = di.keys()
for i in range(len(record[field])):
if i == index:
continue
for k in record[field][i].iterkeys():
if k in di:
ko = True
break
else:
keys.append(k)
if ko:
continue
# copy keys
for i in range(len(record[field])):
if i == index:
continue
for (k, v) in record[field][i].iteritems():
di[k] = v
record[field] = di
def _decode_record(self, node):
"""Transform the XML node I{<record>} into a L{Record}.
@type node: unicode
@param node: the I{<record>} node has the following structure::
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield key="a">oai:cds.cern.ch:1540265</subfield>
<subfield key="p">cerncds:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN</subfield>
</datafield>
...
</record>
@rtype: Record
@return: the keys of the record correspond to the I{datafield tag}.
"""
record = Record()
# controlfield
for controlfield in node.getElementsByTagName('controlfield'):
key = controlfield.getAttribute('tag')
value = controlfield.childNodes[0].nodeValue
record[key] = value
# datafield
for datafield in node.getElementsByTagName('datafield'):
di = self._decode_datafield(datafield)
key = datafield.getAttribute('tag')
ind1 = datafield.getAttribute('ind1').replace(' ', '')
ind2 = datafield.getAttribute('ind2').replace(' ', '')
# In almost all case the tag is an integer
# but from time to time it is equal to "FFT" (inspirehep) !!
if not REG_INT.match(key):
continue
# build the key by concataining all attributes
key = "%s%s%s" % (key, ind1, ind2)
# one occurrence of the key
if key not in record:
record[key] = di
# several occurrence of the key - transform a list of dictionary
elif isinstance(record[key], list):
record[key].append(di)
else:
record[key] = [record[key], di]
return record
def _decode_datafield(self, node):
"""Transform the XML node I{<datafiled>} into a dictionary.
@type node: unicode
@param node: the I{<datafiled>} node has the following structure::
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
@rtype: dict
@return: the keys correspond to the I{subfield code} while the values
are a string of a list of strings.
"""
di = {}
for subfield in node.getElementsByTagName('subfield'):
code = str(subfield.getAttribute('code'))
value = ''
if subfield.childNodes:
value = subfield.childNodes[0].nodeValue
if code not in di:
di[code] = value
elif isinstance(di[code], list):
di[code].append(value)
else:
di[code] = [di[code], value]
return di
def _is_not_xml(self, xml):
"""C{True} when the C{xml} sting is well formed.
@type xml: unicode
@param xml:
@rtype: bool
"""
if xml.startswith("<?xml"):
return False
return True
def __iter__(self):
return self
def next(self):
"""
@rtype: Record
"""
i = self.i
if i < self.length:
node = self.nodes.item(i)
record = self._decode_record(node)
self._clean_record(record)
self.i += 1
return record
else:
raise StopIteration()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment