Commit 6ae3ac92 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Clean module invenio_tools.

parent 656ecd86
......@@ -13,7 +13,6 @@ from .checkandfix import CheckAndFix
from gluon.storage import Storage
from invenio_tools import (CdsException,
InvenioStore,
Marc12,
OAI_URL)
from invenio_tools.factory import build_record
from .msg import Msg
......
......@@ -33,7 +33,7 @@ from .marc12 import Marc12
from .record import Record
from .recordconf import RecordConf
from .recordinst import RecordInst
from .recordpubli import DECODE_REF, RecordPubli
from .recordpubli import RecordPubli
from .recordthesis import RecordThesis
......
""" invenio_tools.iterrecord
"""
import re
from .exception import Marc12Exception
from .record import Record
from xml.dom.minidom import parseString
MSG_WELL_FORMED_XML = "Reject XML is not well formed"
REG_INT = re.compile("^\d+$")
class IterRecord(object):
"""Iterator to decode the XML string and to iterate on Record.
The XML string is encoded using the
`MARC <http://www.loc.gov/marc>`_ format.
The XML string has the following structure::
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
The iterator finds each record block and decode it.
"""
def __init__(self, xml):
"""
Args:
xml (str):
Raises:
Marc12Exception: not well formed XML.
"""
if self._is_not_xml(xml):
raise Marc12Exception(MSG_WELL_FORMED_XML)
dom = parseString(xml)
root = dom.documentElement
nodes = root.getElementsByTagName("record")
self.i = 0
self.length = len(nodes)
self.nodes = nodes
def _clean_record(self, record):
"""Internal tool to clean the record.
Concatenate the following dictionary::
record[field] = [
dict(subfield1=val1),
dict(subfield2=val2),
dict(subfield3=val3),...
]
record[field] = [
dict(subfield1=val1),
dict(subfield2=val2,
subfield3=val3),...
]
into a single one::
record[field] = dict1(subfield1=val1,
subfield2=val2,
subfield3=val3)
Args:
record (Record):
"""
for field in record:
if not isinstance(record[field], list):
continue
nkeys = [len(di) for di in record[field]]
# several dictionary with more than one nkeys
# don't know how to treat that case
if max(nkeys) > 1 and nkeys.count(max(nkeys)) > 1:
continue
# merge single entity dict in one big dict
# works when all the nkeys are different
# otherwise don't know what to do
if max(nkeys) == 1:
keys = []
for di in record[field]:
keys.extend(di.keys())
# in a set duplicate entries are removed
# the next statement is true when all keys are different
if len(keys) == len(set(keys)):
di = record[field][0]
for i in range(1, len(record[field])):
for k, v in record[field][i].items():
di[k] = v
record[field] = di
# merge a single entity one dict into an existing big one
# works when key don't exist in the big one
# otherwise don't known what to do
#
# Example 1: the following list is kept unchanged
# [{"a": u"LHCB-PAPER-2014-047"},
# {"a": u"CERN-PH-EP-2014-221"},
# {"9": u"arXiv", "a": u"arXiv:1410.0149", "c": u"hep-ex"}]
#
else:
index = nkeys.index(max(nkeys))
di, ko = record[field][index], False
# check that key do not exist in the big one
keys = list(di.keys())
for i in range(len(record[field])):
if i == index:
continue
for k in record[field][i].keys():
if k in di:
ko = True
break
else:
keys.append(k)
if ko:
continue
# copy keys
for i in range(len(record[field])):
if i == index:
continue
for k, v in record[field][i].items():
di[k] = v
record[field] = di
def _decode_record(self, node):
"""Transform the XML node *<record>* into a Record.
Args:
node (str): the *<record>* node has the following structure::
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield key="a">oai:cds.cern.ch:1540265</subfield>
<subfield key="p">cerncds:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN</subfield>
</datafield>
...
</record>
Returns:
Record: the keys of the record correspond to the *datafield tag*.
"""
record = Record()
# controlfield
for controlfield in node.getElementsByTagName("controlfield"):
key = controlfield.getAttribute("tag")
value = controlfield.childNodes[0].nodeValue
record[key] = value
# datafield
for datafield in node.getElementsByTagName("datafield"):
di = self._decode_datafield(datafield)
key = datafield.getAttribute("tag")
ind1 = datafield.getAttribute("ind1").replace(" ", "")
ind2 = datafield.getAttribute("ind2").replace(" ", "")
# In almost all case the tag is an integer
# but from time to time it is equal to "FFT" (inspirehep) !!
if not REG_INT.match(key):
continue
# build the key by concataining all attributes
key = "%s%s%s" % (key, ind1, ind2)
# one occurrence of the key
if key not in record:
record[key] = di
# several occurrence of the key - transform a list of dictionary
elif isinstance(record[key], list):
record[key].append(di)
else:
record[key] = [record[key], di]
return record
def _decode_datafield(self, node):
"""Transform the XML node *<datafiled>* into a dictionary.
Args:
node (str): the *<datafiled>* node has the following
structure::
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
Returns:
dict: the keys correspond to the *subfield code* while the values
are a string of a list of strings.
"""
di = {}
for subfield in node.getElementsByTagName("subfield"):
code = str(subfield.getAttribute("code"))
value = ""
if subfield.childNodes:
value = subfield.childNodes[0].nodeValue
if code not in di:
di[code] = value
elif isinstance(di[code], list):
di[code].append(value)
else:
di[code] = [di[code], value]
return di
def _is_not_xml(self, xml):
"""C{True} when the C{xml} string is not well formed.
Args:
xml (str):
Returns:
bool:
"""
if xml.startswith("<?xml"):
return False
return True
def __iter__(self):
return self
def __next__(self):
"""
Returns:
Record: the next decoded record.
Raises:
StopIteration: when there is no more record.
"""
i = self.i
if i < self.length:
node = self.nodes.item(i)
record = self._decode_record(node)
self._clean_record(record)
self.i += 1
return record
else:
raise StopIteration()
""" invenio_tools.marc12
"""
import re
from .base import (is_conference,
is_institute,
is_thesis,
MSG_NO_CONF,
MSG_NO_HOST,
REG_OAI)
from .exception import Marc12Exception
from .inveniostore import InvenioStore
from .iterrecord import IterRecord
from .recordconf import RecordConf
from .recordinst import RecordInst
from .recordpubli import RecordPubli
from .recordthesis import RecordThesis
MSG_DECODING_FAILED = "Record decoding failed."
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
class Marc12(object):
"""Service to decode MARC12 records embedded in the XML string.
The main methods are :meth:`.records` which returns a list
of :class:`.Record` object and :meth:`.iterrecords`.
Each record behaves like a dictionary::
record[field][subfield] = value(s)
where the ``field`` correspond to the *datafield tag* and the
``subfield`` to the *subfield code*.
Note:
The record is upcasted to :class:`.RecordInst` When it describes
an institute, :class:`.RecordPubli` for a publication
and :class:`.RecordConf` for a conference talk or proceeding.
Note:
The conference information are added for a talk or a proceeding.
"""
def _add_conference_data(self, record):
"""Add the conference data to the record.
Args:
record (Record): record describing a conference.
"""
# reference to host
host = record.host()
if host in ("", None):
raise Marc12Exception(MSG_NO_HOST)
# for talk or proceeding a key is always defined
key = record.reference_conference_key()
if not key:
raise Marc12Exception(MSG_NO_CONF)
# get conference information
id_conf = record.reference_conference_id()
conference = self._get_conference(host, id_conf, key)
# protection id can be a reference to other object like book
if "111" not in conference:
return
# copy conference information in the current record
# the conference URL is in 8564u
record["111"] = conference["111"]
if "8564" in conference:
record["8564"] = conference["8564"]
def __call__(self, xml, **kwargs):
"""
Note:
* Allow the syntax ``Marc12()(xml)``.
* Keep for backward compatibility.
* Prefer the method :meth:`.records`.
Args:
xml (str): the XML string with the publication contents.
Keyword Args:
filter_func (reference): a function to eliminate records
which don't satisfy functions criteria. The argument of the
function is a Record while the return value is a boolean.
func (reference): a function applied to each surviving record.
The argument of the function is a Record.
It can be used to polish the record content.
Returns:
list: list of :clas:`.Record`.
"""
return self.records(xml, **kwargs)
def _get_conference(self, host, conf_id, key):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
Args:
host (str): possible values are ``cds.cern.ch`` or
``inspirehep.net``.
conf_id (str): the conference identifier in the store.
key (str): the conference key in the store.
Returns:
Record: The conference record
Raises:
Marc12Exception: when the conference is not found.
"""
cds = InvenioStore(host)
# search the conference by id the preferred method
if conf_id:
xml = cds.get_record(conf_id)
for conference in IterRecord(xml):
if conference.id() == conf_id:
return conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if key:
ids = []
if cds._host.startswith("inspirehep"):
key = key.replace("/", "-")
if REG_CONF.match(key):
ids = cds.get_ids(cc="Conferences", p="111__g:%s" % key)
else:
ids = cds.get_ids(p=key)
for conf_id in ids:
xml = cds.get_record(conf_id)
for conference in IterRecord(xml):
if conference._get("111", "g") == key:
return conference
raise Marc12Exception(MSG_NO_CONF)
def _recover_deleted_record(self, record):
"""Recover a deleted record.
From time to time a record is deleted and replace by a new one.
In that case the record looks like::
{
"0248_": {"a": "oai:cds.cern.ch:1366561"},
"001": "1366561",
"980": {"c": "DELETED"},
"970": {"d": "1366710"}
}
The method replace the old record by the new one, by using the oai URL.
It is build using the field 0248_ and 970.
Note:
It might happen that a record is deleted and not replace by a new.
In that case the CheckException is raised.
Args:
record (Record): the record to be check
Returns:
Record: the input record or the new one.
Raises:
Marc12Exception: when the record is deleted and
not replaced by a new one.
"""
is_deleted = "980" in record \
and "c" in record["980"] and record["980"]["c"] == "DELETED"
if not is_deleted:
return record
is_replaced = \
"970" in record and "d" in record["970"] \
and "0248_" in record and "a" in record["0248_"]
if not is_replaced:
raise Marc12Exception(MSG_DECODING_FAILED)
match = REG_OAI.match(record["0248_"]["a"])
if match:
cds = InvenioStore(match.group(1))
xml = cds.get_record(record["970"]["d"])
new_record = next(IterRecord(xml))
return new_record
raise Marc12Exception(MSG_DECODING_FAILED)
def iterrecords(self, xml):
"""Return an iterator on the embedded records.
Args:
xml (str): the XML string with the publication contents.
Return:
IterRecord:
"""
return IterRecord(xml)
def records(self, xml, filter_func=None, func=None):
"""Transform the the XML string into a list of Record.
Args:
xml (str): the XML string with the publication contents.
It has the following structure:
.. code-block:: xml
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
filter_func (reference): a function to eliminate records
which don't satisfy functions criteria. The argument of the
function is a Record while the return value is a boolean.
func (reference): a function applied to each surviving record.
The argument of the function is a Record.
It can be used to polish the record content.
Returns:
list: list of :class:`.Record`.
Raises:
Marc12Exception: not well formed XML.
"""
li = []
for record in IterRecord(xml):
record = self._recover_deleted_record(record)
if is_conference(record):
upcast_record = RecordConf(record)
self._add_conference_data(upcast_record)
elif is_institute(record):
upcast_record = RecordInst(record)
elif is_thesis(record):
upcast_record = RecordThesis(record)
else:
upcast_record = RecordPubli(record)
if filter_func and not filter_func(upcast_record):
continue
if func:
func(upcast_record)
li.append(upcast_record)
return li
......@@ -12,7 +12,6 @@ from record import Record
>>>>>>> Migrate RecordInst.
MSG_INVALID_ARG = "Invalid argument record"
MSG_INVALID_HOST = "Invalid record host"
MSG_INVALID_RECORD = "Invalid record, it is not describing an institute"
......
......@@ -25,13 +25,6 @@ AUTHOR_FORMATS = [
"Last, First",
"Last F."]
# decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
_ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
MSG_INVALID_FMT = "Invalid format for author"
# the keys containing paper reference
......