Commit e92f798b authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Conference information are added by the Marc12 service.

parent 420d003a
......@@ -10,8 +10,22 @@ REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
def is_conference(record):
"""True when the record describes a publication related to a conference.
Args:
record (Record):
Return:
bool: true when the MARC record describes a publication related
to a conference.
"""
return u"111" in record or record.reference_conference_key()
def is_institute(record):
""" True when the record describe an institute.
""" True when the record describes an institute.
Args:
record (Record):
......
......@@ -5,7 +5,7 @@
import re
import regex
from base import OAI_URL, REG_OAI, REG_YEAR
from base import is_conference, OAI_URL, REG_OAI, REG_YEAR
from exception import CheckException
from filters import CLEAN_REVIEW
from gluon import current
......@@ -44,7 +44,6 @@ MONTHS = {u'Jan':'01',
u'Dec':'12'}
MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_DATE = "Reject no submission date"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
......@@ -68,7 +67,6 @@ MSG_WELL_FORMED_OAI = "Reject OAI is not well formed"
OAI_INVENIO = "oai:%s:%s"
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
......@@ -106,56 +104,6 @@ class CheckAndFix(object):
Most of the method raise the CheckException when something went wrong.
"""
def _get_conference(self, host, id, key):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
@type host: unicode
@param host:
@type id: unicode
@param id:
@type key: unicode
@param key:
@rtype: L{Record}
@return: The conference record
@raise CheckException:
"""
cds = InvenioStore(host)
marc12 = Marc12()
# search the conference by id the preferred method
if id:
xml = cds.get_record(id)
for conference in marc12(xml):
if conference.id() == id:
return conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if key:
ids = []
if cds._host.startswith('inspirehep'):
key = key.replace('/', '-')
if REG_CONF.match(key):
ids = cds.get_ids(cc='Conferences', p='111__g:%s' % key)
else:
ids = cds.get_ids(p=key)
for id in ids:
xml = cds.get_record(id)
for conference in marc12(xml):
if conference.conference_key() == key:
return conference
raise CheckException(MSG_NO_CONF)
def _recover_submitted(self, record):
"""Recover submitted date using conference, preprint or thesis
information.
......@@ -350,31 +298,9 @@ class CheckAndFix(object):
"""
# conference information are available, i.e proceeding
if "111" in record:
return
# alias
host = record.host()
key = record.reference_conference_key()
# for talk or proceeding a key is always defined
if not key:
raise CheckException(MSG_NO_CONF)
# get conference information
id = record.reference_conference_id()
conference = self._get_conference(host, id, key)
# protection id can be a reference to other object like book
if "111" not in conference:
if not is_conference(record):
return
# copy conference information in the current record
# the conference URL is in 8564u
record[u"111"] = conference["111"]
if "8564" in conference:
record[u"8564"] = conference["8564"]
# check country information (all valid countries have been enter once)
db = current.globalenv['db']
id = get_id(db.countries, country=record.conference_country())
......
......@@ -5,243 +5,108 @@
import re
from base import is_institute
from base import is_conference, is_institute
from exception import Marc12Exception
from institute import Institute
from record import Record
from xml.dom.minidom import parseString
from inveniostore import InvenioStore
from iterrecord import IterRecord
MSG_WELL_FORMED_XML = "Reject XML is not well formed"
REG_INT = re.compile("^\d+$")
MSG_NO_CONF = "Reject no conference information"
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
class Marc12(object):
"""Decode the XML string encoded with the
U{MARC<http://www.loc.gov/marc>} format.
The main method L{__call__} analyses the XML string
which has the follwing structure::
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
It returns a L{Record} object which behave like a dictionary::
"""Decode the MARC12 records embedded in the XML string.
The main method L{__call__} returns a list of L{Record} object
which behave like a dictionary::
record[field][subfield] = value(s)
where the C{field} correspond to the I{datafield tag} and the
C{subfield} to the I{subfield code}.
"""
def _clean_record(self, record):
"""Internal tool to clean the record.
concatenate the following dictionary::
When the record describes an institute, it is upcasted to C{Institute}
record[field] = [dict(subfield1=val1), dict(subfield2=val2), dict(subfield3=val3),...]
record[field] = [dict(subfield1=val1), dict(subfield2=val2, subfield3=val3),...]
The conference information are added for a talk or a proceeding.
into a single one::
record[field] = dict1(subfield1=val1, subfield2=val2, subfield3=val3)
"""
def _add_conference_data(self, record):
"""Add the conference data to the record.
@type record: Record
@param record:
"""
for field in record:
if not isinstance(record[field], list):
continue
nkeys = [len(di) for di in record[field]]
# several dictionary with more than one nkeys
# don't know how to treat that case
if max(nkeys) > 1 and nkeys.count(max(nkeys)) > 1:
continue
# merge single entity dict in one big dict
# works when all the nkeys are different
# otherwise don't know what to do
if max(nkeys) == 1:
keys = []
for di in record[field]:
keys.extend(di.iterkeys())
# in a set duplicate entries are removed
# the next statement is true when all keys are different
if len(keys) == len(set(keys)):
di = record[field][0]
for i in range(1, len(record[field])):
for (k, v) in record[field][i].iteritems():
di[k] = v
record[field] = di
# merge a single entity one dict into an existing big one
# works when key don't exist in the big one
# otherwise don't known what to do
#
# Example 1: the following list is kept unchanged
# [{'a': u'LHCB-PAPER-2014-047'},
# {'a': u'CERN-PH-EP-2014-221'},
# {'9': u'arXiv', 'a': u'arXiv:1410.0149', 'c': u'hep-ex'}]
#
else:
index = nkeys.index(max(nkeys))
di, ko = record[field][index], False
# check that key do not exist in the big one
keys = di.keys()
for i in range(len(record[field])):
if i == index:
continue
for k in record[field][i].iterkeys():
if k in di:
ko = True
break
else:
keys.append(k)
if ko:
continue
# copy keys
for i in range(len(record[field])):
if i == index:
continue
for (k, v) in record[field][i].iteritems():
di[k] = v
record[field] = di
def _decode_record(self, node):
"""Transform the XML node I{<record>} into a L{Record}.
@type node: unicode
@param node: the I{<record>} node has the following structure::
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield key="a">oai:cds.cern.ch:1540265</subfield>
<subfield key="p">cerncds:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN</subfield>
</datafield>
...
</record>
@rtype: Record
@return: the keys of the record correspond to the I{datafield tag}.
# alias
host = record.host()
key = record.reference_conference_key()
"""
record = Record()
# controlfield
for controlfield in node.getElementsByTagName('controlfield'):
key = controlfield.getAttribute('tag')
value = controlfield.childNodes[0].nodeValue
record[key] = value
# datafield
for datafield in node.getElementsByTagName('datafield'):
di = self._decode_datafield(datafield)
key = datafield.getAttribute('tag')
ind1 = datafield.getAttribute('ind1').replace(' ', '')
ind2 = datafield.getAttribute('ind2').replace(' ', '')
# In almost all case the tag is an integer
# but from time to time it is equal to "FFT" (inspirehep) !!
if not REG_INT.match(key):
continue
# for talk or proceeding a key is always defined
if not key:
raise Marc12Exception(MSG_NO_CONF)
# build the key by concataining all attributes
key = "%s%s%s" % (key, ind1, ind2)
# get conference information
id = record.reference_conference_id()
conference = self._get_conference(host, id, key)
# one occurrence of the key
if key not in record:
record[key] = di
# protection id can be a reference to other object like book
if u"111" not in conference:
return
# several occurrence of the key - transform a list of dictionary
elif isinstance(record[key], list):
record[key].append(di)
# copy conference information in the current record
# the conference URL is in 8564u
record[u"111"] = conference[u"111"]
if "8564" in conference:
record[u"8564"] = conference[u"8564"]
else:
record[key] = [record[key], di]
def _get_conference(self, host, conf_id, key):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
return record
@type host: unicode
@param host:
def _decode_datafield(self, node):
"""Transform the XML node I{<datafiled>} into a dictionary.
@type id: unicode
@param id:
@type node: unicode
@param node: the I{<datafiled>} node has the following structure::
@type key: unicode
@param key:
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
@rtype: L{Record}
@return: The conference record
@rtype: dict
@return: the keys correspond to the I{subfield code} while the values
are a string of a list of strings.
@raise CheckException:
"""
di = {}
for subfield in node.getElementsByTagName('subfield'):
code = str(subfield.getAttribute('code'))
value = ''
if subfield.childNodes:
value = subfield.childNodes[0].nodeValue
if code not in di:
di[code] = value
elif isinstance(di[code], list):
di[code].append(value)
cds = InvenioStore(host)
# search the conference by id the preferred method
if conf_id:
xml = cds.get_record(conf_id)
for conference in IterRecord(xml):
if conference.id() == conf_id:
return conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if key:
ids = []
if cds._host.startswith('inspirehep'):
key = key.replace('/', '-')
if REG_CONF.match(key):
ids = cds.get_ids(cc='Conferences', p='111__g:%s' % key)
else:
di[code] = [di[code], value]
return di
ids = cds.get_ids(p=key)
def _is_not_xml(self, xml):
"""C{True} when the C{xml} sting is well formed.
for conf_id in ids:
xml = cds.get_record(conf_id)
for conference in IterRecord(xml):
if conference.conference_key() == key:
return conference
@type xml: unicode
@param xml:
@rtype: bool
"""
if xml.startswith("<?xml"):
return False
return True
raise Marc12Exception(MSG_NO_CONF)
def __call__(self, xml, filter=None, func=None):
"""Transform the the XML string into a list of L{Record}
......@@ -284,20 +149,15 @@ class Marc12(object):
"""
li = []
if self._is_not_xml(xml):
raise Marc12Exception(MSG_WELL_FORMED_XML)
dom = parseString(xml)
root = dom.documentElement
for node in root.getElementsByTagName('record'):
record = self._decode_record(node)
self._clean_record(record)
for record in IterRecord(xml):
if is_institute(record):
record = Institute(record)
elif is_conference(record):
self._add_conference_data(record)
if filter and not filter(record):
continue
......
# *-* coding: utf-8 *-*
""" A collections of regular expression defining rules to be applied
on field contents.
""" A collections of regular expression defining rules validating
fields content.
@author: R. Le Gac
"""
# Collaboration field:
# Valid Collaboration field:
# - CMS Collaboration
# - CMS and LHCb Collaborations
# - ATLAS Collaboration, CMS Collaboration
# - Heavy Flavour Averaging Group
# - Heavy Flavour Averaging Group
# - CTA Consortium
# - any mixture of the above separated by a comma
#
REG_COLLABORATION = r'^[A-Za-z0-9\-/, ]+([Cc]ollaboration|[Cc]onsortium|[Gg]roup)[s]?$'
# Conference dates
# Valid Conference dates
# - 3 Dec 2012
# - 10-14 Dec 2012
# - 28 Jun - 4 Jul 2012
......@@ -25,13 +25,13 @@ reg2 = r'\d{1,2}-\d{1,2} [A-Z][a-z]{2} \d{4}'
reg3 = r'\d{1,2} [A-Z][a-z]{2} - \d{1,2} [A-Z][a-z]{2} \d{4}'
REG_CONF_DATES = r'%s|%s|%s' % (reg1, reg2, reg3)
# Defence date
# Valid Defence date
# - 30 Dec 2012
#
REG_DEFENSE = r'\d{2} [A-Z][a-z]{2} \d{4}'
# Submitted field
# Valid Submitted field
# - 2012-12
# - 2012-12-31
#
REG_SUBMITTED = r'\d{4}-\d{2}(-\d{2})?'
\ No newline at end of file
REG_SUBMITTED = r'\d{4}-\d{2}(-\d{2})?'
......@@ -14,6 +14,7 @@ Allow to test the brute force decoding with its mistakes.
Note:
* Only the first authors is defined
* The submitted date is 05 Jan 2012
* conference date 6 - 11 Dec 2010
"""
import copy
......@@ -26,20 +27,17 @@ from invenio_tools import CheckAndFix, load_record
@pytest.fixture(scope="module")
def record():
rec = load_record('cds.cern.ch', 1411352)
svc = CheckAndFix()
svc.conference(rec)
return rec
@pytest.fixture(scope="module")
def recordfix(record):
svc = CheckAndFix()
rec = copy.deepcopy(record)
svc = CheckAndFix()
svc.authors(rec)
svc.conference(rec)
svc.format_authors(rec, format_author_fr)
svc.format_editor(rec)
svc.my_authors(rec)
......@@ -54,40 +52,9 @@ def test_authors(record, recordfix):
assert recordfix.authors() == "O. Leroy"
def test_collaboration(record):
assert record.collaboration() == ""
def test_conference_country(record):
assert record.conference_country() == "Italy"
def test_conference_dates(record):
assert record.conference_dates() == "6-11 Dec 2010"
def test_conference_key(record):
assert record.conference_key() == "rome20101206"
def test_conference_location(record):
assert record.conference_location() == "Rome, Italy"
def test_conference_title(record):
assert record.conference_title() == "Symposium on Prospects in the Physics of Discrete Symmetries"
def test_conference_town(record):
assert record.conference_town() == "Rome"
def test_conference_url(record):
assert record.conference_url() == "http://www.roma1.infn.it/discrete10"
def test_conference_year(record):
assert record.conference_year() == "2010"
def test_conference_dates(record, recordfix):
assert record.conference_dates() == "6 - 11 Dec 2010"
assert recordfix.conference_dates() == "6-11 Dec 2010"
def test_first_author(record, recordfix):
......@@ -100,14 +67,6 @@ def test_first_institutes(record, recordfix):
assert recordfix.first_author_institutes() == "Marseille, CPPM"
def test_host(record):
assert record.host() == "cds.cern.ch"
def test_id(record):
assert record.id() == "1411352"
def test_institutes(record, recordfix):
assert record.institutes() == []
assert record.is_institute_defined() == False
......@@ -116,61 +75,6 @@ def test_institutes(record, recordfix):
assert recordfix.is_institute_defined() == True
def test_is_proceeding(record):
assert record.is_conference_data() == True
assert record.is_published() == True
assert record.is_thesis() == False
def test_oai(record):
assert record.oai() == "oai:cds.cern.ch:1411352"
assert record.oai_url() == "http://cds.cern.ch/record/1411352"