Commit d2a8a4fb authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Refactor Record in RecordConf, RecordInst, RecordPubli, RecordThesis classes.

parent 0ec8ce83
...@@ -3,21 +3,29 @@ ...@@ -3,21 +3,29 @@
@note: details on the invenio API at U{http://invenio-software.org/} @note: details on the invenio API at U{http://invenio-software.org/}
""" """
from base import (is_conference, from base import (ARXIV,
ARXIV_PDF,
is_conference,
is_institute, is_institute,
is_thesis,
OAI_URL, OAI_URL,
REG_ARXIV_NUMBER,
REG_OAI, REG_OAI,
REG_YEAR) REG_YEAR,
THESIS_DIR)
from exception import (CdsException, from exception import (CdsException,
CheckException, CheckException,
InstituteException,
Marc12Exception, Marc12Exception,
RecordException, RecordException,
XmlException) XmlException)
from checkandfix import CheckAndFix, load_record from checkandfix import CheckAndFix, load_record
from institute import Institute
from inveniostore import InvenioStore from inveniostore import InvenioStore
from iterrecord import IterRecord from iterrecord import IterRecord
from marc12 import Marc12 from marc12 import Marc12
from record import Record from record import Record
from recordconf import RecordConf
from recordinst import RecordInst
from recordpubli import RecordPubli
from recordthesis import RecordThesis
...@@ -4,11 +4,17 @@ ...@@ -4,11 +4,17 @@
""" """
import re import re
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
OAI_URL = "http://%s/record/%s" OAI_URL = "http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)') REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})") REG_YEAR = re.compile("(\d{4})")
THESIS_DIR = u"dir."
def is_conference(record): def is_conference(record):
"""True when the record describes a publication related to a conference. """True when the record describes a publication related to a conference.
...@@ -21,11 +27,22 @@ def is_conference(record): ...@@ -21,11 +27,22 @@ def is_conference(record):
to a conference. to a conference.
""" """
return u"111" in record or record.reference_conference_key() if u"111" in record:
return True
# try with the conference key
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
if record.host().startswith("cds"):
field, subfield = u"962", "n"
else:
field, subfield = u"773", "w"
return len(record._get(field, subfield)) > 0
def is_institute(record): def is_institute(record):
""" True when the record describes an institute. """True when the record describes an institute.
Args: Args:
record (Record): record (Record):
...@@ -51,3 +68,18 @@ def is_institute(record): ...@@ -51,3 +68,18 @@ def is_institute(record):
return True return True
return False return False
def is_thesis(record):
"""True when the record describes a thesis.
Args:
record (Record):
Return:
bool: true when the MARC record describes a thesis
"""
li = record._get(u"980", "a", force_list=True)
val = ", ".join(li)
return 'THESIS' in val
...@@ -16,7 +16,6 @@ class ExceptionUTF8(Exception): ...@@ -16,7 +16,6 @@ class ExceptionUTF8(Exception):
class CdsException(ExceptionUTF8): pass class CdsException(ExceptionUTF8): pass
class CheckException(ExceptionUTF8): pass class CheckException(ExceptionUTF8): pass
class InstituteException(ExceptionUTF8): pass
class Marc12Exception(ExceptionUTF8): pass class Marc12Exception(ExceptionUTF8): pass
class RecordException(ExceptionUTF8): pass class RecordException(ExceptionUTF8): pass
class XmlException(ExceptionUTF8): pass class XmlException(ExceptionUTF8): pass
...@@ -5,11 +5,14 @@ ...@@ -5,11 +5,14 @@
import re import re
from base import is_conference, is_institute from base import is_conference, is_institute, is_thesis
from exception import Marc12Exception from exception import Marc12Exception
from institute import Institute
from inveniostore import InvenioStore from inveniostore import InvenioStore
from iterrecord import IterRecord from iterrecord import IterRecord
from recordconf import RecordConf
from recordinst import RecordInst
from recordpubli import RecordPubli
from recordthesis import RecordThesis
MSG_NO_CONF = "Reject no conference information" MSG_NO_CONF = "Reject no conference information"
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$") REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
...@@ -103,14 +106,13 @@ class Marc12(object): ...@@ -103,14 +106,13 @@ class Marc12(object):
for conf_id in ids: for conf_id in ids:
xml = cds.get_record(conf_id) xml = cds.get_record(conf_id)
for conference in IterRecord(xml): for conference in IterRecord(xml):
if conference.conference_key() == key: if conference._get(u"111", "g") == key:
return conference return conference
raise Marc12Exception(MSG_NO_CONF) raise Marc12Exception(MSG_NO_CONF)
def __call__(self, xml, filter=None, func=None): def __call__(self, xml, filter=None, func=None):
"""Transform the the XML string into a list of L{Record} """Transform the the XML string into a list of L{Record}.
or L{Institute}
@type xml: unicode @type xml: unicode
@param xml: the XML string has the following structure:: @param xml: the XML string has the following structure::
...@@ -143,7 +145,7 @@ class Marc12(object): ...@@ -143,7 +145,7 @@ class Marc12(object):
The argument of the function is a Record. The argument of the function is a Record.
It can be used to polish the record content. It can be used to polish the record content.
@rtype: list of L{Record} or L{Institute} @rtype: list of L{Record}
@raise Marc12Exception: not well formed XML. @raise Marc12Exception: not well formed XML.
...@@ -152,18 +154,25 @@ class Marc12(object): ...@@ -152,18 +154,25 @@ class Marc12(object):
for record in IterRecord(xml): for record in IterRecord(xml):
if is_institute(record): if is_conference(record):
record = Institute(record) upcast_record = RecordConf(record)
self._add_conference_data(upcast_record)
elif is_conference(record): elif is_institute(record):
self._add_conference_data(record) upcast_record = RecordInst(record)
if filter and not filter(record): elif is_thesis(record):
upcast_record = RecordThesis(record)
else:
upcast_record = RecordPubli(record)
if filter and not filter(upcast_record):
continue continue
if func: if func:
func(record) func(upcast_record)
li.append(record) li.append(upcast_record)
return li return li
This diff is collapsed.
# -*- coding: utf-8 -*-
""" invenio_tools.recordconf
"""
import re
from base import REG_YEAR
from plugin_dbui import CLEAN_SPACES
from recordpubli import RecordPubli
class RecordConf(RecordPubli):
"""MARC describing a conference talk or a proceeding.
The relation between methods and MARC fields are the following::
| CDS | INSPIREP
----------------------+---------+----------
conference date | 111 d |
conference end | 111 z | None
conference key | 111 g |
conference location | 111 c |
conference title | 111 a |
conference start | None | 111 x
conference URL | 8564 u |
conference year | 111 f |
ref. conf. id | 962 b |
ref. conf. key | 962 n | 773 w
ref. conf. proceeding | 7870 w |
ref. conf. talk | 7870 w |
----------------------+---------+----------
"""
def conference_dates(self):
"""The dates of the conference.
@rtype: unicode
@return:
- The format is '6-5 March'.
- The format is not standardize and can varies
between records and between stores.
- The value is not a standardize C{date}.
"""
return self._get(u"111", "d")
def conference_country(self):
"""The country where the conference took place.
@rtype: unicode
@return:
- Empty string when not defined
- The filter L{CLEAN_SPACES} is applied.
"""
loc = self.conference_location()
if loc:
return CLEAN_SPACES(loc.split(',')[-1])
return ''
def conference_key(self):
"""The conference key used in the store.
@rtype: unicode
@return:
"""
return self._get(u"111", "g")
def conference_location(self):
"""The conference location.
@rtype: unicode
@return:
- The format is C{'town, country'}
"""
location = self._get(u"111", "c")
# protection against [u'NOW 2012', u'Conca Specchiulla, Otranto, Lecce, Italy']
if isinstance(location, list) and len(location) == 2:
location = location[1]
return CLEAN_SPACES(location)
def conference_title(self):
"""The title of the conference.
@rtype: unicode
@return:
"""
return CLEAN_SPACES(self._get(u"111", "a"))
def conference_town(self):
"""The town where the conference took place.
@rtype: unicode
@return:
- An empty string when not defined
"""
loc = self.conference_location()
if loc:
return CLEAN_SPACES(loc.split(',')[0])
return ''
def conference_url(self):
"""The URL of the conference home page.
@rtype: unicode
@return:
- Select arbitrarely the first URL when more than one are founded.
- empty string when not defined
"""
li = self._get(u"8564", "u", force_list=True)
# protection
# from time to time this field contains the reference to the pdf file
val = []
for el in li:
if not el.endswith('pdf'):
val.append(el)
# if more than one URL is associated to the record
# select arbitrarily the first one
if val:
return val[0]
return u''
def conference_year(self):
"""The year of the conference.
@rtype: unicode
@return:
"""
year = self._get(u"111", "f")
if year:
return year
# recovery from conference dates
match = REG_YEAR.search(self.conference_dates())
if match:
return match.group(1)
return u''
def reference_conference_id(self):
"""The C{id} of the conference when the record is a proceeding
or a conference talk.
@rtype: unicode
@return:
- Empty string when not defined.
"""
return self._get(u"962", "b")
def reference_conference_key(self):
"""The conference C{key} when the record is a proceeding
or a conference talk.
@rtype: unicode
@return:
- Empty string when not defined
"""
val = ''
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
#
# NOTE: for the later the field can be a dictionary or
# a list of dictionary. Two type exist. One for the
# proceeding, the other containing the conference key.
# all topologies exist, proc, conf, proc+conf
#
if self.host().startswith("inspirehep") and u"773" in self:
if isinstance(self[u"773"], dict) and "w" in self[u"773"]:
val = self[u"773"]["w"]
elif isinstance(self[u"773"], list):
for di in self[u"773"]:
if "w" in di:
val = di["w"]
elif u"962" in self and "n" in self[u"962"]:
val = self[u"962"]["n"]
return val
def reference_conference_proceeding(self):
"""The id of the proceeding when the record is a conference talk.
@rtype: unicode
@return: record id
"""
return self._get(u"7870", "w")
def reference_conference_talk(self):
"""The id of the conference talk when the record is a proceeding.
@rtype: unicode
@return: record id
"""
return self._get(u"7870", "w")
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" invenio_tools.institute """ invenio_tools.recordinst
""" """
from base import is_institute from base import is_institute
from exception import InstituteException from exception import RecordException
from record import Record from record import Record
...@@ -12,11 +12,9 @@ MSG_INVALID_HOST = "Invalid record host" ...@@ -12,11 +12,9 @@ MSG_INVALID_HOST = "Invalid record host"
MSG_INVALID_RECORD = "Invalid record, it is not describing an institute" MSG_INVALID_RECORD = "Invalid record, it is not describing an institute"
class Institute(dict): class RecordInst(Record):
"""MARC record representing an institute. More information on MARC """MARC record describing an institute.
standard at U{http://www.loc.gov/marc/bibliographic/}). The relation between methods and MARC fields are the following::
The relation between methods and MARC field is the following::
| INSPIREHEP | | INSPIREHEP |
----------------------+-------------+ ----------------------+-------------+
...@@ -35,15 +33,15 @@ class Institute(dict): ...@@ -35,15 +33,15 @@ class Institute(dict):
""" """
if not isinstance(record, Record): if not isinstance(record, Record):
raise InstituteException(MSG_INVALID_ARG) raise RecordException(MSG_INVALID_ARG)
if not is_institute(record): if not is_institute(record):
raise InstituteException(MSG_INVALID_RECORD) raise RecordException(MSG_INVALID_RECORD)
if record.host() != 'inspirehep.net': if record.host() != 'inspirehep.net':
raise InstituteException(MSG_INVALID_INSTITUTE) raise RecordException(MSG_INVALID_INSTITUTE)
dict.__init__(self, record) Record.__init__(self, record)
def future_id(self): def future_id(self):
""" """
...@@ -53,14 +51,6 @@ class Institute(dict): ...@@ -53,14 +51,6 @@ class Institute(dict):
""" """
return self[u"110"]["t"] return self[u"110"]["t"]
def id(self):
"""
Returns:
unicode: the inspirehep id.
"""
return self[u"110"]["u"]
def name(self): def name(self):
""" """
Returns: Returns:
......
# -*- coding: utf-8 -*-
""" invenio_tools.recordpubli
"""
import re
from base import ARXIV, ARXIV_PDF, REG_ARXIV_NUMBER, REG_YEAR, THESIS_DIR
from filters import CLEAN_COLLABORATION
from plugin_dbui import CLEAN_SPACES
from record import Record
class RecordPubli(Record):
"""MARC record describing a publication.
The relation between methods and MARC fields are the following::
| CDS | INSPIREP
----------------------+---------+----------
authors | 700 a |
collaboration | 710 g |
first author | 100 a |
institutes | 700 u |
paper editor | 773 p |
paper pages | 773 c |
paper reference | 773 o |
paper URL | 8564 u |
paper volume | 773 v |
paper year | 773 y |
preprint number | 037 a |
report number | 088 a | 037a
submitted | 269 c |
title | 245 a |
year | 260 c |
----------------------+---------+----------
"""
def authors(self, cmpFct=None):
"""The author(s) signing the publication.
@type cmpFct: reference to a function or None
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- The string is empty when there is no authors.
"""
li = self.authors_as_list()
if cmpFct:
li.sort(key=cmpFct)
return u', '.join(li)
def authors_as_list(self):
"""The list of author(s) signing the publication.
@rtype: list
@return:
- The list is empty when authors are not defined.
"""
authors = []
# NOTE: the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if u"700" in self and isinstance(self[u"700"], dict):
if not ("e" in self[u"700"] and self[u"700"]["e"] == THESIS_DIR):
authors.append(self[u"700"]["a"])
elif u"700" in self and isinstance(self[u"700"], list):
for di in self[u"700"]:
if "e" in di and di["e"] == THESIS_DIR:
continue
authors.append(di["a"])
return authors
def collaboration(self):
"""The collaboration(s) signing the publication.
@rtype: unicode