Commit d2a8a4fb authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Refactor Record in RecordConf, RecordInst, RecordPubli, RecordThesis classes.

parent 0ec8ce83
......@@ -3,21 +3,29 @@
@note: details on the invenio API at U{http://invenio-software.org/}
"""
from base import (is_conference,
from base import (ARXIV,
ARXIV_PDF,
is_conference,
is_institute,
is_thesis,
OAI_URL,
REG_ARXIV_NUMBER,
REG_OAI,
REG_YEAR)
REG_YEAR,
THESIS_DIR)
from exception import (CdsException,
CheckException,
InstituteException,
Marc12Exception,
RecordException,
XmlException)
from checkandfix import CheckAndFix, load_record
from institute import Institute
from inveniostore import InvenioStore
from iterrecord import IterRecord
from marc12 import Marc12
from record import Record
from recordconf import RecordConf
from recordinst import RecordInst
from recordpubli import RecordPubli
from recordthesis import RecordThesis
......@@ -4,11 +4,17 @@
"""
import re
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
OAI_URL = "http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
THESIS_DIR = u"dir."
def is_conference(record):
"""True when the record describes a publication related to a conference.
......@@ -21,11 +27,22 @@ def is_conference(record):
to a conference.
"""
return u"111" in record or record.reference_conference_key()
if u"111" in record:
return True
# try with the conference key
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
if record.host().startswith("cds"):
field, subfield = u"962", "n"
else:
field, subfield = u"773", "w"
return len(record._get(field, subfield)) > 0
def is_institute(record):
""" True when the record describes an institute.
"""True when the record describes an institute.
Args:
record (Record):
......@@ -51,3 +68,18 @@ def is_institute(record):
return True
return False
def is_thesis(record):
"""True when the record describes a thesis.
Args:
record (Record):
Return:
bool: true when the MARC record describes a thesis
"""
li = record._get(u"980", "a", force_list=True)
val = ", ".join(li)
return 'THESIS' in val
......@@ -16,7 +16,6 @@ class ExceptionUTF8(Exception):
class CdsException(ExceptionUTF8): pass
class CheckException(ExceptionUTF8): pass
class InstituteException(ExceptionUTF8): pass
class Marc12Exception(ExceptionUTF8): pass
class RecordException(ExceptionUTF8): pass
class XmlException(ExceptionUTF8): pass
......@@ -5,11 +5,14 @@
import re
from base import is_conference, is_institute
from base import is_conference, is_institute, is_thesis
from exception import Marc12Exception
from institute import Institute
from inveniostore import InvenioStore
from iterrecord import IterRecord
from recordconf import RecordConf
from recordinst import RecordInst
from recordpubli import RecordPubli
from recordthesis import RecordThesis
MSG_NO_CONF = "Reject no conference information"
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
......@@ -103,14 +106,13 @@ class Marc12(object):
for conf_id in ids:
xml = cds.get_record(conf_id)
for conference in IterRecord(xml):
if conference.conference_key() == key:
if conference._get(u"111", "g") == key:
return conference
raise Marc12Exception(MSG_NO_CONF)
def __call__(self, xml, filter=None, func=None):
"""Transform the the XML string into a list of L{Record}
or L{Institute}
"""Transform the the XML string into a list of L{Record}.
@type xml: unicode
@param xml: the XML string has the following structure::
......@@ -143,7 +145,7 @@ class Marc12(object):
The argument of the function is a Record.
It can be used to polish the record content.
@rtype: list of L{Record} or L{Institute}
@rtype: list of L{Record}
@raise Marc12Exception: not well formed XML.
......@@ -152,18 +154,25 @@ class Marc12(object):
for record in IterRecord(xml):
if is_institute(record):
record = Institute(record)
if is_conference(record):
upcast_record = RecordConf(record)
self._add_conference_data(upcast_record)
elif is_conference(record):
self._add_conference_data(record)
elif is_institute(record):
upcast_record = RecordInst(record)
if filter and not filter(record):
elif is_thesis(record):
upcast_record = RecordThesis(record)
else:
upcast_record = RecordPubli(record)
if filter and not filter(upcast_record):
continue
if func:
func(record)
func(upcast_record)
li.append(record)
li.append(upcast_record)
return li
......@@ -2,21 +2,10 @@
""" invenio_tools.record
"""
import re
import pprint
from base import OAI_URL, REG_YEAR, REG_OAI
from filters import CLEAN_COLLABORATION, CLEAN_THESIS_DEFENSE
from plugin_dbui import CLEAN_SPACES
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
THESIS_DIR = u"dir."
from base import OAI_URL, REG_OAI
class Record(dict):
......@@ -32,55 +21,24 @@ class Record(dict):
dict2(subfield1=..., subfield2=...), ...]
In the MARC standard, the C{field} is a string containing at least three digit
while the C{subfield} is a letter. The type of the C{field} and C{subfield}
is string.
while the C{subfield} is a letter. The type of the C{field} is unicode
and C{subfield} is string.
The class comes with a collection of methods to extract the record
information ignoring the C{field} and the C{subfield} codification.
information masking the C{field} and the C{subfield} codification.
The realtion between methods and MARC field is the following::
The relation between methods and MARC fields are the following::
| CDS | INSPIREP
----------------------+---------+----------
authors | 700 a |
collaboration | 710 g |
conference date | 111 d |
conference end | 111 z | None
conference key | 111 g |
conference location | 111 c |
conference title | 111 a |
conference start | None | 111 x
conference URL | 8564 u |
conference year | 111 f |
first author | 100 a |
id | 001 |
institutes | 700 u |
oai | 0248 a | 909CO o
paper editor | 773 p |
paper pages | 773 c |
paper reference | 773 o |
paper URL | 8564 u |
paper volume | 773 v |
paper year | 773 y |
preprint number | 037 a |
ref. conf. id | 962 b |
ref. conf. key | 962 n | 773 w
ref. conf. proceeding | 7870 w |
ref. conf. talk | 7870 w |
report number | 088 a | 037a
submitted | 269 c |
these defense | 500 a |
these level | 502 a |
these director | 700 a |
these universities | 502 b |
title | 245 a |
year | 260 c |
----------------------+---------+----------
"""
def __init__(self):
def __init__(self, *args):
dict.__init__(self)
dict.__init__(self, *args)
# private cache
self.__host = None
......@@ -93,8 +51,8 @@ class Record(dict):
def _get(self, field, subfield, force_list=False):
"""Get the value associated to the key C{field} and C{subfield}.
@type field: str
@param field: typical values are "001", "700", "909CO", ....
@type field: unicode
@param field: typical values are u"001", u"700", u"909CO", ....
@type subfield: str
@param subfield: typical values are "a", "b", ....
......@@ -125,290 +83,12 @@ class Record(dict):
return val
def authors(self, cmpFct=None):
"""The author(s) signing the publication.
@type cmpFct: reference to a function or None
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- The string is empty when there is no authors.
"""
li = self.authors_as_list()
if cmpFct:
li.sort(key=cmpFct)
return u', '.join(li)
def authors_as_list(self):
"""The list of author(s) signing the publication.
@rtype: list
@return:
- The list is empty when authors are not defined.
"""
authors = []
# NOTE: the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if "700" in self and isinstance(self["700"], dict):
if not ("e" in self["700"] and self["700"]["e"] == THESIS_DIR):
authors.append(self["700"]["a"])
elif "700" in self and isinstance(self["700"], list):
for di in self["700"]:
if "e" in di and di["e"] == THESIS_DIR:
continue
authors.append(di["a"])
return authors
def collaboration(self):
"""The collaboration(s) signing the publication.
@rtype: unicode
@return:
- Collaboration names are separated by ", ".
- The filter L{CLEAN_COLLABORATION} is applied.
"""
li = self._get("710", 'g', force_list=True)
return CLEAN_COLLABORATION(', '.join(li))
def conference_dates(self):
"""The dates of the conference.
@rtype: unicode
@return:
- The format is '6-5 March'.
- The format is not standardize and can varies
between records and between stores.
- The value is not a standardize C{date}.
"""
return self._get("111", 'd')
def conference_country(self):
"""The country where the conference took place.
@rtype: unicode
@return:
- Empty string when not defined
- The filter L{CLEAN_SPACES} is applied.
"""
loc = self.conference_location()
if loc:
return CLEAN_SPACES(loc.split(',')[-1])
return ''
def conference_key(self):
"""The conference key used in the store.
@rtype: unicode
@return:
"""
return self._get("111", 'g')
def conference_location(self):
"""The conference location.
@rtype: unicode
@return:
- The format is C{'town, country'}
"""
location = self._get("111", 'c')
# protection against [u'NOW 2012', u'Conca Specchiulla, Otranto, Lecce, Italy']
if isinstance(location, list) and len(location) == 2:
location = location[1]
return CLEAN_SPACES(location)
def conference_title(self):
"""The title of the conference.
@rtype: unicode
@return:
"""
return CLEAN_SPACES(self._get("111", 'a'))
def conference_town(self):
"""The town where the conference took place.
@rtype: unicode
@return:
- An empty string when not defined
"""
loc = self.conference_location()
if loc:
return CLEAN_SPACES(loc.split(',')[0])
return ''
def conference_url(self):
"""The URL of the conference home page.
@rtype: unicode
@return:
- Select arbitrarely the first URL when more than one are founded.
- empty string when not defined
"""
li = self._get("8564", 'u', force_list=True)
# protection
# from time to time this field contains the reference to the pdf file
val = []
for el in li:
if not el.endswith('pdf'):
val.append(el)
# if more than one URL is associated to the record
# select arbitrarily the first one
if val:
return val[0]
return u''
def conference_year(self):
"""The year of the conference.
@rtype: unicode
@return:
"""
year = self._get("111", 'f')
if year:
return year
# recovery from conference dates
match = REG_YEAR.search(self.conference_dates())
if match:
return match.group(1)
return u''
def debug(self):
"""Print the record structure on the standard output.
"""
pprint.pprint(self)
def find_authors(self, pattern):
"""Find authors matching the regular expression C{pattern}.
@type pattern: unicode
@param pattern: regular expression defining the author names.
@rtype: unicode
@return:
- Author names are separated by ", ".
- The string is empty when nothing is found.
"""
li = []
regex = re.compile(pattern)
for author in self.authors_as_list():
if regex.search(author):
li.append(author)
return u', '.join(li)
def find_authors_by_institute(self, pattern, cmpFct=None):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
@type pattern: unicode
@param pattern: regular expression defining the institute name(s)
@type cmpFct: reference to a function
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode or None
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- C{None} when authors are not found.
"""
# authors not defined
if ("100" not in self) and ("700" not in self):
return None
# compile the searching criteria
regex = re.compile(pattern)
# single author in the author list
if "700" in self and isinstance(self["700"], dict):
if 'u' not in self["700"]:
return None
s = self["700"]['u']
if isinstance(s, list):
s = ', '.join(s)
if regex.search(s):
return self["700"]["a"]
return None
# list of authors
elif "700" in self and isinstance(self["700"], list):
authors = []
for di in self["700"]:
if 'u' not in di:
return None
s = di['u']
if isinstance(di['u'], list):
s = ', '.join(di['u'])
if regex.search(s):
authors.append(di['a'])
if cmpFct:
authors.sort(key=cmpFct)
return u', '.join(authors)
def first_author(self):
"""The name of the first author.
@rtype: unicode
@return:
- Empty string when the first author is not defined.
"""
return self._get("100", 'a')
def first_author_institutes(self):
"""The institute(s) associated to the first author.
@rtype: unicode
@return:
- names are separated by ", ".
- The string is empty when institutes are not defined.
"""
li = self._get("100", 'u', force_list=True)
return u', '.join(li)
def host(self):
"""The host housing the record.
......@@ -449,114 +129,13 @@ class Record(dict):
"""The id of the record in the store.
@rtype: unicode
@return: the unic id of the record in the store
"""
return self["001"]
def institutes(self):
"""The list of institute signing the publication.
@rtype: list
@return:
- The list is sort in alphabetic order.
"""
li = []
# each entry can be a string or a list when the author has
# several affiliations
for el in self._get("700", 'u', force_list=True):
if isinstance(el, list):
li.extend(el)
else:
li.append(el)
# remove duplicate entries
li = list(set(li))
# sort institute in alphabetic order
li.sort()
return li
def is_conference_data(self):
"""C{True} when the record contains conference inforamtion.
@rtype: bool
@return:
"""
return "111" in self
def is_institute_defined(self):
"""C{True} when institutes are defined for all authors.
@rtype: bool
@return:
"""
if "700" not in self:
return False
# dict case
if isinstance(self["700"], dict):
return "u" in self["700"]
# list case
elif isinstance(self["700"], list):
for el in self["700"]:
if isinstance(el, dict):
if 'u' in el:
continue