# -*- coding: utf-8 -*- """ invenio_tools.recordpubli """ import re from base import (ARXIV, ARXIV_PDF, REG_ARXIV_NUMBER, REG_AUTHOR, REG_YEAR, THESIS_DIR) from filters import CLEAN_COLLABORATION from plugin_dbui import CLEAN_SPACES from record import Record class RecordPubli(Record): """MARC record describing a publication. The relation between methods and MARC fields are the following:: | CDS | INSPIREP ----------------------+---------+---------- authors | 700 a | collaboration | 710 g | first author | 100 a | institutes | 700 u | paper editor | 773 p | paper pages | 773 c | paper reference | 773 o | paper URL | 8564 u | paper volume | 773 v | paper year | 773 y | preprint number | 037 a | report number | 088 a | 037a submitted | 269 c | title | 245 a | year | 260 c | ----------------------+---------+---------- """ def authors(self, cmpFct=None): """The author(s) signing the publication. @type cmpFct: reference to a function or None @param cmpFct: Compare author names. The comparison function takes two items and returns -1, 0, or 1 depending on whether the first argument is considered smaller than, equal to, or larger than the second one. @rtype: unicode @return: - Author names are separated by ", ". - Author are sorted according to the function C{cmpFct}. - The string is empty when there is no authors. """ li = self.authors_as_list() if cmpFct: li.sort(key=cmpFct) return u', '.join(li) def authors_as_list(self): """The list of author(s) signing the publication. @rtype: list @return: - The list is empty when authors are not defined. """ authors = [] first_author = self.first_author() # a single author if u"700" in self and isinstance(self[u"700"], dict): if "a" in self[u"700"]: authors.append(self[u"700"]["a"]) # a list of authors elif u"700" in self and isinstance(self[u"700"], list): for di in self[u"700"]: if "a" in di: author = di["a"] # PROTECTION # in most of the case the author is a string # but it can be a list, e.g inspirehep.net/138663: # [u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)'] if isinstance(author, unicode): authors.append(author) elif isinstance(author, list): for elt in author: if REG_AUTHOR.match(elt): authors.append(elt) break # the first author is defined not the other one elif first_author: authors.append(first_author) # sometime the first author is missing if first_author != authors[0]: authors.insert(0, first_author) return authors def collaboration(self): """The collaboration(s) signing the publication. @rtype: unicode @return: - Collaboration names are separated by ", ". - The filter L{CLEAN_COLLABORATION} is applied. """ li = self._get(u"710", 'g', force_list=True) return CLEAN_COLLABORATION(', '.join(li)) def find_authors(self, pattern): """Find authors matching the regular expression C{pattern}. @type pattern: unicode @param pattern: regular expression defining the author names. @rtype: unicode @return: - Author names are separated by ", ". - The string is empty when nothing is found. """ li = [] regex = re.compile(pattern) for author in self.authors_as_list(): if regex.search(author): li.append(author) return u', '.join(li) def find_authors_by_institute(self, pattern, cmpFct=None): """Find authors belonging to a given institute(s) defined by a regular expression. @type pattern: unicode @param pattern: regular expression defining the institute name(s) @type cmpFct: reference to a function @param cmpFct: Compare author names. The comparison function takes two items and returns -1, 0, or 1 depending on whether the first argument is considered smaller than, equal to, or larger than the second one. @rtype: unicode @return: - Author names are separated by ", ". - Author are sorted according to the function C{cmpFct}. - Empty string when authors are not found. """ # authors not defined if not self.is_authors(): return u"" authors = [] regex = re.compile(pattern) # standard case data_authors = (self[u"700"] if u"700" in self else []) if isinstance(data_authors, dict): data_authors = [data_authors] # to cover the case in which the first author is not in self[u"700"] data_first_author = (self[u"100"] if u"100" in self else []) if isinstance(data_first_author, dict): data_first_author = [data_first_author] # scan for elements in (data_authors, data_first_author): for di in elements: # no affiliation if 'u' not in di: return u"" affiliations = di['u'] if isinstance(affiliations, list): affiliations = u", ".join(affiliations) # affiliation match if regex.search(affiliations): if "a" in di: authors.append(di["a"]) # remove duplicate entries and sort authors = list(set(authors)) if cmpFct: authors.sort(key=cmpFct) return u", ".join(authors) def first_author(self): """The name of the first author. @rtype: unicode @return: - Empty string when the first author is not defined. """ # standard case value = self._get(u"100", "a") if value: # PROTECTION # It happens that the first author is duplicate, remove it if isinstance(value, list): value = ", ".join(set(value)) return value # sometime it is only defined in the authors list if u"700" in self: if isinstance(self[u"700"], dict) and "a" in self[u"700"]: return self[u"700"]["a"] elif isinstance(self[u"700"], list): if "a" in self[u"700"][0]: return self[u"700"][0]["a"] return u"" def first_author_institutes(self): """The institute(s) associated to the first author. @rtype: unicode @return: - names are separated by ", ". - The string is empty when institutes are not defined. """ # standard case # PROTECTION # sometime the first author is duplicate -- remove duplicate li = self._get(u"100", "u", force_list=True) if li: return u", ".join(set(li)) # sometime it is only defined in the authors list if u"700" in self: if isinstance(self[u"700"], dict) and "u" in self[u"700"]: if isinstance(self[u"700"]["u"], unicode): return self[u"700"]["u"] elif isinstance(self[u"700"]["u"], list): return u", ".join(self[u"700"]["u"]) elif isinstance(self[u"700"], list): if "u" in self[u"700"][0]: if isinstance(self[u"700"][0]["u"], unicode): return self[u"700"][0]["u"] elif isinstance(self[u"700"][0]["u"], list): return u", ".join(self[u"700"][0]["u"]) return u"" def institutes(self): """The list of institute signing the publication. @rtype: list @return: - The list is sort in alphabetic order. """ li = [] # each entry can be a string or a list when the author has # several affiliations for field in (u"100", u"700"): for el in self._get(field, "u", force_list=True): if isinstance(el, list): li.extend(el) else: li.append(el) # remove duplicate entries li = list(set(li)) # sort institute in alphabetic order li.sort() return li def is_affiliations(self): """C{True} when affiliations are defined for authors. This is a fast algorithm checking only first and last authors. To check that the affiliation is defined for all authors, uses the method is_affiliation_for_all. @rtype: bool @return: """ for field in (u"100", u"700"): if field in self: if isinstance(self[field], dict): if "u" not in self[field]: return False elif isinstance(self[field], list): for i in (1, -1): if "u" not in self[field][i]: return False return True def is_affiliation_for_all(self): """C{True} when affiliation are defined for all authors. @rtype: bool @return: """ if u"700" not in self and u"100" not in self: return False for field in (u"100", u"700"): if field in self: dictionaries = self[field] if isinstance(dictionaries, dict): dictionaries = [dictionaries] for di in dictionaries: if isinstance(di, dict): if "u" in di: continue else: return False else: return False return True def is_authors(self): """C{True} when authors are defined. @rtype: bool @return: """ return u"100" in self or u"700" in self def is_published(self): """Return true is the record is published. """ if u"773" not in self: return False # Should have the field "p", "v", "y" and "c" # It is in the state inprint when the dict contains only the o field. di = self[u"773"] if isinstance(di, dict): s1 = set(di.iterkeys()).intersection(set(("c", "p", "v", "y"))) if len(s1) == 4: return True return False def is_with_erratum(self): """C{True} when the record contains erratum data. @rtype: bool @return: """ # record with erratum contains a list of editor return u"773" in self and isinstance(self[u"773"], list) def paper_editor(self): """The abbreviated version of the review, I{i.e.} Phys Lett B. @rtype: unicode or list @return: - A list when there are erratum. - empty string when not defined """ return self._get(u"773", "p") def paper_pages(self): """The page number / range when the record is published in a review. @rtype: unicode or list @return: - The format is "45-67" or "234" - Empty string when not defined """ return self._get(u"773", "c") def paper_reference(self): """The full reference for a publication published in a review. @rtype: unicode @return: - The format is "Phys Lett B 456 2010 5-6" - The string is empty when the publication is not published in a review. """ if u"773" not in self: return u"" li = [] for k in ("p", "v", "y", "c"): if k in self[u"773"]: li.append(self[u"773"][k]) return u' '.join(li) def paper_url(self): """The URL of the preprint. @note: Many others URL exists mainly those related to open access. @rtype: unicode @return: - the string is empty when no URLs are found. """ pdf = "%s.pdf" % self.preprint_number() if u"8564" in self and isinstance(self[u"8564"], list): for el in self[u"8564"]: # protection see http://cds.cern.ch/record/2014733 if "u" in el and isinstance(el["u"], list) and pdf: m = REG_ARXIV_NUMBER.search(pdf) if m: return "%s%s" % (ARXIV_PDF, m.group()) # cds.cern.ch if "y" in el and el["y"] == u"Preprint": return el["u"] # inspirehep.net elif "y" not in el and el["u"].endswith(pdf): return el["u"] else: return u"" def paper_volume(self): """The volume number when the record is published in a review. @rtype: unicode or list @return: - A list when there are erratum. - Empty string when nothing is found. """ return self._get(u"773", "v") def paper_year(self): """The year of the publication. @rtype: unicode or list @return: - A list when there are erratum. - Empty string if the year is not defined. """ rep = self._get(u"773", "y") # protection # in record http://cds.cern.ch:record/1951625 the entrie 773y # is duplicate but there is no erratum if isinstance(rep, list) and not isinstance(self["773"], list): rep = list(set(rep)) if len(rep) == 1: rep = rep[0] return rep def preprint_number(self): """The ArXiv preprint number. @rtype: unicode @return: - Empty string when not defined """ # for both CDS and INSPRIREHEP preprint data in 37 a # for CDS preprint information are also store in 88 a for k in (u"037", u"088"): for val in self._get(k, "a", force_list=True): if ARXIV in val: return val return u'' def report_number(self): """The report number(s) associated to the publication. @rtype: unicode @return: - Numbers are separated by ", " - Number are sorted in alphabetic order. - Empty string when not defined. """ li = [] # cds.cern.ch # report number can be in 37a, 88a and 88 9 # entry can be the preprint number arXiv:xxx if self.host().startswith("cds"): for elt in self._get(u"088", "a", force_list=True): if not elt.startswith(ARXIV): li.append(elt) # if empty have a look to "088" "9" # logic to avoid version number in 88/9 # 88/a = LHCB-PAPER-2015-016 while 88/9 = LHCB-PAPER-2015-016-003 if not li: for elt in self._get(u"088", "9", force_list=True): if not elt.startswith(ARXIV): li.append(elt) # inspirehep.net / cds.cern.ch -- example of MARC structure: # 037__ $$aLHCB-PAPER-2014-047 # 037__ $$aCERN-PH-EP-2014-221 # 037__ $$9arXiv$$aarXiv:1410.0149$$chep-ex if u"037" in self: if isinstance(self[u"037"], dict): if "9" in self[u"037"] and self[u"037"]["9"] == ARXIV: pass elif "a" in self[u"037"]: if not self[u"037"]["a"].startswith(ARXIV): li.append(self[u"037"]["a"]) elif isinstance(self[u"037"], list): for di in self[u"037"]: if "9" in di and di["9"] == ARXIV: continue if "a" in di: if not di["a"].startswith(ARXIV): li.append(di["a"]) li.sort() return ', '.join(li) def submitted(self): """The date of submission. @rtype: list @return: - The format is "YYYY-MM" or "YYYY-MM-DD" - A list when there are erratum. - Empty list when not defined. """ return self._get(u"269", "c", force_list=True) def title(self): """The title of the publication. @rtype: unicode or list @return: - A list when there are erratum. - Empty string when not defined. - The filter L{CLEAN_SPACES} is applied. """ val = self._get(u"245", "a") if isinstance(val, (unicode, str)): return CLEAN_SPACES(val) elif isinstance(val, list): for i in range(len(val)): val[i] = CLEAN_SPACES(val[i]) else: return val def year(self): """The year of the publication. @rtype: unicode or list @return: Empty string when not defined """ val = self._get(u"260", "c") if isinstance(val, list): if len(val): val.sort() val = val[0] else: val = u"" # several form are possible 2014, 2014-12 or 2014-12-31 if val: match = REG_YEAR.search(val) if match: val = match.group(1) return val