""" invenio_tools.recordpubli """ import re from .base import (ARXIV, ARXIV_PDF, REG_ARXIV_NUMBER, REG_YEAR) from .exception import RecordException from filters import CLEAN_COLLABORATION from numpy import NaN from pandas import concat, DataFrame from plugin_dbui import as_list, CLEAN_SPACES from .record import Record AUTHOR_FORMATS = [ "First, Last", "F. Last", "Last", "Last, First", "Last F."] # Decode publication reference: # Phys. Rev. Lett. 113, 032001 (2014) # Eur. Phys. J. C (2014) 74:2883 _ref1 = r"(?P

[A-Za-z\. ]+) +(?P\d+),? +(?P[\d-]+) +\((?P[\d]+)\)" _ref2 = r"(?P

[A-Za-z\. ]+) +\((?P\d+)\) +(?P[\d]+):(?P[\d-]+)" DECODE_REF = [re.compile(_ref1), re.compile(_ref2)] MSG_INVALID_FMT = "Invalid format for author" # The MARC12 keys containing paper reference PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"]) # Limit the number of first name to two (others will be ignored) REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?" def to_initial(x, y, z): """Help function to extract initial from a first name split in x, y and z: Albert (x="Albert", y="", z="") Antonio Augusto (x="Antonio", y="", z="Augusto") Jean-Pierre (x="Jean", y="-", z="Pierre") Args: x (str): first part y (str): separator z (str): second part Returns: str """ if z == "": return "%s." % x[0:1] if y == "": return "%s. %s." % (x[0:1], z[0:1]) else: return "%s.%s%s." % (x[0:1], y[0:1], z[0:1]) def to_str(x): return ("|".join(x) if isinstance(x, list) else x) class RecordPubli(Record): """The MARC record describing a publication. Usual publications are article, preprint, proceeding, report and talk. The relation between methods and MARC fields are the following:: +-----------------------+---------+----------+ | | CDS | INSPIREP | +-----------------------+---------+----------+ | authors | 700 a | | | collaboration | 710 g | | | first author | 100 a | | | institutes | 700 u | | | paper editor | 773 p | | | paper pages | 773 c | | | paper reference | 773 o | | | paper URL | 8564 u | | | paper volume | 773 v | | | paper year | 773 y | | | preprint number | 037 a | | | report number | 088 a | 037a | | submitted | 269 c | | | title | 245 a | | | year | 260 c | | +-----------------------+---------+----------+ """ def __init__(self, *args): self._last_fmt_author = "Last, First" Record.__init__(self, *args) self._process_authors() def _process_authors(self): """Convert authors information into DataFrame: * Keep the subfield "a", "u" and "e" (phd thesis) * Convert list of affiliation in string separated by "|" Authors and their affiliations are defined in the fields 100 and 700. The method deals with cases where: * the first author is defined in 100 but it is not in 700 * first author is not defined in 100 but in 700 * thesis in which 700 contains names of director Authors and their affiliations are stored in DataFrame with the following structure: +------------+---------------------------+ | column | | +------------+---------------------------+ | a | author name (Last, First) | | u | affiliation(s) | | first_name | first name | | last_name | family name | | fmt_name | formated name | +------------+---------------------------+ """ columns4names = ["last_name", "first_name"] # .................................................................... # # Instantiate DataFrame for field 100 and 700 # di = {"100": None, "700": None} for key in di.iterkeys(): if key not in self: continue data = self[key] data = (data if isinstance(data, list) else [data]) df = DataFrame(data) columns = df.columns # keep columns: # - "a": author name # - "e": phd director (equal to "dir.") # - "u": affiliation(s) df = df.drop(columns.difference(["a", "e", "u"]), axis="columns") # add columns first_name, last_name and fmt_name # protection -- split create 1, 2 and more than 2 columns # former append when the author name is 'ATLAS collaboration' df1 = df.a.str.split(",", expand=True) if df1.shape[1] < 2: continue df[columns4names] = df1[[0, 1]] df["fmt_name"] = df.a df.first_name = df.first_name.str.strip() df.last_name = df.last_name.str.strip() # protection -- affiliation not defined if "a" in columns and "u" not in columns: dfu = DataFrame([""]*len(df), columns=["u"]) df = concat([df, dfu], axis="columns") # protection -- mission affiliation df.u = df.u.fillna("") # convert list of affiliation to string separated by | df.u = df.u.apply(lambda x: to_str(x)) di[key] = df # alias d100, d700 = di["100"], di["700"] # .................................................................... # # Protection -- more than one first author # # treat the case with duplicate author name # by building the affiliation string # if d100 is not None and len(d100) > 1: grouped = d100.groupby(["a"], sort=False) if len(grouped) == 1: for name, group in grouped: last_name, first_name = name.split(",") affiliations = \ [el for el in group.u if el not in ("", NaN, None)] di = {"a": [name], "first_name": [first_name.strip()], "fmt_name": [name], "last_name": [last_name.strip()], "u": ["|".join(affiliations)]} d100 = DataFrame(di) # NOTE # The case with more than one first author is rare # It will be detect by the CheckAndFix procedure when it is # not fixed by the above protection # .................................................................... # # the author are spread over the 100 and 700 field. # deal with cases where the first author is defined in 100 # but not in 700, first author is defined in 100 and in 700 # or no author in 100 # if d100 is not None and d700 is not None: if d100.a.iloc[0] != d700.a.iloc[0]: if len(d100) == 1: d700 = concat([d100, d700], ignore_index=True) elif d100 is None and d700 is not None: d100 = DataFrame(d700.iloc[0]).transpose() elif d700 is None and d100 is not None: d700 = d100 else: d100 = d700 = DataFrame({ "a": [""], "first_name": [""], "fmt_name": [""], "last_name": [""], "u": [""]}) # .................................................................... # # Update # self["100"] = d100 self["700"] = d700 def authors(self, sep=", ", sort=False): """The author(s) signing the publication. Args: sep (unicode): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: unicode: * Author names are separated by the ``sep`` argument. * The string is empty when there is no authors. """ li = self.authors_as_list(sort=sort) return sep.join(li) def authors_as_list(self, sort=False): """The list of author(s) signing the publication. Args: sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: list: the list is empty when authors are not defined. """ if sort: li = (self["700"][["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name .tolist()) else: li = (self["700"].fmt_name .sort_index() .tolist()) if len(li) == 1 and li[0] == "": li = [] return li def collaboration(self): """The collaboration(s) signing the publication. Returns: unicode: * names of collaboration are separated by a comma. * The filter CLEAN_COLLABORATION is applied. """ li = self._get("710", "g", force_list=True) return CLEAN_COLLABORATION(", ".join(li)) def find_affiliation(self, pattern): """Find affiliation matching the regular expression *pattern*. Args: pattern (unicode): regular expression defining the affiliation keys. It has to be build for an exact match namely containing start and end of string. This is reuqired to separate `Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`. Returns: unicode: - the affiliation or the first one when several are found. - empty string when nothing is found. """ df = self["700"] query = df.u.str.contains(pattern) data = (df[query].u.unique()) return (data[0] if len(data) > 0 else "") def find_authors(self, pattern, sep=", ", sort=False): """Find authors containing the regular expression *pattern*. The search is performed on the formatted name. Args: pattern (unicode): regular expression defining the author name(s). sep (unicode): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: unicode: * Author names are separated by ``sep`` argument. * The string is empty when nothing is found. """ df = self["700"] query = df.fmt_name.str.contains(pattern) if sort: data = (df.loc[query, ["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name) else: data = (df.loc[query, ["fmt_name"]] .sort_index() .fmt_name) return ("" if len(data) == 0 else sep.join(data)) def find_authors_by_affiliation(self, pattern, sep=", ", sort=False): """Find authors belonging to a given institute(s) defined by a regular expression. Args: pattern (unicode): regular expression defining the affiliation keys for the institute(s). sep (unicode): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: unicode: * Author names are separated by the ``sep`` argument. * Author are sorted according to their family name. * Empty string when authors are not found. """ df = self["700"] query = df.u.str.contains(pattern) if sort: data = (df.loc[query, ["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name) else: data = (df.loc[query, ["fmt_name"]] .sort_index() .fmt_name) return (sep.join(data) if len(data) > 0 else "") def first_author(self): """The name of the first author. Returns: unicode: - Empty string when the first author is not defined. """ return self["700"].fmt_name.iloc[0] def first_author_institutes(self): """The institute(s) associated to the first author. Note: Search is performed via the affiliation defined by the "u" key of the author field. Returns: unicode: - names are separated by ``|``. - The string is empty when institutes are not defined. """ val = self["700"].u.iloc[0] return ("" if val == NaN else val) def institutes(self): """The list of institute signing the publication. Note: Name of institute are given by the affiliation defined by the "u" key of the author field. Returns: list: the list is sort in alphabetic order. """ # expand multi-affiliation (one per column) df = self["700"].u.str.split("|", expand=True) # merge all columns into a single one, # sort and remove duplicate entries li = [df[el].dropna() for el in df.columns] df = (concat(li, ignore_index=True) .sort_values() .unique()) return df.tolist() def is_affiliations(self): """``True`` when affiliations are defined for authors. Note: This is a fast algorithm checking that the ``u`` field exists. To check that the affiliation is defined for all authors, uses the method :func:`is_affiliation_for_all`. Returns: bool: """ df = self["700"] if "u" not in df.columns: return False if len(df) == 1 and df.u.iloc[0] == "": return False return True def is_affiliation_for_all(self): """``True`` when affiliation are defined for all authors. Return: bool: """ df = self["700"] query = df.u.isin(["", NaN]) return df.u[query].size == 0 def is_authors(self): """``True`` when authors are defined. Returns: bool: """ df = self["700"] if "a" not in df.columns: return False if len(df) == 1 and df.a.iloc[0] == "": return False return True def is_published(self): """``True`` is the record is published. Returns: bool: """ if "773" not in self: return False # record can contains erratum for di in as_list(self["773"]): # the reference field is complete and contains, at least, # the keys "p", "v", "y" and "c" if PAPER_REFERENCE_KEYS.issubset(set(di.keys())): return True # paper reference my be incomplete or even wrong # the recovery procedure will use the 773o # check that 773o contains the paper reference: # Eur. Phys. J. C (2014) 74:2883 # Phys. Rev. Lett. 113, 032001 (2014) if "o" in di: value = di["o"] for reg in DECODE_REF: if reg.match(value): return True return False def is_with_erratum(self): """``True`` when the record contains erratum data. Returns: bool """ # record with erratum contains a list of editor return "773" in self and isinstance(self["773"], list) def paper_editor(self): """The abbreviated version of the review, *e.g* Phys Lett B. Returns: unicode or list: * A list when there are erratum. * Empty string when not defined. """ return self._get("773", "p") def paper_pages(self): """The page number / range when the record is published in a review. Returns: unicode or list: * The format is "45-67" or "234". * A list when there are erratum. * Empty string when not defined. """ return self._get("773", "c") def paper_reference(self): """The full reference for a publication published in a review. Returns: unicode or list: * The format is "Phys Lett B 456 2010 5-6". * The string is empty when the publication is not published in a review. """ if "773" not in self: return "" li = [] for k in ("p", "v", "y", "c"): if k in self["773"]: li.append(self["773"][k]) return " ".join(li) def paper_url(self): """The URL of the preprint. Note: Many others URL exists mainly those related to open access. Returns: unicode: the string is empty when no URLs are found. """ pdf = "%s.pdf" % self.preprint_number() if "8564" in self and isinstance(self["8564"], list): for el in self["8564"]: # protection see http://cds.cern.ch/record/2014733 if "u" in el and isinstance(el["u"], list) and pdf: m = REG_ARXIV_NUMBER.search(pdf) if m: return "%s%s" % (ARXIV_PDF, m.group()) # cds.cern.ch if "y" in el and el["y"] == "Preprint": return el["u"] # inspirehep.net elif "y" not in el and el["u"].endswith(pdf): return el["u"] else: return "" def paper_volume(self): """The volume number when the record is published in a review. Returns: unicode or list: - A list when there are erratum. - Empty string when nothing is found. """ return self._get("773", "v") def paper_year(self): """The year of the publication. Returns: unicode or list: - A list when there are erratum. - Empty string if the year is not defined. """ rep = self._get("773", "y") # protection # in record http://cds.cern.ch:record/1951625 the entrie 773y # is duplicate but there is no erratum if isinstance(rep, list) and not isinstance(self["773"], list): rep = list(set(rep)) if len(rep) == 1: rep = rep[0] return rep def preprint_number(self): """The ArXiv preprint number. Returns: unicode: empty string when it is not defined. """ # for both CDS and INSPRIREHEP preprint data in 37 a # for CDS preprint information are also store in 88 a for k in ("037", "088"): for val in self._get(k, "a", force_list=True): if ARXIV in val: return val return "" def reformat_authors(self, fmt="Last, First"): """Reformat names of authors. The default formatting for cds/invenio record is ``Last, First``. Args: fmt (str): define the new format for author names. Possible values are "First, Last", "F. Last", "Last", "Last, First" and "Last F." Raises: RecordException: if fmt is not valid. """ if fmt not in AUTHOR_FORMATS: raise RecordException(MSG_INVALID_FMT) if fmt == self._last_fmt_author: return self._last_fmt_author = fmt # alias d100, d700 = self["100"], self["700"] # .................................................................... # # Compute initial for the first name # if fmt in ("F. Last", "Last F."): for df in (d100, d700): dfm = (df.first_name.str.extract(REG_INITIAL, expand=True) .fillna("")) df["initial"] = dfm.apply( lambda x: to_initial(x[0], x[1], x[2]), axis="columns") # .................................................................... # # Format # if fmt == "Last, First": d100["fmt_name"] = d100.a d700["fmt_name"] = d700.a elif fmt == "First, Last": d100["fmt_name"] = d100.first_name + ", " + d100.last_name d700["fmt_name"] = d700.first_name + " " + d700.last_name elif fmt == "F. Last": d100["fmt_name"] = d100.initial + " " + d100.last_name d700["fmt_name"] = d700.initial + " " + d700.last_name elif fmt == "Last": d100["fmt_name"] = d100.last_name d700["fmt_name"] = d700.last_name elif fmt == "Last F.": d100["fmt_name"] = d100.last_name + " " + d100.initial d700["fmt_name"] = d700.last_name + " " + d700.initial # .................................................................... # # Clean initial column # if fmt in ("F. Last", "Last F."): d100 = d100.drop("initial", axis="columns") d700 = d700.drop("initial", axis="columns") def report_number(self): """The report number(s) associated to the publication. Returns: unicode: - Numbers are separated by a comma - Number are sorted in alphabetic order. - Empty string when not defined. """ li = [] # cds.cern.ch # report number can be in 37a, 88a and 88 9 # entry can be the preprint number arXiv:xxx if self.host().startswith("cds"): for elt in self._get("088", "a", force_list=True): if not elt.startswith(ARXIV): li.append(elt) # if empty have a look to "088" "9" # logic to avoid version number in 88/9 # 88/a = LHCB-PAPER-2015-016 while 88/9 = LHCB-PAPER-2015-016-003 if not li: for elt in self._get("088", "9", force_list=True): if not elt.startswith(ARXIV): li.append(elt) # inspirehep.net / cds.cern.ch -- example of MARC structure: # 037__ $$aLHCB-PAPER-2014-047 # 037__ $$aCERN-PH-EP-2014-221 # 037__ $$9arXiv$$aarXiv:1410.0149$$chep-ex if "037" in self: if isinstance(self["037"], dict): if "9" in self["037"] and self["037"]["9"] == ARXIV: pass elif "a" in self["037"]: if not self["037"]["a"].startswith(ARXIV): li.append(self["037"]["a"]) elif isinstance(self["037"], list): for di in self["037"]: if "9" in di and di["9"] == ARXIV: continue if "a" in di: if not di["a"].startswith(ARXIV): li.append(di["a"]) li.sort() return ", ".join(li) def submitted(self): """The date of submission. Returns: unicode or list: * The format is "YYYY-MM" or "YYYY-MM-DD" * A list when there are erratum. * Empty list when not defined. """ return self._get("269", "c", force_list=True) def title(self): """The title of the publication. Returns: unicode or list: * A list when there are erratum. * Empty string when not defined. * The filter CLEAN_SPACES is applied. """ val = self._get("245", "a") if isinstance(val, (unicode, str)): return CLEAN_SPACES(val) elif isinstance(val, list): for i in range(len(val)): val[i] = CLEAN_SPACES(val[i]) else: return val def year(self): """The year of the publication. Returns: unicode or list: * A list when there are erratum. * Empty string when it is not defined. """ val = self._get("260", "c") if isinstance(val, list): if len(val): val.sort() val = val[0] else: val = "" # several form are possible 2014, 2014-12 or 2014-12-31 if val: match = REG_YEAR.search(val) if match: val = match.group(1) return val