""" invenio_tools.recordpubli """ import re from .base import (ARXIV, ARXIV_PDF, REG_ARXIV_NUMBER, REG_YEAR) from .exception import RecordException from filters import CLEAN_COLLABORATION from numpy import NaN from pandas import concat, DataFrame from plugin_dbui import as_list, CLEAN_SPACES from .record import Record AUTHOR_FORMATS = [ "First, Last", "F. Last", "Last", "Last, First", "Last F."] # decode publication reference: # Phys. Rev. Lett. 113, 032001 (2014) # Eur. Phys. J. C (2014) 74:2883 _ref1 = r"(?P

[A-Za-z\. ]+) +(?P\d+),? +(?P[\d-]+) +\((?P[\d]+)\)" _ref2 = r"(?P

[A-Za-z\. ]+) +\((?P\d+)\) +(?P[\d]+):(?P[\d-]+)" DECODE_REF = [re.compile(_ref1), re.compile(_ref2)] MSG_INVALID_FMT = "Invalid format for author" # the keys containing paper reference PAPER_REFERENCE_KEYS = {"pagination", "title", "volume", "year"} # extract initial of a first name REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?$" def to_initial(x, y, z): """Help function to extract initial from a first name split in x, y and z: Albert (x="Albert", y="", z="") Antonio Augusto (x="Antonio", y="", z="Augusto") Jean-Pierre (x="Jean", y="-", z="Pierre") Args: x (str): first part y (str): separator z (str): second part Returns: str """ if z == "": return "%s." % x[0:1] if y == "": return "%s. %s." % (x[0:1], z[0:1]) else: return "%s.%s%s." % (x[0:1], y[0:1], z[0:1]) def to_str(x): return ("|".join(x) if isinstance(x, list) else x) class RecordPubli(Record): """The record describes an article, preprint, proceeding, report and talk. The main ``field`` and ``subfield`` are:: +---------------------------------+----------------------------------+ | field | subfield | +---------------------------------+----------------------------------+ | FIXME_OAI (inspire) | id | | abstract | | | accelerator_experiment | | | agency_code (cds) | | | authors | INSPIRE_number, affiliation, | | | control_number, first_name, | | | full_name, last_name, | | | relator_name (phd director) | | base (cds) | | | collection | | | comment | | | copyright_status (cds) | | | corporate_name | collaboration | | creation_date | | | doi | | | email_message (cds) | | | filenames | | | files | comment, description, eformat, | | | full_name, full_path, magic, | | | name, path, size, status, | | | subformat, superformat, type, | | | url, version | | filetypes | | | imprint | | | keywords | | | language (cds) | | | license | | | number_of_authors | | | number_of_citations | | | number_of_comments | | | number_of_reviews | | | oai (cds) | value | | other_report_number (cds) | | | persistent_identifiers_keys | | | physical_description | | | prepublication | date, publisher_name, place | | primary_report_number | | | publication_info | pagination, title, volume, year | | recid | none | | reference (inspire) | | | report_number (cds) | internal, report_number | | source_of_acquisition (inspire) | | | status_week (cds) | | | subject | | | system_control_number | institute, value or canceled | | thesaurus_terms | | | title | title | | title_additional (inspire) | | | url (cds) | description, url | | version_id | | +---------------------------------+----------------------------------+ """ def __init__(self, *args): self._last_fmt_author = "Last, First" Record.__init__(self, *args) self._process_authors() self._process_publication_info() def _process_authors(self): """Convert authors information into DataFrame: Authors and their affiliations are stored in DataFrame with the following structure: +---------------+--------------------------------+ | column | | +---------------+--------------------------------+ | affiliation | value separated by "|" | | first_name | first name | | fmt_name | formated name | | full_name | Last, First | | last_name | family name | | relator_name | equal to dir. for phd director | +---------------+--------------------------------+ Note: After running this method, the field ``authors`` is always defined. It contains one entry with empty strings when the field does not exist. """ if u"authors" not in self: cols = ["affiliation", "first_name", "fmt_name", "full_name", "last_name"] self[u"authors"] = DataFrame([[""]*len(cols)], columns=cols) return data = self[u"authors"] data = (data if isinstance(data, list) else [data]) df = DataFrame(data) # drop useless columns refcols = ["affiliation", "first_name", "full_name", "last_name", "relator_name"] columns = df.columns df = df.drop(columns.difference(refcols), axis="columns") # protection -- affiliation not defined if "affiliation" not in columns: dfa = DataFrame([""]*len(df), columns=["affiliation"]) df = concat([df, dfa], axis="columns") # convert list of affiliation to string separated by | df.affiliation = (df.affiliation .fillna("") .apply(lambda x: to_str(x))) # add the column fmt_name df["fmt_name"] = df.full_name # replace self[u"authors"] = df def _process_publication_info(self): """Convert publication_info into DataFrame: Note: * the field is a list when there are eratum * in some case the subfield year is a list (cds 1951625) publication information are stored in DataFrame with the following structure: +------------+--------------------------------+ | column | | +------------+--------------------------------+ | title | abbreviation of the publisher | | volume | volume | | year | year of publication | | pagination | page number or ranges | +------------+--------------------------------+ Note: * After running this method, the field ``publication_info`` is always defined. It contains one entry with empty strings when the field does not exist. * In order to deal with erratum entry are sorter by year and volume. """ if u"publication_info" not in self: cols = ["title", "volume", "year", "pagination"] self[u"publication_info"] = \ DataFrame([[""]*len(cols)], columns=cols) return data = self[u"publication_info"] data = (data if isinstance(data, list) else [data]) df = DataFrame(data) # protection -- list of year, e.g. [2014, 2014] (cds 1951625) df["year"] = \ df.year.apply( lambda x: (", ".join(set(x)) if isinstance(x, list) else x)) # erratum -- sort by year and volume df = df.sort_values(["year", "volume"]) # replace self[u"publication_info"] = df def authors(self, sep=", ", sort=False): """The author(s) signing the publication. Args: sep (str): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: str: * Author names are separated by the ``sep`` argument. * The string is empty when there is no authors. """ li = self.authors_as_list(sort=sort) return sep.join(li) def authors_as_list(self, sort=False): """The list of author(s) signing the publication. Args: sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: list: the list is empty when authors are not defined. """ df = self[u"authors"] if sort: li = (df[["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name .tolist()) else: li = (df.fmt_name .sort_index() .tolist()) if len(li) == 1 and li[0] == "": li = [] return li def collaboration(self): """The collaboration(s) signing the publication. Returns: str: * names of collaboration are separated by a comma. * The filter CLEAN_COLLABORATION is applied. """ li = self._get(u"corporate_name", u"collaboration", force_list=True) return CLEAN_COLLABORATION(", ".join(li)) def find_affiliation(self, pattern): """Find affiliation matching the regular expression *pattern*. Args: pattern (str): regular expression defining the affiliation keys. It has to be build for an exact match namely containing start and end of string. This is reuqired to separate `Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`. Returns: str: - the affiliation or the first one when several are found. - empty string when nothing is found. """ df = self[u"authors"] # modify the pattern to capture group pattern = "(%s)" % pattern data = (df.affiliation.str.extract(pattern, expand=False) .dropna()) return (data[0] if len(data) > 0 else "") def find_authors(self, pattern, sep=", ", sort=False): """Find authors containing the regular expression *pattern*. The search is performed on the formatted name. Args: pattern (str): regular expression defining the author name(s). sep (unicode): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: str: * Author names are separated by ``sep`` argument. * The string is empty when nothing is found. """ df = self[u"authors"] query = df.fmt_name.str.contains(pattern) if sort: data = (df.loc[query, ["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name) else: data = (df.loc[query, ["fmt_name"]] .sort_index() .fmt_name) return ("" if len(data) == 0 else sep.join(data)) def find_authors_by_affiliation(self, pattern, sep=", ", sort=False): """Find authors belonging to a given institute(s) defined by a regular expression. Args: pattern (str): regular expression defining the affiliation keys for the institute(s). sep (unicode): string separating author names. The default is the comma. sort (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Returns: str: * Author names are separated by the ``sep`` argument. * Author are sorted according to their family name. * Empty string when authors are not found. """ df = self[u"authors"] query = df.affiliation.str.contains(pattern) if sort: data = (df.loc[query, ["last_name", "fmt_name"]] .sort_values(by="last_name") .fmt_name) else: data = (df.loc[query, ["fmt_name"]] .sort_index() .fmt_name) return (sep.join(data) if len(data) > 0 else "") def first_author(self): """The name of the first author. Returns: str: - Empty string when the first author is not defined. """ return self[u"authors"].fmt_name.iloc[0] def first_author_institutes(self): """The institute(s) associated to the first author. Note: Search is performed via the affiliation defined by the "u" key of the author field. Returns: str: - names are separated by ``|``. - The string is empty when institutes are not defined. """ val = self[u"authors"].affiliation.iloc[0] return ("" if val == NaN else val) def institutes(self): """The list of institute signing the publication. Note: Name of institute are given by the affiliation defined by the "affiliation" key of the author field. Returns: list: the list is sort in alphabetic order. """ df = self[u"authors"] # expand multi-affiliation (one per column) df = df.affiliation.str.split("|", expand=True) # merge all columns into a single one, # sort and remove duplicate entries li = [df[el].dropna() for el in df.columns] df = (concat(li, ignore_index=True) .sort_values() .unique()) return df.tolist() def is_affiliations(self): """``True`` when affiliations are defined for authors. Note: This is a fast algorithm checking that the ``affiliation`` field exists. To check that the affiliation is defined for all authors, uses the method :func:`is_affiliation_for_all`. Returns: bool: """ df = self[u"authors"] if len(df) == 1 and df.affiliation.iloc[0] == "": return False return True def is_affiliation_for_all(self): """``True`` when affiliation are defined for all authors. Return: bool: """ df = self[u"authors"] query = df.affiliation.isin(["", NaN]) return df.affiliation[query].size == 0 def is_authors(self): """``True`` when authors are defined. Returns: bool: """ df = self[u"authors"] cols = {"first_name", "full_name", "last_name"} if len(df.columns.intersection(cols)) != 3: return False if len(df) == 1 and df.full_name.iloc[0] == "": return False return True def is_published(self): """``True`` is the record is published and contains a complet set of publication infromation (title, volume, year and pagination). Returns: bool: """ df = self[u"publication_info"] query = \ (df.title.str.len() > 0) \ & (df.volume.str.len() > 0) \ & (df.year.str.len() > 0) \ & (df.pagination.str.len() > 0) return len(df[query]) > 0 def is_with_erratum(self): """``True`` when the record contains erratum data. Returns: bool """ df = self[u"publication_info"] return len(df) > 1 def paper_editor(self): """The abbreviated version of the review, *e.g* Phys Lett B. Returns: unicode: * Empty string when not defined. """ df = self[u"publication_info"] return df.title.iloc[0] def paper_pages(self): """The page number / range when the record is published in a review. Returns: unicode: * The format is "45-67" or "234". * Empty string when not defined. """ df = self[u"publication_info"] return df.pagination.iloc[0] def paper_reference(self): """The full reference for a publication published in a review. Returns: unicode: * The format is "Phys Lett B 456 2010 5-6". * The string is empty when the publication is not published in a review. """ paper = self[u"publication_info"].iloc[0] li = [paper.title, paper.volume, paper.year, paper.pagination] return u" ".join(li).strip() def paper_url(self): """The URL of the preprint. Note: Many others URL exists mainly those related to open access. Returns: unicode: the string is empty when no URLs are found. """ # depends on the store # start with CDS looking for the field `url` if u"url" in self: data = self[u"url"] li = (data if isinstance(data, list) else [data]) li = [di[u"url"] for di in li if di[u"description"] == u"Preprint"] if len(li) == 1: return li[0] # scan the list of files # work for both stores. pdf = "%s.pdf" % self.preprint_number() li = self._get(u"files", u"url", force_list=True) li = [el for el in li if el.endswith(pdf)] if len(li) == 1: return li[0] return u"" def paper_volume(self): """The volume number when the record is published in a review. Returns: unicode: - Empty string when nothing is found. """ df = self[u"publication_info"] return df.volume.iloc[0] def paper_year(self): """The year of the publication. Returns: unicode: - Empty string if the year is not defined. """ df = self[u"publication_info"] return df.year.iloc[0] def preprint_number(self): """The ArXiv preprint number. Returns: str: empty string when it is not defined. """ if u"primary_report_number" not in self: return data = self[u"primary_report_number"] data = (data if isinstance(data, list) else [data]) li = [el for el in data if el.startswith(ARXIV)] if len(li) == 1: return li[0] return u"" def reformat_authors(self, fmt="Last, First"): """Reformat names of authors. The default formatting for cds/invenio record is ``Last, First``. Args: fmt (str): define the new format for author names. Possible values are "First, Last", "F. Last", "Last", "Last, First" and "Last F." Raises: RecordException: * the argument ``fmt`` is not valid. """ if fmt not in AUTHOR_FORMATS: raise RecordException(MSG_INVALID_FMT) if fmt == self._last_fmt_author: return self._last_fmt_author = fmt df = self[u"authors"] # .................................................................... # # Compute initial for the first name # if fmt in ("F. Last", "Last F."): dfm = (df.first_name.str.extract(REG_INITIAL, expand=True) .fillna("")) df["initial"] = dfm.apply( lambda x: to_initial(x[0], x[1], x[2]), axis="columns") # .................................................................... # # Format # if fmt == "Last, First": df["fmt_name"] = df.last_name + ", " + df.first_name elif fmt == "First, Last": df["fmt_name"] = df.first_name + ", " + df.last_name elif fmt == "F. Last": df["fmt_name"] = df.initial + " " + df.last_name elif fmt == "Last": df["fmt_name"] = df.last_name elif fmt == "Last F.": df["fmt_name"] = df.last_name + " " + df.initial # .................................................................... # # Clean initial column # if fmt in ("F. Last", "Last F."): df = df.drop("initial", axis="columns") def report_number(self): """The report number(s) associated to the publication. Returns: str: - Numbers are separated by a comma - Number are sorted in alphabetic order. - Empty string when not defined. """ # CDS if u"report_number" in self: data = self[u"report_number"] data = (data if isinstance(data, list) else [data]) li = [] [li.extend(di.itervalues()) for di in data] return ", ".join(sorted(li)) # INSPIRE if u"primary_report_number" in self: data = self[u"primary_report_number"] data = (data if isinstance(data, list) else [data]) li = [el for el in data if not el.startswith(ARXIV)] return ", ".join(sorted(li)) return u"" def submitted(self): """The date of submission. Returns: unicode: * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.* * Empty sring when not defined. """ return self._get(u"prepublication", u"date") def title(self): """The title of the publication. Returns: unicode: * Empty string when not defined. * The filter CLEAN_SPACES is applied. """ return CLEAN_SPACES(self._get(u"title", u"title"))