""" invenio_tools.recordpubli
"""
import re
from .base import (ARXIV,
ARXIV_PDF,
REG_ARXIV_NUMBER,
REG_YEAR)
from .exception import RecordException
from filters import CLEAN_COLLABORATION
from numpy import NaN
from pandas import concat, DataFrame
from plugin_dbui import as_list, CLEAN_SPACES
from .record import Record
AUTHOR_FORMATS = [
"First, Last",
"F. Last",
"Last",
"Last, First",
"Last F."]
# decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
_ref1 = r"(?P
[A-Za-z\. ]+) +(?P\d+),? +(?P[\d-]+) +\((?P[\d]+)\)"
_ref2 = r"(?P[A-Za-z\. ]+) +\((?P\d+)\) +(?P[\d]+):(?P[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
MSG_INVALID_FMT = "Invalid format for author"
# the keys containing paper reference
PAPER_REFERENCE_KEYS = {"pagination", "title", "volume", "year"}
# extract initial of a first name
REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?$"
def to_initial(x, y, z):
"""Help function to extract initial from a first name split in x, y and z:
Albert (x="Albert", y="", z="")
Antonio Augusto (x="Antonio", y="", z="Augusto")
Jean-Pierre (x="Jean", y="-", z="Pierre")
Args:
x (str): first part
y (str): separator
z (str): second part
Returns:
str
"""
if z == "":
return "%s." % x[0:1]
if y == "":
return "%s. %s." % (x[0:1], z[0:1])
else:
return "%s.%s%s." % (x[0:1], y[0:1], z[0:1])
def to_str(x):
return ("|".join(x) if isinstance(x, list) else x)
class RecordPubli(Record):
"""The record describes an article, preprint, proceeding, report and talk.
The main ``field`` and ``subfield`` are::
+---------------------------------+----------------------------------+
| field | subfield |
+---------------------------------+----------------------------------+
| FIXME_OAI (inspire) | id |
| abstract | |
| accelerator_experiment | |
| agency_code (cds) | |
| authors | INSPIRE_number, affiliation, |
| | control_number, first_name, |
| | full_name, last_name, |
| | relator_name (phd director) |
| base (cds) | |
| collection | |
| comment | |
| copyright_status (cds) | |
| corporate_name | collaboration |
| creation_date | |
| doi | |
| email_message (cds) | |
| filenames | |
| files | comment, description, eformat, |
| | full_name, full_path, magic, |
| | name, path, size, status, |
| | subformat, superformat, type, |
| | url, version |
| filetypes | |
| imprint | |
| keywords | |
| language (cds) | |
| license | |
| number_of_authors | |
| number_of_citations | |
| number_of_comments | |
| number_of_reviews | |
| oai (cds) | value |
| other_report_number (cds) | |
| persistent_identifiers_keys | |
| physical_description | |
| prepublication | date, publisher_name, place |
| primary_report_number | |
| publication_info | pagination, title, volume, year |
| recid | none |
| reference (inspire) | |
| report_number (cds) | internal, report_number |
| source_of_acquisition (inspire) | |
| status_week (cds) | |
| subject | |
| system_control_number | institute, value or canceled |
| thesaurus_terms | |
| title | title |
| title_additional (inspire) | |
| url (cds) | description, url |
| version_id | |
+---------------------------------+----------------------------------+
"""
def __init__(self, *args):
self._last_fmt_author = "Last, First"
Record.__init__(self, *args)
self._process_authors()
self._process_publication_info()
def _process_authors(self):
"""Convert authors information into DataFrame:
Authors and their affiliations are stored in DataFrame with the
following structure:
+---------------+--------------------------------+
| column | |
+---------------+--------------------------------+
| affiliation | value separated by "|" |
| first_name | first name |
| fmt_name | formated name |
| full_name | Last, First |
| last_name | family name |
| relator_name | equal to dir. for phd director |
+---------------+--------------------------------+
Note:
After running this method, the field ``authors`` is always defined.
It contains one entry with empty strings when the field does not
exist.
"""
if u"authors" not in self:
cols = ["affiliation",
"first_name",
"fmt_name",
"full_name",
"last_name"]
self[u"authors"] = DataFrame([[""]*len(cols)], columns=cols)
return
data = self[u"authors"]
data = (data if isinstance(data, list) else [data])
df = DataFrame(data)
# drop useless columns
refcols = ["affiliation",
"first_name",
"full_name",
"last_name",
"relator_name"]
columns = df.columns
df = df.drop(columns.difference(refcols), axis="columns")
# protection -- affiliation not defined
if "affiliation" not in columns:
dfa = DataFrame([""]*len(df), columns=["affiliation"])
df = concat([df, dfa], axis="columns")
# convert list of affiliation to string separated by |
df.affiliation = (df.affiliation
.fillna("")
.apply(lambda x: to_str(x)))
# add the column fmt_name
df["fmt_name"] = df.full_name
# replace
self[u"authors"] = df
def _process_publication_info(self):
"""Convert publication_info into DataFrame:
Note:
* the field is a list when there are erratum
* in some case the subfield year is a list (cds 1951625)
publication information are stored in DataFrame with the
following structure:
+------------+--------------------------------+
| column | |
+------------+--------------------------------+
| title | abbreviation of the publisher |
| volume | volume |
| year | year of publication |
| pagination | page number or ranges |
+------------+--------------------------------+
Note:
* After running this method, the field ``publication_info``
is always defined. It contains one entry with empty strings
when the field does not exist.
* In order to deal with erratum entry are sorter by year
and volume.
"""
if u"publication_info" not in self:
cols = ["title",
"volume",
"year",
"pagination"]
self[u"publication_info"] = \
DataFrame([[""]*len(cols)], columns=cols)
return
data = self[u"publication_info"]
data = (data if isinstance(data, list) else [data])
df = DataFrame(data)
# protection -- list of year, e.g. [2014, 2014] (cds 1951625)
df["year"] = \
df.year.apply(
lambda x: (", ".join(set(x)) if isinstance(x, list) else x))
# erratum -- sort by year and volume
columns = df.columns
if set(["year", "volume"]).issubset(columns):
df = df.sort_values(["year", "volume"])
elif "year" in columns:
df = df.sort_values("year")
# replace
self[u"publication_info"] = df
def authors(self, sep=", ", sort=False):
"""The author(s) signing the publication.
Args:
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li = self.authors_as_list(sort=sort)
return sep.join(li)
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list:
the list is empty when authors are not defined.
"""
df = self[u"authors"]
if sort:
li = (df[["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name
.tolist())
else:
li = (df.fmt_name
.sort_index()
.tolist())
if len(li) == 1 and li[0] == "":
li = []
return li
def collaboration(self):
"""The collaboration(s) signing the publication.
Returns:
str:
* names of collaboration are separated by a comma.
* The filter CLEAN_COLLABORATION is applied.
"""
li = self._get(u"corporate_name", u"collaboration", force_list=True)
return CLEAN_COLLABORATION(", ".join(li))
def find_affiliation(self, pattern):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (str):
regular expression defining the affiliation keys.
It has to be build for an exact match namely containing
start and end of string. This is reuqired to separate
`Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
Returns:
str:
- the affiliation or the first one when several are found.
- empty string when nothing is found.
"""
df = self[u"authors"]
# modify the pattern to capture group
pattern = "(%s)" % pattern
data = (df.affiliation.str.extract(pattern, expand=False)
.dropna())
return (data[0] if len(data) > 0 else "")
def find_authors(self, pattern, sep=", ", sort=False):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (str):
regular expression defining the author name(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by ``sep`` argument.
* The string is empty when nothing is found.
"""
df = self[u"authors"]
query = df.fmt_name.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return ("" if len(data) == 0 else sep.join(data))
def find_authors_by_affiliation(self, pattern, sep=", ", sort=False):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (str):
regular expression defining the affiliation keys
for the institute(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* Author are sorted according to their family name.
* Empty string when authors are not found.
"""
df = self[u"authors"]
query = df.affiliation.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return (sep.join(data) if len(data) > 0 else "")
def first_author(self):
"""The name of the first author.
Returns:
str:
- Empty string when the first author is not defined.
"""
return self[u"authors"].fmt_name.iloc[0]
def first_author_institutes(self):
"""The institute(s) associated to the first author.
Note:
Search is performed via the affiliation defined by the "u" key
of the author field.
Returns:
str:
- names are separated by ``|``.
- The string is empty when institutes are not defined.
"""
val = self[u"authors"].affiliation.iloc[0]
return ("" if val == NaN else val)
def institutes(self):
"""The list of institute signing the publication.
Note:
Name of institute are given by the affiliation defined by
the "affiliation" key of the author field.
Returns:
list:
the list is sort in alphabetic order.
"""
df = self[u"authors"]
# expand multi-affiliation (one per column)
df = df.affiliation.str.split("|", expand=True)
# merge all columns into a single one,
# sort and remove duplicate entries
li = [df[el].dropna() for el in df.columns]
df = (concat(li, ignore_index=True)
.sort_values()
.unique())
return df.tolist()
def is_affiliations(self):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking that the ``affiliation`` field
exists. To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
Returns:
bool:
"""
df = self[u"authors"]
if len(df) == 1 and df.affiliation.iloc[0] == "":
return False
return True
def is_affiliation_for_all(self):
"""``True`` when affiliation are defined for all authors.
Return:
bool:
"""
df = self[u"authors"]
query = df.affiliation.isin(["", NaN])
return df.affiliation[query].size == 0
def is_authors(self):
"""``True`` when authors are defined.
Returns:
bool:
"""
df = self[u"authors"]
cols = {"first_name", "full_name", "last_name"}
if len(df.columns.intersection(cols)) != 3:
return False
if len(df) == 1 and df.full_name.iloc[0] == "":
return False
return True
def is_published(self):
"""``True`` is the record is published and contains a complet set
of publication infromation (title, volume, year and pagination).
Returns:
bool:
"""
df = self[u"publication_info"]
query = \
(df.title.str.len() > 0) \
& (df.volume.str.len() > 0) \
& (df.year.str.len() > 0) \
& (df.pagination.str.len() > 0)
return len(df[query]) > 0
def is_with_erratum(self):
"""``True`` when the record contains erratum data.
Returns:
bool
"""
df = self[u"publication_info"]
return len(df) > 1
def paper_editor(self):
"""The abbreviated version of the review, *e.g* Phys Lett B.
Returns:
unicode:
* Empty string when not defined.
"""
df = self[u"publication_info"]
return df.title.iloc[0]
def paper_pages(self):
"""The page number / range when the record is published in a review.
Returns:
unicode:
* The format is "45-67" or "234".
* Empty string when not defined.
"""
df = self[u"publication_info"]
return df.pagination.iloc[0]
def paper_reference(self):
"""The full reference for a publication published in a review.
Returns:
unicode:
* The format is "Phys Lett B 456 2010 5-6".
* The string is empty when the publication is not
published in a review.
"""
paper = self[u"publication_info"].iloc[0]
li = [paper.title,
paper.volume,
paper.year,
paper.pagination]
return u" ".join(li).strip()
def paper_url(self):
"""The URL of the preprint.
Note:
Many others URL exists mainly those related to open access.
Returns:
unicode:
the string is empty when no URLs are found.
"""
# depends on the store
# start with CDS looking for the field `url`
if u"url" in self:
data = self[u"url"]
li = (data if isinstance(data, list) else [data])
li = [di[u"url"] for di in li if di[u"description"] == u"Preprint"]
if len(li) == 1:
return li[0]
# scan the list of files
# work for both stores.
pdf = "%s.pdf" % self.preprint_number()
li = self._get(u"files", u"url", force_list=True)
li = [el for el in li if el.endswith(pdf)]
if len(li) == 1:
return li[0]
return u""
def paper_volume(self):
"""The volume number when the record is published in a review.
Returns:
unicode:
- Empty string when nothing is found.
"""
df = self[u"publication_info"]
return df.volume.iloc[0]
def paper_year(self):
"""The year of the publication.
Returns:
unicode:
- Empty string if the year is not defined.
"""
df = self[u"publication_info"]
return df.year.iloc[0]
def preprint_number(self):
"""The ArXiv preprint number.
Returns:
str: empty string when it is not defined.
"""
if u"primary_report_number" not in self:
return
data = self[u"primary_report_number"]
data = (data if isinstance(data, list) else [data])
li = [el for el in data if el.startswith(ARXIV)]
if len(li) == 1:
return li[0]
return u""
def reformat_authors(self, fmt="Last, First"):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Raises:
RecordException:
* the argument ``fmt`` is not valid.
"""
if fmt not in AUTHOR_FORMATS:
raise RecordException(MSG_INVALID_FMT)
if fmt == self._last_fmt_author:
return
self._last_fmt_author = fmt
df = self[u"authors"]
# ....................................................................
#
# Compute initial for the first name
#
if fmt in ("F. Last", "Last F."):
dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
.fillna(""))
df["initial"] = dfm.apply(
lambda x: to_initial(x[0], x[1], x[2]), axis="columns")
# ....................................................................
#
# Format
#
if fmt == "Last, First":
df["fmt_name"] = df.last_name + ", " + df.first_name
elif fmt == "First, Last":
df["fmt_name"] = df.first_name + ", " + df.last_name
elif fmt == "F. Last":
df["fmt_name"] = df.initial + " " + df.last_name
elif fmt == "Last":
df["fmt_name"] = df.last_name
elif fmt == "Last F.":
df["fmt_name"] = df.last_name + " " + df.initial
# ....................................................................
#
# Clean initial column
#
if fmt in ("F. Last", "Last F."):
df = df.drop("initial", axis="columns")
def report_number(self):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
# CDS
if u"report_number" in self:
data = self[u"report_number"]
data = (data if isinstance(data, list) else [data])
li = []
[li.extend(di.itervalues()) for di in data]
return ", ".join(sorted(li))
# INSPIRE
if u"primary_report_number" in self:
data = self[u"primary_report_number"]
data = (data if isinstance(data, list) else [data])
li = [el for el in data if not el.startswith(ARXIV)]
return ", ".join(sorted(li))
return u""
def submitted(self):
"""The date of submission.
Returns:
unicode:
* format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
* Empty sring when not defined.
"""
return self._get(u"prepublication", u"date")
def title(self):
"""The title of the publication.
Returns:
unicode:
* Empty string when not defined.
* The filter CLEAN_SPACES is applied.
"""
return CLEAN_SPACES(self._get(u"title", u"title"))