""" invenio_tools.recordpubli
"""
import re
from .base import (ARXIV,
ARXIV_PDF,
REG_ARXIV_NUMBER,
REG_YEAR)
from .exception import RecordException
from filters import CLEAN_COLLABORATION
from numpy import NaN
from pandas import concat, DataFrame
from plugin_dbui import as_list, CLEAN_SPACES
from .record import Record
AUTHOR_FORMATS = [
"First, Last",
"F. Last",
"Last",
"Last, First",
"Last F."]
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
_ref1 = r"(?P
[A-Za-z\. ]+) +(?P\d+),? +(?P[\d-]+) +\((?P[\d]+)\)"
_ref2 = r"(?P[A-Za-z\. ]+) +\((?P\d+)\) +(?P[\d]+):(?P[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
MSG_INVALID_FMT = "Invalid format for author"
# The MARC12 keys containing paper reference
PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])
# Limit the number of first name to two (others will be ignored)
REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?"
def to_initial(x, y, z):
"""Help function to extract initial from a first name split in x, y and z:
Albert (x="Albert", y="", z="")
Antonio Augusto (x="Antonio", y="", z="Augusto")
Jean-Pierre (x="Jean", y="-", z="Pierre")
Args:
x (str): first part
y (str): separator
z (str): second part
Returns:
str
"""
if z == "":
return "%s." % x[0:1]
if y == "":
return "%s. %s." % (x[0:1], z[0:1])
else:
return "%s.%s%s." % (x[0:1], y[0:1], z[0:1])
def to_str(x):
return ("|".join(x) if isinstance(x, list) else x)
class RecordPubli(Record):
"""The MARC record describing a publication.
Usual publications are article, preprint, proceeding, report and talk.
The relation between methods and MARC fields are the following::
+-----------------------+---------+----------+
| | CDS | INSPIREP |
+-----------------------+---------+----------+
| authors | 700 a | |
| collaboration | 710 g | |
| first author | 100 a | |
| institutes | 700 u | |
| paper editor | 773 p | |
| paper pages | 773 c | |
| paper reference | 773 o | |
| paper URL | 8564 u | |
| paper volume | 773 v | |
| paper year | 773 y | |
| preprint number | 037 a | |
| report number | 088 a | 037a |
| submitted | 269 c | |
| title | 245 a | |
| year | 260 c | |
+-----------------------+---------+----------+
"""
def __init__(self, *args):
self._last_fmt_author = "Last, First"
Record.__init__(self, *args)
self._process_authors()
def _process_authors(self):
"""Convert authors information into DataFrame:
* Keep the subfield "a", "u" and "e" (phd thesis)
* Convert list of affiliation in string separated by "|"
Authors and their affiliations are defined in the fields 100 and 700.
The method deals with cases where:
* the first author is defined in 100 but it is not in 700
* first author is not defined in 100 but in 700
* thesis in which 700 contains names of director
Authors and their affiliations are stored in DataFrame with the
following structure:
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
| fmt_name | formated name |
+------------+---------------------------+
"""
columns4names = ["last_name", "first_name"]
# ....................................................................
#
# Instantiate DataFrame for field 100 and 700
#
di = {"100": None, "700": None}
for key in di.iterkeys():
if key not in self:
continue
data = self[key]
data = (data if isinstance(data, list) else [data])
df = DataFrame(data)
columns = df.columns
# keep columns:
# - "a": author name
# - "e": phd director (equal to "dir.")
# - "u": affiliation(s)
df = df.drop(columns.difference(["a", "e", "u"]), axis="columns")
# add columns first_name, last_name and fmt_name
# protection -- split create 1, 2 and more than 2 columns
# former append when the author name is 'ATLAS collaboration'
df1 = df.a.str.split(",", expand=True)
if df1.shape[1] < 2:
continue
df[columns4names] = df1[[0, 1]]
df["fmt_name"] = df.a
df.first_name = df.first_name.str.strip()
df.last_name = df.last_name.str.strip()
# protection -- affiliation not defined
if "a" in columns and "u" not in columns:
dfu = DataFrame([""]*len(df), columns=["u"])
df = concat([df, dfu], axis="columns")
# protection -- mission affiliation
df.u = df.u.fillna("")
# convert list of affiliation to string separated by |
df.u = df.u.apply(lambda x: to_str(x))
di[key] = df
# alias
d100, d700 = di["100"], di["700"]
# ....................................................................
#
# Protection -- more than one first author
#
# treat the case with duplicate author name
# by building the affiliation string
#
if d100 is not None and len(d100) > 1:
grouped = d100.groupby(["a"], sort=False)
if len(grouped) == 1:
for name, group in grouped:
last_name, first_name = name.split(",")
affiliations = \
[el for el in group.u if el not in ("", NaN, None)]
di = {"a": [name],
"first_name": [first_name.strip()],
"fmt_name": [name],
"last_name": [last_name.strip()],
"u": ["|".join(affiliations)]}
d100 = DataFrame(di)
# NOTE
# The case with more than one first author is rare
# It will be detect by the CheckAndFix procedure when it is
# not fixed by the above protection
# ....................................................................
#
# the author are spread over the 100 and 700 field.
# deal with cases where the first author is defined in 100
# but not in 700, first author is defined in 100 and in 700
# or no author in 100
#
if d100 is not None and d700 is not None:
if d100.a.iloc[0] != d700.a.iloc[0]:
if len(d100) == 1:
d700 = concat([d100, d700], ignore_index=True)
elif d100 is None and d700 is not None:
d100 = DataFrame(d700.iloc[0]).transpose()
elif d700 is None and d100 is not None:
d700 = d100
else:
d100 = d700 = DataFrame({
"a": [""],
"first_name": [""],
"fmt_name": [""],
"last_name": [""],
"u": [""]})
# ....................................................................
#
# Update
#
self["100"] = d100
self["700"] = d700
def authors(self, sep=", ", sort=False):
"""The author(s) signing the publication.
Args:
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li = self.authors_as_list(sort=sort)
return sep.join(li)
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list: the list is empty when authors are not defined.
"""
if sort:
li = (self["700"][["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name
.tolist())
else:
li = (self["700"].fmt_name
.sort_index()
.tolist())
if len(li) == 1 and li[0] == "":
li = []
return li
def collaboration(self):
"""The collaboration(s) signing the publication.
Returns:
unicode:
* names of collaboration are separated by a comma.
* The filter CLEAN_COLLABORATION is applied.
"""
li = self._get("710", "g", force_list=True)
return CLEAN_COLLABORATION(", ".join(li))
def find_affiliation(self, pattern):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (unicode):
regular expression defining the affiliation keys.
It has to be build for an exact match namely containing
start and end of string. This is reuqired to separate
`Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
Returns:
unicode:
- the affiliation or the first one when several are found.
- empty string when nothing is found.
"""
df = self["700"]
query = df.u.str.contains(pattern)
data = (df[query].u.unique())
return (data[0] if len(data) > 0 else "")
def find_authors(self, pattern, sep=", ", sort=False):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (unicode):
regular expression defining the author name(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by ``sep`` argument.
* The string is empty when nothing is found.
"""
df = self["700"]
query = df.fmt_name.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return ("" if len(data) == 0 else sep.join(data))
def find_authors_by_affiliation(self, pattern, sep=", ", sort=False):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (unicode):
regular expression defining the affiliation keys
for the institute(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by the ``sep`` argument.
* Author are sorted according to their family name.
* Empty string when authors are not found.
"""
df = self["700"]
query = df.u.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return (sep.join(data) if len(data) > 0 else "")
def first_author(self):
"""The name of the first author.
Returns:
unicode:
- Empty string when the first author is not defined.
"""
return self["700"].fmt_name.iloc[0]
def first_author_institutes(self):
"""The institute(s) associated to the first author.
Note:
Search is performed via the affiliation defined by the "u" key
of the author field.
Returns:
unicode:
- names are separated by ``|``.
- The string is empty when institutes are not defined.
"""
val = self["700"].u.iloc[0]
return ("" if val == NaN else val)
def institutes(self):
"""The list of institute signing the publication.
Note:
Name of institute are given by the affiliation defined by
the "u" key of the author field.
Returns:
list: the list is sort in alphabetic order.
"""
# expand multi-affiliation (one per column)
df = self["700"].u.str.split("|", expand=True)
# merge all columns into a single one,
# sort and remove duplicate entries
li = [df[el].dropna() for el in df.columns]
df = (concat(li, ignore_index=True)
.sort_values()
.unique())
return df.tolist()
def is_affiliations(self):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking that the ``u`` field exists.
To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
Returns:
bool:
"""
df = self["700"]
if "u" not in df.columns:
return False
if len(df) == 1 and df.u.iloc[0] == "":
return False
return True
def is_affiliation_for_all(self):
"""``True`` when affiliation are defined for all authors.
Return:
bool:
"""
df = self["700"]
query = df.u.isin(["", NaN])
return df.u[query].size == 0
def is_authors(self):
"""``True`` when authors are defined.
Returns:
bool:
"""
df = self["700"]
if "a" not in df.columns:
return False
if len(df) == 1 and df.a.iloc[0] == "":
return False
return True
def is_published(self):
"""``True`` is the record is published.
Returns:
bool:
"""
if "773" not in self:
return False
# record can contains erratum
for di in as_list(self["773"]):
# the reference field is complete and contains, at least,
# the keys "p", "v", "y" and "c"
if PAPER_REFERENCE_KEYS.issubset(set(di.keys())):
return True
# paper reference my be incomplete or even wrong
# the recovery procedure will use the 773o
# check that 773o contains the paper reference:
# Eur. Phys. J. C (2014) 74:2883
# Phys. Rev. Lett. 113, 032001 (2014)
if "o" in di:
value = di["o"]
for reg in DECODE_REF:
if reg.match(value):
return True
return False
def is_with_erratum(self):
"""``True`` when the record contains erratum data.
Returns:
bool
"""
# record with erratum contains a list of editor
return "773" in self and isinstance(self["773"], list)
def paper_editor(self):
"""The abbreviated version of the review, *e.g* Phys Lett B.
Returns:
unicode or list:
* A list when there are erratum.
* Empty string when not defined.
"""
return self._get("773", "p")
def paper_pages(self):
"""The page number / range when the record is published in a review.
Returns:
unicode or list:
* The format is "45-67" or "234".
* A list when there are erratum.
* Empty string when not defined.
"""
return self._get("773", "c")
def paper_reference(self):
"""The full reference for a publication published in a review.
Returns:
unicode or list:
* The format is "Phys Lett B 456 2010 5-6".
* The string is empty when the publication is not
published in a review.
"""
if "773" not in self:
return ""
li = []
for k in ("p", "v", "y", "c"):
if k in self["773"]:
li.append(self["773"][k])
return " ".join(li)
def paper_url(self):
"""The URL of the preprint.
Note:
Many others URL exists mainly those related to open access.
Returns:
unicode: the string is empty when no URLs are found.
"""
pdf = "%s.pdf" % self.preprint_number()
if "8564" in self and isinstance(self["8564"], list):
for el in self["8564"]:
# protection see http://cds.cern.ch/record/2014733
if "u" in el and isinstance(el["u"], list) and pdf:
m = REG_ARXIV_NUMBER.search(pdf)
if m:
return "%s%s" % (ARXIV_PDF, m.group())
# cds.cern.ch
if "y" in el and el["y"] == "Preprint":
return el["u"]
# inspirehep.net
elif "y" not in el and el["u"].endswith(pdf):
return el["u"]
else:
return ""
def paper_volume(self):
"""The volume number when the record is published in a review.
Returns:
unicode or list:
- A list when there are erratum.
- Empty string when nothing is found.
"""
return self._get("773", "v")
def paper_year(self):
"""The year of the publication.
Returns:
unicode or list:
- A list when there are erratum.
- Empty string if the year is not defined.
"""
rep = self._get("773", "y")
# protection
# in record http://cds.cern.ch:record/1951625 the entrie 773y
# is duplicate but there is no erratum
if isinstance(rep, list) and not isinstance(self["773"], list):
rep = list(set(rep))
if len(rep) == 1:
rep = rep[0]
return rep
def preprint_number(self):
"""The ArXiv preprint number.
Returns:
unicode: empty string when it is not defined.
"""
# for both CDS and INSPRIREHEP preprint data in 37 a
# for CDS preprint information are also store in 88 a
for k in ("037", "088"):
for val in self._get(k, "a", force_list=True):
if ARXIV in val:
return val
return ""
def reformat_authors(self, fmt="Last, First"):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
"""
if fmt not in AUTHOR_FORMATS:
raise RecordException(MSG_INVALID_FMT)
if fmt == self._last_fmt_author:
return
self._last_fmt_author = fmt
# alias
d100, d700 = self["100"], self["700"]
# ....................................................................
#
# Compute initial for the first name
#
if fmt in ("F. Last", "Last F."):
for df in (d100, d700):
dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
.fillna(""))
df["initial"] = dfm.apply(
lambda x: to_initial(x[0], x[1], x[2]), axis="columns")
# ....................................................................
#
# Format
#
if fmt == "Last, First":
d100["fmt_name"] = d100.a
d700["fmt_name"] = d700.a
elif fmt == "First, Last":
d100["fmt_name"] = d100.first_name + ", " + d100.last_name
d700["fmt_name"] = d700.first_name + " " + d700.last_name
elif fmt == "F. Last":
d100["fmt_name"] = d100.initial + " " + d100.last_name
d700["fmt_name"] = d700.initial + " " + d700.last_name
elif fmt == "Last":
d100["fmt_name"] = d100.last_name
d700["fmt_name"] = d700.last_name
elif fmt == "Last F.":
d100["fmt_name"] = d100.last_name + " " + d100.initial
d700["fmt_name"] = d700.last_name + " " + d700.initial
# ....................................................................
#
# Clean initial column
#
if fmt in ("F. Last", "Last F."):
d100 = d100.drop("initial", axis="columns")
d700 = d700.drop("initial", axis="columns")
def report_number(self):
"""The report number(s) associated to the publication.
Returns:
unicode:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
li = []
# cds.cern.ch
# report number can be in 37a, 88a and 88 9
# entry can be the preprint number arXiv:xxx
if self.host().startswith("cds"):
for elt in self._get("088", "a", force_list=True):
if not elt.startswith(ARXIV):
li.append(elt)
# if empty have a look to "088" "9"
# logic to avoid version number in 88/9
# 88/a = LHCB-PAPER-2015-016 while 88/9 = LHCB-PAPER-2015-016-003
if not li:
for elt in self._get("088", "9", force_list=True):
if not elt.startswith(ARXIV):
li.append(elt)
# inspirehep.net / cds.cern.ch -- example of MARC structure:
# 037__ $$aLHCB-PAPER-2014-047
# 037__ $$aCERN-PH-EP-2014-221
# 037__ $$9arXiv$$aarXiv:1410.0149$$chep-ex
if "037" in self:
if isinstance(self["037"], dict):
if "9" in self["037"] and self["037"]["9"] == ARXIV:
pass
elif "a" in self["037"]:
if not self["037"]["a"].startswith(ARXIV):
li.append(self["037"]["a"])
elif isinstance(self["037"], list):
for di in self["037"]:
if "9" in di and di["9"] == ARXIV:
continue
if "a" in di:
if not di["a"].startswith(ARXIV):
li.append(di["a"])
li.sort()
return ", ".join(li)
def submitted(self):
"""The date of submission.
Returns:
unicode or list:
* The format is "YYYY-MM" or "YYYY-MM-DD"
* A list when there are erratum.
* Empty list when not defined.
"""
return self._get("269", "c", force_list=True)
def title(self):
"""The title of the publication.
Returns:
unicode or list:
* A list when there are erratum.
* Empty string when not defined.
* The filter CLEAN_SPACES is applied.
"""
val = self._get("245", "a")
if isinstance(val, (unicode, str)):
return CLEAN_SPACES(val)
elif isinstance(val, list):
for i in range(len(val)):
val[i] = CLEAN_SPACES(val[i])
else:
return val
def year(self):
"""The year of the publication.
Returns:
unicode or list:
* A list when there are erratum.
* Empty string when it is not defined.
"""
val = self._get("260", "c")
if isinstance(val, list):
if len(val):
val.sort()
val = val[0]
else:
val = ""
# several form are possible 2014, 2014-12 or 2014-12-31
if val:
match = REG_YEAR.search(val)
if match:
val = match.group(1)
return val