Commit b7ca0e1b authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update RecordPubli and RecordThesis in order to include author formatting/sorting.

parent 8a8147e4
......@@ -9,12 +9,22 @@ from base import (ARXIV,
ARXIV_PDF,
REG_ARXIV_NUMBER,
REG_YEAR)
from exception import RecordException
from filters import CLEAN_COLLABORATION
from numpy import NaN
from pandas import concat, DataFrame
from plugin_dbui import as_list, CLEAN_SPACES
from record import Record
AUTHOR_FORMATS = [
"First, Last",
"F. Last",
"Last",
"Last, First",
"Last F."]
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
......@@ -22,9 +32,38 @@ _ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
MSG_INVALID_FMT = "Invalid format for author"
# The MARC12 keys containing paper reference
PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])
REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?$"
def to_initial(x, y, z):
"""Help function to extract initial from a first name split in x, y and z:
Albert (x="Albert", y="", z="")
Antonio Augusto (x="Antonio", y="", z="Augusto")
Jean-Pierre (x="Jean", y="-", z="Pierre")
Args:
x (str): first part
y (str): separator
z (str): second part
Returns:
str
"""
if z == "":
return "%s." % x[0:1]
if y == "":
return "%s. %s." % (x[0:1], z[0:1])
else:
return "%s.%s%s." % (x[0:1], y[0:1], z[0:1])
def to_str(x):
return ("|".join(x) if isinstance(x, list) else x)
......@@ -58,6 +97,8 @@ class RecordPubli(Record):
"""
def __init__(self, *args):
self._last_fmt_author = "Last, First"
Record.__init__(self, *args)
self._process_authors()
......@@ -67,14 +108,29 @@ class RecordPubli(Record):
* Keep the subfield "a", "u" and "e" (phd thesis)
* Convert list of affiliation in string separated by "|"
The author are spread over the 100 and 700 field.
Authors and their affiliations are defined in the fields 100 and 700.
The method deals with cases where:
* the first author is defined in 100 but it is not in 700
* first author is not defined in 100 but in 700
* thesis in which 700 contains names of director
Authors and their affiliations are stored in DataFrame with the
following structure:
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
| fmt_name | formated name |
+------------+---------------------------+
"""
columns4names = ["last_name", "first_name"]
# ....................................................................
#
# Instantiate DataFrame for field 100 and 700
......@@ -91,12 +147,20 @@ class RecordPubli(Record):
df = DataFrame(data)
columns = df.columns
# only keep columns:
# keep columns:
# - "a": author name
# - "e": phd director (equal to "dir.")
# - "u": affiliation(s)
df = df.drop(columns.difference(["a", "e", "u"]), axis="columns")
# add columns first_name, last_name and fmt_name
# warning: in some case split create more than 2 columns
df[columns4names] = df.a.str.split(u",", expand=True)[[0, 1]]
df["fmt_name"] = df.a
df.first_name = df.first_name.str.strip()
df.last_name = df.last_name.str.strip()
# protection -- affiliation not defined
if "a" in columns and "u" not in columns:
dfu = DataFrame([""]*len(df), columns=["u"])
......@@ -105,8 +169,7 @@ class RecordPubli(Record):
# protection -- mission affiliation
df.u = df.u.fillna("")
# convert list of affiliation to string
# in which values are separated by |
# convert list of affiliation to string separated by |
df.u = df.u.apply(lambda x: to_str(x))
di[key] = df
......@@ -116,15 +179,33 @@ class RecordPubli(Record):
# ....................................................................
#
# protection -- more than one first author
# the case with duplicate author name (build affiliation)
# Protection -- more than one first author
#
# treat the case with duplicate author name
# by building the affiliation string
#
if d100 is not None and len(d100) > 1:
grouped = d100.groupby(["a"], sort=False)
if len(grouped) == 1:
for name, group in grouped:
li = [el for el in group.u if el not in ("", NaN, None)]
d100 = DataFrame({"a": [name], "u": ["|".join(li)]})
last_name, first_name = name.split(u",")
affiliations = \
[el for el in group.u if el not in ("", NaN, None)]
di = {"a": [name],
"first_name": [first_name.strip()],
"fmt_name": [name],
"last_name": [last_name.strip()],
"u": ["|".join(affiliations)]}
d100 = DataFrame(di)
# NOTE
# The case with more than one first author is rare
# It will be detect by the CheckAndFix procedure when it is
# not fixed by the above protection
# ....................................................................
#
......@@ -132,6 +213,7 @@ class RecordPubli(Record):
# deal with cases where the first author is defined in 100
# but not in 700, first author is defined in 100 and in 700
# or no author in 100
#
if d100 is not None and d700 is not None:
if d100.a.iloc[0] != d700.a.iloc[0]:
if len(d100) == 1:
......@@ -146,39 +228,54 @@ class RecordPubli(Record):
else:
d100 = d700 = DataFrame({"a": [""], "u": [""]})
# ....................................................................
#
# Update
#
self[u"100"] = d100
self[u"700"] = d700
def authors(self, cmpFct=None):
def authors(self, sep=u", ", sort=False):
"""The author(s) signing the publication.
Args:
cmpFct (reference): function to compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller
than, equal to, or larger than the second one.
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by comma.
* Author are sorted according to the function *cmpFct*.
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li = self.authors_as_list()
if cmpFct:
li.sort(key=cmpFct)
return u", ".join(li)
li = self.authors_as_list(sort=sort)
return sep.join(li)
def authors_as_list(self):
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list: the list is empty when authors are not defined.
"""
li = self[u"700"].a.tolist()
if sort:
li = (self[u"700"][["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name
.tolist())
else:
li = (self[u"700"].fmt_name
.sort_index()
.tolist())
if len(li) == 1 and li[0] == "":
li = []
......@@ -201,8 +298,9 @@ class RecordPubli(Record):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (unicode): regular expression defining the
affiliation keys.
pattern (unicode):
regular expression defining the affiliation keys.
It should not contains groups.
Returns:
unicode:
......@@ -213,39 +311,63 @@ class RecordPubli(Record):
# modify the pattern to capture group
pattern = "(%s)" % pattern
series = self[u"700"].u.str.extract(pattern, expand=False).dropna()
return (series.iloc[0] if len(series) > 0 else u"")
data = (self[u"700"].u.str.extract(pattern, expand=False)
.dropna())
def find_authors(self, pattern):
return (data.iloc[0] if len(data) > 0 else u"")
def find_authors(self, pattern, sep=u", ", sort=False):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (unicode): regular expression defining the author name(s).
pattern (unicode):
regular expression defining the author name(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by ``|``.
* Author names are separated by ``sep`` argument.
* The string is empty when nothing is found.
"""
df = self[u"700"]
query = df.a.str.contains(pattern)
df = df.loc[query, ["a"]]
query = df.fmt_name.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
return (u"" if len(df) == 0 else u"|".join(df.a))
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return (u"" if len(data) == 0 else sep.join(data))
def find_authors_by_affiliation(self, pattern):
def find_authors_by_affiliation(self, pattern, sep=u", ", sort=False):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (unicode): regular expression defining the affiliation keys
pattern (unicode):
regular expression defining the affiliation keys
for the institute(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by ``|``.
* Author names are separated by the ``sep`` argument.
* Author are sorted according to their family name.
* Empty string when authors are not found.
......@@ -253,21 +375,31 @@ class RecordPubli(Record):
df = self[u"700"]
query = df.u.str.contains(pattern)
df = df.loc[query, ["a"]]
df.a = df.a.str.encode("utf-8").sort_values()
return ("|".join(df.a) if len(df) > 0 else "").decode("utf-8")
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
data = data.str.encode("utf-8")
sep = sep.encode("utf-8")
return (sep.join(data) if len(data) > 0 else "").decode("utf-8")
def first_author(self):
"""The name of the first author.
Returns:
unicode or list:
unicode:
- Empty string when the first author is not defined.
- List of name when there is more than one.
"""
return self[u"700"].a.iloc[0]
return self[u"700"].fmt_name.iloc[0]
def first_author_institutes(self):
"""The institute(s) associated to the first author.
......@@ -312,7 +444,7 @@ class RecordPubli(Record):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking only first and last authors.
This is a fast algorithm checking that the ``u`` field exists.
To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
......@@ -524,6 +656,76 @@ class RecordPubli(Record):
return val
return u""
def reformat_authors(self, fmt="Last, First"):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
"""
if fmt not in AUTHOR_FORMATS:
raise RecordException(MSG_INVALID_FMT)
if fmt == self._last_fmt_author:
return
self._last_fmt_author = fmt
# alias
d100, d700 = self[u"100"], self[u"700"]
# ....................................................................
#
# Compute initial for the first name
#
if fmt in ("F. Last", "Last F."):
for df in (d100, d700):
dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
.fillna(""))
df["initial"] = dfm.apply(
lambda x: to_initial(x[0], x[1], x[2]), axis="columns")
# ....................................................................
#
# Format
#
if fmt == "Last, First":
d100["fmt_name"] = d100.a
d700["fmt_name"] = d700.a
elif fmt == "First, Last":
d100["fmt_name"] = d100.first_name + ", " + d100.last_name
d700["fmt_name"] = d700.first_name + " " + d700.last_name
elif fmt == "F. Last":
d100["fmt_name"] = d100.initial + " " + d100.last_name
d700["fmt_name"] = d700.initial + " " + d700.last_name
elif fmt == "Last":
d100["fmt_name"] = d100.last_name
d700["fmt_name"] = d700.last_name
elif fmt == "Last F.":
d100["fmt_name"] = d100.last_name + " " + d100.initial
d700["fmt_name"] = d700.last_name + " " + d700.initial
# ....................................................................
#
# Clean initial column
#
if fmt in ("F. Last", "Last F."):
d100 = d100.drop("initial", axis="columns")
d700 = d700.drop("initial", axis="columns")
def report_number(self):
"""The report number(s) associated to the publication.
......
......@@ -30,9 +30,12 @@ class RecordThesis(RecordPubli):
+-----------------------+---------+----------+
"""
def authors_as_list(self):
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
Args:
sort (bool): sort authors by first name when true.
Returns:
list: the list is empty when authors are not defined.
......@@ -42,9 +45,17 @@ class RecordThesis(RecordPubli):
df = self[u"700"]
query = df.e != THESIS_DIR
df = df.loc[query]
li = df.a.tolist()
if sort:
li = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name
.tolist())
else:
li = (df.loc[query].fmt_name
.sort_index()
.tolist())
if len(li) == 1 and li[0] == "":
li = []
......@@ -78,12 +89,12 @@ class RecordThesis(RecordPubli):
"""
return self._get(u"502", "a")
def these_directors(self):
def these_directors(self, sep=u", "):
"""The list of director(s)
Returns:
unicode:
* Names are separated by ``|``.
* Names are separated by the ``sep`` argument.
* Empty string when it is not defined.
"""
......@@ -94,7 +105,7 @@ class RecordThesis(RecordPubli):
query = df.e == THESIS_DIR
df = df.loc[query]
return (u"|".join(df.a) if len(df) > 0 else u"")
return (sep.join(df.fmt_name) if len(df) > 0 else u"")
def these_town(self):
"""The town where the thesis took place.
......
......@@ -65,12 +65,12 @@ def test_find_affiliation(record):
def test_find_authors(record):
assert record.find_authors("Leo") == u"Beaucourt, Leo|Kravchuk, Leonid|Leo, Sabato"
assert record.find_authors("Leo") == u"Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato"
def test_find_authors_by_affiliation(record):
pattern = "CPPM, Marseille|Marseille, CPPM"
authors = record.find_authors_by_affiliation(pattern)
authors = record.find_authors_by_affiliation(pattern, sep=u"|")
assert authors == u"Akar, Simon|Aslanides, Elie|Cogan, Julien|" \
u"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|" \
......
......@@ -104,7 +104,7 @@ def test_these_level(record):
def test_these_directors(record):
assert record.these_directors() == "He, Mao|Monnier, Emmanuel|Zhu, Chengguang"
assert record.these_directors() == "He, Mao, Monnier, Emmanuel, Zhu, Chengguang"
def test_these_town(record):
......
......@@ -20,7 +20,7 @@ def test_protection_find_authors_by_affiliation():
"""The affiliation is not defined for one author -- skip it."""
record = load_record('cds.cern.ch', 2012165)
pattern = "Marseille, CPPM|CPPM, Marseille"
authors = record.find_authors_by_affiliation(pattern)
authors = record.find_authors_by_affiliation(pattern, sep=u"|")
assert authors == u"Akar, Simon|Aslanides, Elie|Cogan, Julien|" \
u"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|" \
u"Mancinelli, Giampiero|Mordà, Alessandro|" \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment