Commit 0a40e705 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

App store_tools.PluginAuthors and PluginPublicationInfo

parent eb81fae3
......@@ -17,7 +17,7 @@ from store_tools import (MSG_NO_CONF,
REG_OAI,
REG_YEAR)
from store_tools.recordpubli import PAPER_REFERENCE_KEYS
from store_tools.pluginpublicationinfo import PAPER_REFERENCE_KEYS
from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID
......
......@@ -140,35 +140,3 @@ def is_thesis(recjson):
return True
return False
def to_initial(name):
"""Help function to extract initial from a first name:
+------------------+----------+
| name | initial |
+------------------+----------+
| Albert | A. |
| Antonio Augusto | A. A. |
| Kristof Antoon M | K. A. M. |
| Jean-Pierre | J.-P. |
| Marie-Hélène | M.-H. |
+------------------+----------+
Args:
name (str):
Returns:
str
"""
if len(name) == 0:
return ""
li = []
for el in re.finditer(r"(\w+|-)", name, re.UNICODE):
val = el.group(1)[0:1]
val = (val if val == "-" else "%s." % val)
li.append(val)
return ("".join(li) if "-" in li else " ".join(li))
"""recordauthorplugin.py
"""
import re
from .exception import RecordException
from numpy import NaN
from pandas import concat
AUTHOR_FORMATS = [
"First, Last",
"F. Last",
"Last",
"Last, First",
"Last F."]
MSG_INVALID_FMT = "Invalid format for author"
def to_initial(name):
"""Help function to extract initial from a first name:
+------------------+----------+
| name | initial |
+------------------+----------+
| Albert | A. |
| Antonio Augusto | A. A. |
| Kristof Antoon M | K. A. M. |
| Jean-Pierre | J.-P. |
| Marie-Hélène | M.-H. |
+------------------+----------+
Args:
name (str):
Returns:
str
"""
if len(name) == 0:
return ""
li = []
for el in re.finditer(r"(\w+|-)", name, re.UNICODE):
val = el.group(1)[0:1]
val = (val if val == "-" else "%s." % val)
li.append(val)
return ("".join(li) if "-" in li else " ".join(li))
class PluginAuthors(object):
"""Plugin to handle authors in RecordPubli and RecordHepPubli
Authors and their affiliations are stored in DataFrame with the
following structure:
+---------------+--------------------------------+
| column | |
+===============+================================+
| affiliation | value separated by "|" |
+---------------+--------------------------------+
| first_name | first name |
+---------------+--------------------------------+
| fmt_name | formated name |
+---------------+--------------------------------+
| full_name | Last, First |
+---------------+--------------------------------+
| last_name | family name |
+---------------+--------------------------------+
The DataFrame has to be build in the parent
It is retrieved via self["authors"]
"""
def authors(self, sep=", ", sort=False):
"""The author(s) signing the publication.
Args:
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li = self.authors_as_list(sort=sort)
return sep.join(li)
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list:
* name are unique
* the list is empty when authors are not defined.
"""
df = self["authors"]
if sort:
li = (df[["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name
.tolist())
else:
li = (df.fmt_name
.sort_index()
.tolist())
if len(li) == 1 and li[0] == "":
li = []
return li
def find_affiliation(self, pattern):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (str):
regular expression defining the affiliation keys.
It has to be build for an exact match namely containing
start and end of string. This is required to separate
`Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
Returns:
str:
- the affiliation or the first one when several are found.
- empty string when nothing is found.
"""
df = self["authors"]
query = df.affiliation.str.match(pattern)
data = df[query]
if data.empty:
return ""
data = data.affiliation.unique()
return (data[0] if len(data) > 0 else "")
def find_authors(self, pattern, sep=", ", sort=False):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (str):
regular expression defining the author name(s).
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by ``sep`` argument.
* The string is empty when nothing is found.
"""
df = self["authors"]
query = df.fmt_name.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return ("" if len(data) == 0 else sep.join(data))
def find_authors_by_affiliation(self, pattern, sep=", ", sort=False):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (str):
regular expression defining the affiliation keys
for the institute(s).
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* Author are sorted according to their family name.
* Empty string when authors are not found.
"""
df = self["authors"]
query = df.affiliation.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return (sep.join(data) if len(data) > 0 else "")
def first_author(self):
"""The name of the first author.
Returns:
str:
empty string when the first author is not defined.
"""
return self["authors"].fmt_name.iloc[0]
def first_author_institutes(self):
"""The institute(s) associated to the first author.
Returns:
str:
- names are separated by ``|``.
- The string is empty when institutes are not defined.
"""
val = self["authors"].affiliation.iloc[0]
return ("" if val == NaN else val)
def institutes(self):
"""The list of institute signing the publication.
Returns:
list:
the list is sort in alphabetic order.
"""
df = self["authors"]
# expand multi-affiliation (one per column)
df = df.affiliation.str.split("|", expand=True)
# merge all columns into a single one,
# sort and remove duplicate entries
li = [df[el].dropna() for el in df.columns]
df = (concat(li, ignore_index=True)
.sort_values()
.unique())
return df.tolist()
def is_affiliations(self):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking that the ``affiliation`` field
exists. To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
Returns:
bool:
"""
df = self["authors"]
if len(df) == 1 and df.affiliation.iloc[0] == "":
return False
return True
def is_affiliation_for_all(self):
"""``True`` when affiliation are defined for all authors.
Return:
bool:
"""
df = self["authors"]
query = df.affiliation.isin(["", NaN])
return df.affiliation[query].size == 0
def is_authors(self):
"""``True`` when authors are defined.
Returns:
bool:
"""
df = self["authors"]
cols = {"first_name", "full_name", "last_name"}
if len(df.columns.intersection(cols)) != 3:
return False
if len(df) == 1 and df.full_name.iloc[0] == "":
return False
return True
def reformat_authors(self, fmt="Last, First"):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``.
Raises:
RecordException:
the argument ``fmt`` is not valid.
"""
if fmt not in AUTHOR_FORMATS:
raise RecordException(MSG_INVALID_FMT)
if fmt == self._last_fmt_author:
return
self._last_fmt_author = fmt
df = self["authors"]
# ....................................................................
#
# Compute initial for the first name
#
if fmt in ("F. Last", "Last F."):
df["initial"] = (df.first_name
.fillna("")
.apply(to_initial))
# ....................................................................
#
# Format
#
if fmt == "Last, First":
df["fmt_name"] = df.last_name + ", " + df.first_name
elif fmt == "First, Last":
df["fmt_name"] = df.first_name + ", " + df.last_name
elif fmt == "F. Last":
df["fmt_name"] = df.initial + " " + df.last_name
elif fmt == "Last":
df["fmt_name"] = df.last_name
elif fmt == "Last F.":
df["fmt_name"] = df.last_name + " " + df.initial
# ....................................................................
#
# Clean initial column
#
if fmt in ("F. Last", "Last F."):
df = df.drop("initial", axis="columns")
"""pluginpublicationinfo.py
"""
import numpy as np
PAPER_REFERENCE_KEYS = {"pagination", "title", "volume", "year"}
class PluginPublicationInfo(object):
"""Plugin to handle publication_info in RecordPubli and RecordHepPubli
publication information are stored in DataFrame with the
following structure:
+------------+--------------------------------+
| column | |
+============+================================+
| title | abbreviation of the publisher |
+------------+--------------------------------+
| volume | volume |
+------------+--------------------------------+
| year | year of publication |
+------------+--------------------------------+
| pagination | page number or ranges |
+------------+--------------------------------+
The DataFrame has to be build in the parent
It is retrieved via self["publication_info"]
"""
def is_published(self):
"""``True`` is the record is published and contains a full set
of publication information (title, volume, year and pagination).
Returns:
bool:
"""
# NOTE
# * df.columns are title, volume, year and pagination
# * df can contains one or more rows due to erratum.
# * assume that the first row is the oldest one and corresponds tp
# the first publication
# * the row contains empty string when the record is not published.
# * iloc[0] returns a serie where the index are the column's name
#
columns = (self["publication_info"].iloc[0]
.replace("", np.nan)
.dropna()
.index)
return len(columns.intersection(PAPER_REFERENCE_KEYS)) == 4
def is_with_erratum(self):
"""``True`` when the record contains erratum data.
Returns:
bool
"""
df = self["publication_info"]
return len(df) > 1
def paper_editor(self):
"""The abbreviated version of the review, *e.g* Phys Lett B.
Returns:
str:
empty string when not defined.
"""
df = self["publication_info"]
return (df.title.iloc[0] if "title" in df else "")
def paper_pages(self):
"""The page number / range when the record is published in a review.
Returns:
str:
* The format is "45-67" or "234".
* Empty string when not defined.
"""
df = self["publication_info"]
return (df.pagination.iloc[0] if "pagination" in df else "")
def paper_reference(self):
"""The full reference for a publication published in a review.
Returns:
str:
* The format is "Phys Lett B 456 2010 5-6".
* The string is empty when the publication is not
published in a review.
"""
df = self["publication_info"]
its = df.columns.intersection({"title",
"volume",
"year",
"pagination"})
if len(its) != 4:
return ""
paper = df.iloc[0]
li = [paper.title,
paper.volume,
paper.year,
paper.pagination]
return " ".join(li).strip()
def paper_volume(self):
"""The volume number when the record is published in a review.
Returns:
str:
empty string when nothing is found.
"""
df = self["publication_info"]
return (df.volume.iloc[0] if "volume" in df else "")
def paper_year(self):
"""The year of the publication.
Returns:
str:
- Empty string if the year is not defined.
"""
df = self["publication_info"]
return (df.year.iloc[0] if "year" in df else "")
......@@ -7,14 +7,14 @@
"""
from store_tools.base import (is_conference,
is_institute,
is_thesis,
to_initial)
is_thesis)
from store_tools.factory import (add_conference_data,
build_record,
build_store,
get_conference_data)
from store_tools.pluginauthors import to_initial
from store_tools.recordconf import RecordConf
from store_tools.recordinst import RecordInst
from store_tools.recordpubli import RecordPubli
......@@ -248,4 +248,3 @@ def test_to_initial_02014():
assert to_initial("Jean-Pierre") == "J.-P."
assert to_initial("Marie-Hélène") == "M.-H."
assert to_initial("Marie - Pierre") == "M.-P."
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment