Commit 14d0602f authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Redesing RecordPubli by using PluginAuthors and PluginPublicationInfo

parent 0a40e705
""" store_tools.recordpubli """ store_tools.recordpubli
""" """
import numpy as np
import re
from .base import to_initial
from .exception import RecordException
from filters import CLEAN_COLLABORATION from filters import CLEAN_COLLABORATION
from numpy import NaN
from pandas import concat, DataFrame from pandas import concat, DataFrame
from plugin_dbui import as_list, CLEAN_SPACES from plugin_dbui import CLEAN_SPACES
from .record import Record from .record import Record
from store_tools import (ARXIV, from store_tools import ARXIV
ARXIV_PDF, from store_tools.pluginauthors import PluginAuthors
REG_ARXIV_NUMBER, from store_tools.pluginpublicationinfo import PluginPublicationInfo
REG_YEAR)
AUTHOR_FORMATS = [
"First, Last",
"F. Last",
"Last",
"Last, First",
"Last F."]
MSG_INVALID_FMT = "Invalid format for author"
# the keys containing paper reference
PAPER_REFERENCE_KEYS = {"pagination", "title", "volume", "year"}
def to_str(x): def to_str(x):
return ("|".join(x) if isinstance(x, list) else x) return ("|".join(x) if isinstance(x, list) else x)
class RecordPubli(Record): class RecordPubli(Record, PluginAuthors, PluginPublicationInfo):
"""Article, preprint, proceeding, report and talk from cds.cern.ch or """Article, preprint, proceeding, report and talk from cds.cern.ch or
old.inspirehep.net. old.inspirehep.net.
...@@ -283,58 +264,6 @@ class RecordPubli(Record): ...@@ -283,58 +264,6 @@ class RecordPubli(Record):
# replace # replace
self["publication_info"] = df self["publication_info"] = df
def authors(self, sep=", ", sort=False):
"""The author(s) signing the publication.
Args:
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li = self.authors_as_list(sort=sort)
return sep.join(li)
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list:
* name are unique
* the list is empty when authors are not defined.
"""
df = self["authors"]
if sort:
li = (df[["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name
.tolist())
else:
li = (df.fmt_name
.sort_index()
.tolist())
if len(li) == 1 and li[0] == "":
li = []
return li
def collaboration(self): def collaboration(self):
"""The collaboration(s) signing the publication. """The collaboration(s) signing the publication.
...@@ -347,286 +276,6 @@ class RecordPubli(Record): ...@@ -347,286 +276,6 @@ class RecordPubli(Record):
li = self._get("corporate_name", "collaboration", force_list=True) li = self._get("corporate_name", "collaboration", force_list=True)
return CLEAN_COLLABORATION(", ".join(li)) return CLEAN_COLLABORATION(", ".join(li))
def find_affiliation(self, pattern):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (str):
regular expression defining the affiliation keys.
It has to be build for an exact match namely containing
start and end of string. This is required to separate
`Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
Returns:
str:
- the affiliation or the first one when several are found.
- empty string when nothing is found.
"""
df = self["authors"]
query = df.affiliation.str.match(pattern)
data = df[query]
if data.empty:
return ""
data = data.affiliation.unique()
return (data[0] if len(data) > 0 else "")
def find_authors(self, pattern, sep=", ", sort=False):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (str):
regular expression defining the author name(s).
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by ``sep`` argument.
* The string is empty when nothing is found.
"""
df = self["authors"]
query = df.fmt_name.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return ("" if len(data) == 0 else sep.join(data))
def find_authors_by_affiliation(self, pattern, sep=", ", sort=False):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (str):
regular expression defining the affiliation keys
for the institute(s).
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* Author are sorted according to their family name.
* Empty string when authors are not found.
"""
df = self["authors"]
query = df.affiliation.str.contains(pattern)
if sort:
data = (df.loc[query, ["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
data = (df.loc[query, ["fmt_name"]]
.sort_index()
.fmt_name)
return (sep.join(data) if len(data) > 0 else "")
def first_author(self):
"""The name of the first author.
Returns:
str:
empty string when the first author is not defined.
"""
return self["authors"].fmt_name.iloc[0]
def first_author_institutes(self):
"""The institute(s) associated to the first author.
Returns:
str:
- names are separated by ``|``.
- The string is empty when institutes are not defined.
"""
val = self["authors"].affiliation.iloc[0]
return ("" if val == NaN else val)
def institutes(self):
"""The list of institute signing the publication.
Returns:
list:
the list is sort in alphabetic order.
"""
df = self["authors"]
# expand multi-affiliation (one per column)
df = df.affiliation.str.split("|", expand=True)
# merge all columns into a single one,
# sort and remove duplicate entries
li = [df[el].dropna() for el in df.columns]
df = (concat(li, ignore_index=True)
.sort_values()
.unique())
return df.tolist()
def is_affiliations(self):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking that the ``affiliation`` field
exists. To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
Returns:
bool:
"""
df = self["authors"]
if len(df) == 1 and df.affiliation.iloc[0] == "":
return False
return True
def is_affiliation_for_all(self):
"""``True`` when affiliation are defined for all authors.
Return:
bool:
"""
df = self["authors"]
query = df.affiliation.isin(["", NaN])
return df.affiliation[query].size == 0
def is_authors(self):
"""``True`` when authors are defined.
Returns:
bool:
"""
df = self["authors"]
cols = {"first_name", "full_name", "last_name"}
if len(df.columns.intersection(cols)) != 3:
return False
if len(df) == 1 and df.full_name.iloc[0] == "":
return False
return True
def is_published(self):
"""``True`` is the record is published and contains a full set
of publication information (title, volume, year and pagination).
Returns:
bool:
"""
# NOTE
# * df.columns are title, volume, year and pagination
# * df can contains one or more rows due to erratum.
# * assume that the first row is the oldest one and corresponds tp
# the first publication
# * the row contains empty string when the record is not published.
# * iloc[0] returns a serie where the index are the column's name
#
columns = (self["publication_info"].iloc[0]
.replace("", np.nan)
.dropna()
.index)
return len(columns.intersection(PAPER_REFERENCE_KEYS)) == 4
def is_with_erratum(self):
"""``True`` when the record contains erratum data.
Returns:
bool
"""
df = self["publication_info"]
return len(df) > 1
def paper_editor(self):
"""The abbreviated version of the review, *e.g* Phys Lett B.
Returns:
str:
empty string when not defined.
"""
df = self["publication_info"]
return (df.title.iloc[0] if "title" in df else "")
def paper_pages(self):
"""The page number / range when the record is published in a review.
Returns:
str:
* The format is "45-67" or "234".
* Empty string when not defined.
"""
df = self["publication_info"]
return (df.pagination.iloc[0] if "pagination" in df else "")
def paper_reference(self):
"""The full reference for a publication published in a review.
Returns:
str:
* The format is "Phys Lett B 456 2010 5-6".
* The string is empty when the publication is not
published in a review.
"""
df = self["publication_info"]
its = df.columns.intersection({"title",
"volume",
"year",
"pagination"})
if len(its) != 4:
return ""
paper = df.iloc[0]
li = [paper.title,
paper.volume,
paper.year,
paper.pagination]
return " ".join(li).strip()
def paper_url(self): def paper_url(self):
"""The URL of the preprint. """The URL of the preprint.
...@@ -651,28 +300,6 @@ class RecordPubli(Record): ...@@ -651,28 +300,6 @@ class RecordPubli(Record):
return "" return ""
def paper_volume(self):
"""The volume number when the record is published in a review.
Returns:
str:
empty string when nothing is found.
"""
df = self["publication_info"]
return (df.volume.iloc[0] if "volume" in df else "")
def paper_year(self):
"""The year of the publication.
Returns:
str:
- Empty string if the year is not defined.
"""
df = self["publication_info"]
return (df.year.iloc[0] if "year" in df else "")
def preprint_number(self): def preprint_number(self):
"""The ArXiv preprint number. """The ArXiv preprint number.
...@@ -693,68 +320,6 @@ class RecordPubli(Record): ...@@ -693,68 +320,6 @@ class RecordPubli(Record):
return "" return ""
def reformat_authors(self, fmt="Last, First"):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``.
Raises:
RecordException:
the argument ``fmt`` is not valid.
"""
if fmt not in AUTHOR_FORMATS:
raise RecordException(MSG_INVALID_FMT)
if fmt == self._last_fmt_author:
return
self._last_fmt_author = fmt
df = self["authors"]
# ....................................................................
#
# Compute initial for the first name
#
if fmt in ("F. Last", "Last F."):
df["initial"] = (df.first_name
.fillna("")
.apply(to_initial))
# ....................................................................
#
# Format
#
if fmt == "Last, First":
df["fmt_name"] = df.last_name + ", " + df.first_name
elif fmt == "First, Last":
df["fmt_name"] = df.first_name + ", " + df.last_name
elif fmt == "F. Last":
df["fmt_name"] = df.initial + " " + df.last_name
elif fmt == "Last":
df["fmt_name"] = df.last_name
elif fmt == "Last F.":
df["fmt_name"] = df.last_name + " " + df.initial
# ....................................................................
#
# Clean initial column
#
if fmt in ("F. Last", "Last F."):
df = df.drop("initial", axis="columns")
def report_number(self): def report_number(self):
"""The report number(s) associated to the publication. """The report number(s) associated to the publication.
...@@ -776,7 +341,7 @@ class RecordPubli(Record): ...@@ -776,7 +341,7 @@ class RecordPubli(Record):
return ", ".join(sorted(li)) return ", ".join(sorted(li))
# INSPIRE # OLD.INSPIRE
if "primary_report_number" in self: if "primary_report_number" in self:
data = self["primary_report_number"] data = self["primary_report_number"]
......
...@@ -23,6 +23,7 @@ import pandas as pd ...@@ -23,6 +23,7 @@ import pandas as pd
import pytest import pytest
from store_tools import load_record from store_tools import load_record
from store_tools.recordpubli import RecordPubli
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -30,7 +31,11 @@ def record(): ...@@ -30,7 +31,11 @@ def record():
return load_record("cds.cern.ch", 1951625) return load_record("cds.cern.ch", 1951625)
def test_constructor_cds_04001(record): def test_upcast_cds_04001(record):
assert isinstance(record, RecordPubli)
def test_constructor_cds_04002(record):
"""test the method _process_authors and _process_publication_info. """test the method _process_authors and _process_publication_info.
""" """
...@@ -63,11 +68,11 @@ def test_constructor_cds_04001(record): ...@@ -63,11 +68,11 @@ def test_constructor_cds_04001(record):
# #
# Section devoted to authors # Section devoted to authors
# #
def test_is_authors_cds_04002(record): def test_is_authors_cds_04010(record):
assert record.is_authors() assert record.is_authors()
def test_authors_as_list_cds_04003(record): def test_authors_as_list_cds_04011(record):
authors = record.authors_as_list() authors = record.authors_as_list()
assert len(authors) == record["number_of_authors"] assert len(authors) == record["number_of_authors"]
...@@ -77,16 +82,16 @@ def test_authors_as_list_cds_04003(record): ...@@ -77,16 +82,16 @@ def test_authors_as_list_cds_04003(record):
assert authors[-1] == "Zvyagin, Alexander" assert authors[-1] == "Zvyagin, Alexander"
def test_first_author_cds_04004(record): def test_first_author_cds_04012(record):
assert record.first_author() == "Aaij, Roel" assert record.first_author() == "Aaij, Roel"
def test_find_authors_cds_04005(record): def test_find_authors_cds_04013(record):
assert record.find_authors("Leo") == \ assert record.find_authors("Leo") == \
"Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato" "Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato"
def test_reformat_author_cds_04006(record): def test_reformat_author_cds_04014(record):
record.reformat_authors("F. Last") record.reformat_authors("F. Last")
authors = record.authors_as_list() authors = record.authors_as_list()
...@@ -105,12 +110,12 @@ def test_reformat_author_cds_04006(record): ...@@ -105,12 +110,12 @@ def test_reformat_author_cds_04006(record):
# #
# Section devoted to affiliation # Section devoted to affiliation
# #
def test_is_affiliations_cds_04007(record): def test_is_affiliations_cds_04020(record):
assert record.is_affiliations() assert record.is_affiliations()
assert record.is_affiliation_for_all() assert record.is_affiliation_for_all()
def test_institutes_cds_04008(record): def test_institutes_cds_04021(record):
institutes = record.institutes() institutes = record.institutes()
...@@ -119,7 +124,7 @@ def test_institutes_cds_04008(record): ...@@ -119,7 +124,7 @@ def test_institutes_cds_04008(record):