Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 38150847 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Refactor recordpubli and recordthesis to process phd director's name.

parent 5db30cca
......@@ -40,6 +40,69 @@ PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])
REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?"
def format_names(df, fmt):
"""Helper function to format author and PhD director names.
Args:
df (pandas.DataFrame):
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
+------------+---------------------------+
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Returns:
pandas.DataFrame:
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
| fmt_name | formated name |
+------------+---------------------------+
"""
# Compute initial for the first name
if fmt in ("F. Last", "Last F."):
dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
.fillna(""))
df["initial"] = dfm.apply(
lambda x: to_initial(x[0], x[1], x[2]), axis="columns")
# Format
if fmt == "Last, First":
df["fmt_name"] = df.a
elif fmt == "First, Last":
df["fmt_name"] = df.first_name + " " + df.last_name
elif fmt == "F. Last":
df["fmt_name"] = df.initial + " " + df.last_name
elif fmt == "Last":
df["fmt_name"] = df.last_name
elif fmt == "Last F.":
df["fmt_name"] = df.last_name + " " + df.initial
# Clean initial column
if fmt in ("F. Last", "Last F."):
df = df.drop("initial", axis="columns")
return df
def to_initial(x, y, z):
"""Help function to extract initial from a first name split in x, y and z:
......@@ -678,7 +741,8 @@ class RecordPubli(Record):
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
RecordException:
when fmt is not valid.
"""
if fmt not in AUTHOR_FORMATS:
......@@ -689,52 +753,8 @@ class RecordPubli(Record):
self._last_fmt_author = fmt
# alias
d100, d700 = self["100"], self["700"]
# ....................................................................
#
# Compute initial for the first name
#
if fmt in ("F. Last", "Last F."):
for df in (d100, d700):
dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
.fillna(""))
df["initial"] = dfm.apply(
lambda x: to_initial(x[0], x[1], x[2]), axis="columns")
# ....................................................................
#
# Format
#
if fmt == "Last, First":
d100["fmt_name"] = d100.a
d700["fmt_name"] = d700.a
elif fmt == "First, Last":
d100["fmt_name"] = d100.first_name + ", " + d100.last_name
d700["fmt_name"] = d700.first_name + " " + d700.last_name
elif fmt == "F. Last":
d100["fmt_name"] = d100.initial + " " + d100.last_name
d700["fmt_name"] = d700.initial + " " + d700.last_name
elif fmt == "Last":
d100["fmt_name"] = d100.last_name
d700["fmt_name"] = d700.last_name
elif fmt == "Last F.":
d100["fmt_name"] = d100.last_name + " " + d100.initial
d700["fmt_name"] = d700.last_name + " " + d700.initial
# ....................................................................
#
# Clean initial column
#
if fmt in ("F. Last", "Last F."):
d100 = d100.drop("initial", axis="columns")
d700 = d700.drop("initial", axis="columns")
self["100"] = format_names(self["100"], fmt)
self["700"] = format_names(self["700"], fmt)
def report_number(self):
"""The report number(s) associated to the publication.
......
......@@ -3,7 +3,7 @@
"""
from .base import THESIS_DIR
from filters import CLEAN_THESIS_DEFENSE
from .recordpubli import RecordPubli
from .recordpubli import format_names, RecordPubli
from pandas import DataFrame
......@@ -30,6 +30,26 @@ class RecordThesis(RecordPubli):
+-----------------------+---------+----------+
"""
def _process_authors(self):
"""Process author and director names
"""
RecordPubli._process_authors(self)
# PhD directors
if "701" not in self:
return
df = DataFrame(self["701"])
df1 = df.a.str.split(",", expand=True)
df[["last_name", "first_name"]] = df1[[0, 1]]
df.first_name = df.first_name.str.strip()
df.last_name = df.last_name.str.strip()
df["fmt_name"] = df.a
self["701"] = df
def authors_as_list(self, sort=False):
"""The list of author(s) signing the publication.
......@@ -63,6 +83,28 @@ class RecordThesis(RecordPubli):
return li
def reformat_authors(self, fmt="Last, First"):
"""Reformat author and director names.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
"""
RecordPubli.reformat_authors(self, fmt)
if "701" not in self:
return
self["701"] = format_names(self["701"], fmt)
def these_defense(self):
"""The defence date for a master/phd thesis.
......@@ -101,18 +143,13 @@ class RecordThesis(RecordPubli):
"""
# for a thesis, the author field 700 field contains
# names of the director as well as the name of authors
df = self["700"]
if "e" in df.columns:
query = df.e == THESIS_DIR
df = df.loc[query]
# discover in 2019, that director's name move in the field 701
elif "701" in self:
df = DataFrame(self["701"])
if "e" in df.columns:
query = df.e == THESIS_DIR
df = (df.loc[query]
.assign(fmt_name=lambda x: x.a))
for df in (self.get("700"), self.get("701")):
if (df is None) or ("e" not in df.columns):
continue
df = df.loc[df.e == THESIS_DIR]
break
return (sep.join(df.fmt_name) if len(df) > 0 else "")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment