Commit fe4859a0 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update RecordPubli and RecordThesis to implement DataFrame for author/affilation.

parent 28d07ec0
......@@ -15,6 +15,8 @@ from iterauthors import (iter_author_affiliations,
iter_author_items,
iter_author_names)
from itertools import ifilter, imap
from numpy import NaN
from pandas import concat, DataFrame, merge
from plugin_dbui import as_list, CLEAN_SPACES
from record import Record
......@@ -29,6 +31,10 @@ DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])
def to_str(x):
return ("|".join(x) if isinstance(x, list) else x)
class RecordPubli(Record):
"""The MARC record describing a publication.
Usual publications are article, preprint, proceeding, report and talk.
......@@ -55,6 +61,99 @@ class RecordPubli(Record):
+-----------------------+---------+----------+
"""
def __init__(self, *args):
Record.__init__(self, *args)
self._process_authors()
def _process_authors(self):
"""Convert authors information into DataFrame:
* Keep the subfield "a", "u" and "e" (phd thesis)
* Convert list of affiliation in string separated by "|"
The author are spread over the 100 and 700 field.
The method deals with cases where:
* the first author is defined in 100 but it is not in 700
* first author is not defined in 100 but in 700
* thesis in which 700 contains names of director
"""
# ....................................................................
#
# Instantiate DataFrame for field 100 and 700
#
di = {u"100": None, u"700": None}
for key in di.iterkeys():
if key not in self:
continue
data = self[key]
data = (data if isinstance(data, list) else [data])
df = DataFrame(data)
columns = df.columns
# only keep columns:
# - "a": author name
# - "e": phd director (equal to "dir.")
# - "u": affiliation(s)
df = df.drop(columns.difference(["a", "e", "u"]), axis="columns")
# protection -- affiliation not defined
if "a" in columns and "u" not in columns:
dfu = DataFrame([""]*len(df), columns=["u"])
df = concat([df, dfu], axis="columns")
# protection -- mission affiliation
df.u = df.u.fillna("")
# convert list of affiliation to string
# in which values are separated by |
df.u = df.u.apply(lambda x: to_str(x))
di[key] = df
# alias
d100, d700 = di[u"100"], di[u"700"]
# ....................................................................
#
# protection -- more than one first author
# the case with duplicate author name (build affiliation)
#
if d100 is not None and len(d100) > 1:
grouped = d100.groupby(["a"], sort=False)
if len(grouped) == 1:
for name, group in grouped:
li = [el for el in group.u if el not in ("", NaN, None)]
d100 = DataFrame({"a": [name], "u": ["|".join(li)]})
# ....................................................................
#
# the author are spread over the 100 and 700 field.
# deal with cases where the first author is defined in 100
# but not in 700, first author is defined in 100 and in 700
# or no author in 100
if d100 is not None and d700 is not None:
if d100.a.iloc[0] != d700.a.iloc[0]:
if len(d100) == 1:
d700 = concat([d100, d700], ignore_index=True)
elif d100 is None and d700 is not None:
d100 = DataFrame(d700.iloc[0]).transpose()
elif d700 is None and d100 is not None:
d700 = d100
else:
d100 = d700 = DataFrame({"a": [""], "u": [""]})
self[u"100"] = d100
self[u"700"] = d700
def authors(self, cmpFct=None):
"""The author(s) signing the publication.
......@@ -72,6 +171,7 @@ class RecordPubli(Record):
"""
li = self.authors_as_list()
if cmpFct:
li.sort(key=cmpFct)
return u", ".join(li)
......@@ -83,7 +183,12 @@ class RecordPubli(Record):
list: the list is empty when authors are not defined.
"""
return list(iter_author_names(self))
li = self[u"700"].a.tolist()
if len(li) == 1 and li[0] == "":
li = []
return li
def collaboration(self):
"""The collaboration(s) signing the publication.
......@@ -100,9 +205,6 @@ class RecordPubli(Record):
def find_affiliation(self, pattern):
"""Find affiliation matching the regular expression *pattern*.
Affiliation keys are obtained by concatenating the "u" and "v"
keys of the author field 100 and 700.
Args:
pattern (unicode): regular expression defining the
affiliation keys.
......@@ -113,74 +215,53 @@ class RecordPubli(Record):
- empty string when nothing is found.
"""
regex = re.compile(pattern)
# modify the pattern to capture group
pattern = "(%s)" % pattern
# NOTE: an author can have several affiliations
for iter_key in iter_author_affiliation_keys(self):
li = list(ifilter(regex.match, iter_key))
if len(li) > 0:
return li[0]
return u""
series = self[u"700"].u.str.extract(pattern, expand=False).dropna()
return (series.iloc[0] if len(series) > 0 else u"")
def find_authors(self, pattern):
"""Find authors matching the regular expression *pattern*.
"""Find authors containing the regular expression *pattern*.
Args:
pattern (unicode): regular expression defining the author name(s).
Returns:
unicode:
* Author names are separated by a comma.
* Author names are separated by ``|``.
* The string is empty when nothing is found.
"""
regex = re.compile(pattern)
return u", ".join(ifilter(regex.search, iter_author_names(self)))
df = self[u"700"]
def find_authors_by_affiliation(self, pattern, cmpFct=None):
"""Find authors belonging to a given institute(s) defined by a regular
expression. The search is performed on the affiliation keys.
query = df.a.str.contains(pattern)
df = df.loc[query, ["a"]]
return (u"" if len(df) == 0 else u"|".join(df.a))
Affiliation keys are obtained by concatenating the "u" and "v" keys
of the author field 100 and 700.
def find_authors_by_affiliation(self, pattern):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (unicode): regular expression defining the affiliation keys
for the institute(s).
cmpFct (reference): function to compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller
than, equal to, or larger than the second one.
Returns:
unicode:
* Author names are separated by a comma.
* Author are sorted according to the function *cmpFct*.
* Author names are separated by ``|``.
* Author are sorted according to their family name.
* Empty string when authors are not found.
"""
# authors not defined
if not self.is_authors():
return u""
# filter the list using affiliation key(s)
regex = re.compile(pattern)
df = self[u"700"]
# extract author name from the item
authors = list(imap(
lambda x: x[0],
ifilter(
lambda x: len(list(ifilter(regex.search, x[1]))) > 0,
iter_author_items(self))))
# short the list
if cmpFct:
authors.sort(key=cmpFct)
query = df.u.str.contains(pattern)
df = df.loc[query, ["a"]]
return u", ".join(authors)
df.a = df.a.str.encode("utf-8").sort_values()
return ("|".join(df.a) if len(df) > 0 else "").decode("utf-8")
def first_author(self):
"""The name of the first author.
......@@ -191,7 +272,7 @@ class RecordPubli(Record):
- List of name when there is more than one.
"""
return iter_author_names(self).next()
return self[u"700"].a.iloc[0]
def first_author_institutes(self):
"""The institute(s) associated to the first author.
......@@ -202,11 +283,12 @@ class RecordPubli(Record):
Returns:
unicode:
- names are separated by a comma.
- names are separated by ``|``.
- The string is empty when institutes are not defined.
"""
return u", ".join(iter_author_affiliations(self).next())
val = self[u"700"].u.iloc[0]
return ("" if val == NaN else val)
def institutes(self):
"""The list of institute signing the publication.
......@@ -219,16 +301,17 @@ class RecordPubli(Record):
list: the list is sort in alphabetic order.
"""
myset = set()
# expand multi-affiliation (one per column)
df = self[u"700"].u.str.split("|", expand=True)
for elt in iter_author_affiliations(self):
myset.update(elt)
# merge all columns into a single one,
# sort and remove duplicate entries
li = [df[el].dropna() for el in df.columns]
df = (concat(li, ignore_index=True)
.sort_values()
.unique())
# sort institute in alphabetic order
myli = list(myset)
myli.sort()
return myli
return df.tolist()
def is_affiliations(self):
"""``True`` when affiliations are defined for authors.
......@@ -243,16 +326,13 @@ class RecordPubli(Record):
bool:
"""
for field in (u"100", u"700"):
if field in self:
if isinstance(self[field], dict):
if "u" not in self[field]:
return False
df = self[u"700"]
if "u" not in df.columns:
return False
elif isinstance(self[field], list):
for i in (1, -1):
if "u" not in self[field][i]:
return False
if len(df) == 1 and df.u.iloc[0] == "":
return False
return True
......@@ -263,8 +343,10 @@ class RecordPubli(Record):
bool:
"""
return len(list(ifilter(
lambda x: len(x) == 0, iter_author_affiliations(self)))) == 0
df = self[u"700"]
query = df.u.isin(["", NaN])
return df.u[query].size == 0
def is_authors(self):
"""``True`` when authors are defined.
......@@ -273,7 +355,15 @@ class RecordPubli(Record):
bool:
"""
return u"100" in self or u"700" in self
df = self[u"700"]
if "a" not in df.columns:
return False
if len(df) == 1 and df.a.iloc[0] == "":
return False
return True
def is_published(self):
"""``True`` is the record is published.
......
......@@ -39,11 +39,19 @@ class RecordThesis(RecordPubli):
list: the list is empty when authors are not defined.
"""
# for a thesis, the author field 700 contains names of director
# which have to be removed.
# for a thesis, the author field 700 contains names of author
# as well as directors. The latter have to be removed.
df = self[u"700"]
iter_filter = ifilterfalse(is_thesis_dir, iter_author_fields(self))
return list(imap(author_name, iter_filter))
query = df.e != THESIS_DIR
df = df.loc[query]
li = df.a.tolist()
if len(li) == 1 and li[0] == "":
li = []
return li
def these_defense(self):
"""The defence date for a master/phd thesis.
......@@ -77,15 +85,18 @@ class RecordThesis(RecordPubli):
Returns:
unicode:
* Names are separated by a comma.
* Names are separated by ``|``.
* Empty string when it is not defined.
"""
# for a thesis, the author field 700 field contains
# names of the director as well as the name of authors
df = self[u"700"]
query = df.e == THESIS_DIR
df = df.loc[query]
iter_filter = ifilter(is_thesis_dir, iter_author_fields(self))
return u", ".join(list(imap(author_name, iter_filter)))
return (u"|".join(df.a) if len(df) > 0 else u"")
def these_town(self):
"""The town where the thesis took place.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment