Commit 54b04413 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '86-dataframe-authors' into 'master'

Resolve "Use pandas.DataFrame in record for author and their affiliation"

Closes #86

See merge request !86
parents 28d07ec0 123eed1b
......@@ -10,8 +10,6 @@ from harvest_tools import (build_harvester_tool,
CheckAndFix,
CheckException,
DRY_RUN,
format_author_fr,
family_name_fr,
search_synonym,
ToolException)
from invenio_tools import (load_record,
......@@ -158,12 +156,12 @@ def edit_insert():
# authors
try:
check.authors(record)
check.format_authors(record, format_author_fr)
check.format_authors(record, fmt="F. Last")
check.my_affiliation(
record, selector.id_projects, selector.id_teams)
check.get_my_authors(record, cmpFct=family_name_fr)
check.get_my_authors(record, sort=True)
except CheckException:
pass
......
harvest_tools.base.format_author_fr
===================================
.. currentmodule:: harvest_tools.base
.. autofunction:: format_author_fr
\ No newline at end of file
......@@ -50,7 +50,6 @@ Helper functions
:toctree: generated/
~base.family_name_fr
~base.format_author_fr
~base.learn_my_authors
~base.search_synonym
......
......@@ -11,7 +11,6 @@ from base import (DRY_RUN,
MSG_NO_ENTRY,
MSG_TOOMANY_SYNONYM,
family_name_fr,
format_author_fr,
search_synonym)
from automaton import Automaton
......
......@@ -6,9 +6,7 @@ import traceback
from automaton import Automaton
from base import (family_name_fr,
format_author_fr,
learn_my_authors,
from base import (learn_my_authors,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
......@@ -63,8 +61,8 @@ class Articles(Automaton):
self.check.submitted(record)
self.check.year(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -3,7 +3,6 @@
"""
from exception import ToolException
from invenio_tools import REG_AUTHOR
from plugin_dbui import get_id, UNDEF_ID
......@@ -31,59 +30,6 @@ def family_name_fr(full_name):
return full_name[full_name.find(' ') + 1:]
def format_author_fr(name):
"""Format the author name according to French typographic rules.
Note:
The name stays unchanged when the formatting failed.
Args:
name (unicode): full name. Possible patterns are:
* ``Family, L``
* ``Family, L.``
* ``Family, P L``
* ``Family, P.L.``
* ``Family, P.-L.``
* ``Family, M -H``
* ``Family Name, J``
* ``Family-Name, J``
* ``Family, F Name``
* ``Family, First``
* ...
Returns:
unicode: the author name encode as ``J.-P. Doe``.
"""
# protection
if name == '' or name is None:
return name
# decode the name Family, First + many variant
# group(1) family name
# group(2) first name
# group(3) second part of the first name
match = REG_AUTHOR.match(name)
# reformat the name as L. Family
# or keep it as it is
if match:
if match.group(3):
tpl = (match.group(2)[0], match.group(3)[0], match.group(1))
result = '%s.-%s. %s' % tpl
else:
result = '%s. %s' % (match.group(2)[0], match.group(1))
else:
result = name
# avoid author name in upper case (R. LE FOO --> R. Le Foo)
result = result.title()
return result
def learn_my_authors(db,
authors=None,
id_project=None,
......
......@@ -5,7 +5,7 @@
import re
import regex
from base import format_author_fr, search_synonym, ToolException
from base import search_synonym, ToolException
from exception import CheckException
from gluon import current
from invenio_tools import (DECODE_REF,
......@@ -14,11 +14,9 @@ from invenio_tools import (DECODE_REF,
OAI_URL,
RecordConf,
RecordThesis,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from itertools import imap
from pandas import DataFrame
from plugin_dbui import CLEAN_SPACES, get_id
......@@ -94,13 +92,17 @@ class CheckAndFix(object):
self.__reference = None
# private cache for my authors list
self.__my_authors = {}
self._my_authors = {}
def _get_reg_institute(self):
"""
"""Get the regular expression defining the affiliation of my institute.
It is obtained by concatenating the affiliation keys.
Affiliation key can contains character like ``(``, ``)`` or ``&``.
They are replaced by ``\(`` *etc*.
Returns:
unicode: the regular expression defining the affiliation
of my institute.
unicode:
"""
# alias
......@@ -117,6 +119,15 @@ class CheckAndFix(object):
u"|".join(imap(
lambda row: u"^%(key_u)s%(key_v)s" % row, iterselect))
# protect special character
reg_institute = (reg_institute
.replace("(", "\\(")
.replace(")", "\\)")
.replace("&", "\\&")
.replace("$", "\\$")
.replace("+", "\\+")
.replace("?", "\\?"))
return reg_institute
def _get_author_rescue_list(self, record, id_project, id_team):
......@@ -315,7 +326,7 @@ class CheckAndFix(object):
if not record.is_authors():
raise CheckException(MSG_NO_AUTHOR)
if isinstance(record.first_author(), list):
if len(record[u"100"]) > 1:
raise CheckException(MSG_TO_MANY_FAUTHOR)
def clean_erratum(self, record):
......@@ -459,44 +470,18 @@ class CheckAndFix(object):
return False
def format_authors(self, record, func):
"""Format the author names using the function func.
def format_authors(self, record, fmt="Last, First"):
"""Format the author names.
Args:
record (RecordPubli): record describing a publication.
func (reference): function used to format the author names.
fmt (str):
define the format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
"""
for key in (u"100", u"700"):
if key in record:
if isinstance(record[key], list):
for i in xrange(len(record[key])):
if "a" in record[key][i]:
# PROTECTION
# see RecordPubli.author_as_list
value = record[key][i]["a"]
if isinstance(value, unicode):
record[key][i]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
else:
if "a" in record[key]:
value = record[key]["a"]
# PROTECTION
# see RecordPubli.authors_as_list
if isinstance(value, unicode):
record[key]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
record.reformat_authors(fmt)
def format_editor(self, record):
"""Format the editor abbreviation. The encoding
......@@ -619,19 +604,21 @@ class CheckAndFix(object):
value = value.replace('U.', university)
record[u'502']['b'][i] = value
def get_my_authors(self, record, cmpFct=None):
def get_my_authors(self, record, sep=u", ", sort=False):
"""Get authors of my institutes signing the record.
The information is append to the Record object via the attribute
``my_authors``.
Args:
record (RecordPubli): record describing a publication.
cmpFct (reference): extract the family name from the full name.
It is used to sort my author list according to the
author family name.
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode: the list of authors separated by comma
unicode: the list of authors separated by the ``sep`` argument.
Raises:
CheckException: when the list is empty
......@@ -639,17 +626,17 @@ class CheckAndFix(object):
"""
# might have been computed when affiliation is checked
rec_id = record.id()
if rec_id in self.__my_authors:
li = self.__my_authors[rec_id]
li.sort(key=cmpFct)
value = u', '.join(li)
if rec_id in self._my_authors:
li = self._my_authors[rec_id]
value = sep.join(li)
# find authors of my institute signing the record
else:
reg_institute = self.reg_institute
value = record.find_authors_by_affiliation(reg_institute, cmpFct)
value = \
record.find_authors_by_affiliation(reg_institute, sep, sort)
if not value:
if len(value) == 0:
raise CheckException(MSG_NO_MY_AUTHOR)
record.my_authors = value
......@@ -680,11 +667,13 @@ class CheckAndFix(object):
if not isinstance(record, RecordThesis):
raise CheckException(MSG_NO_THESIS)
def my_affiliation(self,
record,
id_project,
id_team,
func=format_author_fr):
def my_affiliation(
self,
record,
id_project,
id_team,
fmt_rescue="F. Last",
sort=False):
"""Check that authors of my institute are signatories.
Launch a recovery procedure when affiliations are not defined.
......@@ -694,14 +683,26 @@ class CheckAndFix(object):
record (RecordPubli): record describing a publication.
id_project (int): identifier of the project in the database
id_team (int): identifier of the team in the database
func (reference): function used to format the author names.
fmt_rescue (str):
the format for the authors used in the rescue list
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Return
str:
* the found affiliation
* an empty string when the rescue list is used.
Raises:
CheckException: when there is no authors from my institute.
CheckException:
when the rescue list is required but empty
or because the intersection between the rescue list
and the author is null.
"""
value = record.find_affiliation(self.reg_institute)
if value:
if len(value) > 0:
return value
# affiliation is not defined
......@@ -711,20 +712,31 @@ class CheckAndFix(object):
raise CheckException(MSG_NO_MY_AUTHOR)
# format the author in the same way as the rescue list
# compute the intersection between the authors and the rescue list
df = (DataFrame(record.authors_as_list(), columns=["raw_author"])
.assign(format_author=lambda x:
x.raw_author.apply(lambda y: func(y)))
.set_index("format_author"))
fmt_ref = record._last_fmt_author
record.reformat_authors(fmt_rescue)
if sort:
authors = (record[u"700"][["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
authors = (record[u"700"].fmt_name
.sort_index())
# go back to the origin formatting
record.reformat_authors(fmt_ref)
rescue_list = [el.decode("utf-8") for el in rescue_list]
intersection = df.index & rescue_list
# compute the intersection between the authors and the rescue list
intersection = set(authors) & set(rescue_list)
if intersection.size == 0:
if len(intersection) == 0:
raise CheckException(MSG_NO_MY_AUTHOR)
# cache the result for a latter use
self.__my_authors[record.id()] = intersection.values.tolist()
self._my_authors[record.id()] = list(intersection)
return u""
def paper_reference(self, record):
"""Check that editor, page, volume and paper year are defined
......
......@@ -6,7 +6,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from plugin_dbui import UNDEF_ID
......@@ -36,8 +36,8 @@ class Notes(Automaton):
self.check.submitted(record)
self.check.year(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -6,7 +6,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from invenio_tools import RecordConf, RecordThesis
from plugin_dbui import UNDEF_ID
......@@ -59,8 +59,8 @@ class Preprints(Automaton):
self.check.submitted(record)
self.check.year(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -6,7 +6,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from plugin_dbui import UNDEF_ID
......@@ -45,8 +45,8 @@ class Proceedings(Automaton):
self.check.publisher(record)
self.check.paper_reference(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -6,7 +6,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID, UNKNOWN
......@@ -43,8 +43,8 @@ class Reports(Automaton):
self.check.submitted(record)
self.check.year(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......@@ -80,9 +80,7 @@ class Reports(Automaton):
# allow undefined institute authors
try:
self.check.my_authors(record,
reference=self._my_author_list(record),
cmpFct=family_name_fr)
self.check.get_my_authors(record, sort=True)
authors_institute = record.my_authors
except CheckException:
......
......@@ -6,7 +6,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from plugin_dbui import UNDEF_ID
......@@ -40,8 +40,8 @@ class Talks(Automaton):
self.check.submitted(record)
self.check.year(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -7,7 +7,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from invenio_tools import RecordThesis
from plugin_dbui import get_id, UNDEF_ID
......@@ -40,8 +40,8 @@ class Thesis(Automaton):
self.check.year(record)
self.check.format_universities(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -28,11 +28,6 @@ from exception import (CdsException,
XmlException)
from inveniostore import InvenioStore
from iterauthors import (iter_author_affiliations,
iter_author_affiliation_keys,
iter_author_fields,
iter_author_items,
iter_author_names)
from iterrecord import IterRecord, REG_INT
from marc12 import Marc12
from record import Record
......
# -*- coding: utf-8 -*-
""" invenio_tools.iterauthors
"""
from base import REG_AUTHOR
from itertools import chain, imap, izip_longest
def to_list(x):
return (x if isinstance(x, list) else [x])
def affiliation_keys(field):
"""Extract affiliation key(s) from the author field.
The affiliation keys are obtained by concatenating the "u" and "v"
keys of the author field.
Note:
An author can have several affiliations.
Args:
field (dict): author field
Returns:
itertools.imap
"""
if "u" not in field:
return ()
key_u, key_v = to_list(field["u"]), []
if "v" in field:
key_v = to_list(field["v"])
# transform [u1, u2, u3], [v1, v2] into [(u1, v1), (u2, v2), (u3, "")]
iter_uv = izip_longest(key_u, key_v, fillvalue="")
# concatenate (u1,v1) into u1v1
return imap(lambda x: u"%s%s" % x, iter_uv)
def author_item(field):
"""Extract the item from the author field.
The author item is a tuple containing the author name and an iterator
on the author affiliation keys.
Args:
field (dict): author field
Returns:
tuple: (author name, affiliation keys iterator)
"""
if "a" not in field:
return (u"", u"")
return (field["a"], affiliation_keys(field))
def author_name(field):
"""Extract the name from the author field.
Note:
In most of the case the author is a string
but it can be a list, e.g inspirehep.net/1386663:
[u"Zuniga, J.", u"(the A.N.T.ARES. Collaboration)"]