Commit e22f1a62 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update CheckAndFix and its tests.

parent b7ca0e1b
......@@ -5,7 +5,7 @@
import re
import regex
from base import format_author_fr, search_synonym, ToolException
from base import search_synonym, ToolException
from exception import CheckException
from gluon import current
from invenio_tools import (DECODE_REF,
......@@ -14,11 +14,9 @@ from invenio_tools import (DECODE_REF,
OAI_URL,
RecordConf,
RecordThesis,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from itertools import imap
from pandas import DataFrame
from plugin_dbui import CLEAN_SPACES, get_id
......@@ -94,13 +92,17 @@ class CheckAndFix(object):
self.__reference = None
# private cache for my authors list
self.__my_authors = {}
self._my_authors = {}
def _get_reg_institute(self):
"""
"""Get the regular expression defining the affiliation of my institute.
It is obtained by concatenating the affiliation keys.
Affiliation key can contains character like ``(``, ``)`` or ``&``.
They are replaced by ``\(`` *etc*.
Returns:
unicode: the regular expression defining the affiliation
of my institute.
unicode:
"""
# alias
......@@ -117,6 +119,15 @@ class CheckAndFix(object):
u"|".join(imap(
lambda row: u"^%(key_u)s%(key_v)s" % row, iterselect))
# protect special character
reg_institute = (reg_institute
.replace("(", "\\(")
.replace(")", "\\)")
.replace("&", "\\&")
.replace("$", "\\$")
.replace("+", "\\+")
.replace("?", "\\?"))
return reg_institute
def _get_author_rescue_list(self, record, id_project, id_team):
......@@ -315,7 +326,7 @@ class CheckAndFix(object):
if not record.is_authors():
raise CheckException(MSG_NO_AUTHOR)
if isinstance(record.first_author(), list):
if len(record[u"100"]) > 1:
raise CheckException(MSG_TO_MANY_FAUTHOR)
def clean_erratum(self, record):
......@@ -459,44 +470,18 @@ class CheckAndFix(object):
return False
def format_authors(self, record, func):
"""Format the author names using the function func.
def format_authors(self, record, fmt="Last, First"):
"""Format the author names.
Args:
record (RecordPubli): record describing a publication.
func (reference): function used to format the author names.
fmt (str):
define the format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
"""
for key in (u"100", u"700"):
if key in record:
if isinstance(record[key], list):
for i in xrange(len(record[key])):
if "a" in record[key][i]:
# PROTECTION
# see RecordPubli.author_as_list
value = record[key][i]["a"]
if isinstance(value, unicode):
record[key][i]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
else:
if "a" in record[key]:
value = record[key]["a"]
# PROTECTION
# see RecordPubli.authors_as_list
if isinstance(value, unicode):
record[key]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
record.reformat_authors(fmt)
def format_editor(self, record):
"""Format the editor abbreviation. The encoding
......@@ -619,19 +604,21 @@ class CheckAndFix(object):
value = value.replace('U.', university)
record[u'502']['b'][i] = value
def get_my_authors(self, record, cmpFct=None):
def get_my_authors(self, record, sep=u", ", sort=False):
"""Get authors of my institutes signing the record.
The information is append to the Record object via the attribute
``my_authors``.
Args:
record (RecordPubli): record describing a publication.
cmpFct (reference): extract the family name from the full name.
It is used to sort my author list according to the
author family name.
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode: the list of authors separated by comma
unicode: the list of authors separated by the ``sep`` argument.
Raises:
CheckException: when the list is empty
......@@ -639,17 +626,17 @@ class CheckAndFix(object):
"""
# might have been computed when affiliation is checked
rec_id = record.id()
if rec_id in self.__my_authors:
li = self.__my_authors[rec_id]
li.sort(key=cmpFct)
value = u', '.join(li)
if rec_id in self._my_authors:
li = self._my_authors[rec_id]
value = sep.join(li)
# find authors of my institute signing the record
else:
reg_institute = self.reg_institute
value = record.find_authors_by_affiliation(reg_institute, cmpFct)
value = \
record.find_authors_by_affiliation(reg_institute, sep, sort)
if not value:
if len(value) == 0:
raise CheckException(MSG_NO_MY_AUTHOR)
record.my_authors = value
......@@ -680,11 +667,13 @@ class CheckAndFix(object):
if not isinstance(record, RecordThesis):
raise CheckException(MSG_NO_THESIS)
def my_affiliation(self,
record,
id_project,
id_team,
func=format_author_fr):
def my_affiliation(
self,
record,
id_project,
id_team,
fmt_rescue="F. Last",
sort=False):
"""Check that authors of my institute are signatories.
Launch a recovery procedure when affiliations are not defined.
......@@ -694,14 +683,26 @@ class CheckAndFix(object):
record (RecordPubli): record describing a publication.
id_project (int): identifier of the project in the database
id_team (int): identifier of the team in the database
func (reference): function used to format the author names.
fmt_rescue (str):
the format for the authors used in the rescue list
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Return
str:
* the found affiliation
* an empty string when the rescue list is used.
Raises:
CheckException: when there is no authors from my institute.
CheckException:
when the rescue list is required but empty
or because the intersection between the rescue list
and the author is null.
"""
value = record.find_affiliation(self.reg_institute)
if value:
if len(value) > 0:
return value
# affiliation is not defined
......@@ -711,20 +712,31 @@ class CheckAndFix(object):
raise CheckException(MSG_NO_MY_AUTHOR)
# format the author in the same way as the rescue list
# compute the intersection between the authors and the rescue list
df = (DataFrame(record.authors_as_list(), columns=["raw_author"])
.assign(format_author=lambda x:
x.raw_author.apply(lambda y: func(y)))
.set_index("format_author"))
fmt_ref = record._last_fmt_author
record.reformat_authors(fmt_rescue)
if sort:
authors = (record[u"700"][["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
authors = (record[u"700"].fmt_name
.sort_index())
# go back to the origin formatting
record.reformat_authors(fmt_ref)
rescue_list = [el.decode("utf-8") for el in rescue_list]
intersection = df.index & rescue_list
# compute the intersection between the authors and the rescue list
intersection = set(authors) & set(rescue_list)
if intersection.size == 0:
if len(intersection) == 0:
raise CheckException(MSG_NO_MY_AUTHOR)
# cache the result for a latter use
self.__my_authors[record.id()] = intersection.values.tolist()
self._my_authors[record.id()] = list(intersection)
return u""
def paper_reference(self, record):
"""Check that editor, page, volume and paper year are defined
......
......@@ -21,9 +21,12 @@ Note:
import copy
import pytest
from gluon import current
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
from plugin_dbui import get_id
CPPM_AUTHORS = [u"S. Akar",
u"E. Aslanides",
......@@ -49,9 +52,9 @@ def recordfix(record):
svc = CheckAndFix()
svc.authors(rec)
svc.format_authors(rec, format_author_fr)
svc.format_authors(rec, fmt="F. Last")
svc.format_editor(rec)
svc.get_my_authors(rec, cmpFct=family_name_fr)
svc.get_my_authors(rec, sort=True)
svc.submitted(rec)
svc.year(rec)
......@@ -62,14 +65,15 @@ def test_find_authors_by_affiliation(recordfix):
svc = CheckAndFix()
rex = svc._get_reg_institute()
references = set(["^CPPM, Marseille",
"^Centre de Physique des Particules de Marseille (CPPM)",
"^Marseille, CPPM"])
references = set([
"^CPPM, Marseille",
"^Centre de Physique des Particules de Marseille \\(CPPM\\)",
"^Marseille, CPPM"])
values = set(rex.split("|"))
assert values == references
authors = recordfix.find_authors_by_affiliation(rex, family_name_fr)
authors = recordfix.find_authors_by_affiliation(rex)
assert authors.split(', ') == CPPM_AUTHORS
......@@ -78,13 +82,56 @@ def test_first_author(record, recordfix):
assert recordfix.first_author() == "R. Aaij"
def test_my_affiliation(record):
db = current.db
rec = copy.deepcopy(record)
# test is useful when the rescue list exists
id_project = get_id(db.projects, project="LHCb")
id_team = get_id(db.teams, team="LHCb")
year = rec.year()
id_rec = get_id(
db.my_authors,
id_projects=id_project,
id_teams=id_team,
year=year)
if id_rec is None:
return
svc = CheckAndFix()
value = svc.my_affiliation(rec, id_project, id_team, "F. Last")
# test that the affiliation is found in the record
assert value == "Marseille, CPPM"
assert svc._my_authors == {}
# delete the affiliation
# and check that the affiliation is performed via the rescue list
rec[u"700"].u = ""
value = svc.my_affiliation(rec, id_project, id_team, "F. Last")
assert value == ""
assert svc._my_authors[record.id()] == [
'R. Le Gac',
'M. Perrin-Terrin',
'E. Aslanides',
'J. Cogan',
'J. Serrano',
'W. Kanso',
'S. Akar',
'O. Leroy',
'G. Mancinelli']
def test_my_authors(recordfix):
assert recordfix.my_authors.split(', ') == CPPM_AUTHORS
def test_paper_editor(record, recordfix):
assert record.paper_editor() == "J. Instrum."
assert recordfix.paper_editor() == "J. Instrum."
assert record.paper_editor() == "JINST"
assert recordfix.paper_editor() == "JINST"
def test_submitted(record, recordfix):
......
......@@ -15,7 +15,7 @@ import copy
import pytest
from gluon import current
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
@pytest.fixture(scope="module")
......@@ -29,11 +29,11 @@ def recordfix(record):
svc = CheckAndFix()
svc.authors(rec)
svc.format_authors(rec, format_author_fr)
svc.format_authors(rec, fmt="F. Last")
return rec
def test_first_author(record, recordfix):
assert record.first_author() == "Lees, J.P."
assert recordfix.first_author() == "J.-P. Lees"
assert recordfix.first_author() == "J. P. Lees"
......@@ -20,7 +20,7 @@ Note:
import copy
import pytest
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
......@@ -38,9 +38,9 @@ def recordfix(record):
svc.authors(rec)
svc.country(rec)
svc.conference_date(rec, "cds.cern.ch")
svc.format_authors(rec, format_author_fr)
svc.format_authors(rec, "F. Last")
svc.format_editor(rec)
svc.get_my_authors(rec, family_name_fr)
svc.get_my_authors(rec,sort=True)
svc.submitted(rec)
svc.year(rec)
......
......@@ -23,9 +23,7 @@ import copy
import pytest
from harvest_tools import (CheckAndFix,
CheckException,
format_author_fr)
from harvest_tools import CheckAndFix, CheckException
from invenio_tools import load_record
......@@ -43,7 +41,7 @@ def recordfix(record):
svc.authors(rec)
svc.country(rec)
svc.conference_date(rec, "inspirehep.net")
svc.format_authors(rec, format_author_fr)
svc.format_authors(rec, "F. Last")
svc.format_editor(rec)
svc.submitted(rec)
svc.year(rec)
......@@ -64,8 +62,7 @@ def test_authors(record, recordfix):
def test_my_authors_exception(recordfix):
svc = CheckAndFix()
with pytest.raises(CheckException):
svc.get_my_authors(recordfix)
svc.get_my_authors(recordfix)
def test_submitted(record, recordfix):
......@@ -74,6 +71,6 @@ def test_submitted(record, recordfix):
def test_year(record, recordfix):
assert record.year() == ""
assert record.year() == "2013"
assert recordfix.year() == "2013"
......@@ -2,7 +2,7 @@
"""CheckAndFix non conformities
"""
from harvest_tools import CheckAndFix, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
......@@ -11,11 +11,12 @@ def test_protection_format_authors():
record = load_record('inspirehep.net', 1386663)
svc = CheckAndFix()
svc.format_authors(record, format_author_fr)
svc.format_authors(record, "F. Last")
authors = record[u"700"]
assert len(authors) == 139
assert authors[128]["a"] == u"J. Zúñiga"
assert len(authors) == 140
assert authors.iloc[0].fmt_name == u"S. Adrián-Martínez"
assert authors.iloc[128].fmt_name == u"J. D. Zornoza"
assert authors.iloc[139].fmt_name == u"D. M. Coward"
......@@ -16,7 +16,7 @@ Note:
import copy
import pytest
from harvest_tools import CheckAndFix, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
......@@ -33,7 +33,7 @@ def recordfix(record):
svc.authors(rec)
svc.country(rec)
svc.conference_date(rec, "cds.cern.ch")
svc.format_authors(rec, format_author_fr)
svc.format_authors(rec, "F. Last")
svc.format_editor(rec)
svc.submitted(rec)
svc.year(rec)
......
......@@ -18,7 +18,7 @@ Note:
import pytest
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record, RecordThesis
......@@ -28,8 +28,8 @@ def record():
svc = CheckAndFix()
svc.authors(rec)
svc.format_authors(rec, format_author_fr)
svc.get_my_authors(rec, family_name_fr)
svc.format_authors(rec, "F. Last")
svc.get_my_authors(rec, sort=True)
svc.submitted(rec)
svc.year(rec)
......
......@@ -20,7 +20,7 @@ import copy
import pytest
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
......@@ -34,8 +34,8 @@ def recordfix(record):
svc = CheckAndFix()
svc.authors(rec)
svc.format_authors(rec, format_author_fr)
svc.get_my_authors(rec, family_name_fr)
svc.format_authors(rec, "F. Last")
svc.get_my_authors(rec, sort=True)
svc.submitted(rec)
svc.year(rec)
......
......@@ -18,7 +18,7 @@ Note:
import pytest
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record, RecordThesis
......@@ -28,8 +28,8 @@ def record():
svc = CheckAndFix()
svc.authors(rec)
svc.format_authors(rec, format_author_fr)
svc.get_my_authors(rec, family_name_fr)
svc.format_authors(rec, "F. Last")
svc.get_my_authors(rec, sort=True)
svc.submitted(rec)
svc.year(rec)
......
......@@ -9,7 +9,7 @@
import pytest
from gluon import current
from harvest_tools import CheckAndFix, format_author_fr
from harvest_tools import CheckAndFix
from invenio_tools import load_record
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment