Commit e8db7234 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Modify the regular expression REG_AUTHOR to treat more cases.

parent 6d8ae032
......@@ -41,12 +41,16 @@ def format_author_fr(name):
name (unicode): full name. Possible patterns are:
* ``Family, L``
* ``Family, L.``
* ``Family, P L``
* ``Family, P.L.``
* ``Family, P.-L.``
* ``Family, M -H``
* ``Family Name, J``
* ``Family-Name, J``
* ``Family, F Name``
* ``Family, First``
* ...
Returns:
unicode: the author name encode as ``J.-P. Doe``.
......@@ -56,28 +60,18 @@ def format_author_fr(name):
if name == '' or name is None:
return name
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
# decode the name Family, First + many variant
# group(1) family name
# group(2) first name
# group(3) second part of the first name
match = REG_AUTHOR.match(name)
# reformat the name as L. Family
# or keep it as it is
if match:
if match.group(3) and match.group(4):
result = '%s.%s%s. %s' % (match.group(2)[0], match.group(3)[0],
match.group(4)[0], match.group(1))
elif "-" in match.group(2):
li = [el[0] for el in match.group(2).split("-")]
li.append(match.group(1))
result = "%s.-%s. %s" % tuple(li)
if match.group(3):
tpl = (match.group(2)[0], match.group(3)[0], match.group(1))
result = '%s.-%s. %s' % tpl
else:
result = '%s. %s' % (match.group(2)[0], match.group(1))
......
......@@ -18,16 +18,12 @@ OAI_URL = u"http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
REG_AUTHOR = re.compile(r"(.+), (\S+)( |\-)*(\S+)*")
# name are encoded Family, First where first can be first-Second
# many variant are possible with initial, dot, ...
# group(1) is the family name
# group(2) is the part of the first name before the separator (" ", "-")
# group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
......
# -*- coding: utf-8 -*-
"""ARTICLE
http://inspirehep.net/record/1278588.
The CheckAndFix correction are applied.
Only the changes are checked
Note:
* The first author is duplicate
* authors are formatted as Lees, J.P.
"""
import copy
import pytest
from gluon import current
from harvest_tools import CheckAndFix, family_name_fr, format_author_fr
from invenio_tools import load_record
@pytest.fixture(scope="module")
def record():
return load_record('inspirehep.net', 1278588)
@pytest.fixture(scope="module")
def recordfix(record):
rec = copy.deepcopy(record)
svc = CheckAndFix()
svc.authors(rec)
svc.format_authors(rec, format_author_fr)
return rec
def test_first_author(record, recordfix):
assert record.first_author() == "Lees, J.P."
assert recordfix.first_author() == "J.-P. Lees"
......@@ -2,16 +2,50 @@
"""test basic harvester functions
"""
import locale
import pytest
from gluon import current
from harvest_tools import format_author_fr, search_synonym, ToolException
from invenio_tools import load_record
def test_format_author():
assert format_author_fr("Aaij, Roel") == "R. Aaij"
assert format_author_fr("Le Gac, Renaud") == "R. Le Gac"
assert format_author_fr("Bettler, Marc-Olivier") == "M.-O. Bettler"
def test_locale():
"""Check that the locale language is the correct French."""
assert locale.getlocale() == ('fr_FR', 'UTF-8')
def test_author_type():
"""Check the type of string returns by the author functions is unicode."""
record = load_record('inspirehep.net', 1278588)
assert type(record.first_author()) == unicode
for author in record.authors_as_list():
assert type(author) == unicode
@pytest.mark.parametrize("value, expected", [
(u"Aaij, Roel", u"R. Aaij"),
(u"Le Gac, Renaud", u"R. Le Gac"),
(u"Le Gac, R.", u"R. Le Gac"),
(u"Le Gac, R", u"R. Le Gac"),
(u"Perrin-Terrin, Mathieu", u"M. Perrin-Terrin"),
(u"Perrin-Terrin, M.", u"M. Perrin-Terrin"),
(u"Perrin-Terrin, M", u"M. Perrin-Terrin"),
(u"Schune, Marie-Hélène", u"M.-H. Schune"),
(u"Schune, Marie-Helene", u"M.-H. Schune"),
(u"Schune, Marie Helene", u"M.-H. Schune"),
(u"Schune, M.H.", u"M.-H. Schune"),
(u"Schune, M.-H.", u"M.-H. Schune"),
(u"Schune, M. -H.", u"M.-H. Schune"),
(u"Schune, M-H", u"M.-H. Schune"),
(u"Schune, M -H", u"M.-H. Schune")
])
def test_format_author(value, expected):
print value
assert type(value) == unicode
assert format_author_fr(value) == expected
def test_search_synonym():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment