Commit b0613eb4 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add protection finding authors (trigger by ins1386663).

parent 02a7f39d
......@@ -5,6 +5,7 @@
import re
from gluon import current
from invenio_tools import REG_AUTHOR
DRY_RUN = "dry run"
......@@ -49,7 +50,7 @@ def format_author_fr(name):
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
match = re.match(r'(.+), (\S+)( |\-)*(\S+)*', name)
match = REG_AUTHOR.match(name)
# reformat the name as L. Family
# or keep it as it is
......
......@@ -10,6 +10,7 @@ from base import (ARXIV,
is_thesis,
OAI_URL,
REG_ARXIV_NUMBER,
REG_AUTHOR,
REG_OAI,
REG_YEAR,
THESIS_DIR)
......
......@@ -13,8 +13,20 @@ MSG_NO_THESIS = "Reject no thesis information"
OAI_URL = "http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
REG_AUTHOR = re.compile(r"(.+), (\S+)( |\-)*(\S+)*")
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
THESIS_DIR = u"dir."
......
......@@ -5,7 +5,12 @@
import re
import regex
from base import MSG_NO_CONF, MSG_NO_THESIS, OAI_URL, REG_OAI, REG_YEAR
from base import (MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from exception import CheckException
from filters import CLEAN_REVIEW
from gluon import current
......@@ -339,13 +344,33 @@ class CheckAndFix(object):
for key in (u"100", u"700"):
if key in record:
if isinstance(record[key], list):
for i in range(len(record[key])):
for i in xrange(len(record[key])):
if "a" in record[key][i]:
record[key][i]["a"] = func(record[key][i]["a"])
# PROTECTION
# see RecordPubli.author_as_list
value = record[key][i]["a"]
if isinstance(value, unicode):
record[key][i]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
else:
if "a" in record[key]:
record[key]["a"] = func(record[key]["a"])
value = record[key]["a"]
# PROTECTION
# see RecordPubli.authors_as_list
if isinstance(value, unicode):
record[key]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
def format_editor(self, record):
"""Format the editor abbreviation since the encoding
......
......@@ -5,7 +5,12 @@
import re
from base import ARXIV, ARXIV_PDF, REG_ARXIV_NUMBER, REG_YEAR, THESIS_DIR
from base import (ARXIV,
ARXIV_PDF,
REG_ARXIV_NUMBER,
REG_AUTHOR,
REG_YEAR,
THESIS_DIR)
from filters import CLEAN_COLLABORATION
from plugin_dbui import CLEAN_SPACES
from record import Record
......@@ -66,7 +71,8 @@ class RecordPubli(Record):
"""
authors = []
# NOTE: the content of the 700 field depend on the record type.
# NOTE
# the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if u"700" in self and isinstance(self[u"700"], dict):
if not ("e" in self[u"700"] and self[u"700"]["e"] == THESIS_DIR):
......@@ -78,7 +84,20 @@ class RecordPubli(Record):
continue
if "a" in di:
authors.append(di["a"])
author = di["a"]
# PROTECTION
# in most of the case the author is a string
# but it can be a list, e.g inspirehep.net/138663:
# [u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']
if isinstance(author, unicode):
authors.append(di["a"])
elif isinstance(author, list):
for elt in author:
if REG_AUTHOR.match(elt):
authors.append(elt)
break
return authors
......
# -*- coding: utf-8 -*-
"""CheckAndFix non conformities
"""
from invenio_tools import CheckAndFix, load_record
from harvest_tools import format_author_fr
def test_protection_format_authors():
"""[u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']"""
record = load_record('inspirehep.net', 1386663)
svc = CheckAndFix()
svc.format_authors(record, format_author_fr)
authors = record[u"700"]
assert len(authors) == 139
assert authors[128]["a"] == u"J. Zuniga"
......@@ -8,6 +8,14 @@ Protection are add in the record method to correct them
from invenio_tools import load_record
def test_protection_authors_as_list():
"""[u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']"""
record = load_record('inspirehep.net', 1386663)
authors = record.authors_as_list()
assert len(authors) == 139
assert authors[128] == u"Zuniga, J."
def test_protection_oai():
"""['oai:cds.cern.ch:1513204', 'oai:cds.cern.ch:1512766']"""
record = load_record('cds.cern.ch', 1513204)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment