Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit b0613eb4 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add protection finding authors (trigger by ins1386663).

parent 02a7f39d
......@@ -5,6 +5,7 @@
import re
from gluon import current
from invenio_tools import REG_AUTHOR
DRY_RUN = "dry run"
......@@ -49,7 +50,7 @@ def format_author_fr(name):
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
match = re.match(r'(.+), (\S+)( |\-)*(\S+)*', name)
match = REG_AUTHOR.match(name)
# reformat the name as L. Family
# or keep it as it is
......
......@@ -10,6 +10,7 @@ from base import (ARXIV,
is_thesis,
OAI_URL,
REG_ARXIV_NUMBER,
REG_AUTHOR,
REG_OAI,
REG_YEAR,
THESIS_DIR)
......
......@@ -13,8 +13,20 @@ MSG_NO_THESIS = "Reject no thesis information"
OAI_URL = "http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
REG_AUTHOR = re.compile(r"(.+), (\S+)( |\-)*(\S+)*")
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
THESIS_DIR = u"dir."
......
......@@ -5,7 +5,12 @@
import re
import regex
from base import MSG_NO_CONF, MSG_NO_THESIS, OAI_URL, REG_OAI, REG_YEAR
from base import (MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from exception import CheckException
from filters import CLEAN_REVIEW
from gluon import current
......@@ -339,13 +344,33 @@ class CheckAndFix(object):
for key in (u"100", u"700"):
if key in record:
if isinstance(record[key], list):
for i in range(len(record[key])):
for i in xrange(len(record[key])):
if "a" in record[key][i]:
record[key][i]["a"] = func(record[key][i]["a"])
# PROTECTION
# see RecordPubli.author_as_list
value = record[key][i]["a"]
if isinstance(value, unicode):
record[key][i]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
else:
if "a" in record[key]:
record[key]["a"] = func(record[key]["a"])
value = record[key]["a"]
# PROTECTION
# see RecordPubli.authors_as_list
if isinstance(value, unicode):
record[key]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
def format_editor(self, record):
"""Format the editor abbreviation since the encoding
......
......@@ -5,7 +5,12 @@
import re
from base import ARXIV, ARXIV_PDF, REG_ARXIV_NUMBER, REG_YEAR, THESIS_DIR
from base import (ARXIV,
ARXIV_PDF,
REG_ARXIV_NUMBER,
REG_AUTHOR,
REG_YEAR,
THESIS_DIR)
from filters import CLEAN_COLLABORATION
from plugin_dbui import CLEAN_SPACES
from record import Record
......@@ -66,7 +71,8 @@ class RecordPubli(Record):
"""
authors = []
# NOTE: the content of the 700 field depend on the record type.
# NOTE
# the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if u"700" in self and isinstance(self[u"700"], dict):
if not ("e" in self[u"700"] and self[u"700"]["e"] == THESIS_DIR):
......@@ -78,7 +84,20 @@ class RecordPubli(Record):
continue
if "a" in di:
authors.append(di["a"])
author = di["a"]
# PROTECTION
# in most of the case the author is a string
# but it can be a list, e.g inspirehep.net/138663:
# [u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']
if isinstance(author, unicode):
authors.append(di["a"])
elif isinstance(author, list):
for elt in author:
if REG_AUTHOR.match(elt):
authors.append(elt)
break
return authors
......
# -*- coding: utf-8 -*-
"""CheckAndFix non conformities
"""
from invenio_tools import CheckAndFix, load_record
from harvest_tools import format_author_fr
def test_protection_format_authors():
"""[u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']"""
record = load_record('inspirehep.net', 1386663)
svc = CheckAndFix()
svc.format_authors(record, format_author_fr)
authors = record[u"700"]
assert len(authors) == 139
assert authors[128]["a"] == u"J. Zuniga"
......@@ -8,6 +8,14 @@ Protection are add in the record method to correct them
from invenio_tools import load_record
def test_protection_authors_as_list():
"""[u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']"""
record = load_record('inspirehep.net', 1386663)
authors = record.authors_as_list()
assert len(authors) == 139
assert authors[128] == u"Zuniga, J."
def test_protection_oai():
"""['oai:cds.cern.ch:1513204', 'oai:cds.cern.ch:1512766']"""
record = load_record('cds.cern.ch', 1513204)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment