Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
b0613eb4
Commit
b0613eb4
authored
Sep 21, 2015
by
LE GAC Renaud
Browse files
Add protection finding authors (trigger by ins1386663).
parent
02a7f39d
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
97 additions
and
10 deletions
+97
-10
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+2
-1
modules/invenio_tools/__init__.py
modules/invenio_tools/__init__.py
+1
-0
modules/invenio_tools/base.py
modules/invenio_tools/base.py
+14
-2
modules/invenio_tools/checkandfix.py
modules/invenio_tools/checkandfix.py
+29
-4
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+22
-3
tests/harvester/CheckAndFix/test_checkandfix_non_conformities.py
...arvester/CheckAndFix/test_checkandfix_non_conformities.py
+21
-0
tests/harvester/Record/test_record_non_conformities.py
tests/harvester/Record/test_record_non_conformities.py
+8
-0
No files found.
modules/harvest_tools/base.py
View file @
b0613eb4
...
...
@@ -5,6 +5,7 @@
import
re
from
gluon
import
current
from
invenio_tools
import
REG_AUTHOR
DRY_RUN
=
"dry run"
...
...
@@ -49,7 +50,7 @@ def format_author_fr(name):
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
match
=
re
.
match
(
r
'(.+), (\S+)( |\-)*(\S+)*'
,
name
)
match
=
REG_AUTHOR
.
match
(
name
)
# reformat the name as L. Family
# or keep it as it is
...
...
modules/invenio_tools/__init__.py
View file @
b0613eb4
...
...
@@ -10,6 +10,7 @@ from base import (ARXIV,
is_thesis
,
OAI_URL
,
REG_ARXIV_NUMBER
,
REG_AUTHOR
,
REG_OAI
,
REG_YEAR
,
THESIS_DIR
)
...
...
modules/invenio_tools/base.py
View file @
b0613eb4
...
...
@@ -13,8 +13,20 @@ MSG_NO_THESIS = "Reject no thesis information"
OAI_URL
=
"http://%s/record/%s"
REG_ARXIV_NUMBER
=
re
.
compile
(
"\d+\.\d+"
)
REG_OAI
=
re
.
compile
(
'oai:([a-z\.]+):([\d]+)'
)
REG_YEAR
=
re
.
compile
(
"(\d{4})"
)
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
REG_AUTHOR
=
re
.
compile
(
r
"(.+), (\S+)( |\-)*(\S+)*"
)
REG_OAI
=
re
.
compile
(
r
"oai:([a-z\.]+):([\d]+)"
)
REG_YEAR
=
re
.
compile
(
r
"(\d{4})"
)
THESIS_DIR
=
u
"dir."
...
...
modules/invenio_tools/checkandfix.py
View file @
b0613eb4
...
...
@@ -5,7 +5,12 @@
import
re
import
regex
from
base
import
MSG_NO_CONF
,
MSG_NO_THESIS
,
OAI_URL
,
REG_OAI
,
REG_YEAR
from
base
import
(
MSG_NO_CONF
,
MSG_NO_THESIS
,
OAI_URL
,
REG_AUTHOR
,
REG_OAI
,
REG_YEAR
)
from
exception
import
CheckException
from
filters
import
CLEAN_REVIEW
from
gluon
import
current
...
...
@@ -339,13 +344,33 @@ class CheckAndFix(object):
for
key
in
(
u
"100"
,
u
"700"
):
if
key
in
record
:
if
isinstance
(
record
[
key
],
list
):
for
i
in
range
(
len
(
record
[
key
])):
for
i
in
x
range
(
len
(
record
[
key
])):
if
"a"
in
record
[
key
][
i
]:
record
[
key
][
i
][
"a"
]
=
func
(
record
[
key
][
i
][
"a"
])
# PROTECTION
# see RecordPubli.author_as_list
value
=
record
[
key
][
i
][
"a"
]
if
isinstance
(
value
,
unicode
):
record
[
key
][
i
][
"a"
]
=
func
(
value
)
elif
isinstance
(
value
,
list
):
for
elt
in
value
:
if
REG_AUTHOR
.
match
(
elt
):
record
[
key
][
i
][
"a"
]
=
func
(
elt
)
else
:
if
"a"
in
record
[
key
]:
record
[
key
][
"a"
]
=
func
(
record
[
key
][
"a"
])
value
=
record
[
key
][
"a"
]
# PROTECTION
# see RecordPubli.authors_as_list
if
isinstance
(
value
,
unicode
):
record
[
key
][
"a"
]
=
func
(
value
)
elif
isinstance
(
value
,
list
):
for
elt
in
value
:
if
REG_AUTHOR
.
match
(
elt
):
record
[
key
][
i
][
"a"
]
=
func
(
elt
)
def
format_editor
(
self
,
record
):
"""Format the editor abbreviation since the encoding
...
...
modules/invenio_tools/recordpubli.py
View file @
b0613eb4
...
...
@@ -5,7 +5,12 @@
import
re
from
base
import
ARXIV
,
ARXIV_PDF
,
REG_ARXIV_NUMBER
,
REG_YEAR
,
THESIS_DIR
from
base
import
(
ARXIV
,
ARXIV_PDF
,
REG_ARXIV_NUMBER
,
REG_AUTHOR
,
REG_YEAR
,
THESIS_DIR
)
from
filters
import
CLEAN_COLLABORATION
from
plugin_dbui
import
CLEAN_SPACES
from
record
import
Record
...
...
@@ -66,7 +71,8 @@ class RecordPubli(Record):
"""
authors
=
[]
# NOTE: the content of the 700 field depend on the record type.
# NOTE
# the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if
u
"700"
in
self
and
isinstance
(
self
[
u
"700"
],
dict
):
if
not
(
"e"
in
self
[
u
"700"
]
and
self
[
u
"700"
][
"e"
]
==
THESIS_DIR
):
...
...
@@ -78,7 +84,20 @@ class RecordPubli(Record):
continue
if
"a"
in
di
:
authors
.
append
(
di
[
"a"
])
author
=
di
[
"a"
]
# PROTECTION
# in most of the case the author is a string
# but it can be a list, e.g inspirehep.net/138663:
# [u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']
if
isinstance
(
author
,
unicode
):
authors
.
append
(
di
[
"a"
])
elif
isinstance
(
author
,
list
):
for
elt
in
author
:
if
REG_AUTHOR
.
match
(
elt
):
authors
.
append
(
elt
)
break
return
authors
...
...
tests/harvester/CheckAndFix/test_checkandfix_non_conformities.py
0 → 100644
View file @
b0613eb4
# -*- coding: utf-8 -*-
"""CheckAndFix non conformities
"""
from
invenio_tools
import
CheckAndFix
,
load_record
from
harvest_tools
import
format_author_fr
def
test_protection_format_authors
():
"""[u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']"""
record
=
load_record
(
'inspirehep.net'
,
1386663
)
svc
=
CheckAndFix
()
svc
.
format_authors
(
record
,
format_author_fr
)
authors
=
record
[
u
"700"
]
assert
len
(
authors
)
==
139
assert
authors
[
128
][
"a"
]
==
u
"J. Zuniga"
tests/harvester/Record/test_record_non_conformities.py
View file @
b0613eb4
...
...
@@ -8,6 +8,14 @@ Protection are add in the record method to correct them
from
invenio_tools
import
load_record
def
test_protection_authors_as_list
():
"""[u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']"""
record
=
load_record
(
'inspirehep.net'
,
1386663
)
authors
=
record
.
authors_as_list
()
assert
len
(
authors
)
==
139
assert
authors
[
128
]
==
u
"Zuniga, J."
def
test_protection_oai
():
"""['oai:cds.cern.ch:1513204', 'oai:cds.cern.ch:1512766']"""
record
=
load_record
(
'cds.cern.ch'
,
1513204
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment