Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
e22f1a62
Commit
e22f1a62
authored
Jun 07, 2017
by
LE GAC Renaud
Browse files
Update CheckAndFix and its tests.
parent
b7ca0e1b
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
165 additions
and
108 deletions
+165
-108
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+81
-69
tests/harvest_tools/CheckAndFix/test_acl_cds1951625_fix.py
tests/harvest_tools/CheckAndFix/test_acl_cds1951625_fix.py
+56
-9
tests/harvest_tools/CheckAndFix/test_acl_ins1278588_fix.py
tests/harvest_tools/CheckAndFix/test_acl_ins1278588_fix.py
+3
-3
tests/harvest_tools/CheckAndFix/test_acti_cds1411352_fix.py
tests/harvest_tools/CheckAndFix/test_acti_cds1411352_fix.py
+3
-3
tests/harvest_tools/CheckAndFix/test_acti_ins1276938_fix.py
tests/harvest_tools/CheckAndFix/test_acti_ins1276938_fix.py
+4
-7
tests/harvest_tools/CheckAndFix/test_checkandfix_non_conformities.py
...st_tools/CheckAndFix/test_checkandfix_non_conformities.py
+6
-5
tests/harvest_tools/CheckAndFix/test_com_cds1550918_fix.py
tests/harvest_tools/CheckAndFix/test_com_cds1550918_fix.py
+2
-2
tests/harvest_tools/CheckAndFix/test_phd_cds1394605_fix.py
tests/harvest_tools/CheckAndFix/test_phd_cds1394605_fix.py
+3
-3
tests/harvest_tools/CheckAndFix/test_phd_cds1632177_fix.py
tests/harvest_tools/CheckAndFix/test_phd_cds1632177_fix.py
+3
-3
tests/harvest_tools/CheckAndFix/test_phd_cds1642541_fix.py
tests/harvest_tools/CheckAndFix/test_phd_cds1642541_fix.py
+3
-3
tests/harvest_tools/CheckAndFix/test_phd_cds2015250_fix.py
tests/harvest_tools/CheckAndFix/test_phd_cds2015250_fix.py
+1
-1
No files found.
modules/harvest_tools/checkandfix.py
View file @
e22f1a62
...
...
@@ -5,7 +5,7 @@
import
re
import
regex
from
base
import
format_author_fr
,
search_synonym
,
ToolException
from
base
import
search_synonym
,
ToolException
from
exception
import
CheckException
from
gluon
import
current
from
invenio_tools
import
(
DECODE_REF
,
...
...
@@ -14,11 +14,9 @@ from invenio_tools import (DECODE_REF,
OAI_URL
,
RecordConf
,
RecordThesis
,
REG_AUTHOR
,
REG_OAI
,
REG_YEAR
)
from
itertools
import
imap
from
pandas
import
DataFrame
from
plugin_dbui
import
CLEAN_SPACES
,
get_id
...
...
@@ -94,13 +92,17 @@ class CheckAndFix(object):
self
.
__reference
=
None
# private cache for my authors list
self
.
_
_
my_authors
=
{}
self
.
_my_authors
=
{}
def
_get_reg_institute
(
self
):
"""
"""Get the regular expression defining the affiliation of my institute.
It is obtained by concatenating the affiliation keys.
Affiliation key can contains character like ``(``, ``)`` or ``&``.
They are replaced by ``\(`` *etc*.
Returns:
unicode: the regular expression defining the affiliation
of my institute.
unicode:
"""
# alias
...
...
@@ -117,6 +119,15 @@ class CheckAndFix(object):
u
"|"
.
join
(
imap
(
lambda
row
:
u
"^%(key_u)s%(key_v)s"
%
row
,
iterselect
))
# protect special character
reg_institute
=
(
reg_institute
.
replace
(
"("
,
"
\\
("
)
.
replace
(
")"
,
"
\\
)"
)
.
replace
(
"&"
,
"
\\
&"
)
.
replace
(
"$"
,
"
\\
$"
)
.
replace
(
"+"
,
"
\\
+"
)
.
replace
(
"?"
,
"
\\
?"
))
return
reg_institute
def
_get_author_rescue_list
(
self
,
record
,
id_project
,
id_team
):
...
...
@@ -315,7 +326,7 @@ class CheckAndFix(object):
if
not
record
.
is_authors
():
raise
CheckException
(
MSG_NO_AUTHOR
)
if
isinstance
(
record
.
first_author
(),
list
)
:
if
len
(
record
[
u
"100"
])
>
1
:
raise
CheckException
(
MSG_TO_MANY_FAUTHOR
)
def
clean_erratum
(
self
,
record
):
...
...
@@ -459,44 +470,18 @@ class CheckAndFix(object):
return
False
def
format_authors
(
self
,
record
,
f
unc
):
"""Format the author names
using the function func
.
def
format_authors
(
self
,
record
,
f
mt
=
"Last, First"
):
"""Format the author names.
Args:
record (RecordPubli): record describing a publication.
func (reference): function used to format the author names.
fmt (str):
define the format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
"""
for
key
in
(
u
"100"
,
u
"700"
):
if
key
in
record
:
if
isinstance
(
record
[
key
],
list
):
for
i
in
xrange
(
len
(
record
[
key
])):
if
"a"
in
record
[
key
][
i
]:
# PROTECTION
# see RecordPubli.author_as_list
value
=
record
[
key
][
i
][
"a"
]
if
isinstance
(
value
,
unicode
):
record
[
key
][
i
][
"a"
]
=
func
(
value
)
elif
isinstance
(
value
,
list
):
for
elt
in
value
:
if
REG_AUTHOR
.
match
(
elt
):
record
[
key
][
i
][
"a"
]
=
func
(
elt
)
else
:
if
"a"
in
record
[
key
]:
value
=
record
[
key
][
"a"
]
# PROTECTION
# see RecordPubli.authors_as_list
if
isinstance
(
value
,
unicode
):
record
[
key
][
"a"
]
=
func
(
value
)
elif
isinstance
(
value
,
list
):
for
elt
in
value
:
if
REG_AUTHOR
.
match
(
elt
):
record
[
key
][
i
][
"a"
]
=
func
(
elt
)
record
.
reformat_authors
(
fmt
)
def
format_editor
(
self
,
record
):
"""Format the editor abbreviation. The encoding
...
...
@@ -619,19 +604,21 @@ class CheckAndFix(object):
value
=
value
.
replace
(
'U.'
,
university
)
record
[
u
'502'
][
'b'
][
i
]
=
value
def
get_my_authors
(
self
,
record
,
cmpFct
=
Non
e
):
def
get_my_authors
(
self
,
record
,
sep
=
u
", "
,
sort
=
Fals
e
):
"""Get authors of my institutes signing the record.
The information is append to the Record object via the attribute
``my_authors``.
Args:
record (RecordPubli): record describing a publication.
cmpFct (reference): extract the family name from the full name.
It is used to sort my author list according to the
author family name.
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode: the list of authors separated by
comma
unicode: the list of authors separated by
the ``sep`` argument.
Raises:
CheckException: when the list is empty
...
...
@@ -639,17 +626,17 @@ class CheckAndFix(object):
"""
# might have been computed when affiliation is checked
rec_id
=
record
.
id
()
if
rec_id
in
self
.
__my_authors
:
li
=
self
.
__my_authors
[
rec_id
]
li
.
sort
(
key
=
cmpFct
)
value
=
u
', '
.
join
(
li
)
if
rec_id
in
self
.
_my_authors
:
li
=
self
.
_my_authors
[
rec_id
]
value
=
sep
.
join
(
li
)
# find authors of my institute signing the record
else
:
reg_institute
=
self
.
reg_institute
value
=
record
.
find_authors_by_affiliation
(
reg_institute
,
cmpFct
)
value
=
\
record
.
find_authors_by_affiliation
(
reg_institute
,
sep
,
sort
)
if
not
value
:
if
len
(
value
)
==
0
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
record
.
my_authors
=
value
...
...
@@ -680,11 +667,13 @@ class CheckAndFix(object):
if
not
isinstance
(
record
,
RecordThesis
):
raise
CheckException
(
MSG_NO_THESIS
)
def
my_affiliation
(
self
,
record
,
id_project
,
id_team
,
func
=
format_author_fr
):
def
my_affiliation
(
self
,
record
,
id_project
,
id_team
,
fmt_rescue
=
"F. Last"
,
sort
=
False
):
"""Check that authors of my institute are signatories.
Launch a recovery procedure when affiliations are not defined.
...
...
@@ -694,14 +683,26 @@ class CheckAndFix(object):
record (RecordPubli): record describing a publication.
id_project (int): identifier of the project in the database
id_team (int): identifier of the team in the database
func (reference): function used to format the author names.
fmt_rescue (str):
the format for the authors used in the rescue list
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Return
str:
* the found affiliation
* an empty string when the rescue list is used.
Raises:
CheckException: when there is no authors from my institute.
CheckException:
when the rescue list is required but empty
or because the intersection between the rescue list
and the author is null.
"""
value
=
record
.
find_affiliation
(
self
.
reg_institute
)
if
value
:
if
len
(
value
)
>
0
:
return
value
# affiliation is not defined
...
...
@@ -711,20 +712,31 @@ class CheckAndFix(object):
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
# format the author in the same way as the rescue list
# compute the intersection between the authors and the rescue list
df
=
(
DataFrame
(
record
.
authors_as_list
(),
columns
=
[
"raw_author"
])
.
assign
(
format_author
=
lambda
x
:
x
.
raw_author
.
apply
(
lambda
y
:
func
(
y
)))
.
set_index
(
"format_author"
))
fmt_ref
=
record
.
_last_fmt_author
record
.
reformat_authors
(
fmt_rescue
)
if
sort
:
authors
=
(
record
[
u
"700"
][[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
else
:
authors
=
(
record
[
u
"700"
].
fmt_name
.
sort_index
())
# go back to the origin formatting
record
.
reformat_authors
(
fmt_ref
)
rescue_list
=
[
el
.
decode
(
"utf-8"
)
for
el
in
rescue
_
list
]
intersection
=
df
.
index
&
rescue_list
# compute the intersection between the authors and the
rescue
list
intersection
=
set
(
authors
)
&
set
(
rescue_list
)
if
intersection
.
size
==
0
:
if
len
(
intersection
)
==
0
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
# cache the result for a latter use
self
.
__my_authors
[
record
.
id
()]
=
intersection
.
values
.
tolist
()
self
.
_my_authors
[
record
.
id
()]
=
list
(
intersection
)
return
u
""
def
paper_reference
(
self
,
record
):
"""Check that editor, page, volume and paper year are defined
...
...
tests/harvest_tools/CheckAndFix/test_acl_cds1951625_fix.py
View file @
e22f1a62
...
...
@@ -21,9 +21,12 @@ Note:
import
copy
import
pytest
from
gluon
import
current
from
harvest_tools
import
CheckAndFix
,
family_name_fr
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
from
plugin_dbui
import
get_id
CPPM_AUTHORS
=
[
u
"S. Akar"
,
u
"E. Aslanides"
,
...
...
@@ -49,9 +52,9 @@ def recordfix(record):
svc
=
CheckAndFix
()
svc
.
authors
(
rec
)
svc
.
format_authors
(
rec
,
f
ormat_author_fr
)
svc
.
format_authors
(
rec
,
f
mt
=
"F. Last"
)
svc
.
format_editor
(
rec
)
svc
.
get_my_authors
(
rec
,
cmpFct
=
family_name_fr
)
svc
.
get_my_authors
(
rec
,
sort
=
True
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
@@ -62,14 +65,15 @@ def test_find_authors_by_affiliation(recordfix):
svc
=
CheckAndFix
()
rex
=
svc
.
_get_reg_institute
()
references
=
set
([
"^CPPM, Marseille"
,
"^Centre de Physique des Particules de Marseille (CPPM)"
,
"^Marseille, CPPM"
])
references
=
set
([
"^CPPM, Marseille"
,
"^Centre de Physique des Particules de Marseille
\\
(CPPM
\\
)"
,
"^Marseille, CPPM"
])
values
=
set
(
rex
.
split
(
"|"
))
assert
values
==
references
authors
=
recordfix
.
find_authors_by_affiliation
(
rex
,
family_name_fr
)
authors
=
recordfix
.
find_authors_by_affiliation
(
rex
)
assert
authors
.
split
(
', '
)
==
CPPM_AUTHORS
...
...
@@ -78,13 +82,56 @@ def test_first_author(record, recordfix):
assert
recordfix
.
first_author
()
==
"R. Aaij"
def
test_my_affiliation
(
record
):
db
=
current
.
db
rec
=
copy
.
deepcopy
(
record
)
# test is useful when the rescue list exists
id_project
=
get_id
(
db
.
projects
,
project
=
"LHCb"
)
id_team
=
get_id
(
db
.
teams
,
team
=
"LHCb"
)
year
=
rec
.
year
()
id_rec
=
get_id
(
db
.
my_authors
,
id_projects
=
id_project
,
id_teams
=
id_team
,
year
=
year
)
if
id_rec
is
None
:
return
svc
=
CheckAndFix
()
value
=
svc
.
my_affiliation
(
rec
,
id_project
,
id_team
,
"F. Last"
)
# test that the affiliation is found in the record
assert
value
==
"Marseille, CPPM"
assert
svc
.
_my_authors
==
{}
# delete the affiliation
# and check that the affiliation is performed via the rescue list
rec
[
u
"700"
].
u
=
""
value
=
svc
.
my_affiliation
(
rec
,
id_project
,
id_team
,
"F. Last"
)
assert
value
==
""
assert
svc
.
_my_authors
[
record
.
id
()]
==
[
'R. Le Gac'
,
'M. Perrin-Terrin'
,
'E. Aslanides'
,
'J. Cogan'
,
'J. Serrano'
,
'W. Kanso'
,
'S. Akar'
,
'O. Leroy'
,
'G. Mancinelli'
]
def
test_my_authors
(
recordfix
):
assert
recordfix
.
my_authors
.
split
(
', '
)
==
CPPM_AUTHORS
def
test_paper_editor
(
record
,
recordfix
):
assert
record
.
paper_editor
()
==
"J
. Instrum.
"
assert
recordfix
.
paper_editor
()
==
"J
. Instrum.
"
assert
record
.
paper_editor
()
==
"J
INST
"
assert
recordfix
.
paper_editor
()
==
"J
INST
"
def
test_submitted
(
record
,
recordfix
):
...
...
tests/harvest_tools/CheckAndFix/test_acl_ins1278588_fix.py
View file @
e22f1a62
...
...
@@ -15,7 +15,7 @@ import copy
import
pytest
from
gluon
import
current
from
harvest_tools
import
CheckAndFix
,
family_name_fr
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -29,11 +29,11 @@ def recordfix(record):
svc
=
CheckAndFix
()
svc
.
authors
(
rec
)
svc
.
format_authors
(
rec
,
f
ormat_author_fr
)
svc
.
format_authors
(
rec
,
f
mt
=
"F. Last"
)
return
rec
def
test_first_author
(
record
,
recordfix
):
assert
record
.
first_author
()
==
"Lees, J.P."
assert
recordfix
.
first_author
()
==
"J.
-
P. Lees"
assert
recordfix
.
first_author
()
==
"J.
P. Lees"
tests/harvest_tools/CheckAndFix/test_acti_cds1411352_fix.py
View file @
e22f1a62
...
...
@@ -20,7 +20,7 @@ Note:
import
copy
import
pytest
from
harvest_tools
import
CheckAndFix
,
family_name_fr
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
...
...
@@ -38,9 +38,9 @@ def recordfix(record):
svc
.
authors
(
rec
)
svc
.
country
(
rec
)
svc
.
conference_date
(
rec
,
"cds.cern.ch"
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
format_authors
(
rec
,
"F. Last"
)
svc
.
format_editor
(
rec
)
svc
.
get_my_authors
(
rec
,
family_name_fr
)
svc
.
get_my_authors
(
rec
,
sort
=
True
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
tests/harvest_tools/CheckAndFix/test_acti_ins1276938_fix.py
View file @
e22f1a62
...
...
@@ -23,9 +23,7 @@ import copy
import
pytest
from
harvest_tools
import
(
CheckAndFix
,
CheckException
,
format_author_fr
)
from
harvest_tools
import
CheckAndFix
,
CheckException
from
invenio_tools
import
load_record
...
...
@@ -43,7 +41,7 @@ def recordfix(record):
svc
.
authors
(
rec
)
svc
.
country
(
rec
)
svc
.
conference_date
(
rec
,
"inspirehep.net"
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
format_authors
(
rec
,
"F. Last"
)
svc
.
format_editor
(
rec
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
@@ -64,8 +62,7 @@ def test_authors(record, recordfix):
def
test_my_authors_exception
(
recordfix
):
svc
=
CheckAndFix
()
with
pytest
.
raises
(
CheckException
):
svc
.
get_my_authors
(
recordfix
)
svc
.
get_my_authors
(
recordfix
)
def
test_submitted
(
record
,
recordfix
):
...
...
@@ -74,6 +71,6 @@ def test_submitted(record, recordfix):
def
test_year
(
record
,
recordfix
):
assert
record
.
year
()
==
""
assert
record
.
year
()
==
"
2013
"
assert
recordfix
.
year
()
==
"2013"
tests/harvest_tools/CheckAndFix/test_checkandfix_non_conformities.py
View file @
e22f1a62
...
...
@@ -2,7 +2,7 @@
"""CheckAndFix non conformities
"""
from
harvest_tools
import
CheckAndFix
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
...
...
@@ -11,11 +11,12 @@ def test_protection_format_authors():
record
=
load_record
(
'inspirehep.net'
,
1386663
)
svc
=
CheckAndFix
()
svc
.
format_authors
(
record
,
format_author_fr
)
svc
.
format_authors
(
record
,
"F. Last"
)
authors
=
record
[
u
"700"
]
assert
len
(
authors
)
==
139
assert
authors
[
128
][
"a"
]
==
u
"J. Zúñiga"
assert
len
(
authors
)
==
140
assert
authors
.
iloc
[
0
].
fmt_name
==
u
"S. Adrián-Martínez"
assert
authors
.
iloc
[
128
].
fmt_name
==
u
"J. D. Zornoza"
assert
authors
.
iloc
[
139
].
fmt_name
==
u
"D. M. Coward"
tests/harvest_tools/CheckAndFix/test_com_cds1550918_fix.py
View file @
e22f1a62
...
...
@@ -16,7 +16,7 @@ Note:
import
copy
import
pytest
from
harvest_tools
import
CheckAndFix
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
...
...
@@ -33,7 +33,7 @@ def recordfix(record):
svc
.
authors
(
rec
)
svc
.
country
(
rec
)
svc
.
conference_date
(
rec
,
"cds.cern.ch"
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
format_authors
(
rec
,
"F. Last"
)
svc
.
format_editor
(
rec
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
tests/harvest_tools/CheckAndFix/test_phd_cds1394605_fix.py
View file @
e22f1a62
...
...
@@ -18,7 +18,7 @@ Note:
import
pytest
from
harvest_tools
import
CheckAndFix
,
family_name_fr
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
,
RecordThesis
...
...
@@ -28,8 +28,8 @@ def record():
svc
=
CheckAndFix
()
svc
.
authors
(
rec
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
get_my_authors
(
rec
,
family_name_fr
)
svc
.
format_authors
(
rec
,
"F. Last"
)
svc
.
get_my_authors
(
rec
,
sort
=
True
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
tests/harvest_tools/CheckAndFix/test_phd_cds1632177_fix.py
View file @
e22f1a62
...
...
@@ -20,7 +20,7 @@ import copy
import
pytest
from
harvest_tools
import
CheckAndFix
,
family_name_fr
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
...
...
@@ -34,8 +34,8 @@ def recordfix(record):
svc
=
CheckAndFix
()
svc
.
authors
(
rec
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
get_my_authors
(
rec
,
family_name_fr
)
svc
.
format_authors
(
rec
,
"F. Last"
)
svc
.
get_my_authors
(
rec
,
sort
=
True
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
tests/harvest_tools/CheckAndFix/test_phd_cds1642541_fix.py
View file @
e22f1a62
...
...
@@ -18,7 +18,7 @@ Note:
import
pytest
from
harvest_tools
import
CheckAndFix
,
family_name_fr
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
,
RecordThesis
...
...
@@ -28,8 +28,8 @@ def record():
svc
=
CheckAndFix
()
svc
.
authors
(
rec
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
get_my_authors
(
rec
,
family_name_fr
)
svc
.
format_authors
(
rec
,
"F. Last"
)
svc
.
get_my_authors
(
rec
,
sort
=
True
)
svc
.
submitted
(
rec
)
svc
.
year
(
rec
)
...
...
tests/harvest_tools/CheckAndFix/test_phd_cds2015250_fix.py
View file @
e22f1a62
...
...
@@ -9,7 +9,7 @@
import
pytest
from
gluon
import
current
from
harvest_tools
import
CheckAndFix
,
format_author_fr
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
load_record
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment