Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
eb80c771
Commit
eb80c771
authored
Jun 26, 2017
by
LE GAC Renaud
Browse files
Migrate CheckAndFix: the base methods required by the class Automaton.
parent
a4170897
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
148 additions
and
39 deletions
+148
-39
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+56
-38
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+1
-1
tests/basis/test_08_CheckAndFix_base.py
tests/basis/test_08_CheckAndFix_base.py
+91
-0
No files found.
modules/harvest_tools/checkandfix.py
View file @
eb80c771
...
...
@@ -138,41 +138,42 @@ class CheckAndFix(object):
"""Get the rescue list for my authors.
Args:
record (RecordPubli): record describing a publication.
id_project (int): identifier of the project in the database.
id_team (int): identifier of the team in the database.
record (RecordPubli):
record describing a publication.
id_project (int):
identifier of the project in the database.
id_team (int):
identifier of the team in the database.
Returns:
list: empty when not defined
list:
empty when not defined
"""
year
=
record
.
year
()
year
=
record
.
submitted
()
# try to recover year when not defined
if
not
year
:
# published article, proceeding
if
"773"
in
record
and
"y"
in
record
[
"773"
]
:
year
=
record
[
"773"
][
"y"
]
if
record
[
u
"publication_info"
].
year
.
iloc
[
0
]
!=
""
:
year
=
record
[
u
"publication_info"
].
year
.
iloc
[
0
]
# start date of a conference
elif
"111"
in
record
and
"x"
in
record
[
"111"
]
:
year
=
record
[
"111"
][
"x"
]
elif
record
.
_get
(
u
"meeting_name"
,
u
"opening_date"
)
!=
u
""
:
year
=
record
.
_get
(
u
"meeting_name"
,
u
"opening_date"
)
# end date of a conference
elif
"111"
in
record
and
"z"
in
record
[
"111"
]:
year
=
record
[
"111"
][
"z"
]
# submitted date
elif
"269"
in
record
and
"c"
in
record
[
"269"
]:
year
=
record
[
"269"
][
"c"
]
elif
record
.
_get
(
u
"meeting_name"
,
u
"closing_date"
)
!=
u
""
:
year
=
record
.
_get
(
u
"meeting_name"
,
u
"closing_date"
)
else
:
return
[]
#
# NOTE
# keep in mind that the CheckAndfix mechanism is not yet run
# therefore year can be a list due to erratum, ...
# protection
# submitted and paper year are protect against erratum, but ...
#
if
isinstance
(
year
,
list
):
year
.
sort
()
...
...
@@ -197,7 +198,7 @@ class CheckAndFix(object):
id_teams
=
id_team
)
if
row
:
self
.
__reference
=
row
[
'authors'
].
split
(
', '
)
self
.
__reference
=
row
[
'authors'
].
strip
(
"
\n
"
).
split
(
', '
)
else
:
self
.
__reference
=
[]
...
...
@@ -315,24 +316,23 @@ class CheckAndFix(object):
else
:
raise
CheckException
(
MSG_NO_REF
)
def
authors
(
self
,
record
):
@
staticmethod
def
authors
(
record
):
"""Check that author fields are defined.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException:
when there is no authors or more than
one *first
author
*
.
CheckException:
when there is no
author
s
.
"""
if
not
record
.
is_authors
():
raise
CheckException
(
MSG_NO_AUTHOR
)
if
len
(
record
[
"100"
])
>
1
:
raise
CheckException
(
MSG_TO_MANY_FAUTHOR
)
def
clean_erratum
(
self
,
record
):
"""Clean record with erratum by removing them.
...
...
@@ -361,10 +361,12 @@ class CheckAndFix(object):
Have a look to the synonyms when the collaboration is not well formed.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException: when the collaboration value is defined
CheckException:
when the collaboration value is defined
nor entered as a synonym.
"""
...
...
@@ -376,6 +378,7 @@ class CheckAndFix(object):
try
:
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
...
...
@@ -454,10 +457,12 @@ class CheckAndFix(object):
to new one.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Returns:
bool: ``True`` when a record is found in the database with
bool:
``True`` when a record is found in the database with
the bad OAI.
"""
...
...
@@ -684,11 +689,18 @@ class CheckAndFix(object):
It is based on the author rescue list stored in the database.
Args:
record (RecordPubli): record describing a publication.
id_project (int): identifier of the project in the database
id_team (int): identifier of the team in the database
record (RecordPubli):
record describing a publication.
id_project (int):
identifier of the project in the database
id_team (int):
identifier of the team in the database
fmt_rescue (str):
the format for the authors used in the rescue list
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
...
...
@@ -720,12 +732,12 @@ class CheckAndFix(object):
record
.
reformat_authors
(
fmt_rescue
)
if
sort
:
authors
=
(
record
[
"700
"
][[
"last_name"
,
"fmt_name"
]]
authors
=
(
record
[
u
"authors
"
][[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
else
:
authors
=
(
record
[
"700
"
].
fmt_name
authors
=
(
record
[
u
"authors
"
].
fmt_name
.
sort_index
())
# go back to the origin formatting
...
...
@@ -926,9 +938,15 @@ class CheckAndFix(object):
CheckException: when the record is marked temporary
"""
# found on INSPIREHEP (see record 1317573)
if
"500"
in
record
and
"a"
in
record
[
"500"
]:
if
record
[
"500"
][
"a"
]
==
"*Temporary record*"
:
# INSPIREHEP
# Can be find by using the XML syntax:
# http://inspirehep.net/search?500__a="*Temporary record*"
#
# or the corresponding JSON field:
# http://inspirehep.net/comment="*Temporary record*"
#
if
u
"comment"
in
record
:
if
record
[
u
"comment"
]
==
u
"*Temporary record*"
:
raise
CheckException
(
MSG_TEMPORARY_RECORD
)
def
year
(
self
,
record
):
...
...
modules/invenio_tools/recordpubli.py
View file @
eb80c771
...
...
@@ -204,7 +204,7 @@ class RecordPubli(Record):
"""Convert publication_info into DataFrame:
Note:
* the field is a list when there are eratum
* the field is a list when there are er
r
atum
* in some case the subfield year is a list (cds 1951625)
publication information are stored in DataFrame with the
...
...
tests/basis/test_08_CheckAndFix_base.py
0 → 100644
View file @
eb80c771
# -*- coding: utf-8 -*-
"""test_08_CheckAndFix_base
* Test CheckAndFix methods required by the Automaton base class.
- constructor
- is_bad_aoi
- temporary_record
- authors
- my_affiliation
- collaboration
"""
import
pytest
import
requests
from
harvest_tools.checkandfix
import
CheckAndFix
,
CheckException
from
invenio_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
reccds
():
return
load_record
(
"cds.cern.ch"
,
1951625
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_constructor
(
svc
):
assert
svc
.
reg_institute
==
\
"^Marseille, CPPM|"
\
"^CPPM, Marseille|"
\
"^Centre de Physique des Particules de Marseille \(CPPM\)"
def
test_is_bad_oai
(
svc
,
reccds
):
assert
not
svc
.
is_bad_oai_used
(
reccds
)
def
test_temporary_record
(
svc
,
reccds
):
assert
svc
.
temporary_record
(
reccds
)
is
None
# look for some temporarily record in inspirehep.net
# using the MarcXML syntax
# get a list of recids
payload
=
{
"p"
:
"500__a:'*Temporary record*'"
,
"of"
:
"id"
,
"rg"
:
10
}
r
=
requests
.
get
(
"http://inspirehep.net/search"
,
params
=
payload
)
li
=
r
.
json
()
# try with the oldest one to avoid issue with missing information, etc.
with
pytest
.
raises
(
CheckException
):
recins
=
load_record
(
"inspirehep.net"
,
li
[
-
1
])
svc
.
temporary_record
(
recins
)
def
test_authors
(
svc
,
reccds
):
assert
svc
.
authors
(
reccds
)
is
None
def
test__get_author_rescue_list
(
svc
,
reccds
):
assert
svc
.
_get_author_rescue_list
(
reccds
,
8
,
7
)
==
[
"C. Adrover"
,
"S. Akar"
,
"E. Aslanides"
,
"J. Cogan"
,
"W. Kanso"
,
"R. Le Gac"
,
"O. Leroy"
,
"G. Mancinelli"
,
"E. Maurice"
,
"A. Morda"
,
"A. Mordà"
,
"M. Perrin-Terrin"
,
"M. Sapunov"
,
"J. Serrano"
,
"A. Tsaregorodtsev"
]
def
test_my_affiliation
(
svc
,
reccds
):
assert
svc
.
my_affiliation
(
reccds
,
8
,
7
)
==
"Marseille, CPPM"
# a paper from NA62 -- no CPPM author
recna62
=
load_record
(
"cds.cern.ch"
,
1434415
)
with
pytest
.
raises
(
CheckException
):
svc
.
my_affiliation
(
recna62
,
id_project
=
8
,
id_team
=
7
)
def
test_collaboration
(
svc
,
reccds
):
assert
svc
.
collaboration
(
reccds
)
is
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment