Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
eb80c771
Commit
eb80c771
authored
Jun 26, 2017
by
LE GAC Renaud
Browse files
Migrate CheckAndFix: the base methods required by the class Automaton.
parent
a4170897
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
148 additions
and
39 deletions
+148
-39
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+56
-38
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+1
-1
tests/basis/test_08_CheckAndFix_base.py
tests/basis/test_08_CheckAndFix_base.py
+91
-0
No files found.
modules/harvest_tools/checkandfix.py
View file @
eb80c771
...
@@ -138,41 +138,42 @@ class CheckAndFix(object):
...
@@ -138,41 +138,42 @@ class CheckAndFix(object):
"""Get the rescue list for my authors.
"""Get the rescue list for my authors.
Args:
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
id_project (int): identifier of the project in the database.
record describing a publication.
id_team (int): identifier of the team in the database.
id_project (int):
identifier of the project in the database.
id_team (int):
identifier of the team in the database.
Returns:
Returns:
list: empty when not defined
list:
empty when not defined
"""
"""
year
=
record
.
year
()
year
=
record
.
submitted
()
# try to recover year when not defined
# try to recover year when not defined
if
not
year
:
if
not
year
:
# published article, proceeding
# published article, proceeding
if
"773"
in
record
and
"y"
in
record
[
"773"
]
:
if
record
[
u
"publication_info"
].
year
.
iloc
[
0
]
!=
""
:
year
=
record
[
"773"
][
"y"
]
year
=
record
[
u
"publication_info"
].
year
.
iloc
[
0
]
# start date of a conference
# start date of a conference
elif
"111"
in
record
and
"x"
in
record
[
"111"
]
:
elif
record
.
_get
(
u
"meeting_name"
,
u
"opening_date"
)
!=
u
""
:
year
=
record
[
"111"
][
"x"
]
year
=
record
.
_get
(
u
"meeting_name"
,
u
"opening_date"
)
# end date of a conference
# end date of a conference
elif
"111"
in
record
and
"z"
in
record
[
"111"
]:
elif
record
.
_get
(
u
"meeting_name"
,
u
"closing_date"
)
!=
u
""
:
year
=
record
[
"111"
][
"z"
]
year
=
record
.
_get
(
u
"meeting_name"
,
u
"closing_date"
)
# submitted date
elif
"269"
in
record
and
"c"
in
record
[
"269"
]:
year
=
record
[
"269"
][
"c"
]
else
:
else
:
return
[]
return
[]
#
#
# NOTE
# protection
# keep in mind that the CheckAndfix mechanism is not yet run
# submitted and paper year are protect against erratum, but ...
# therefore year can be a list due to erratum, ...
#
#
if
isinstance
(
year
,
list
):
if
isinstance
(
year
,
list
):
year
.
sort
()
year
.
sort
()
...
@@ -197,7 +198,7 @@ class CheckAndFix(object):
...
@@ -197,7 +198,7 @@ class CheckAndFix(object):
id_teams
=
id_team
)
id_teams
=
id_team
)
if
row
:
if
row
:
self
.
__reference
=
row
[
'authors'
].
split
(
', '
)
self
.
__reference
=
row
[
'authors'
].
strip
(
"
\n
"
).
split
(
', '
)
else
:
else
:
self
.
__reference
=
[]
self
.
__reference
=
[]
...
@@ -315,24 +316,23 @@ class CheckAndFix(object):
...
@@ -315,24 +316,23 @@ class CheckAndFix(object):
else
:
else
:
raise
CheckException
(
MSG_NO_REF
)
raise
CheckException
(
MSG_NO_REF
)
def
authors
(
self
,
record
):
@
staticmethod
def
authors
(
record
):
"""Check that author fields are defined.
"""Check that author fields are defined.
Args:
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
Raises:
CheckException:
when there is no authors or more than
CheckException:
one *first
author
*
.
when there is no
author
s
.
"""
"""
if
not
record
.
is_authors
():
if
not
record
.
is_authors
():
raise
CheckException
(
MSG_NO_AUTHOR
)
raise
CheckException
(
MSG_NO_AUTHOR
)
if
len
(
record
[
"100"
])
>
1
:
raise
CheckException
(
MSG_TO_MANY_FAUTHOR
)
def
clean_erratum
(
self
,
record
):
def
clean_erratum
(
self
,
record
):
"""Clean record with erratum by removing them.
"""Clean record with erratum by removing them.
...
@@ -361,10 +361,12 @@ class CheckAndFix(object):
...
@@ -361,10 +361,12 @@ class CheckAndFix(object):
Have a look to the synonyms when the collaboration is not well formed.
Have a look to the synonyms when the collaboration is not well formed.
Args:
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
Raises:
CheckException: when the collaboration value is defined
CheckException:
when the collaboration value is defined
nor entered as a synonym.
nor entered as a synonym.
"""
"""
...
@@ -376,6 +378,7 @@ class CheckAndFix(object):
...
@@ -376,6 +378,7 @@ class CheckAndFix(object):
try
:
try
:
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
except
ToolException
as
e
:
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
raise
CheckException
(
*
e
.
args
)
...
@@ -454,10 +457,12 @@ class CheckAndFix(object):
...
@@ -454,10 +457,12 @@ class CheckAndFix(object):
to new one.
to new one.
Args:
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Returns:
Returns:
bool: ``True`` when a record is found in the database with
bool:
``True`` when a record is found in the database with
the bad OAI.
the bad OAI.
"""
"""
...
@@ -684,11 +689,18 @@ class CheckAndFix(object):
...
@@ -684,11 +689,18 @@ class CheckAndFix(object):
It is based on the author rescue list stored in the database.
It is based on the author rescue list stored in the database.
Args:
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
id_project (int): identifier of the project in the database
record describing a publication.
id_team (int): identifier of the team in the database
id_project (int):
identifier of the project in the database
id_team (int):
identifier of the team in the database
fmt_rescue (str):
fmt_rescue (str):
the format for the authors used in the rescue list
the format for the authors used in the rescue list
sort (bool):
sort (bool):
sort authors by family name when true otherwise use the
sort authors by family name when true otherwise use the
order of authors at the creation of the record
order of authors at the creation of the record
...
@@ -720,12 +732,12 @@ class CheckAndFix(object):
...
@@ -720,12 +732,12 @@ class CheckAndFix(object):
record
.
reformat_authors
(
fmt_rescue
)
record
.
reformat_authors
(
fmt_rescue
)
if
sort
:
if
sort
:
authors
=
(
record
[
"700
"
][[
"last_name"
,
"fmt_name"
]]
authors
=
(
record
[
u
"authors
"
][[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
.
fmt_name
)
else
:
else
:
authors
=
(
record
[
"700
"
].
fmt_name
authors
=
(
record
[
u
"authors
"
].
fmt_name
.
sort_index
())
.
sort_index
())
# go back to the origin formatting
# go back to the origin formatting
...
@@ -926,9 +938,15 @@ class CheckAndFix(object):
...
@@ -926,9 +938,15 @@ class CheckAndFix(object):
CheckException: when the record is marked temporary
CheckException: when the record is marked temporary
"""
"""
# found on INSPIREHEP (see record 1317573)
# INSPIREHEP
if
"500"
in
record
and
"a"
in
record
[
"500"
]:
# Can be find by using the XML syntax:
if
record
[
"500"
][
"a"
]
==
"*Temporary record*"
:
# http://inspirehep.net/search?500__a="*Temporary record*"
#
# or the corresponding JSON field:
# http://inspirehep.net/comment="*Temporary record*"
#
if
u
"comment"
in
record
:
if
record
[
u
"comment"
]
==
u
"*Temporary record*"
:
raise
CheckException
(
MSG_TEMPORARY_RECORD
)
raise
CheckException
(
MSG_TEMPORARY_RECORD
)
def
year
(
self
,
record
):
def
year
(
self
,
record
):
...
...
modules/invenio_tools/recordpubli.py
View file @
eb80c771
...
@@ -204,7 +204,7 @@ class RecordPubli(Record):
...
@@ -204,7 +204,7 @@ class RecordPubli(Record):
"""Convert publication_info into DataFrame:
"""Convert publication_info into DataFrame:
Note:
Note:
* the field is a list when there are eratum
* the field is a list when there are er
r
atum
* in some case the subfield year is a list (cds 1951625)
* in some case the subfield year is a list (cds 1951625)
publication information are stored in DataFrame with the
publication information are stored in DataFrame with the
...
...
tests/basis/test_08_CheckAndFix_base.py
0 → 100644
View file @
eb80c771
# -*- coding: utf-8 -*-
"""test_08_CheckAndFix_base
* Test CheckAndFix methods required by the Automaton base class.
- constructor
- is_bad_aoi
- temporary_record
- authors
- my_affiliation
- collaboration
"""
import
pytest
import
requests
from
harvest_tools.checkandfix
import
CheckAndFix
,
CheckException
from
invenio_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
reccds
():
return
load_record
(
"cds.cern.ch"
,
1951625
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_constructor
(
svc
):
assert
svc
.
reg_institute
==
\
"^Marseille, CPPM|"
\
"^CPPM, Marseille|"
\
"^Centre de Physique des Particules de Marseille \(CPPM\)"
def
test_is_bad_oai
(
svc
,
reccds
):
assert
not
svc
.
is_bad_oai_used
(
reccds
)
def
test_temporary_record
(
svc
,
reccds
):
assert
svc
.
temporary_record
(
reccds
)
is
None
# look for some temporarily record in inspirehep.net
# using the MarcXML syntax
# get a list of recids
payload
=
{
"p"
:
"500__a:'*Temporary record*'"
,
"of"
:
"id"
,
"rg"
:
10
}
r
=
requests
.
get
(
"http://inspirehep.net/search"
,
params
=
payload
)
li
=
r
.
json
()
# try with the oldest one to avoid issue with missing information, etc.
with
pytest
.
raises
(
CheckException
):
recins
=
load_record
(
"inspirehep.net"
,
li
[
-
1
])
svc
.
temporary_record
(
recins
)
def
test_authors
(
svc
,
reccds
):
assert
svc
.
authors
(
reccds
)
is
None
def
test__get_author_rescue_list
(
svc
,
reccds
):
assert
svc
.
_get_author_rescue_list
(
reccds
,
8
,
7
)
==
[
"C. Adrover"
,
"S. Akar"
,
"E. Aslanides"
,
"J. Cogan"
,
"W. Kanso"
,
"R. Le Gac"
,
"O. Leroy"
,
"G. Mancinelli"
,
"E. Maurice"
,
"A. Morda"
,
"A. Mordà"
,
"M. Perrin-Terrin"
,
"M. Sapunov"
,
"J. Serrano"
,
"A. Tsaregorodtsev"
]
def
test_my_affiliation
(
svc
,
reccds
):
assert
svc
.
my_affiliation
(
reccds
,
8
,
7
)
==
"Marseille, CPPM"
# a paper from NA62 -- no CPPM author
recna62
=
load_record
(
"cds.cern.ch"
,
1434415
)
with
pytest
.
raises
(
CheckException
):
svc
.
my_affiliation
(
recna62
,
id_project
=
8
,
id_team
=
7
)
def
test_collaboration
(
svc
,
reccds
):
assert
svc
.
collaboration
(
reccds
)
is
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment