Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Docker-in-Docker (DinD) capabilities of public runners deactivated.
More info
Open sidebar
limbra
limbra
Commits
e98db28b
Commit
e98db28b
authored
Jan 15, 2021
by
LE GAC Renaud
Browse files
Migrate check and fix method to RecordCdsConf and RecordHepConf
parent
a0dd169b
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
462 additions
and
246 deletions
+462
-246
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+8
-2
modules/harvest_tools/proceedings.py
modules/harvest_tools/proceedings.py
+22
-14
modules/store_tools/base.py
modules/store_tools/base.py
+2
-0
modules/store_tools/confmixin.py
modules/store_tools/confmixin.py
+71
-11
modules/store_tools/recordcdsconfpaper.py
modules/store_tools/recordcdsconfpaper.py
+51
-11
modules/store_tools/recordcdspubli.py
modules/store_tools/recordcdspubli.py
+5
-2
modules/store_tools/recordhepconfpaper.py
modules/store_tools/recordhepconfpaper.py
+38
-8
modules/store_tools/recordheppubli.py
modules/store_tools/recordheppubli.py
+9
-8
tests/basis/test_13_check_and_fix_article_cds.py
tests/basis/test_13_check_and_fix_article_cds.py
+1
-89
tests/basis/test_15_CheckAndFix_proceeding.py
tests/basis/test_15_CheckAndFix_proceeding.py
+0
-98
tests/basis/test_15_check_and_fix_proceeding_cds.py
tests/basis/test_15_check_and_fix_proceeding_cds.py
+125
-0
tests/basis/test_16_check_and_fix_proceeding_ins.py
tests/basis/test_16_check_and_fix_proceeding_ins.py
+127
-0
tests/basis/test_17_check_and_fix_thesis.py
tests/basis/test_17_check_and_fix_thesis.py
+3
-3
No files found.
modules/harvest_tools/articles.py
View file @
e98db28b
...
...
@@ -3,10 +3,12 @@
"""
from
.automaton
import
Automaton
from
.base
import
(
learn_my_authors
,
MSG_CRASH
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
)
from
plugin_dbui
import
get_id
,
UNDEF_ID
from
store_tools
import
CheckException
MSG_NO_EDITOR
=
"Reject article is not published"
MSG_NOT_ARTICLE
=
"Reject publication is not and article"
...
...
@@ -52,7 +54,7 @@ class Articles(Automaton):
can not be corrected.
"""
self
.
logger
.
debug
(
f
"
{
T4
}
check record (article)"
)
self
.
logger
.
debug
(
f
"
{
T4
}
check
and fix
record (article)"
)
if
record
.
subtype
()
==
"article"
:
self
.
logs
[
-
1
].
reject
(
MSG_NOT_ARTICLE
,
record
)
...
...
@@ -74,10 +76,14 @@ class Articles(Automaton):
record
.
check_publisher
(
self
.
db
)
record
.
check_paper_reference
()
except
Exception
as
e
:
except
Check
Exception
as
e
:
self
.
logs
[
-
1
].
reject
(
e
,
record
=
record
)
return
False
except
Exception
as
e
:
self
.
logs
[
-
1
].
reject
(
MSG_CRASH
%
e
,
record
=
record
,
translate
=
False
)
return
False
return
True
def
get_record_by_fields
(
self
,
...
...
modules/harvest_tools/proceedings.py
View file @
e98db28b
...
...
@@ -6,6 +6,8 @@ from .base import MSG_CRASH, MSG_LOAD
from
.checkandfix
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
MSG_NOT_PROCEEDING
=
"Reject publication is not a proceeding"
T4
=
" "
*
4
...
...
@@ -13,6 +15,7 @@ class Proceedings(Automaton):
"""Automaton for conference proceedings.
"""
def
check_record
(
self
,
record
):
"""Check the content of the proceeding in order to fix non conformities.
...
...
@@ -26,24 +29,29 @@ class Proceedings(Automaton):
corrected.
"""
if
not
Automaton
.
check_record
(
self
,
record
):
return
False
self
.
logger
.
debug
(
f
"
{
T4
}
check nd fix record (proceeding)"
)
self
.
logger
.
debug
(
f
"
{
T4
}
check record (proceeding)"
)
if
record
.
subtype
()
==
"proceeding"
:
self
.
logs
[
-
1
].
reject
(
MSG_NOT_PROCEEDING
,
record
)
return
False
try
:
self
.
check
.
is_conference
(
record
)
self
.
check
.
country
(
record
)
self
.
check
.
conference_date
(
record
)
self
.
check
.
submitted
(
record
)
self
.
check
.
format_editor
(
record
)
self
.
check
.
publisher
(
record
)
self
.
check
.
paper_reference
(
record
)
self
.
check
.
format_authors
(
record
,
fmt
=
"F. Last"
)
self
.
check
.
get_my_authors
(
record
,
sort
=
True
)
# is with authors form my institute
# standardise name of collaboration
# format authors according to my format
# extract authors form my institute signing the publication
# is submitted date well formed
record
.
check_and_fix
(
self
.
rex_institute
,
fmt_author
=
"F. Last"
,
sep_author
=
", "
,
sort_author
=
True
)
record
.
check_country
()
record
.
check_conference_date
()
record
.
format_editor
()
record
.
check_publisher
(
self
.
db
)
record
.
check_paper_reference
()
except
CheckException
as
e
:
self
.
logs
[
-
1
].
reject
(
e
,
record
=
record
)
...
...
modules/store_tools/base.py
View file @
e98db28b
...
...
@@ -15,6 +15,7 @@ INS = ("inspirehep", "inspirehep.net")
MSG_INV_CONF
=
"Reject invalid conference information"
MSG_INV_CONF_KEY
=
"Reject invalid conference key"
MSG_NO_CONF
=
"Reject no conference information"
MSG_NO_CONF_DATE
=
"Reject no conference date"
MSG_NO_CONF_ID_KEY
=
"Reject no conference identifier and key"
MSG_NO_COUNTRY
=
"Reject invalid country"
MSG_NO_ENTRY
=
"Reject %s is not defined"
...
...
@@ -25,6 +26,7 @@ MSG_NO_SHELF = "No shelf %s for store %s"
MSG_NO_THESIS
=
"Reject no thesis information"
MSG_TOOMANY_SYNONYM
=
"Reject too many %s synonyms"
MSG_UNKNOWN_COLLABORATION
=
"Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY
=
"Reject country is unknown."
MSG_WELL_FORMED_COLLABORATION
=
"Reject collaboration is not well formed"
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
...
...
modules/store_tools/confmixin.py
View file @
e98db28b
"""confmixin.py
"""
from
.base
import
(
MSG_NO_CONF_DATE
,
MSG_UNKNOWN_COUNTRY
,
REG_DATE
,
search_synonym
,
T6
)
from
.exception
import
CheckException
from
datetime
import
datetime
from
plugin_dbui
import
CLEAN_SPACES
from
plugin_dbui
import
CLEAN_SPACES
,
UNDEF_ID
class
ConfMixin
(
object
):
"""Mixin to handle conference data.
The parent class must have the attribute ``conference``.
It is a dictionary with at least the following keys:
The parent class must have the attribute ``conference``:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
+----------------+----------------------------------------+
"""
def
check_conference_date
(
self
):
"""Check conference date exists.
Note:
Date is well formed by construction (conference_date)
Raises:
CheckException:
dates are not found.
"""
self
.
logger
.
debug
(
f
"
{
T6
}
check conference date"
)
val
=
self
.
conference_dates
()
if
len
(
val
)
==
0
:
raise
CheckException
(
MSG_NO_CONF_DATE
)
def
check_country
(
self
,
db
=
None
):
"""Check synonyms for conference country by using by the proper value.
Args:
db (pydal.DAL):
database connection
Raises:
CheckException:
* the country is unknown (neither country nor synonym)
* more than one synonym found.
"""
if
db
is
None
:
self
.
logger
.
debug
(
f
"
{
T6
}
skip check country -- db is None"
)
return
self
.
logger
.
debug
(
f
"
{
T6
}
check country"
)
val
=
self
.
conference_country
()
if
len
(
val
)
==
0
:
raise
CheckException
(
MSG_UNKNOWN_COUNTRY
)
dbid
=
search_synonym
(
db
.
countries
,
"country"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
CheckException
(
MSG_UNKNOWN_COUNTRY
)
country
=
db
.
countries
[
dbid
].
country
if
country
!=
val
:
self
.
conference
[
"addresses"
][
0
][
"country"
]
=
country
def
conference_country
(
self
):
"""The country where the conference took place.
...
...
@@ -57,7 +114,10 @@ class ConfMixin(object):
opening
=
conference
.
get
(
"opening_date"
,
None
)
closing
=
conference
.
get
(
"closing_date"
,
None
)
if
opening
is
None
or
closing
is
None
:
if
opening
is
None
or
REG_DATE
.
match
(
opening
)
is
None
:
return
""
if
closing
is
None
or
REG_DATE
.
match
(
closing
)
is
None
:
return
""
ds
=
datetime
.
strptime
(
opening
,
"%Y-%m-%d"
)
...
...
modules/store_tools/recordcdsconfpaper.py
View file @
e98db28b
...
...
@@ -3,9 +3,13 @@
"""
import
re
from
.base
import
T4
,
T6
from
.base
import
(
MSG_WELL_FORMED_DATE
,
REG_DATE_YYYYMM
,
T4
,
T6
)
from
.cdsstore
import
CdsStore
from
.confmixin
import
ConfMixin
from
.exception
import
CheckException
from
.recordcdspubli
import
RecordCdsPubli
REX_DATE8
=
re
.
compile
(
r
"(\d{4})(\d{2})(\d{2})"
)
...
...
@@ -17,14 +21,19 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
Attributes:
conference (dict or None):
the conference metadata:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
* year
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
| year | str |
+----------------+----------------------------------------+
"""
...
...
@@ -61,7 +70,7 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
store
=
CdsStore
(
"cds.cern.ch"
)
if
conf_id
is
not
None
:
logger
.
debug
(
f
"
{
T6
}
search
by
conference by id
{
conf_id
}
"
)
logger
.
debug
(
f
"
{
T6
}
search conference by id
{
conf_id
}
"
)
recjson
=
store
.
get_record
(
conf_id
)
if
recjson
[
"recid"
]
!=
int
(
conf_id
):
...
...
@@ -72,7 +81,7 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
return
elif
conf_key
is
not
None
:
logger
.
debug
(
f
"
{
T6
}
search
by
conference by key
{
conf_key
}
"
)
logger
.
debug
(
f
"
{
T6
}
search conference by key
{
conf_key
}
"
)
ids
=
store
.
get_ids
(
p
=
conf_key
)
mtch
=
False
...
...
@@ -132,3 +141,34 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
#
# Append conference data
self
.
conference
=
dct
def
check_submitted_date
(
self
):
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
* the date is not well formed
"""
self
.
logger
.
debug
(
f
"
{
T6
}
check submitted date"
)
xdate
=
self
.
submitted
()
if
REG_DATE_YYYYMM
.
match
(
xdate
):
return
# recover by using the opening date of the conference
val
=
self
.
conference
.
get
(
"opening_date"
,
None
)
if
val
is
not
None
:
if
"prepublication"
in
self
:
prepublication
=
self
[
"prepublication"
]
if
isinstance
(
prepublication
,
list
):
prepublication
[
0
][
"date"
]
=
val
else
:
prepublication
[
"date"
]
=
val
else
:
self
[
"prepublication"
]
=
{
"date"
:
val
}
else
:
raise
CheckException
(
MSG_WELL_FORMED_DATE
)
modules/store_tools/recordcdspubli.py
View file @
e98db28b
...
...
@@ -833,7 +833,7 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
Returns:
str:
* "articles", "preprint", "note" or "report"
* "articles", "preprint",
"proceeding",
"note" or "report"
* empty string when it is not defined
"""
...
...
@@ -843,7 +843,10 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst
=
[
dct
.
get
(
"primary"
,
""
).
lower
()
for
dct
in
collection
]
# order matter since note can have preprint+note
if
"conferencepaper"
in
lst
:
return
"proceeding"
# order matter since we have (preprint+note)
for
val
in
(
"article"
,
"note"
,
"report"
,
"preprint"
):
if
val
in
lst
:
return
val
...
...
modules/store_tools/recordhepconfpaper.py
View file @
e98db28b
...
...
@@ -3,8 +3,12 @@
"""
import
requests
from
.base
import
T4
,
T6
from
.base
import
(
MSG_WELL_FORMED_DATE
,
REG_DATE_YYYYMM
,
T4
,
T6
)
from
.confmixin
import
ConfMixin
from
.exception
import
CheckException
from
.recordheppubli
import
RecordHepPubli
...
...
@@ -21,13 +25,17 @@ class RecordHepConfPaper(RecordHepPubli, ConfMixin):
https://inspire-schemas.readthedocs.io/en/latest/schemas/
Main information are:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
+----------------+----------------------------------------+
"""
...
...
@@ -72,3 +80,25 @@ class RecordHepConfPaper(RecordHepPubli, ConfMixin):
# append conference data
self
.
conference
=
obj
.
get
(
"metadata"
,
None
)
def
check_submitted_date
(
self
):
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
* the date is not well formed
"""
self
.
logger
.
debug
(
f
"
{
T6
}
check submitted date"
)
xdate
=
self
.
submitted
()
if
REG_DATE_YYYYMM
.
match
(
xdate
):
return
# recover by using the opening date of the conference
val
=
self
.
conference
.
get
(
"opening_date"
,
None
)
if
val
is
not
None
:
self
[
"preprint_date"
]
=
val
else
:
raise
CheckException
(
MSG_WELL_FORMED_DATE
)
modules/store_tools/recordheppubli.py
View file @
e98db28b
...
...
@@ -275,7 +275,7 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
"""
def
check_submitted_date
(
self
):
"""Check that submitted date
as
``YYYY-MM`` or ``YYYY-MM-DD``.
"""Check that submitted date
is either
``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
...
...
@@ -530,7 +530,7 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
Returns:
str:
* "articles", "preprint", "note" or "report"
* "articles", "preprint",
"proceeding",
"note" or "report"
* empty string when it is not defined
"""
...
...
@@ -540,13 +540,14 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
doctype
=
(
doctype
[
0
]
if
isinstance
(
doctype
,
list
)
else
doctype
)
if
doctype
!=
"article"
:
return
doctype
if
doctype
==
"article"
:
pubinfo
=
self
.
get
(
"publication_info"
,
None
)
return
(
"preprint"
if
pubinfo
is
None
else
"article"
)
# separate article from preprint
# in the latter case publication_info is missing
pubinfo
=
self
.
get
(
"publication_info"
,
None
)
return
(
"preprint"
if
pubinfo
is
None
else
"article"
)
elif
doctype
==
"conference paper"
:
return
(
"proceeding"
if
self
.
is_published
()
else
""
)
return
doctype
def
title
(
self
):
"""The title of the publication.
...
...
tests/basis/test_13_check_and_fix_article_cds.py
View file @
e98db28b
...
...
@@ -29,7 +29,7 @@ def record():
return
load_record
(
"cds.cern.ch"
,
2242641
)
def
test_subtype_
ins_
13001
(
record
):
def
test_subtype_13001
(
record
):
assert
record
.
subtype
()
==
"article"
...
...
@@ -120,91 +120,3 @@ def test_check_paper_reference_13022(record):
# check_paper_reference is a dummy method to preserve interface
assert
record
.
paper_reference
()
==
"Phys. Rev. D 95 2017 052005"
assert
record
.
check_paper_reference
()
is
None
# def test_format_editor_cds_13001(svc, reccds):
#
# # cds
# assert reccds.paper_editor() == "Phys. Rev. D"
# assert reccds.paper_volume() == "95"
#
# svc.format_editor(reccds)
#
# assert reccds.paper_editor() == "Phys. Rev. D"
# assert reccds.paper_volume() == "95"
#
#
# def test_format_editor_ins_13002(svc, recins):
# # inspire
# assert recins.paper_editor() == "Phys. Rev. D"
# assert recins.paper_volume() == "95"
#
# svc.format_editor(recins)
#
# assert recins.paper_editor() == "Phys. Rev. D"
# assert recins.paper_volume() == "95"
#
#
# def test_publisher_cds_13003(svc, reccds):
# assert svc.publisher(reccds) is None
#
#
# def test_paper_reference_cds_13004(svc, reccds):
#
# # check recovery procedure using DOI
# # remove the publisher and volume information
# paper_ref = reccds.paper_reference()
#
# reccds.df_info.loc[0, ["title", "volume"]] = ["", ""]
# svc.paper_reference(reccds)
#
# assert reccds.paper_reference() == paper_ref
#
#
# def test_submitted_cds_13005(svc, reccds):
#
# assert reccds.submitted() == "19 Jan 2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01-19"
#
# # test the case 19 01 2017
# reccds["prepublication"]["date"] = "19 01 2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01-19"
#
# # test the case 2017
# reccds["prepublication"]["date"] = "2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01"
#
#
# def test_format_authors_cds_13007(svc, reccds):
#
# authors = reccds.authors_as_list()
#
# assert len(authors) == reccds["number_of_authors"]
# assert authors[0] == "Aaij, Roel"
# assert authors[1] == "Adeva, Bernardo"
# assert authors[344] == "Koopman, Rose"
# assert authors[-1] == "Zucchelli, Stefano"
#
# svc.format_authors(reccds, fmt="F. Last")
# authors = reccds.authors_as_list()
#
# assert authors[0] == "R. Aaij"
# assert authors[1] == "B. Adeva"
# assert authors[344] == "R. Koopman"
# assert authors[-1] == "S. Zucchelli"
#
#
# def test_get_my_authors_cds_13008(svc, reccds):
#
# svc.format_authors(reccds, fmt="F. Last")
# assert svc.get_my_authors(reccds, sep="|", sort=True) is None
#
# my_authors = reccds.my_authors
#
# assert my_authors == "J. Arnau Romeu|E. Aslanides|J. Cogan|" \
# "K. De Bruyn|R. Le Gac|O. Leroy|" \
# "G. Mancinelli|M. Martin|A. Mordà|" \
# "J. Serrano|A. Tayduganov|A. Tsaregorodtsev"
tests/basis/test_15_CheckAndFix_proceeding.py
deleted
100644 → 0
View file @
a0dd169b
"""test_15_CheckAndFix_proceeding
* Test CheckAndFix methods for proceeding.
Use the same proceeding in cds.cern.ch and inspirehep.net
- is_conference
- country
- conference_date
- submitted
- format_editor (already test with article)
- publisher (already test with article)
- paper_reference (already test with article)
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import
pytest
from
harvest_tools.checkandfix
import
CheckAndFix
from
harvest_tools.exception
import
CheckException
from
store_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
reccds
():
return
load_record
(
"cds.cern.ch"
,
1411352
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
recins
():
return
load_record
(
"inspirehep.net"
,
1089237
,
shelf
=
"literature"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_is_conference_cds_15001
(
svc
,
reccds
):
assert
svc
.
is_conference
(
reccds
)
is
None
# test exception
# the publication cds2242595 is a talk without conference data
#
reccds2
=
load_record
(
"cds.cern.ch"
,
2242595
)
with
pytest
.
raises
(
CheckException
):
svc
.
is_conference
(
reccds2
)
def
test_is_conference_ins_15002
(
svc
,
recins
):
assert
svc
.
is_conference
(
recins
)
is
None
def
test_country_cds_15003
(
svc
,
reccds
):
assert
svc
.
country
(
reccds
)
is
None
def
test_country_ins_15004
(
svc
,
recins
):
assert
svc
.
country
(
recins
)
is
None
def
test_conference_date_cds_15005
(
svc
,
reccds
):
assert
reccds
.
conference_dates
()
==
"6 - 11 Dec 2010"
svc
.
conference_date
(
reccds
)
assert
reccds
.
conference_dates
()
==
"6-11 Dec 2010"
def
test_conference_date_cds_15006
(
svc
):
reccds
=
load_record
(
"cds.cern.ch"
,
2688580
)
assert
reccds
.
conference_dates
()
==
"04-06 Sept 2019"
svc
.
conference_date
(
reccds
)