Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
b47b5b7e
Commit
b47b5b7e
authored
Jan 08, 2021
by
LE GAC Renaud
Browse files
Restore Automaton, move Automaton._search_parameters to store
parent
14512643
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
181 additions
and
109 deletions
+181
-109
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+13
-85
modules/store_tools/__init__.py
modules/store_tools/__init__.py
+1
-0
modules/store_tools/factory.py
modules/store_tools/factory.py
+5
-7
modules/store_tools/inspirehepstore.py
modules/store_tools/inspirehepstore.py
+34
-0
modules/store_tools/inveniostore.py
modules/store_tools/inveniostore.py
+36
-0
modules/store_tools/recordheppubli.py
modules/store_tools/recordheppubli.py
+3
-1
tests/basis/test_01_store.py
tests/basis/test_01_store.py
+1
-7
tests/basis/test_12_Automaton.py
tests/basis/test_12_Automaton.py
+88
-9
No files found.
modules/harvest_tools/automaton.py
View file @
b47b5b7e
...
...
@@ -3,7 +3,6 @@
"""
import
logging
import
re
import
traceback
from
.base
import
(
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
...
...
@@ -14,7 +13,7 @@ from .msg import Msg
from
.msgcollection
import
MsgCollection
from
plugin_dbui
import
CALLBACK_ERRORS
,
get_id
from
store_tools
import
(
CdsException
,
InvenioS
tore
,
build_s
tore
,
OAI_URL
)
from
store_tools.factory
import
build_record
...
...
@@ -26,10 +25,6 @@ MSG_INSERT_FAIL = "Fail to insert the new record in the database."
OAI
=
"oai:%s:%i"
# search collection when using inspirehep
# require for "Hal Hidden"
REG_COLLECTION
=
re
.
compile
(
r
"cc([A-Za-z ]+)(and|$)"
)
T2
=
" "
*
2
T4
=
" "
*
4
T6
=
" "
*
6
...
...
@@ -153,6 +148,9 @@ class Automaton(object):
self
.
_id_preprint
=
get_id
(
db
.
categories
,
code
=
"PRE"
)
self
.
_id_article
=
get_id
(
db
.
categories
,
code
=
"ACL"
)
# Keep track of the shelf for inspirehep.net
self
.
shelf
=
None
def
_insert_in_db
(
self
,
log_year
=
""
,
**
fields
):
"""Insert the record in the database, handling database exception.
...
...
@@ -291,79 +289,6 @@ class Automaton(object):
return
publication
.
id
def
_search_parameters
(
self
,
collection
):
"""Build the keywords to steer the URL search in invenio store.
The main parameter is the collection and the date range defined
in the selector.
Args:
collection (str):
string defining the collection in the store.
The syntax depends on the invenio store:
* ``"find cn d0 and tc p and not tc c"``
* ``"LHCb Papers"``.
Returns:
dict:
the key are a sub-set of those defined in
:meth:`store_tools.InvenioStore.get_ids`.
"""
year_start
=
self
.
year_start
year_end
=
self
.
year_end
# INSPIREHEP store
if
collection
.
startswith
(
"find"
):
query
=
collection
if
year_start
and
not
year_end
:
query
+=
f
" and date
{
year_start
}
"
elif
not
year_start
and
year_end
:
query
+=
f
" and date
{
year_end
}
"
elif
year_start
and
year_end
:
tpl
=
(
f
"date
{
el
}
"
for
el
in
range
(
year_start
,
year_end
+
1
))
sdates
=
" or "
.
join
(
tpl
)
query
+=
f
" and (
{
sdates
}
)"
dic
=
dict
(
p
=
query
,
# query à la spires
rg
=
1000
,
# maximum number of records returned
sf
=
"year"
,
# sort by date
so
=
"d"
)
# descending order
# handle the cc keyword (true inspirehep collection)
match
=
REG_COLLECTION
.
search
(
query
)
if
match
:
dic
[
"cc"
]
=
match
.
group
(
1
).
strip
()
dic
[
"p"
]
=
REG_COLLECTION
.
sub
(
""
,
query
).
strip
()
dic
[
"p"
]
=
dic
[
"p"
].
replace
(
" "
,
" "
)
if
dic
[
"p"
]
==
"find"
:
del
dic
[
"p"
]
# CERN INVENIO store
else
:
if
year_start
and
not
year_end
:
rex
=
year_start
elif
not
year_start
and
year_end
:
rex
=
year_end
elif
year_start
and
year_end
:
tpl
=
(
str
(
el
)
for
el
in
range
(
year_start
,
year_end
+
1
))
rex
=
"|"
.
join
(
tpl
)
dic
=
dict
(
cc
=
collection
,
# collection
f1
=
"year"
,
# search on year
m1
=
"r"
,
# use regular expression
p1
=
rex
,
# regular expression defining year
sf
=
"year"
,
# sort by date
so
=
"d"
)
# descending order
return
dic
def
check_record
(
self
,
record
):
"""Check the content of the record in order to fix non-conformities.
Return ``False`` when non-conformities are found and can not be
...
...
@@ -519,7 +444,11 @@ class Automaton(object):
collection_logs
.
append
(
MsgCollection
(
title
=
ctitle
))
# get search parameters for the collection including user criteria
kwargs
=
self
.
_search_parameters
(
collection
)
kwargs
=
store
.
search_parameters
(
collection
,
year_start
=
self
.
year_start
,
year_end
=
self
.
year_end
)
logger
.
debug
(
f
"search parameters
{
kwargs
}
"
)
# get the list of record identifier matching the search criteria
try
:
...
...
@@ -563,15 +492,14 @@ class Automaton(object):
"""
logger
=
self
.
logger
logger
.
info
(
f
"
{
T4
}
process record
{
recjson
[
'recid'
]
}
(process_recjson)"
)
logger
.
info
(
f
"
{
T4
}
process record (process_recjson)"
)
collection_logs
=
self
.
collection_logs
harvester
=
self
.
harvester
logs
=
self
.
logs
# instantiate the record
record
=
build_record
(
recjson
)
record
=
build_record
(
recjson
,
shelf
=
self
.
shelf
)
logger
.
debug
(
f
"
{
T4
}{
record
.
title
()[:
72
]
}
"
)
# start the log for the record
...
...
@@ -669,8 +597,8 @@ class Automaton(object):
self
.
harvester
.
collections
=
collections
# instantiate the store
shelf
=
(
"literature"
if
host
==
"inspirehep.net"
else
None
)
self
.
store
=
InvenioS
tore
(
host
,
shelf
=
shelf
)
self
.
shelf
=
(
"literature"
if
host
==
"inspirehep.net"
else
None
)
self
.
store
=
build_s
tore
(
host
,
shelf
=
self
.
shelf
)
# list of collections
collections
=
re
.
sub
(
" *, *"
,
","
,
collections
).
split
(
","
)
...
...
modules/store_tools/__init__.py
View file @
b47b5b7e
...
...
@@ -25,6 +25,7 @@ from .exception import (CdsException,
RecordException
)
from
.factory
import
build_record
,
build_store
from
.inspirehepstore
import
InspirehepStore
from
.inveniostore
import
InvenioStore
from
.record
import
Record
from
.recordconf
import
RecordConf
...
...
modules/store_tools/factory.py
View file @
b47b5b7e
...
...
@@ -239,30 +239,28 @@ def build_store(host=None, shelf=None):
shelf (str):
section of the store containing records. It depends on the host.
Possible values are ``None``, ``literature``, ``conferences``
and ``institutions``
Possible values are ``literature``, ``conferences`` and
``institutions``.
The correlation between host and shelf is in the table:
+----------------+--------------+-----------------------------+
| host | shelf | base API |
+----------------+--------------+-----------------------------+
| cds.cern.ch | None | https://cds.cern.ch/ |
+----------------+--------------+-----------------------------+
| inspirehep.net | None | https://old.inspirehep.net/ |
| inspirehep.net | literature | https://inspirehep.net/ |
| inspirehep.net | conferences | https://inspirehep.net/ |
| inspirehep.net | institutions | https://inspirehep.net/ |
+----------------+--------------+-----------------------------+
Returns:
InvenioStore
InvenioStore
or InspirehepStore
"""
if
host
in
CDS
:
store
=
InvenioStore
(
host
=
"cds.cern.ch"
)
elif
host
in
INS
and
shelf
is
None
:
store
=
InvenioStore
(
host
=
"old.inspirehep.net"
)
elif
host
in
INS
and
shelf
in
SHELFS
:
store
=
InspirehepStore
(
host
=
host
,
shelf
=
shelf
)
...
...
modules/store_tools/inspirehepstore.py
View file @
b47b5b7e
...
...
@@ -207,3 +207,37 @@ class InspirehepStore(BaseStore):
raise
CdsException
(
MSG_INVALID_RESPONSE
)
return
records
def
search_parameters
(
self
,
collection
,
year_start
=
None
,
year_end
=
None
):
"""Build the (key, value) pairs to steer the search for a collection.
Args:
collection (str):
the collection in the store, *e.g.* ``LHCb Papers``.
* find cn d0 and tc p and not tc c
* find cc HAL Hidden a simpson, g and not tc c
* other syntax accept by inspirehep.net search engine
year_start (str):
year_end (str):
Returns:
dict:
"""
query
=
collection
if
year_start
and
not
year_end
:
query
+=
f
" and date
{
year_start
}
"
elif
not
year_start
and
year_end
:
query
+=
f
" and date
{
year_end
}
"
elif
year_start
and
year_end
:
tpl
=
(
f
"date
{
el
}
"
for
el
in
range
(
year_start
,
year_end
+
1
))
sdates
=
" or "
.
join
(
tpl
)
query
+=
f
" and (
{
sdates
}
)"
# get 100 records per page
return
dict
(
q
=
query
,
size
=
100
)
modules/store_tools/inveniostore.py
View file @
b47b5b7e
...
...
@@ -191,3 +191,39 @@ class InvenioStore(BaseStore):
return
obj
[
0
]
raise
CdsException
(
MSG_INVALID_RESPONSE
)
def
search_parameters
(
self
,
collection
,
year_start
=
None
,
year_end
=
None
):
"""Build the (key, value) pairs to steer the search for a collection.
Args:
collection (str):
the collection in the store:
* ``LHCb Papers``
* other syntax accept by cds.cern.ch search engine
year_start (str):
year_end (str):
Returns:
dict:
"""
rex
=
""
if
year_start
and
not
year_end
:
rex
=
year_start
elif
not
year_start
and
year_end
:
rex
=
year_end
elif
year_start
and
year_end
:
tpl
=
(
str
(
el
)
for
el
in
range
(
year_start
,
year_end
+
1
))
rex
=
"|"
.
join
(
tpl
)
dct
=
dict
(
cc
=
collection
,
# collection
f1
=
"year"
,
# search on year
m1
=
"r"
,
# use regular expression
p1
=
rex
,
# regular expression defining year
sf
=
"year"
,
# sort by date
so
=
"d"
)
# descending order
return
dct
modules/store_tools/recordheppubli.py
View file @
b47b5b7e
...
...
@@ -75,7 +75,9 @@ class RecordHepPubli(RecordHep, PluginAuthors, PluginPublicationInfo):
(
author
[
"inspire_roles"
]
if
"inspire_roles"
in
author
else
[])
full_name
=
author
[
"full_name"
]
last_name
,
first_name
=
full_name
.
split
(
","
)
idx
=
full_name
.
find
(
","
)
last_name
=
full_name
[:
idx
]
first_name
=
full_name
[
idx
+
1
:].
strip
()
dct
=
{
"affiliation"
:
"|"
.
join
(
affiliations
),
"first_name"
:
first_name
.
strip
(),
...
...
tests/basis/test_01_store.py
View file @
b47b5b7e
...
...
@@ -75,13 +75,7 @@ def test_get_record_cds_01020():
recjson
=
store
.
get_record
(
1951625
)
assert
isinstance
(
recjson
,
dict
)
def
test_get_record_ins_old_01021
():
# old inspirehep interface
store
=
build_store
(
"inspirehep.net"
,
shelf
=
None
)
recjson
=
store
.
get_record
(
1319638
)
assert
isinstance
(
recjson
,
dict
)
assert
recjson
.
get
(
"$schema"
,
None
)
is
None
# v1.4.0 remove obsolete test_get_record_ins_old_01021():
def
test_get_record_ins_literature_01022
():
...
...
tests/basis/test_12_Automaton.py
View file @
b47b5b7e
...
...
@@ -60,7 +60,11 @@ def test__is_record_in_db_12001(svc):
assert
rec_id_1
==
rec_id_2
def
test_process_recid_12002
(
svc
):
# ............................................................................
#
# Process a record, collection and URL by using cds.cern.ch
#
def
test_process_recid_cds_12010
(
svc
):
"""Test the deepest method to retrieve a record.
"""
...
...
@@ -78,10 +82,10 @@ def test_process_recid_12002(svc):
ctitle
=
"LHCb / article / %s"
%
collection
svc
.
collection_logs
.
append
(
MsgCollection
(
title
=
ctitle
))
# get a list of id
s
kwargs
=
svc
.
_
search_parameters
(
collection
)
# get a list of id
entifier
kwargs
=
svc
.
store
.
search_parameters
(
collection
,
year_start
=
"2010"
)
recids
=
svc
.
store
.
get_ids
(
**
kwargs
)
assert
len
(
recids
)
>
0
assert
len
(
recids
)
==
2
# try with the oldest one
recid
=
recids
[
-
1
]
...
...
@@ -99,7 +103,7 @@ def test_process_recid_12002(svc):
del
svc
.
store
def
test_process_collection_120
03
(
svc
):
def
test_process_collection_
cds_
120
11
(
svc
):
# mimic the previous stage process_url
collection
=
"LHCb Papers"
...
...
@@ -119,10 +123,85 @@ def test_process_collection_12003(svc):
del
svc
.
store
def
test_process_url_cds_120
04
(
svc
):
def
test_process_url_cds_120
12
(
svc
):
assert
svc
.
process_url
(
"cds.cern.ch"
,
"LHCb Papers"
)
is
None
def
test_process_url_ins_12005
(
svc
):
assert
svc
.
process_url
(
"inspirehep.net"
,
"find cn lhcb and tc p and not tc c"
)
is
None
# ............................................................................
#
# Process a record, collection and URL by using inspirehep.net
#
def
test_process_recid_ins_12020
(
svc
):
"""Test the deepest method to retrieve a record.
"""
# reset
svc
.
collection_logs
=
[]
svc
.
logs
=
[]
# mimic high level stage process_collection and process_url
collection
=
"find cn LHCb and tc p and not tc c"
host
=
"inspirehep.net"
shelf
=
"literature"
svc
.
harvester
.
host
=
host
svc
.
harvester
.
collections
=
collection
svc
.
shelf
=
shelf
svc
.
store
=
build_store
(
host
,
shelf
=
svc
.
shelf
)
ctitle
=
"LHCb / article / %s"
%
collection
svc
.
collection_logs
.
append
(
MsgCollection
(
title
=
ctitle
))
# get a list of identifier
kwargs
=
svc
.
store
.
search_parameters
(
collection
,
year_start
=
"2010"
)
recids
=
svc
.
store
.
get_ids
(
**
kwargs
)
assert
len
(
recids
)
==
3
# try with the oldest one
recid
=
recids
[
0
]
assert
svc
.
process_recid
(
recid
)
is
None
print
(
svc
.
logs
)
assert
len
(
svc
.
logs
)
==
1
assert
svc
.
logs
[
-
1
].
action
is
None
assert
svc
.
logs
[
-
1
].
txt
is
None
# reset
svc
.
collection_logs
=
[]
svc
.
logs
=
[]
del
svc
.
harvester
.
host
del
svc
.
harvester
.
collections
del
svc
.
store
def
test_process_collection_ins_12021
(
svc
):
# mimic the previous stage process_url
collection
=
"find cn LHCb and tc p and not tc c"
host
=
"inspirehep.net"
shelf
=
"literature"
svc
.
harvester
.
host
=
host
svc
.
harvester
.
collections
=
collection
svc
.
shelf
=
shelf
svc
.
store
=
build_store
(
host
,
shelf
=
svc
.
shelf
)
# do it
assert
svc
.
process_collection
(
collection
)
is
None
# reset
svc
.
collection_logs
=
[]
svc
.
logs
=
[]
del
svc
.
harvester
.
host
del
svc
.
harvester
.
collections
del
svc
.
store
def
test_process_url_cds_12022
(
svc
):
collection
=
"find cn LHCb and tc p and not tc c"
host
=
"inspirehep.net"
assert
svc
.
process_url
(
host
,
collection
)
is
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment