Commit 0146901a authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Complete InspirehepStore

parent 56f65a7e
......@@ -33,7 +33,7 @@ class InspirehepStore(BaseStore):
shelf=None):
super().__init__(api_record=f"https://{host}/api/{shelf}",
api_search=f"https://{host}/api/{shelf}/?q=",
api_search=f"https://{host}/api/{shelf}",
host=host,
max_retries=max_retries,
shelf=self)
......@@ -50,8 +50,7 @@ class InspirehepStore(BaseStore):
name of the field in the JSON record.
Returns:
* int
* str
* value
* None when the field is not found
Raises:
......@@ -60,17 +59,35 @@ class InspirehepStore(BaseStore):
* JSON object could not be decoded.
"""
kwargs = {
"q": f"recid={rec_id}",
"fields": fieldname}
try:
obj = self.search(**kwargs)
return obj[0]["metadata"][fieldname]
except (KeyError, TypeError):
raise CdsException(MSG_INVALID_RESPONSE)
def get_ids(self, **kwargs):
"""Return a list of *record id* matching search criteria.
Keyword Args:
The keyword arguments are those of the inspirehep search engine
Mode information in
The keyword arguments are those of the inspirehep search engine:
https://inspirehep.net/help/knowledge-base/inspire-paper-search/
q (str):
query string (inspire or elastic search style)
size (int):
number of records per page [10]
sort (str):
sort order. Value depend on the shelf, *e.g* ``mostrecent`
or ``mostcited`` for ``literature``
Returns:
list:
* A list of numbers.
......@@ -84,6 +101,10 @@ class InspirehepStore(BaseStore):
* not well formed list of ids.
"""
# request only the field id
kwargs["fields"] = "recid"
lst = self.search(**kwargs)
return [int(dct["id"]) for dct in lst]
def get_record(self, rec_id):
"""Retrieve a record defined by its *record id*.
......@@ -94,7 +115,8 @@ class InspirehepStore(BaseStore):
Returns:
dict:
the record data (recjson).
the record data in JSON format:
https://inspire-schemas.readthedocs.io/en/latest/schemas/
Raises:
CdsException:
......@@ -103,54 +125,47 @@ class InspirehepStore(BaseStore):
* more than one record
"""
url = f"{self._api_record}/{rec_id}"
rep = self.interrogate(url, timeout=30)
def interrogate(self, url, timeout=10, **kwargs):
"""Interrogate the store using the *URL*.
It is retry several time when the service is not available.
Args:
url (str):
# the response is a dict with 5 keys:
# 'id', 'created', 'updated' 'links' and 'metadata`
# More in https://github.com/inspirehep/rest-api-doc
try:
obj = rep.json()
return obj["metadata"]
timeout (float):
timeout for the HTTP request
except ValueError:
raise CdsException(MSG_HTTP_DECODE)
Keyword Args:
except (KeyError, TypeError):
raise CdsException(MSG_INVALID_RESPONSE)
The keyword arguments are those of the inspirehep search engine.
def search(self, **kwargs):
"""Return a list of *JSON record* matching search criteria.
Mode information in
Keyword Args:
The keyword arguments are those of the inspirehep search engine:
https://inspirehep.net/help/knowledge-base/inspire-paper-search/
Returns:
requests.Response:
q (str):
query string (inspire or elastic search style)
Raises:
RequestException:
something went wrong within the HTTP dialog
fields (str):
fields in metadata to be returned, *e.g.*
'titles,authors.full_name,authors.affiliations.record'
"""
size (int):
number of records per page [10]
def last_search_url(self):
"""
Returns:
str:
the URL used in the last search.
"""
return self._url
def search(self, query, **kwargs):
"""Return a list of *JSON record* matching search criteria.
Args:
query (str):
Use the syntax of the web interface or elasticsearch one.
Keyword Args:
elasticsearch keywords
sort (str):
sort order. Value depend on the shelf, *e.g* ``mostrecent`
or ``mostcited`` for ``literature``
Returns:
* list of JSON records
The record format is defined in:
https://inspire-schemas.readthedocs.io/en/latest/schemas/
Raises:
CdsException:
......@@ -158,20 +173,30 @@ class InspirehepStore(BaseStore):
* JSON object could not be decoded.
"""
url = f"{self._api_search}{query}"
rep = self.interrogate(url, timeout=30, **kwargs)
try:
obj = rep.json()
except ValueError:
raise CdsException(MSG_HTTP_DECODE)
# the response is a dict with 3 keys: 'hits', 'links', 'sort_options'
# the hits section is a dict with 2 keys: hits (list), total (int)
try:
return obj["hits"]["hits"]
except (KeyError, TypeError):
raise CdsException(MSG_INVALID_RESPONSE)
records = []
url = self._api_search
# scan the store
# the response is a dict with 2 keys: 'hits'and 'links'
# the hits section is list of a dict with the id key
#
# response are paginated with a default of 10 records per page
# keyword argument size allows to change the number of record per page
# scan all pages by using links.next
#
# More in https://github.com/inspirehep/rest-api-doc
while url is not None:
rep = self.interrogate(url, timeout=30, **kwargs)
try:
obj = rep.json()
records.extend(obj["hits"]["hits"])
url = obj["links"].get("next", None)
except ValueError:
raise CdsException(MSG_HTTP_DECODE)
except (KeyError, TypeError):
raise CdsException(MSG_INVALID_RESPONSE)
return records
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment