inveniostore.py 11.2 KB
Newer Older
1
""" store_tools.inveniostore
2 3

"""
4
import requests
5

6
from .exception import CdsException
7
from requests.adapters import HTTPAdapter
8
from builtins import getattr
9

LE GAC Renaud's avatar
LE GAC Renaud committed
10 11 12 13 14 15
CDS_SEARCH_KEYS = ("req", "cc", "c", "ec", "p", "f", "rg", "sf", "so", "sp",
                   "rm", "of", "ot", "as", "p1", "f1", "m1", "op1", "p2", "f2",
                   "m2", "op2", "p3", "f3", "m3", "sc", "jrec", "recid",
                   "recidb", "sysno", "id", "idb", "sysnb", "action", "d1",
                   "d1y", "d1m", "d1d", "d2", "d2y", "d2m", "d2d", "dt",
                   "verbose", "ap", "ln", "ec")
16

17 18 19
CDS = ("cds", "cds.cern.ch")
INS = ("inspirehep", "inspirehep.net")

20
MSG_HTTP_DECODE = "Fail to decode HTTP response"
21
MSG_HTTP_ERROR = "HTTP Error"
22
MSG_INVALID_RESPONSE = "Invalid response"
23
MSG_NO_IDS = "Invalid list of record identifiers"
24
MSG_NO_SHELF = "No shelf %s for store %s"
25
MSG_NOT_IMPLEMENTED = "Method '%s' not implemented for store %s and shelf %s"
26
MSG_WRONG_KEYWORD = "Invalid keyword argument"
27

28 29
# maximum number of identifiers to be collected at once.
# The value of 200 is the maximum value authorised using cds.cern.ch
30 31 32 33
N_IDS = 200


class InvenioStore(object):
34
    """Class to dialogue with `invenio <http://invenio-software.org/>`_ store.
35

36 37 38
    The class provides methods to request:
        * a list of identifier satisfying search criteria.
        * a record identified by its id.
39

40 41 42 43 44 45
    Args:
        host (str):
            possible values are ``cds``, ``cds.cern.ch``,``inspirehep``
            or ``inspirehep.net``

        shelf (str):
46
            section of the store containing records. It depends on the host.
47 48 49 50 51 52 53 54 55 56 57 58 59 60
            Possible values are ``None``, ``literature``, ``conferences``
            and ``institutions``

             +----------------+--------------+-----------------------------+
             | host           | shelf        | base API                    |
             +----------------+--------------+-----------------------------+
             | cds.cern.ch    | None         | https://cds.cern.ch/        |
             +----------------+--------------+-----------------------------+
             | inspirehep.net | None         | https://old.inspirehep.net/ |
             | inspirehep.net | literature   | https://old.inspirehep.net/ |
             | inspirehep.net | conferences  | https://inspirehep.net/     |
             | inspirehep.net | institutions | https://old.inspirehep.net/ |
             +----------------+--------------+-----------------------------+

61
    """
62

63
    def __init__(self, host="cds", shelf=None):
64

65
        self._shelf = shelf
66 67
        self._url = None

68
        # base url for the API
69
        if host in CDS:
70 71 72 73 74 75 76 77 78 79
            api_search = "https://cds.cern.ch/search"
            api_record = "https://cds.cern.ch/record"
            host = "cds.cern.ch"

        elif host in INS and shelf in (None, "literature", "institutions"):
            api_search = "https://old.inspirehep.net/search"
            api_record = "https://old.inspirehep.net/record"
            host = "old.inspirehep.net"

        elif host in INS and shelf in ("conferences",):
80
            api_search = "https://inspirehep.net/api/conferences/?q="
81 82 83 84 85 86
            api_record = "https://inspirehep.net/api/conferences"
            host = "inspirehep.net"

        else:
            raise CdsException(MSG_NO_SHELF % (shelf, host))

87
        # start a session, a persistent connection with the server
88 89
        # let the session handle the number of retry
        session = requests.Session()
90
        session.mount(f"https://{host}", HTTPAdapter(max_retries=3))
91

92 93 94
        self._api_search = api_search
        self._api_record = api_record
        self._host = host
95
        self._session = session
96 97 98

    def __del__(self):
        # close the session
99 100
        if getattr(self, "_session", None) is not None:
            self._session.close()
101

102 103 104
    def get_field(self, rec_id, fieldname):
        """Retrieve the field value for the record identified by
        its *record id*.
105

106 107 108 109
        Note:
            The method is implemented for store, shelf pairs
            relying on ``cds.cern.ch`` and ``old.inspirehep.net``.

110
        Args:
111 112
            rec_id (int):
                record identifier in the store.
113

114 115
            fieldname (str):
                name of the field in the JSON record.
116

117 118 119 120
        Returns:
            * int
            * str
            * None when the field is not found
121

122
        Raises:
123
            CdsException:
124 125 126 127 128
                * method is not implemented for all store, shelf pairs.
                  It works for those relying on cds.cern.ch and
                  old.inspirehep.net
                * the server return an HTTP error.
                * JSON object could not be decoded.
129

130 131 132 133 134
        """
        host = self._host
        if host not in  ("cds.cern.ch", "old.inspirehep.net"):
            msg = MSG_NOT_IMPLEMENTED % ("get_field", host, self._shelf)
            raise CdsException(msg)
135

136
        url = f"{self._api_record}/{rec_id}"
137

138
        rep = self.interrogate(url, timeout=60, of="recjson", ot=fieldname)
139

140 141
        try:
            obj = rep.json()
142

143 144
        except ValueError:
            raise CdsException(MSG_HTTP_DECODE)
145

146 147 148 149
        if isinstance(obj, list) and len(obj) == 1:
            return obj[0][fieldname]

        return None
150 151

    def get_ids(self, **kwargs):
152
        """Return a list of *record id* matching search criteria.
153

154 155 156 157
        Note:
            The method is implemented for store, shelf pairs
            relying on ``cds.cern.ch`` and ``old.inspirehep.net``.

158
        Keyword Args:
159

160 161
            The keyword arguments are those of the invenio search engine and
            they depend on the version of invenio.
162

163
            See https://gitlab.in2p3.fr/limbra/limbra/-/blob/master/modules/store_tools/README.md
164
            for more information.
165

166
        Returns:
167 168 169
            list:
                * A list of numbers.
                * The list is empty when the request failed on the server.
170

171
        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
172 173
            CdsException::

174 175 176
                * Method not implemented for all store, shelf pairs.
                  It works for those relying on cds.cern.ch and
                  old.inspirehep.net
177 178
                * keyword argument is invalid;
                * the server return an HTTP error;
179
                * JSON object can't be decoded;
180
                * not well formed list of ids.
181 182

        """
183
        host = self._host
184
        if host not in  ("cds.cern.ch", "old.inspirehep.net"):
185 186 187
            msg = MSG_NOT_IMPLEMENTED % ("get_ids", host, self._shelf)
            raise CdsException(msg)

188 189 190 191 192 193 194 195 196
        for k in kwargs:
            if k not in CDS_SEARCH_KEYS:
                raise CdsException(MSG_WRONG_KEYWORD, k)

        ids = []
        scan = True

        # NOTE: the number of ids which can be collected at once is limited
        # to 10 on cds.cern.ch since the invenio version 1.1.3.1106 (Jun 2014)
197
        # Therefore to get the complete list of ids we have to scan them
198 199
        # by block of 200. The later is the maximum value allowed by cds.
        # We use the parameter rg and jrec to steer the scan.
200
        # They have no effect on inspirehep.net.
201

LE GAC Renaud's avatar
LE GAC Renaud committed
202 203 204
        kwargs["of"] = "id"
        kwargs["rg"] = N_IDS
        kwargs["jrec"] = -N_IDS
205 206

        while scan:
LE GAC Renaud's avatar
LE GAC Renaud committed
207
            kwargs["jrec"] += N_IDS
208

209
            rep = self.interrogate(self._api_search, timeout=30, **kwargs)
210

211 212 213 214 215
            try:
                li = rep.json()

                # check that the list of identifier is well form
                # [1291068, 1352722, 1376692] or [1493820] or []
216
                if len(list(filter(lambda x: not isinstance(x, int), li))) > 0:
217 218
                    raise CdsException(MSG_NO_IDS)

219 220
                ids.extend(li)

221 222 223
            # trigger when the JSON object cannot be decoded
            except ValueError as e:
                raise CdsException(e)
224 225 226 227 228 229

            if len(li) != N_IDS:
                scan = False

        return ids

230
    def get_record(self, rec_id):
231
        """Retrieve a record defined by its *record id*.
232

233
        Args:
234 235
            rec_id (int):
                record identifier in the store.
236

237
        Returns:
238
            dict:
LE GAC Renaud's avatar
LE GAC Renaud committed
239
                the record data (recjson).
240

241
        Raises:
242
            CdsException:
LE GAC Renaud's avatar
LE GAC Renaud committed
243

244
                * the server return an HTTP error.
245
                * JSON object could not be decoded.
246
                * more than one record
247 248

        """
249
        url = f"{self._api_record}/{rec_id}"
250 251 252 253

        kwargs = {}
        if self._host in ("cds.cern.ch", "old.inspirehep.net"):
            kwargs = {"of": "recjson"}
254

255
        rep = self.interrogate(url, timeout=30, **kwargs)
256

257
        try:
258
            obj = rep.json()
259

260
        except ValueError:
261
            raise CdsException(MSG_HTTP_DECODE)
262

263 264 265 266 267 268
        if isinstance(obj, dict):
            return obj

        if isinstance(obj, list) and len(obj) == 1:
            return obj[0]

269
        raise CdsException(MSG_INVALID_RESPONSE)
270

271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
    def interrogate(self, url, timeout=10, **kwargs):
        """Interrogate the store using the *URL*.
        It is retry several time when the service is not available.

        Args:
            url (str):
                the URL string depends on the store and on the invenio
                version which is running, *e.g.*::

                    * ``https://cds.cern.ch/record/123456/of=recjson``
                    * ``https://cds.cern.ch/search?of=id&....

            timeout (float):
                timeout for the HTTP request

        Keyword Args:

            The keyword arguments are those of the invenio search engine and
            they depend on the version of invenio.

291
            See https://gitlab.in2p3.fr/limbra/limbra/-/blob/master/modules/store_tools/README.md
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
            for more information.

        Returns:
            requests.Response:

        Raises:
            RequestException:
                something went wrong within the HTTP dialog

        """
        self._url = url

        r = self._session.get(url, timeout=timeout, params=kwargs)
        r.raise_for_status()

        return r

309
    def last_search_url(self):
310
        """
311
        Returns:
312
            str:
313
                the URL used in the last search.
314 315

        """
316
        return self._url
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360

    def search(self, query, **kwargs):
        """Return a list of *JSON record* matching search criteria.

        Note:
            The method is implemented for store, shelf pairs
            relying on ``inspirehep.net``.

        Args:
            query (str):
                query for the inspirehep store.
                Use the syntax of the web interface or elasticsearch one.

        Keyword Args:

            elasticsearch keywords

        Returns:
            * list of JSON records

        Raises:
            CdsException:

                * the server return an HTTP error.
                * JSON object could not be decoded.

        """
        url = f"{self._api_search}{query}"

        rep = self.interrogate(url, timeout=30, **kwargs)

        try:
            obj = rep.json()

        except ValueError:
            raise CdsException(MSG_HTTP_DECODE)

        # the response is a dict with 3 keys: 'hits', 'links', 'sort_options'
        # the hits section is a dict with 2 keys: hits (list), total (int)
        try:
            return obj["hits"]["hits"]

        except (KeyError, TypeError):
            raise CdsException(MSG_INVALID_RESPONSE)