inveniostore.py 10.5 KB
Newer Older
1
""" store_tools.inveniostore
2 3

"""
4
import requests
5

6
from .exception import CdsException
7
from requests.adapters import HTTPAdapter
8
from builtins import getattr
9

LE GAC Renaud's avatar
LE GAC Renaud committed
10 11 12 13 14 15
CDS_SEARCH_KEYS = ("req", "cc", "c", "ec", "p", "f", "rg", "sf", "so", "sp",
                   "rm", "of", "ot", "as", "p1", "f1", "m1", "op1", "p2", "f2",
                   "m2", "op2", "p3", "f3", "m3", "sc", "jrec", "recid",
                   "recidb", "sysno", "id", "idb", "sysnb", "action", "d1",
                   "d1y", "d1m", "d1d", "d2", "d2y", "d2m", "d2d", "dt",
                   "verbose", "ap", "ln", "ec")
16

17
MSG_HTTP_DECODE = "Fail to decode HTTP response"
18
MSG_HTTP_ERROR = "HTTP Error"
19
MSG_INVALID_RESPONSE = "Invalid response"
20
MSG_NO_IDS = "Invalid list of record identifiers"
21
MSG_NOT_IMPLEMENTED = "Method '%s' not implemented for store %s and shelf %s"
22
MSG_WRONG_KEYWORD = "Invalid keyword argument"
23

24 25
# maximum number of identifiers to be collected at once.
# The value of 200 is the maximum value authorised using cds.cern.ch
26 27 28 29
N_IDS = 200


class InvenioStore(object):
30
    """Class to dialogue with `invenio <http://invenio-software.org/>`_ store.
31

32 33 34
    The class provides methods to request:
        * a list of identifier satisfying search criteria.
        * a record identified by its id.
35

36 37 38 39 40
    Args:
        host (str):
            possible values are ``cds``, ``cds.cern.ch``,``inspirehep``
            or ``inspirehep.net``

41 42 43 44 45 46
        api_search (str):

        api_record (str):

        max_retries (int):

47
        shelf (str):
48
            section of the store containing records. It depends on the host.
49 50 51 52 53 54 55 56 57 58 59 60 61 62
            Possible values are ``None``, ``literature``, ``conferences``
            and ``institutions``

             +----------------+--------------+-----------------------------+
             | host           | shelf        | base API                    |
             +----------------+--------------+-----------------------------+
             | cds.cern.ch    | None         | https://cds.cern.ch/        |
             +----------------+--------------+-----------------------------+
             | inspirehep.net | None         | https://old.inspirehep.net/ |
             | inspirehep.net | literature   | https://old.inspirehep.net/ |
             | inspirehep.net | conferences  | https://inspirehep.net/     |
             | inspirehep.net | institutions | https://old.inspirehep.net/ |
             +----------------+--------------+-----------------------------+

63
    """
64

65 66 67 68 69 70
    def __init__(self,
                 api_record=None,
                 api_search=None,
                 host=None,
                 max_retries=3,
                 shelf=None):
71

72 73 74
        self._api_search = api_search
        self._api_record = api_record
        self._host = host
75
        self._shelf = shelf
76 77
        self._url = None

78
        # start a session, a persistent connection with the server
79 80
        # let the session handle the number of retry
        session = requests.Session()
81
        session.mount(f"https://{host}", HTTPAdapter(max_retries=max_retries))
82
        self._session = session
83 84 85

    def __del__(self):
        # close the session
86 87
        if getattr(self, "_session", None) is not None:
            self._session.close()
88

89 90 91
    def get_field(self, rec_id, fieldname):
        """Retrieve the field value for the record identified by
        its *record id*.
92

93 94 95 96
        Note:
            The method is implemented for store, shelf pairs
            relying on ``cds.cern.ch`` and ``old.inspirehep.net``.

97
        Args:
98 99
            rec_id (int):
                record identifier in the store.
100

101 102
            fieldname (str):
                name of the field in the JSON record.
103

104 105 106 107
        Returns:
            * int
            * str
            * None when the field is not found
108

109
        Raises:
110
            CdsException:
111 112 113 114 115
                * method is not implemented for all store, shelf pairs.
                  It works for those relying on cds.cern.ch and
                  old.inspirehep.net
                * the server return an HTTP error.
                * JSON object could not be decoded.
116

117 118 119 120 121
        """
        host = self._host
        if host not in  ("cds.cern.ch", "old.inspirehep.net"):
            msg = MSG_NOT_IMPLEMENTED % ("get_field", host, self._shelf)
            raise CdsException(msg)
122

123
        url = f"{self._api_record}/{rec_id}"
124

125
        rep = self.interrogate(url, timeout=60, of="recjson", ot=fieldname)
126

127 128
        try:
            obj = rep.json()
129

130 131
        except ValueError:
            raise CdsException(MSG_HTTP_DECODE)
132

133 134 135 136
        if isinstance(obj, list) and len(obj) == 1:
            return obj[0][fieldname]

        return None
137 138

    def get_ids(self, **kwargs):
139
        """Return a list of *record id* matching search criteria.
140

141 142 143 144
        Note:
            The method is implemented for store, shelf pairs
            relying on ``cds.cern.ch`` and ``old.inspirehep.net``.

145
        Keyword Args:
146

147 148
            The keyword arguments are those of the invenio search engine and
            they depend on the version of invenio.
149

150
            See https://gitlab.in2p3.fr/limbra/limbra/-/blob/master/modules/store_tools/README.md
151
            for more information.
152

153
        Returns:
154 155 156
            list:
                * A list of numbers.
                * The list is empty when the request failed on the server.
157

158
        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
159 160
            CdsException::

161 162 163
                * Method not implemented for all store, shelf pairs.
                  It works for those relying on cds.cern.ch and
                  old.inspirehep.net
164 165
                * keyword argument is invalid;
                * the server return an HTTP error;
166
                * JSON object can't be decoded;
167
                * not well formed list of ids.
168 169

        """
170
        host = self._host
171
        if host not in  ("cds.cern.ch", "old.inspirehep.net"):
172 173 174
            msg = MSG_NOT_IMPLEMENTED % ("get_ids", host, self._shelf)
            raise CdsException(msg)

175 176 177 178 179 180 181 182 183
        for k in kwargs:
            if k not in CDS_SEARCH_KEYS:
                raise CdsException(MSG_WRONG_KEYWORD, k)

        ids = []
        scan = True

        # NOTE: the number of ids which can be collected at once is limited
        # to 10 on cds.cern.ch since the invenio version 1.1.3.1106 (Jun 2014)
184
        # Therefore to get the complete list of ids we have to scan them
185 186
        # by block of 200. The later is the maximum value allowed by cds.
        # We use the parameter rg and jrec to steer the scan.
187
        # They have no effect on inspirehep.net.
188

LE GAC Renaud's avatar
LE GAC Renaud committed
189 190 191
        kwargs["of"] = "id"
        kwargs["rg"] = N_IDS
        kwargs["jrec"] = -N_IDS
192 193

        while scan:
LE GAC Renaud's avatar
LE GAC Renaud committed
194
            kwargs["jrec"] += N_IDS
195

196
            rep = self.interrogate(self._api_search, timeout=30, **kwargs)
197

198 199 200 201 202
            try:
                li = rep.json()

                # check that the list of identifier is well form
                # [1291068, 1352722, 1376692] or [1493820] or []
203
                if len(list(filter(lambda x: not isinstance(x, int), li))) > 0:
204 205
                    raise CdsException(MSG_NO_IDS)

206 207
                ids.extend(li)

208 209 210
            # trigger when the JSON object cannot be decoded
            except ValueError as e:
                raise CdsException(e)
211 212 213 214 215 216

            if len(li) != N_IDS:
                scan = False

        return ids

217
    def get_record(self, rec_id):
218
        """Retrieve a record defined by its *record id*.
219

220
        Args:
221 222
            rec_id (int):
                record identifier in the store.
223

224
        Returns:
225
            dict:
LE GAC Renaud's avatar
LE GAC Renaud committed
226
                the record data (recjson).
227

228
        Raises:
229
            CdsException:
LE GAC Renaud's avatar
LE GAC Renaud committed
230

231
                * the server return an HTTP error.
232
                * JSON object could not be decoded.
233
                * more than one record
234 235

        """
236
        url = f"{self._api_record}/{rec_id}"
237 238 239 240

        kwargs = {}
        if self._host in ("cds.cern.ch", "old.inspirehep.net"):
            kwargs = {"of": "recjson"}
241

242
        rep = self.interrogate(url, timeout=30, **kwargs)
243

244
        try:
245
            obj = rep.json()
246

247
        except ValueError:
248
            raise CdsException(MSG_HTTP_DECODE)
249

250 251 252 253 254 255
        if isinstance(obj, dict):
            return obj

        if isinstance(obj, list) and len(obj) == 1:
            return obj[0]

256
        raise CdsException(MSG_INVALID_RESPONSE)
257

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
    def interrogate(self, url, timeout=10, **kwargs):
        """Interrogate the store using the *URL*.
        It is retry several time when the service is not available.

        Args:
            url (str):
                the URL string depends on the store and on the invenio
                version which is running, *e.g.*::

                    * ``https://cds.cern.ch/record/123456/of=recjson``
                    * ``https://cds.cern.ch/search?of=id&....

            timeout (float):
                timeout for the HTTP request

        Keyword Args:

            The keyword arguments are those of the invenio search engine and
            they depend on the version of invenio.

278
            See https://gitlab.in2p3.fr/limbra/limbra/-/blob/master/modules/store_tools/README.md
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
            for more information.

        Returns:
            requests.Response:

        Raises:
            RequestException:
                something went wrong within the HTTP dialog

        """
        self._url = url

        r = self._session.get(url, timeout=timeout, params=kwargs)
        r.raise_for_status()

        return r

296
    def last_search_url(self):
297
        """
298
        Returns:
299
            str:
300
                the URL used in the last search.
301 302

        """
303
        return self._url
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347

    def search(self, query, **kwargs):
        """Return a list of *JSON record* matching search criteria.

        Note:
            The method is implemented for store, shelf pairs
            relying on ``inspirehep.net``.

        Args:
            query (str):
                query for the inspirehep store.
                Use the syntax of the web interface or elasticsearch one.

        Keyword Args:

            elasticsearch keywords

        Returns:
            * list of JSON records

        Raises:
            CdsException:

                * the server return an HTTP error.
                * JSON object could not be decoded.

        """
        url = f"{self._api_search}{query}"

        rep = self.interrogate(url, timeout=30, **kwargs)

        try:
            obj = rep.json()

        except ValueError:
            raise CdsException(MSG_HTTP_DECODE)

        # the response is a dict with 3 keys: 'hits', 'links', 'sort_options'
        # the hits section is a dict with 2 keys: hits (list), total (int)
        try:
            return obj["hits"]["hits"]

        except (KeyError, TypeError):
            raise CdsException(MSG_INVALID_RESPONSE)