inveniostore.py 7.1 KB
Newer Older
1 2 3 4 5
""" invenio_tools.inveniostore

"""
import json
import re
6
import requests
7 8
import time

9
from .exception import CdsException
10
from requests.adapters import HTTPAdapter
11 12


LE GAC Renaud's avatar
LE GAC Renaud committed
13 14 15 16 17 18
CDS_SEARCH_KEYS = ("req", "cc", "c", "ec", "p", "f", "rg", "sf", "so", "sp",
                   "rm", "of", "ot", "as", "p1", "f1", "m1", "op1", "p2", "f2",
                   "m2", "op2", "p3", "f3", "m3", "sc", "jrec", "recid",
                   "recidb", "sysno", "id", "idb", "sysnb", "action", "d1",
                   "d1y", "d1m", "d1d", "d2", "d2y", "d2m", "d2d", "dt",
                   "verbose", "ap", "ln", "ec")
19

20 21 22
MSG_HTTP_ERROR = "HTTP Error"
MSG_NO_IDS = "Invalid list of record identifiers"
MSG_WRONG_KEYWORD = "Invalid keyword argument"
23

24 25
# maximum number of identifiers to be collected at once.
# The value of 200 is the maximum value authorised using cds.cern.ch
26 27 28 29
N_IDS = 200


class InvenioStore(object):
30
    """Class to dialogue with `invenio <http://invenio-software.org/>`_ store.
31

32 33 34
    The class provides methods to request:
        * a list of identifier satisfying search criteria.
        * a record identified by its id.
35 36

    """
LE GAC Renaud's avatar
LE GAC Renaud committed
37
    def __init__(self, host="cds.cern.ch"):
38
        """
39 40 41
        Args:
            host (str): possible values are ``cds.cern.ch``
                or ``inspirehep.net``.
42 43 44 45 46

        """
        self._host = host
        self._url = None

47
        # start a session, a persistent connection with the server
48 49 50 51 52
        # let the session handle the number of retry
        session = requests.Session()
        session.mount(f"http://{host}", HTTPAdapter(max_retries=3))

        self._session = session
53 54 55 56 57

    def __del__(self):
        # close the session
        self._session.close()

58
    def interogate(self, url, timeout=10, params=None):
59
        """Interrogate the store using the *URL*.
60
        It is retry several time when the service is not available.
61

62
        Args:
63
            url (str): URL string
64
            params (dict): parameters to be send with the URL
65

66 67 68
            timeout (float):
                timeout for the HTTP request

69
        Returns:
70
            str: the HTTP response
71

72
        Raises:
73 74
            RequestException:
                something went wrong within the HTTP dialog
75 76 77 78

        """
        self._url = url

79 80
        r = self._session.get(url, timeout=timeout, params=params)
        r.raise_for_status()
81

82
        r.encoding = "utf-8"
83
        return r.text
84 85

    def get_ids(self, **kwargs):
86
        """Return a list of *record id* matching search criteria.
87 88 89

        Search criteria are defined by the keywords arguments:

90 91 92
        Keyword Args:
            cc (str): current collection (e.g. "ATLAS Papers").
                The collection the user started to search/browse from.
93

94
            p (str): pattern to search for (e.g. "ellis and muon or kaon").
95

96
            f (str): field to search within (e.g. "author").
97

98 99
            p1 (str): first pattern to search for in the advanced search
                interface.  Much like **p**.
100

101 102
            f1 (str): first field to search within in the advanced search
                interface.  Much like **f**.
103

104 105 106
            m1 (str): first matching type in the advanced search interface.
                ("a" all of the words, "o" any of the words, "e" exact
                phrase, "p" partial phrase, "r" regular expression).
107

108 109
            op1 (str): first operator, to join the first and the second unit
                in the advanced search interface.  ("a" add, "o" or, "n" not).
110

111 112
            p2 (str): second pattern to search for in the advanced search
                interface.  Much like **p**.
113

114 115
            f2 (str): second field to search within in the advanced search
                interface.  Much like **f**.
116

117 118 119
            m2 (str): second matching type in the advanced search interface.
                ("a" all of the words, "o" any of the words, "e" exact
                phrase, "p" partial phrase, "r" regular expression).
120

121 122
            op2 (str): second operator, to join the second and the third unit
                in the advanced search interface.  ("a" add, "o" or,"n" not).
123

124 125
            p3 (str): third pattern to search for in the advanced search
                interface.  Much like **p**.
126

127 128
            f3 (str): third field to search within in the advanced search
                interface.  Much like **f**.
129

130 131 132
            m3 (str): third matching type in the advanced search interface.
                ("a" all of the words, "o" any of the words, "e" exact
                phrase, "p" partial phrase, "r" regular expression).
133 134

        The complete list of  keyword arguments can be found at
135
        http://invenio-demo.cern.ch/help/hacking/search-engine-api.
136

137 138 139
        Returns:
            list: a list of numbers.
                The list is empty when the request failed on the server.
140

141
        Raises:
142 143 144 145 146
            CdsException:
                * keyword argument is invalid;
                * the server return an HTTP error;
                * JSON object can't be decoded
                * not well formed list of ids.
147 148 149 150 151 152 153 154 155 156 157

        """
        for k in kwargs:
            if k not in CDS_SEARCH_KEYS:
                raise CdsException(MSG_WRONG_KEYWORD, k)

        ids = []
        scan = True

        # NOTE: the number of ids which can be collected at once is limited
        # to 10 on cds.cern.ch since the invenio version 1.1.3.1106 (Jun 2014)
158
        # Therefore to get the complete list of ids we have to scan them
159 160
        # by block of 200. The later is the maximum value allowed by cds.
        # We use the parameter rg and jrec to steer the scan.
161
        # They have no effect on inspirehep.net.
162

LE GAC Renaud's avatar
LE GAC Renaud committed
163 164 165
        kwargs["of"] = "id"
        kwargs["rg"] = N_IDS
        kwargs["jrec"] = -N_IDS
166 167

        while scan:
LE GAC Renaud's avatar
LE GAC Renaud committed
168
            kwargs["jrec"] += N_IDS
169

170 171
            url = "http://%s/search" % self._host
            rep = self.interogate(url, params=kwargs)
172

173 174 175 176 177 178 179 180
            try:
                li = rep.json()

                # check that the list of identifier is well form
                # [1291068, 1352722, 1376692] or [1493820] or []
                if len(filter(lambda x: not isinstance(x, int), li)) > 0:
                    raise CdsException(MSG_NO_IDS)

181 182
                ids.extend(li)

183 184 185
            # trigger when the JSON object cannot be decoded
            except ValueError as e:
                raise CdsException(e)
186 187 188 189 190 191

            if len(li) != N_IDS:
                scan = False

        return ids

192
    def get_record(self, rec_id):
193
        """Retrieve a record defined by its *record id*.
194

195 196
        Args:
            rec_id (int): record identifier in the store.
197

198
        Returns:
199 200
            dict:
                the record data (MarcJSON).
201

202
        Raises:
203 204 205
            CdsException:
                * the server return an HTTP error.
                * no JSON object could be decoded.
206 207 208

        """
        self._try = 0
209

210 211
        url = "http://%s/record/%s" % (self._host, rec_id)
        rep = self.interogate(url, params={"of": "recjson"})
212

213 214
        try:
            li = rep.json()
215

216 217
        except ValueError as e:
            raise CdsException(e)
218

219
        return li[0]
220

221
    def last_search_url(self):
222
        """
223
        Returns:
224
            unicode: the URL used in the last search.
225 226

        """
227
        return self._url