""" invenio_tools.inveniostore """ import json import re import requests import time from .exception import CdsException from requests.adapters import HTTPAdapter CDS_SEARCH_KEYS = ("req", "cc", "c", "ec", "p", "f", "rg", "sf", "so", "sp", "rm", "of", "ot", "as", "p1", "f1", "m1", "op1", "p2", "f2", "m2", "op2", "p3", "f3", "m3", "sc", "jrec", "recid", "recidb", "sysno", "id", "idb", "sysnb", "action", "d1", "d1y", "d1m", "d1d", "d2", "d2y", "d2m", "d2d", "dt", "verbose", "ap", "ln", "ec") MSG_HTTP_ERROR = "HTTP Error" MSG_NO_IDS = "Invalid list of record identifiers" MSG_WRONG_KEYWORD = "Invalid keyword argument" # maximum number of identifiers to be collected at once. # The value of 200 is the maximum value authorised using cds.cern.ch N_IDS = 200 class InvenioStore(object): """Class to dialogue with `invenio `_ store. The class provides methods to request: * a list of identifier satisfying search criteria. * a record identified by its id. """ def __init__(self, host="cds.cern.ch"): """ Args: host (str): possible values are ``cds.cern.ch`` or ``inspirehep.net``. """ self._host = host self._url = None # start a session, a persistent connection with the server # let the session handle the number of retry session = requests.Session() session.mount(f"http://{host}", HTTPAdapter(max_retries=3)) self._session = session def __del__(self): # close the session self._session.close() def interogate(self, url, timeout=10, params=None): """Interrogate the store using the *URL*. It is retry several time when the service is not available. Args: url (str): URL string params (dict): parameters to be send with the URL timeout (float): timeout for the HTTP request Returns: str: the HTTP response Raises: RequestException: something went wrong within the HTTP dialog """ self._url = url r = self._session.get(url, timeout=timeout, params=params) r.raise_for_status() r.encoding = "utf-8" return r.text def get_ids(self, **kwargs): """Return a list of *record id* matching search criteria. Search criteria are defined by the keywords arguments: Keyword Args: cc (str): current collection (e.g. "ATLAS Papers"). The collection the user started to search/browse from. p (str): pattern to search for (e.g. "ellis and muon or kaon"). f (str): field to search within (e.g. "author"). p1 (str): first pattern to search for in the advanced search interface. Much like **p**. f1 (str): first field to search within in the advanced search interface. Much like **f**. m1 (str): first matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op1 (str): first operator, to join the first and the second unit in the advanced search interface. ("a" add, "o" or, "n" not). p2 (str): second pattern to search for in the advanced search interface. Much like **p**. f2 (str): second field to search within in the advanced search interface. Much like **f**. m2 (str): second matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). op2 (str): second operator, to join the second and the third unit in the advanced search interface. ("a" add, "o" or,"n" not). p3 (str): third pattern to search for in the advanced search interface. Much like **p**. f3 (str): third field to search within in the advanced search interface. Much like **f**. m3 (str): third matching type in the advanced search interface. ("a" all of the words, "o" any of the words, "e" exact phrase, "p" partial phrase, "r" regular expression). The complete list of keyword arguments can be found at http://invenio-demo.cern.ch/help/hacking/search-engine-api. Returns: list: a list of numbers. The list is empty when the request failed on the server. Raises: CdsException: * keyword argument is invalid; * the server return an HTTP error; * JSON object can't be decoded * not well formed list of ids. """ for k in kwargs: if k not in CDS_SEARCH_KEYS: raise CdsException(MSG_WRONG_KEYWORD, k) ids = [] scan = True # NOTE: the number of ids which can be collected at once is limited # to 10 on cds.cern.ch since the invenio version 1.1.3.1106 (Jun 2014) # Therefore to get the complete list of ids we have to scan them # by block of 200. The later is the maximum value allowed by cds. # We use the parameter rg and jrec to steer the scan. # They have no effect on inspirehep.net. kwargs["of"] = "id" kwargs["rg"] = N_IDS kwargs["jrec"] = -N_IDS while scan: kwargs["jrec"] += N_IDS url = "http://%s/search" % self._host rep = self.interogate(url, params=kwargs) try: li = rep.json() # check that the list of identifier is well form # [1291068, 1352722, 1376692] or [1493820] or [] if len(filter(lambda x: not isinstance(x, int), li)) > 0: raise CdsException(MSG_NO_IDS) ids.extend(li) # trigger when the JSON object cannot be decoded except ValueError as e: raise CdsException(e) if len(li) != N_IDS: scan = False return ids def get_record(self, rec_id): """Retrieve a record defined by its *record id*. Args: rec_id (int): record identifier in the store. Returns: dict: the record data (MarcJSON). Raises: CdsException: * the server return an HTTP error. * no JSON object could be decoded. """ self._try = 0 url = "http://%s/record/%s" % (self._host, rec_id) rep = self.interogate(url, params={"of": "recjson"}) try: li = rep.json() except ValueError as e: raise CdsException(e) return li[0] def last_search_url(self): """ Returns: unicode: the URL used in the last search. """ return self._url