factory.py 11.8 KB
Newer Older
1
""" store_tools.factory
2 3

"""
4
import re
LE GAC Renaud's avatar
LE GAC Renaud committed
5

6 7 8
from .base import (CDS,
                   INS,
                   is_conference,
9 10 11 12 13
                   is_institute,
                   is_thesis,
                   MSG_INV_CONF,
                   MSG_INV_CONF_KEY,
                   MSG_NO_CONF,
14
                   MSG_NO_SHELF,
15
                   REG_CONF)
16

17
from datetime import datetime
18
from .exception import CdsException
19
from .inveniostore import InvenioStore
20 21 22 23
from .recordconf import RecordConf
from .recordinst import RecordInst
from .recordpubli import RecordPubli
from .recordthesis import RecordThesis
24

25 26 27
REX_T = "\$\$t([\w, ]+)"
REX_U = "\$\$u([\w, ]+)"

28

LE GAC Renaud's avatar
LE GAC Renaud committed
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
def add_affiliation_keys(recjson):
    """A the affiliation keys to the record describing an institute:

        * The XML record contains the affiliation keys used by inspirehep.net.
          They are located in the field 110__u and 110__t (future).

        * The JSON record does not contains this information.

        * This tool add the affiliation keys to the JSON record.
          They are located:

             +----------------+------------------------------------+
             | field (limbra) | subfield                           |
             +----------------+------------------------------------+
             | corporate_note | identifier, futur_identifier, name |
             +----------------+------------------------------------+

46 47 48
    Args:
        recjson (dict):
            record data (MarcJSON)
LE GAC Renaud's avatar
LE GAC Renaud committed
49 50

    """
51
    store = InvenioStore("inspirehep", shelf="institutions")
LE GAC Renaud's avatar
LE GAC Renaud committed
52

53
    url = f"https://old.inspirehep.net/record/{recjson['recid']}"
LE GAC Renaud's avatar
LE GAC Renaud committed
54

55 56
    rep = store.interrogate(url, ot="110", of="txt")
    txt = rep.text
LE GAC Renaud's avatar
LE GAC Renaud committed
57

58 59 60
    recjson["corporate_note"] = {
        "identifier": re.search(REX_U, txt).group(1),
        "future_identifier": re.search(REX_T, txt).group(1)}
LE GAC Renaud's avatar
LE GAC Renaud committed
61 62


63 64 65
def add_conference_data(recjson):
    """Add the conference data to the recjson.

66 67 68
    Note:
        Encoding of conference information depends on the store.

69 70 71 72 73 74
    It adds the following field and subfield::
         +---------------+-----------------------------------------------+
         | field         | subfield                                      |
         +---------------+-----------------------------------------------+
         | meeting_name  | closing_date, coference_code, country, date,  |
         |               | location, opening_date, year                  |
LE GAC Renaud's avatar
LE GAC Renaud committed
75
         | meeting_note  | recid, url                                    |
76 77 78
         +---------------+-----------------------------------------------+

    Args:
79 80
        recjson (dict):
            record data (MarcJSON)
81

LE GAC Renaud's avatar
LE GAC Renaud committed
82 83 84 85 86
    Note:
        * Fields are not added when there is no conference identifier and
          no conference key in the recjson.
        * The method CheckAndFix.is_conference will identify that case.

87 88 89
    """
    # ........................................................................
    #
LE GAC Renaud's avatar
LE GAC Renaud committed
90
    # Retrieve conference identifier and the host
91 92 93 94
    #     - the algorithm depend on the store
    #     - for cds use aleph_linking_page
    #     - for inspire use  publication_info.cnum
    #
LE GAC Renaud's avatar
LE GAC Renaud committed
95
    conf_id, conf_key, host = None, None, None
96

97 98 99
    if "aleph_linking_page" in recjson:
        di = recjson["aleph_linking_page"]
        conf_id = di["sysno"]
100
        conf_key = di.get("up_link", None)
LE GAC Renaud's avatar
LE GAC Renaud committed
101
        host = "cds.cern.ch"
102

103 104
    elif "publication_info" in recjson:
        data = recjson["publication_info"]
105 106 107
        data = (data if isinstance(data, list) else [data])

        for di in data:
108 109
            if "cnum" in di:
                conf_key = di["cnum"]
LE GAC Renaud's avatar
LE GAC Renaud committed
110
                host = "inspirehep.net"
111 112 113
                break

    if conf_id is None and conf_key is None:
LE GAC Renaud's avatar
LE GAC Renaud committed
114
        return
115 116 117 118 119 120

    # ........................................................................
    #
    # Get conference data
    #
    if conf_id is not None:
LE GAC Renaud's avatar
LE GAC Renaud committed
121
        conf_id = (conf_id if isinstance(conf_id, int) else int(conf_id))
122
        kwargs = dict(conf_id=conf_id)
123 124

    else:
125 126 127 128 129 130 131
        kwargs = dict(key=conf_key)

    try:
        confjson = get_conference_data(host, **kwargs)

    except CdsException:
        return
132

133 134 135
    # ........................................................................
    #
    # Add conference data to the recjson (cds.cern.ch)
LE GAC Renaud's avatar
LE GAC Renaud committed
136
    #
137 138 139 140 141 142 143 144 145 146 147 148 149 150
    if host in CDS:
        # extract the conference url
        #   - information is in confjson[url]
        #   - in most of the case it is a dictionary
        #   - when it is a list take the first entry which is for the
        #     home page while the second one is for the proceeding (cds 2270940)
        #   - in other case the url is not defined (cds 2258914)
        confurl = ""
        if "url" in confjson:
            obj = confjson["url"]
            confurl = (obj["url"] if isinstance(obj, dict) else obj[0]["url"])

        recjson["meeting_name"] = confjson["meeting_name"]
        recjson["meeting_note"] = {"recid": confjson["recid"], "url": confurl}
LE GAC Renaud's avatar
LE GAC Renaud committed
151

152 153
    # ........................................................................
    #
154
    # Add conference data to the recjson (inspirehep.net)
155
    #
156
    elif host in INS:
157
        # location of the conference
158
        address = [el for el in confjson["addresses"] if el.get("country")][0]
159

160
        # date of the conference 6-12 Dec 2010
161 162 163 164 165
        start, end = confjson["opening_date"], confjson["closing_date"]
        ds = datetime.strptime(start, "%Y-%m-%d")
        de = datetime.strptime(end, "%Y-%m-%d")

        if ds.month == de.month:
LE GAC Renaud's avatar
LE GAC Renaud committed
166
            sdate = f"{ds.day}-{de.day} " + ds.strftime("%b %Y")
167
        else:
LE GAC Renaud's avatar
LE GAC Renaud committed
168
            sdate = f"{ds.strftime('%-d %b')} - {de.strftime('%-d %b %Y')}"
169

170 171 172 173 174 175 176 177 178 179
        # URL of the conference (take the first value)
        urls = confjson.get("urls")
        if urls is None:
            url = ""
        elif isinstance(urls, list) and len(urls) > 0:
            url = urls[0]["value"]
        else:
            url = "???"

        # add
180 181 182 183 184
        recjson["meeting_name"] = [{
            "closing_date": end,
            "coference_code": confjson["cnum"],
            "country": address["country_code"],
            "date": sdate,
LE GAC Renaud's avatar
LE GAC Renaud committed
185 186
            "location": f"{address['cities'][0]}, {address['country']}",
            "meeting": confjson["titles"][0]["title"],
187 188 189 190 191
            "opening_date": start,
            "year": confjson["opening_date"][:4]}]

        recjson["meeting_note"] = {
            "recid": confjson["control_number"],
192
            "url": url}
193 194 195


def build_record(recjson):
196 197 198
    """Transform a JSON object into a record

    Args:
199
        recjson (dict):
200 201 202 203 204 205 206 207 208
            record data in a JSON format.

    Return
        Record:
            either RecordConf, RecordInst, RecodPubli or RecordThesis

    Raises:

    """
209 210 211
    if is_conference(recjson):
        add_conference_data(recjson)
        upcast_record = RecordConf(recjson)
212

213
    elif is_institute(recjson):
LE GAC Renaud's avatar
LE GAC Renaud committed
214
        add_affiliation_keys(recjson)
215
        upcast_record = RecordInst(recjson)
216

217 218
    elif is_thesis(recjson):
        upcast_record = RecordThesis(recjson)
219 220

    else:
221
        upcast_record = RecordPubli(recjson)
222 223

    return upcast_record
224 225


226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
def build_store(host=None, shelf=None):
    """Return the interface to the publication store.

    Args:
        host (str):
            possible values are ``cds``, ``cds.cern.ch``,``inspirehep``
            or ``inspirehep.net``

        shelf (str):
            section of the store containing records. It depends on the host.
            Possible values are ``None``, ``literature``, ``conferences``
            and ``institutions``

             +----------------+--------------+-----------------------------+
             | host           | shelf        | base API                    |
             +----------------+--------------+-----------------------------+
             | cds.cern.ch    | None         | https://cds.cern.ch/        |
             +----------------+--------------+-----------------------------+
             | inspirehep.net | None         | https://old.inspirehep.net/ |
             | inspirehep.net | literature   | https://old.inspirehep.net/ |
             | inspirehep.net | conferences  | https://inspirehep.net/     |
             | inspirehep.net | institutions | https://old.inspirehep.net/ |
             +----------------+--------------+-----------------------------+

    Returns:
        InvenioStore

    """
    if host in CDS:
        store = InvenioStore(
            host="cds.cern.ch",
            api_record="https://cds.cern.ch/record",
            api_search="https://cds.cern.ch/search",
            shelf=shelf)

    elif host in INS and shelf in (None, "literature", "institutions"):
        store = InvenioStore(
            host="old.inspirehep.net",
            api_record="https://old.inspirehep.net/record",
            api_search="https://old.inspirehep.net/search",
            shelf=shelf)

    elif host in INS and shelf in ("conferences",):
        store = InvenioStore(
            host="inspirehep.net",
            api_record="https://inspirehep.net/api/conferences",
            api_search="https://inspirehep.net/api/conferences/?q=",
            shelf=shelf)

    else:
        raise CdsException(MSG_NO_SHELF % (shelf, host))

    return store


281 282 283 284
def get_conference_data(host, conf_id=None, key=None):
    """Get the conference data identified by its id or key.

    Args:
285
        host (str):
286
            possible values are ``store``, ``store.cern.ch``, ``inspirehep``
287
            or ``inspirehep.net``.
288

LE GAC Renaud's avatar
LE GAC Renaud committed
289
        conf_id (int):
290 291 292
            the conference identifier in the store.
            This is the preferred way.

293
        key (str): the conference key in the store.
294 295 296 297 298 299 300 301 302 303 304

    Returns:
        dict:
            The conference data (MarcJSON).

    Raises:
        CdsException:
            - conference record with a wrong identifier
            - conference not found

    """
305
    store = build_store(host, shelf="conferences")
306 307 308

    # ........................................................................
    #
309
    # search by id in cds.cern.ch
310
    #
311
    if conf_id is not None and host in CDS:
312
        recjson = store.get_record(conf_id)
LE GAC Renaud's avatar
LE GAC Renaud committed
313
        if recjson["recid"] != conf_id:
314 315 316 317 318 319 320
            raise CdsException(MSG_INV_CONF)
        return recjson

    # ........................................................................
    #
    # search by key in cds.cern.ch
    #
321
    if key is not None and host in CDS:
322
        ids = store.get_ids(p=key)
323 324

        for conf_id in ids:
325
            recjson = store.get_record(conf_id)
326 327 328 329 330 331

            if match_conference_key(recjson, key):
                return recjson

        raise CdsException(MSG_NO_CONF)

332 333 334 335 336
    # ........................................................................
    #
    # search by id in inspirehep.net
    #
    if conf_id is not None and host in INS:
337
        obj = store.get_record(conf_id)
338 339 340 341
        if obj["id"] != str(conf_id):
            raise CdsException(MSG_INV_CONF)
        return obj["metadata"]

342 343 344 345
    # ........................................................................
    #
    # search by key in inspirehep.net
    #
346
    if key is not None and host in INS:
347

348
        key = key.replace("/", "-")
349 350 351
        if not REG_CONF.match(key):
            raise CdsException(MSG_INV_CONF_KEY)

352
        obj = store.search(f"cnum:{key}")
353

354 355
        try:
            recjson = obj[0]["metadata"]
356

357 358
        except (KeyError, TypeError):
            raise CdsException(MSG_NO_CONF)
359

360 361 362 363
        if recjson["cnum"] != key:
            raise CdsException(MSG_NO_CONF)

        return recjson
364 365 366 367 368 369 370 371 372 373


def match_conference_key(recjson, conf_key):
    """Return ``True`` when the record corresponds to a conference identified
    by its key.

    Args:
        recjson (dict):
            record formatted MarcJSON.

374
        conf_key (str):
375 376 377 378 379 380
            conference key

    Returns
        bool:

    """
381 382
    if "meeting_name" in recjson:
        for di in recjson["meeting_name"]:
383

384
            subfield = "coference_code"
385 386 387 388
            if subfield in di and di[subfield] == conf_key:
                return True

    return False