base.py 3.96 KB
Newer Older
1
""" store_tools.base
2 3 4 5

"""
import re

6 7
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
8

9 10 11
CDS = ("cds", "cds.cern.ch")
INS = ("inspirehep", "inspirehep.net")

12 13
MSG_INV_CONF = "Reject invalid conference information"
MSG_INV_CONF_KEY = "Reject invalid conference key"
14
MSG_NO_CONF = "Reject no conference information"
15
MSG_NO_CONF_ID_KEY = "Reject no conference identifier and key"
LE GAC Renaud's avatar
LE GAC Renaud committed
16
MSG_NO_COUNTRY = "Reject invalid country"
17
MSG_NO_HOST = "Reject no host information in record"
LE GAC Renaud's avatar
LE GAC Renaud committed
18
MSG_NO_PUBLISHER = "Reject invalid publisher"
19
MSG_NO_SHELF = "No shelf %s for store %s"
20
MSG_NO_THESIS = "Reject no thesis information"
LE GAC Renaud's avatar
LE GAC Renaud committed
21
MSG_WELL_FORMED_COLLABORATION = "Reject collaboration is not well formed"
22

23 24
OAI = "oai:%s:%s"
OAI_URL = "http://%s/record/%s"
25

26
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
27

28 29 30 31 32 33
# name are encoded Family, First where first can be first-Second
# many variant are possible with initial, dot, ...
# group(1) is the family name
# group(2) is the part of the first name before the separator (" ", "-")
# group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
34

LE GAC Renaud's avatar
LE GAC Renaud committed
35 36
REG_DATE = re.compile(r"(\d{4}-\d{2}-\d{2})")
REG_CONF = re.compile("^C\d+-\d+-\d+(?:\.\d+)?$")
37 38
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
39

40
THESIS_DIR = "dir."
41

42

43
def is_conference(recjson):
44 45
    """True when the record describes a publication related to a conference.

46 47 48 49
    Note:
        This tools works for JSON record coming from cds.cern.ch,
        old.inspirehep.net as well as inspirehep.net

50
    Args:
51 52
        recjson (dict):
            record associated to a publication or to and institute.
53 54

    Return:
55 56
        bool:
            ``True`` when the record describes a publication related
57
            to a conference.
58 59

    """
60 61 62 63 64 65 66
    # proceeding in inspirehep
    if recjson.get("$schema", "").endswith("hep.json"):
        for elt in recjson.get("document_type", []):
            if elt == "conference paper":
                return True

    # proceeding in cds
67 68 69 70 71 72 73 74 75 76
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "ConferencePaper"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True

77
    # talk in cds
78 79 80 81 82
    # look for a conference key or for a subject equal to Talk
    found = \
        ("aleph_linking_page" in recjson) or \
        ("subject" in recjson
         and "term" in recjson["subject"]
83
         and recjson["subject"]["term"] == "Talk")
84 85

    if found:
86 87
        return True

88
    return False
89 90


91
def is_institute(recjson):
92
    """True when the record describes an institute.
93

94 95 96 97
    Note:
        This tools works for JSON record coming from cds.cern.ch,
        old.inspirehep.net as well as inspirehep.net

98
    Args:
99 100
        recjson (dict):
            record associated to a publication or to and institute.
101 102

    Return:
103 104
        bool:
            ``True`` when the record describes an institute.
105 106

    """
107
    return recjson.get("$schema", "").endswith("institutions.json")
108 109


110
def is_thesis(recjson):
111 112
    """True when the record describes a thesis.

113 114 115 116
    Note:
        This tools works for JSON record coming from cds.cern.ch,
        old.inspirehep.net as well as inspirehep.net

117
    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
118 119
        recjson (dict):
            record associated to a publication or to and institute.
120 121

    Return:
LE GAC Renaud's avatar
LE GAC Renaud committed
122
        bool: ``True`` when the record describes a thesis.
123 124

    """
125 126 127 128 129 130 131
    # Thesis in inspirehep
    if recjson.get("$schema", "").endswith("hep.json"):
        for elt in recjson.get("document_type", []):
            if elt == "thesis":
                return True

    # Thesis in cds
132 133 134 135 136 137 138 139 140
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "THESIS"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True
141 142

    return False