base.py 3.7 KB
Newer Older
1
""" store_tools.base
2 3 4 5

"""
import re

6 7
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
8

9 10 11
CDS = ("cds", "cds.cern.ch")
INS = ("inspirehep", "inspirehep.net")

12 13
MSG_INV_CONF = "Reject invalid conference information"
MSG_INV_CONF_KEY = "Reject invalid conference key"
14
MSG_NO_CONF = "Reject no conference information"
15
MSG_NO_CONF_ID_KEY = "Reject no conference identifier and key"
LE GAC Renaud's avatar
LE GAC Renaud committed
16
MSG_NO_COUNTRY = "Reject invalid country"
17
MSG_NO_HOST = "Reject no host information in record"
LE GAC Renaud's avatar
LE GAC Renaud committed
18
MSG_NO_PUBLISHER = "Reject invalid publisher"
19
MSG_NO_SHELF = "No shelf %s for store %s"
20
MSG_NO_THESIS = "Reject no thesis information"
LE GAC Renaud's avatar
LE GAC Renaud committed
21
MSG_WELL_FORMED_COLLABORATION = "Reject collaboration is not well formed"
22

23 24
OAI = "oai:%s:%s"
OAI_URL = "http://%s/record/%s"
25

26
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
27

28 29 30 31 32 33
# name are encoded Family, First where first can be first-Second
# many variant are possible with initial, dot, ...
# group(1) is the family name
# group(2) is the part of the first name before the separator (" ", "-")
# group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
34

LE GAC Renaud's avatar
LE GAC Renaud committed
35 36
REG_DATE = re.compile(r"(\d{4}-\d{2}-\d{2})")
REG_CONF = re.compile("^C\d+-\d+-\d+(?:\.\d+)?$")
37 38
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
39

40
THESIS_DIR = "dir."
41

42

43
def is_conference(recjson):
44 45 46
    """True when the record describes a publication related to a conference.

    Args:
47 48
        recjson (dict):
            record associated to a publication or to and institute.
49 50

    Return:
51 52
        bool:
            ``True`` when the record describes a publication related
53
            to a conference.
54 55

    """
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
    # ConferencePaper in collection
    # find proceeding in both stores
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "ConferencePaper"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True

    # try to identify talk in cds
    # look for a conference key or for a subject equal to Talk
    found = \
        ("aleph_linking_page" in recjson) or \
        ("subject" in recjson
         and "term" in recjson["subject"]
74
         and recjson["subject"]["term"] == "Talk")
75 76

    if found:
77 78
        return True

79 80 81 82 83 84 85 86
    # try to identify talk in inspirehep
    # look for a conference key
    found = \
        ("publication_info" in recjson
         and "cnum" in recjson["publication_info"])

    if found:
        return True
87

88
    return False
89 90


91
def is_institute(recjson):
92
    """True when the record describes an institute.
93 94

    Args:
95 96
        recjson (dict):
            record associated to a publication or to and institute.
97 98

    Return:
99 100
        bool:
            ``True`` when the record describes an institute.
101 102

    """
103 104 105 106 107 108 109 110 111 112
    # INSTITUTION in collection
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "INSTITUTION"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True
113 114

    return False
115 116


117
def is_thesis(recjson):
118 119 120
    """True when the record describes a thesis.

    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
121 122
        recjson (dict):
            record associated to a publication or to and institute.
123 124

    Return:
LE GAC Renaud's avatar
LE GAC Renaud committed
125
        bool: ``True`` when the record describes a thesis.
126 127

    """
128 129 130 131 132 133 134 135 136 137
    # THESIS in collection
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "THESIS"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True