base.py 3.35 KB
Newer Older
1 2 3 4 5
""" invenio_tools.base

"""
import re

6 7
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
8

9
MSG_NO_CONF = "Reject no conference information"
LE GAC Renaud's avatar
LE GAC Renaud committed
10
MSG_NO_COUNTRY = "Reject invalid country"
11
MSG_NO_HOST = "Reject no host information in record"
LE GAC Renaud's avatar
LE GAC Renaud committed
12
MSG_NO_PUBLISHER = "Reject invalid publisher"
13
MSG_NO_THESIS = "Reject no thesis information"
LE GAC Renaud's avatar
LE GAC Renaud committed
14
MSG_WELL_FORMED_COLLABORATION = "Reject collaboration is not well formed"
15

16 17
OAI = "oai:%s:%s"
OAI_URL = "http://%s/record/%s"
18

19
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
20

21 22 23 24 25 26
# name are encoded Family, First where first can be first-Second
# many variant are possible with initial, dot, ...
# group(1) is the family name
# group(2) is the part of the first name before the separator (" ", "-")
# group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
27 28 29

REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
30

31
THESIS_DIR = "dir."
32

33

34
def is_conference(recjson):
35 36 37
    """True when the record describes a publication related to a conference.

    Args:
38 39
        recjson (dict):
            record associated to a publication or to and institute.
40 41

    Return:
42 43
        bool:
            ``True`` when the record describes a publication related
44
            to a conference.
45 46

    """
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
    # ConferencePaper in collection
    # find proceeding in both stores
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "ConferencePaper"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True

    # try to identify talk in cds
    # look for a conference key or for a subject equal to Talk
    found = \
        ("aleph_linking_page" in recjson) or \
        ("subject" in recjson
         and "term" in recjson["subject"]
         and recjson["subject"]["term"] == u"Talk")

    if found:
68 69
        return True

70 71 72 73 74 75 76 77
    # try to identify talk in inspirehep
    # look for a conference key
    found = \
        ("publication_info" in recjson
         and "cnum" in recjson["publication_info"])

    if found:
        return True
78

79
    return False
80 81


82
def is_institute(recjson):
83
    """True when the record describes an institute.
84 85

    Args:
86 87
        recjson (dict):
            record associated to a publication or to and institute.
88 89

    Return:
90 91
        bool:
            ``True`` when the record describes an institute.
92 93

    """
94 95 96 97 98 99 100 101 102 103
    # INSTITUTION in collection
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "INSTITUTION"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True
104 105

    return False
106 107


108
def is_thesis(recjson):
109 110 111
    """True when the record describes a thesis.

    Args:
112 113
        record (Record): MARC12 record associated to a publication
            or to and institute.
114 115

    Return:
116
        bool: ``True`` when the MARC record describes a thesis.
117 118

    """
119 120 121 122 123 124 125 126 127 128
    # THESIS in collection
    if "collection" in recjson:
        data = recjson["collection"]
        data = (data if isinstance(data, list) else [data])

        value = "THESIS"
        li = [di for di in data if "primary" in di and di["primary"] == value]

        if len(li) > 0:
            return True