Commit 35b24bcb authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update invenio factory with the tool to get / add conference_data.

parent 34a60765
...@@ -6,7 +6,10 @@ import re ...@@ -6,7 +6,10 @@ import re
ARXIV = "arXiv" ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/" ARXIV_PDF = "http://arxiv.org/pdf/"
MSG_INV_CONF = "Reject invalid conference information"
MSG_INV_CONF_KEY = "Reject invalid conference key"
MSG_NO_CONF = "Reject no conference information" MSG_NO_CONF = "Reject no conference information"
MSG_NO_CONF_ID_KEY = "Reject no conference identifier and key"
MSG_NO_COUNTRY = "Reject invalid country" MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_HOST = "Reject no host information in record" MSG_NO_HOST = "Reject no host information in record"
MSG_NO_PUBLISHER = "Reject invalid publisher" MSG_NO_PUBLISHER = "Reject invalid publisher"
...@@ -25,6 +28,7 @@ REG_ARXIV_NUMBER = re.compile("\d+\.\d+") ...@@ -25,6 +28,7 @@ REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
# group(3) is the part of the first name after the separator (" ", "-") # group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE) REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
REG_CONF = re.compile("^C\d+-\d+-\d+(\.\d+)?$")
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)") REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})") REG_YEAR = re.compile(r"(\d{4})")
......
...@@ -2,18 +2,107 @@ ...@@ -2,18 +2,107 @@
""" invenio_tools.factory """ invenio_tools.factory
""" """
from base import is_conference, is_institute, is_thesis from base import (is_conference,
is_institute,
is_thesis,
MSG_INV_CONF,
MSG_INV_CONF_KEY,
MSG_NO_CONF,
MSG_NO_CONF_ID_KEY,
REG_CONF,
REG_OAI)
from exception import CdsException
from inveniostore import InvenioStore
from recordconf import RecordConf from recordconf import RecordConf
from recordinst import RecordInst from recordinst import RecordInst
from recordpubli import RecordPubli from recordpubli import RecordPubli
from recordthesis import RecordThesis from recordthesis import RecordThesis
def build_record(rec_json): def add_conference_data(recjson):
"""Add the conference data to the recjson.
It adds the following field and subfield::
+---------------+-----------------------------------------------+
| field | subfield |
+---------------+-----------------------------------------------+
| meeting_name | closing_date, coference_code, country, date, |
| | location, opening_date, year |
| meeting | recid, url |
+---------------+-----------------------------------------------+
Args:
recjson (dict): record data (MarcJSON)
Raise:
CdsException:
- no conference identifier and key in the recjson
- conference recjson found but with a wrong identifier
- conference not found
"""
# ........................................................................
#
# Retrieve conference identifier
# - the algorithm depend on the store
# - for cds use aleph_linking_page
# - for inspire use publication_info.cnum
#
conf_id, conf_key = None, None
if u"aleph_linking_page" in recjson:
di = recjson[u"aleph_linking_page"]
conf_id = di[u"sysno"]
conf_key = di[u"up_link"]
elif u"publication_info" in recjson:
data = recjson[u"publication_info"]
data = (data if isinstance(data, list) else [data])
for di in data:
if u"cnum" in di:
conf_key = di[u"cnum"]
break
if conf_id is None and conf_key is None:
raise CdsException(MSG_NO_CONF_ID_KEY)
# ........................................................................
#
# Get conference data
#
# extract the host name
if u"oai" in recjson:
oai = recjson[u"oai"][u"value"]
elif u"FIXME_OAI" in recjson:
oai = recjson[u"FIXME_OAI"][u"id"]
host = REG_OAI.match(oai).group(1)
# get the data
if conf_id is not None:
confjson = get_conference_data(host, conf_id=conf_id)
else:
confjson = get_conference_data(host, key=conf_key)
# ........................................................................
#
# Add conference data to the recjson
#
recjson[u"meeting_name"] = confjson[u"meeting_name"]
recjson[u"meeting"] = {
u"recid": confjson[u"recid"],
u"url": confjson[u"url"][u"url"]}
def build_record(recjson):
"""Transform a JSON object into a record """Transform a JSON object into a record
Args: Args:
rec_json (dict): recjson (dict):
record data in a JSON format. record data in a JSON format.
Return Return
...@@ -23,17 +112,113 @@ def build_record(rec_json): ...@@ -23,17 +112,113 @@ def build_record(rec_json):
Raises: Raises:
""" """
if is_conference(rec_json): if is_conference(recjson):
upcast_record = RecordConf(rec_json) add_conference_data(recjson)
# self._add_conference_data(upcast_record) upcast_record = RecordConf(recjson)
elif is_institute(rec_json): elif is_institute(recjson):
upcast_record = RecordInst(rec_json) upcast_record = RecordInst(recjson)
elif is_thesis(rec_json): elif is_thesis(recjson):
upcast_record = RecordThesis(rec_json) upcast_record = RecordThesis(recjson)
else: else:
upcast_record = RecordPubli(rec_json) upcast_record = RecordPubli(recjson)
return upcast_record return upcast_record
def get_conference_data(host, conf_id=None, key=None):
"""Get the conference data identified by its id or key.
Args:
host (unicode):
possible values are ``cds.cern.ch`` or ``inspirehep.net``.
conf_id (unicode):
the conference identifier in the store.
This is the preferred way.
key (unicode): the conference key in the store.
Returns:
dict:
The conference data (MarcJSON).
Raises:
CdsException:
- conference record with a wrong identifier
- conference not found
"""
cds = InvenioStore(host)
# ........................................................................
#
# search by id
#
if conf_id is not None:
recjson = cds.get_record(conf_id)
if str(recjson["recid"]) != conf_id:
raise CdsException(MSG_INV_CONF)
return recjson
# ........................................................................
#
# search by key in cds.cern.ch
#
if key is not None and host == "cds.cern.ch":
ids = cds.get_ids(p=key)
for conf_id in ids:
recjson = cds.get_record(conf_id)
if match_conference_key(recjson, key):
return recjson
raise CdsException(MSG_NO_CONF)
# ........................................................................
#
# search by key in inspirehep.net
#
if key is not None and host == "inspirehep.net":
key = key.replace("/", "-")
if not REG_CONF.match(key):
raise CdsException(MSG_INV_CONF_KEY)
ids = cds.get_ids(cc="Conferences", p="111__g:%s" % key)
for conf_id in ids:
recjson = cds.get_record(conf_id)
if match_conference_key(recjson, key):
return recjson
raise CdsException(MSG_NO_CONF)
def match_conference_key(recjson, conf_key):
"""Return ``True`` when the record corresponds to a conference identified
by its key.
Args:
recjson (dict):
record formatted MarcJSON.
conf_key (unicode):
conference key
Returns
bool:
"""
if u"meeting_name" in recjson:
for di in recjson[u"meeting_name"]:
subfield = u"coference_code"
if subfield in di and di[subfield] == conf_key:
return True
return False
...@@ -196,9 +196,8 @@ class InvenioStore(object): ...@@ -196,9 +196,8 @@ class InvenioStore(object):
rec_id (int): record identifier in the store. rec_id (int): record identifier in the store.
Returns: Returns:
str: the XML string is compliant with dict:
the `MARC <http://www.loc.gov/marc/>`_ standard. the record data (MarcJSON).
Use Marc12.__call__ to decode it.
Raises: Raises:
CdsException: CdsException:
......
...@@ -74,7 +74,7 @@ class RecordPubli(Record): ...@@ -74,7 +74,7 @@ class RecordPubli(Record):
The main ``field`` and ``subfield`` are:: The main ``field`` and ``subfield`` are::
+---------------------------------+----------------------------------+ +---------------------------------+----------------------------------+
| field (cds) | subfield | | field | subfield |
+---------------------------------+----------------------------------+ +---------------------------------+----------------------------------+
| FIXME_OAI (inspire) | id | | FIXME_OAI (inspire) | id |
| abstract | | | abstract | |
......
...@@ -3,17 +3,29 @@ ...@@ -3,17 +3,29 @@
* Test tools to introspect the type of record. * Test tools to introspect the type of record.
* Test tools to upcast the Record from the JSON object. * Test tools to upcast the Record from the JSON object.
* Test tools to get / add conference data.
""" """
from invenio_tools.base import is_conference, is_institute, is_thesis from invenio_tools.base import (is_conference,
from invenio_tools.factory import build_record is_institute,
is_thesis)
from invenio_tools.factory import (add_conference_data,
build_record,
get_conference_data)
from invenio_tools.inveniostore import InvenioStore from invenio_tools.inveniostore import InvenioStore
from invenio_tools.record import Record
from invenio_tools.recordconf import RecordConf from invenio_tools.recordconf import RecordConf
from invenio_tools.recordinst import RecordInst from invenio_tools.recordinst import RecordInst
from invenio_tools.recordpubli import RecordPubli from invenio_tools.recordpubli import RecordPubli
from invenio_tools.recordthesis import RecordThesis from invenio_tools.recordthesis import RecordThesis
# ............................................................................
#
# Section to test introspection and instantiation
#
def test_conference_cds(): def test_conference_cds():
store = InvenioStore("cds.cern.ch") store = InvenioStore("cds.cern.ch")
...@@ -105,3 +117,100 @@ def test_thesis_cds(): ...@@ -105,3 +117,100 @@ def test_thesis_cds():
record = build_record(recjson) record = build_record(recjson)
assert isinstance(record, RecordThesis) assert isinstance(record, RecordThesis)
# ............................................................................
#
# Section to test tool to get and add conference data
#
def test_get_conference_data():
""" check the different approach to get the conference data
using the same proceeding, in both store cds.cern.ch and inspirehep.net.
the map of identifiers and keys is the following:
+------------------+--------------+-----------|
| | cds | inspire |
+------------------+--------------+-----------|
| proceeding recid | 1411352 | 1089237 |
| conference recid | 1181092 | 980401 |
| conference key | rome20101206 | C10-12-06 |
+------------------+--------------+-----------|
"""
# ........................................................................
#
# CDS
#
# by id
recjson1 = get_conference_data("cds.cern.ch", conf_id=1181092)
assert recjson1["recid"] == 1181092
assert recjson1["meeting_name"][0]["coference_code"] == "rome20101206"
# by keys
recjson2 = get_conference_data("cds.cern.ch", key="rome20101206")
assert recjson2["recid"] == 1181092
assert recjson2["meeting_name"][0]["coference_code"] == "rome20101206"
assert recjson1 == recjson2
# ........................................................................
#
# INSPIRE
#
# by id
recjson3 = get_conference_data("inspirehep.net", conf_id=980401)
assert recjson3["recid"] == 980401
assert recjson3["meeting_name"][0]["coference_code"] == "C10-12-06"
# by key
recjson4 = get_conference_data("inspirehep.net", key="C10-12-06")
assert recjson4["recid"] == 980401
assert recjson4["meeting_name"][0]["coference_code"] == "C10-12-06"
assert recjson3 == recjson4
def test_add_conference_data():
""" check the different approach to get the conference data
using the same proceeding, in both store cds.cern.ch and inspirehep.net.
the map of identifiers and keys is the following:
+------------------+--------------+-----------|
| | cds | inspire |
+------------------+--------------+-----------|
| proceeding recid | 1411352 | 1089237 |
| conference recid | 1181092 | 980401 |
| conference key | rome20101206 | C10-12-06 |
+------------------+--------------+-----------|
"""
# ........................................................................
#
# CDS
#
store = InvenioStore("cds.cern.ch")
recjson = store.get_record(1411352)
add_conference_data(recjson)
assert "meeting_name" in recjson
assert "meeting" in recjson
assert recjson["meeting_name"][0]["coference_code"] == "rome20101206"
assert recjson["meeting"]["url"] == "http://www.roma1.infn.it/discrete10"
assert recjson["meeting"]["recid"] == 1181092
# ........................................................................
#
# INSPIRE
#
store = InvenioStore("inspirehep.net")
recjson = store.get_record(1089237)
add_conference_data(recjson)
assert "meeting_name" in recjson
assert "meeting" in recjson
assert recjson["meeting_name"][0]["coference_code"] == "C10-12-06"
assert recjson["meeting"]["url"] == "http://www.roma1.infn.it/discrete10"
assert recjson["meeting"]["recid"] == 980401
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment