Commit f530752c authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate CheckAndFix: methods required by the Thesis harvester.

parent e8154552
......@@ -10,8 +10,7 @@ from .base import search_synonym, ToolException
from datetime import datetime
from .exception import CheckException
from gluon import current
from invenio_tools import (DECODE_REF,
MSG_NO_CONF,
from invenio_tools import (MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
RecordConf,
......@@ -297,10 +296,7 @@ class CheckAndFix(object):
bool: ``True`` if *one* row is found, ``False`` otherwise.
"""
db = self.db
table = db[tablename]
query = table.synonyms.contains(value)
query = self.db[tablename].synonyms.contains(value)
if db(query).count() == 1:
return True
......@@ -316,6 +312,7 @@ class CheckAndFix(object):
Returns:
unicode:
target at least YYYY-MM
empty when procedure failed
"""
......@@ -323,7 +320,7 @@ class CheckAndFix(object):
if isinstance(record, RecordConf):
opening, closing = self._get_conference_dates(record)
return opening.strftime("%Y-%m-%d")
val = opening.strftime("%Y-%m-%d")
elif isinstance(record, RecordThesis):
val = record.these_defense()
......@@ -335,6 +332,10 @@ class CheckAndFix(object):
if m_arxiv:
val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))
# last change use the creation date for the record
if val == u"" or len(val) < 7:
val = record[u"creation_date"][0:7]
return val
@staticmethod
......@@ -538,17 +539,18 @@ class CheckAndFix(object):
* Replace U. by University
Args:
record (RecordThesis): record describing a thesis.
record (RecordThesis):
record describing a thesis.
"""
# protection
if not isinstance(record, RecordThesis):
return
is_cppm = self._get_reg_institute().find("CPPM") != -1
values = record[u"dissertation_note"][u"university"]
# CPPM: fix the name of Aix-Marseille university
if is_cppm:
# CPPM -- fix the name of Aix-Marseille university
if self._get_reg_institute().find("CPPM") != -1:
year = REG_YEAR.search(record.these_defense()).group(1)
if int(year) < 2012:
......@@ -556,33 +558,14 @@ class CheckAndFix(object):
else:
university = "Aix Marseille Université"
if "502" in record and "b" in record["502"]:
if isinstance(record["502"]["b"], str):
if "Marseille" in record["502"]["b"]:
record["502"]["b"] = university
values = (university if "Marseille" in values else values)
elif isinstance(record["502"]["b"], list):
for i in range(len(record["502"]["b"])):
if "Marseille" in record["502"]["b"][i]:
record["502"]["b"][i] = university
# Other: replace U. by University
# Other -- replace U. by University
else:
university = current.T(UNIVERSITY, lazy=False)
if "502" in record and "b" in record["502"]:
if isinstance(record["502"]["b"], str):
value = record["502"]["b"]
if "U." in value:
value = value.replace('U.', university)
record["502"]["b"] = value
elif isinstance(record["502"]["b"], list):
for i in range(len(record["502"]["b"])):
value = record["502"]["b"][i]
if "U." in value:
value = value.replace('U.', university)
record["502"]["b"][i] = value
university = current.T(UNIVERSITY).decode("utf8")
values.replace('U.', university)
record[u"dissertation_note"][u"university"] = values
def get_my_authors(self, record, sep=", ", sort=False):
"""Get authors of my institutes signing the record.
......@@ -647,10 +630,12 @@ class CheckAndFix(object):
"""Check that the record described a thesis.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException: when the record does not describe a thesis.
CheckException:
the record does not describe a thesis.
"""
if not isinstance(record, RecordThesis):
......@@ -819,42 +804,6 @@ class CheckAndFix(object):
except ToolException as e:
raise CheckException(*e.args)
@staticmethod
def recover_oai(record, host):
"""Recover the OAI identifier when it is not defined
or not well form.
Args:
record (RecordPubli): record describing a publication.
host (str): possible values ares ``cds.cern.ch``
or ``inspirehep.net``
"""
# Note:
# For the record cds 1951625, possible values are:
# oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
# oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
# in both store)
# In all the case the first OAI corresponds to the record.id()
#
oai = record.oai()
if oai is not None and REG_OAI.match(oai):
return
if host == "cds.cern.ch":
field, subfield = "0248", "a"
elif host == "inspirehep.net":
field, subfield = "909CO", "o"
else:
raise ValueError(MSG_INVALID_HOST)
if field not in record:
record[field] = dict()
record[field][subfield] = OAI_INVENIO % (host, record.id())
def submitted(self, record):
"""Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
Look for alternative when it is not defined.
......@@ -875,11 +824,14 @@ class CheckAndFix(object):
date = record.submitted()
# recover missing date using conference, preprint, thesis information
if len(date) == 0:
if len(date) < 7:
date = self._recover_submitted(record)
if len(date) == 0:
raise CheckException(MSG_NO_DATE)
elif len(date) < 7:
raise CheckException(MSG_WELL_FORMED_DATE)
# 22 Mar 2011
m = DECODE_DD_MMM_YYYY.match(date)
if m:
......@@ -892,17 +844,12 @@ class CheckAndFix(object):
data = (m.group(3), int(m.group(2)), int(m.group(1)))
date = '%s-%02i-%02i' % data
# 2011
m_year = DECODE_YYYY.match(date)
if m_year:
date = self._recover_submitted(record)
# check the minimum requirement is 2001-05
if not REG_SUBMITTED.match(date):
raise CheckException(MSG_WELL_FORMED_DATE)
if u"prepublication" in record:
record[u"prepublication"][u"date"] = date
else:
record[u"prepublication"] = {u"date": date}
@staticmethod
def temporary_record(record):
"""Some records are marked temporary.
......
# -*- coding: utf-8 -*-
"""test_12_CheckAndFix_thesis
* Test CheckAndFix methods for thesis.
Use the one talk in cds.cern.ch
- is_thesis
- submitted
- format_universities
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from invenio_tools import load_record
@pytest.fixture(scope="module")
def reccds():
return load_record("cds.cern.ch", 1394605)
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_is_thesis(svc, reccds):
assert svc.is_thesis(reccds) is None
def test_submitted(svc, reccds):
assert reccds.these_defense() == "2011"
assert reccds.submitted() == ""
svc.submitted(reccds)
assert reccds.submitted() == "2011-11"
def test_format_universities(svc, reccds):
# Khanji en 2011 (Université de la Méditerrannée)
assert reccds[u"dissertation_note"][u"university"] == \
"Marseille U., Luminy"
svc.format_universities(reccds)
assert reccds[u"dissertation_note"][u"university"] == \
u"Université de la Méditerrannée Aix-Marseille II"
# Chen en 2013 (Aix marseille Université)
reccds2 = load_record("cds.cern.ch", 1632177)
assert reccds2[u"dissertation_note"][u"university"] == \
u"Shandong U. & Marseille, CPPM"
svc.format_universities(reccds2)
assert reccds2[u"dissertation_note"][u"university"] == \
u"Aix Marseille Université"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment