Commit f530752c authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate CheckAndFix: methods required by the Thesis harvester.

parent e8154552
...@@ -10,8 +10,7 @@ from .base import search_synonym, ToolException ...@@ -10,8 +10,7 @@ from .base import search_synonym, ToolException
from datetime import datetime from datetime import datetime
from .exception import CheckException from .exception import CheckException
from gluon import current from gluon import current
from invenio_tools import (DECODE_REF, from invenio_tools import (MSG_NO_CONF,
MSG_NO_CONF,
MSG_NO_THESIS, MSG_NO_THESIS,
OAI_URL, OAI_URL,
RecordConf, RecordConf,
...@@ -297,10 +296,7 @@ class CheckAndFix(object): ...@@ -297,10 +296,7 @@ class CheckAndFix(object):
bool: ``True`` if *one* row is found, ``False`` otherwise. bool: ``True`` if *one* row is found, ``False`` otherwise.
""" """
db = self.db query = self.db[tablename].synonyms.contains(value)
table = db[tablename]
query = table.synonyms.contains(value)
if db(query).count() == 1: if db(query).count() == 1:
return True return True
...@@ -316,6 +312,7 @@ class CheckAndFix(object): ...@@ -316,6 +312,7 @@ class CheckAndFix(object):
Returns: Returns:
unicode: unicode:
target at least YYYY-MM
empty when procedure failed empty when procedure failed
""" """
...@@ -323,7 +320,7 @@ class CheckAndFix(object): ...@@ -323,7 +320,7 @@ class CheckAndFix(object):
if isinstance(record, RecordConf): if isinstance(record, RecordConf):
opening, closing = self._get_conference_dates(record) opening, closing = self._get_conference_dates(record)
return opening.strftime("%Y-%m-%d") val = opening.strftime("%Y-%m-%d")
elif isinstance(record, RecordThesis): elif isinstance(record, RecordThesis):
val = record.these_defense() val = record.these_defense()
...@@ -335,6 +332,10 @@ class CheckAndFix(object): ...@@ -335,6 +332,10 @@ class CheckAndFix(object):
if m_arxiv: if m_arxiv:
val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2)) val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))
# last change use the creation date for the record
if val == u"" or len(val) < 7:
val = record[u"creation_date"][0:7]
return val return val
@staticmethod @staticmethod
...@@ -538,17 +539,18 @@ class CheckAndFix(object): ...@@ -538,17 +539,18 @@ class CheckAndFix(object):
* Replace U. by University * Replace U. by University
Args: Args:
record (RecordThesis): record describing a thesis. record (RecordThesis):
record describing a thesis.
""" """
# protection # protection
if not isinstance(record, RecordThesis): if not isinstance(record, RecordThesis):
return return
is_cppm = self._get_reg_institute().find("CPPM") != -1 values = record[u"dissertation_note"][u"university"]
# CPPM: fix the name of Aix-Marseille university # CPPM -- fix the name of Aix-Marseille university
if is_cppm: if self._get_reg_institute().find("CPPM") != -1:
year = REG_YEAR.search(record.these_defense()).group(1) year = REG_YEAR.search(record.these_defense()).group(1)
if int(year) < 2012: if int(year) < 2012:
...@@ -556,33 +558,14 @@ class CheckAndFix(object): ...@@ -556,33 +558,14 @@ class CheckAndFix(object):
else: else:
university = "Aix Marseille Université" university = "Aix Marseille Université"
if "502" in record and "b" in record["502"]: values = (university if "Marseille" in values else values)
if isinstance(record["502"]["b"], str):
if "Marseille" in record["502"]["b"]:
record["502"]["b"] = university
elif isinstance(record["502"]["b"], list):
for i in range(len(record["502"]["b"])):
if "Marseille" in record["502"]["b"][i]:
record["502"]["b"][i] = university
# Other: replace U. by University # Other -- replace U. by University
else: else:
university = current.T(UNIVERSITY, lazy=False) university = current.T(UNIVERSITY).decode("utf8")
values.replace('U.', university)
if "502" in record and "b" in record["502"]:
if isinstance(record["502"]["b"], str): record[u"dissertation_note"][u"university"] = values
value = record["502"]["b"]
if "U." in value:
value = value.replace('U.', university)
record["502"]["b"] = value
elif isinstance(record["502"]["b"], list):
for i in range(len(record["502"]["b"])):
value = record["502"]["b"][i]
if "U." in value:
value = value.replace('U.', university)
record["502"]["b"][i] = value
def get_my_authors(self, record, sep=", ", sort=False): def get_my_authors(self, record, sep=", ", sort=False):
"""Get authors of my institutes signing the record. """Get authors of my institutes signing the record.
...@@ -647,10 +630,12 @@ class CheckAndFix(object): ...@@ -647,10 +630,12 @@ class CheckAndFix(object):
"""Check that the record described a thesis. """Check that the record described a thesis.
Args: Args:
record (RecordPubli): record describing a publication. record (RecordPubli):
record describing a publication.
Raises: Raises:
CheckException: when the record does not describe a thesis. CheckException:
the record does not describe a thesis.
""" """
if not isinstance(record, RecordThesis): if not isinstance(record, RecordThesis):
...@@ -819,42 +804,6 @@ class CheckAndFix(object): ...@@ -819,42 +804,6 @@ class CheckAndFix(object):
except ToolException as e: except ToolException as e:
raise CheckException(*e.args) raise CheckException(*e.args)
@staticmethod
def recover_oai(record, host):
"""Recover the OAI identifier when it is not defined
or not well form.
Args:
record (RecordPubli): record describing a publication.
host (str): possible values ares ``cds.cern.ch``
or ``inspirehep.net``
"""
# Note:
# For the record cds 1951625, possible values are:
# oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
# oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
# in both store)
# In all the case the first OAI corresponds to the record.id()
#
oai = record.oai()
if oai is not None and REG_OAI.match(oai):
return
if host == "cds.cern.ch":
field, subfield = "0248", "a"
elif host == "inspirehep.net":
field, subfield = "909CO", "o"
else:
raise ValueError(MSG_INVALID_HOST)
if field not in record:
record[field] = dict()
record[field][subfield] = OAI_INVENIO % (host, record.id())
def submitted(self, record): def submitted(self, record):
"""Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``. """Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
Look for alternative when it is not defined. Look for alternative when it is not defined.
...@@ -875,11 +824,14 @@ class CheckAndFix(object): ...@@ -875,11 +824,14 @@ class CheckAndFix(object):
date = record.submitted() date = record.submitted()
# recover missing date using conference, preprint, thesis information # recover missing date using conference, preprint, thesis information
if len(date) == 0: if len(date) < 7:
date = self._recover_submitted(record) date = self._recover_submitted(record)
if len(date) == 0: if len(date) == 0:
raise CheckException(MSG_NO_DATE) raise CheckException(MSG_NO_DATE)
elif len(date) < 7:
raise CheckException(MSG_WELL_FORMED_DATE)
# 22 Mar 2011 # 22 Mar 2011
m = DECODE_DD_MMM_YYYY.match(date) m = DECODE_DD_MMM_YYYY.match(date)
if m: if m:
...@@ -892,16 +844,11 @@ class CheckAndFix(object): ...@@ -892,16 +844,11 @@ class CheckAndFix(object):
data = (m.group(3), int(m.group(2)), int(m.group(1))) data = (m.group(3), int(m.group(2)), int(m.group(1)))
date = '%s-%02i-%02i' % data date = '%s-%02i-%02i' % data
# 2011 if u"prepublication" in record:
m_year = DECODE_YYYY.match(date) record[u"prepublication"][u"date"] = date
if m_year:
date = self._recover_submitted(record)
# check the minimum requirement is 2001-05
if not REG_SUBMITTED.match(date):
raise CheckException(MSG_WELL_FORMED_DATE)
record[u"prepublication"][u"date"] = date else:
record[u"prepublication"] = {u"date": date}
@staticmethod @staticmethod
def temporary_record(record): def temporary_record(record):
......
# -*- coding: utf-8 -*-
"""test_12_CheckAndFix_thesis
* Test CheckAndFix methods for thesis.
Use the one talk in cds.cern.ch
- is_thesis
- submitted
- format_universities
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from invenio_tools import load_record
@pytest.fixture(scope="module")
def reccds():
return load_record("cds.cern.ch", 1394605)
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_is_thesis(svc, reccds):
assert svc.is_thesis(reccds) is None
def test_submitted(svc, reccds):
assert reccds.these_defense() == "2011"
assert reccds.submitted() == ""
svc.submitted(reccds)
assert reccds.submitted() == "2011-11"
def test_format_universities(svc, reccds):
# Khanji en 2011 (Université de la Méditerrannée)
assert reccds[u"dissertation_note"][u"university"] == \
"Marseille U., Luminy"
svc.format_universities(reccds)
assert reccds[u"dissertation_note"][u"university"] == \
u"Université de la Méditerrannée Aix-Marseille II"
# Chen en 2013 (Aix marseille Université)
reccds2 = load_record("cds.cern.ch", 1632177)
assert reccds2[u"dissertation_note"][u"university"] == \
u"Shandong U. & Marseille, CPPM"
svc.format_universities(reccds2)
assert reccds2[u"dissertation_note"][u"university"] == \
u"Aix Marseille Université"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment