Commit e8154552 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate CheckAndFix: methods required by the harvester Proceeding.

parent 56512435
...@@ -7,6 +7,7 @@ import re ...@@ -7,6 +7,7 @@ import re
import regex import regex
from .base import search_synonym, ToolException from .base import search_synonym, ToolException
from datetime import datetime
from .exception import CheckException from .exception import CheckException
from gluon import current from gluon import current
from invenio_tools import (DECODE_REF, from invenio_tools import (DECODE_REF,
...@@ -81,6 +82,11 @@ REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)") ...@@ -81,6 +82,11 @@ REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED) REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}")
REG_WELL_FORMED_CONF_DATES_2 = \
re.compile("\d{2} [A-Z][a-z]{2} - \d{2} [A-Z][a-z]{2} \d{4}")
UNIVERSITY = "University" UNIVERSITY = "University"
...@@ -101,6 +107,74 @@ class CheckAndFix(object): ...@@ -101,6 +107,74 @@ class CheckAndFix(object):
# private cache for my authors list # private cache for my authors list
self._my_authors = {} self._my_authors = {}
@staticmethod
def _get_conference_dates(record):
"""Return the opening and closing dates of a conference.
Args:
record (RecordConf):
record describing a conference proceeding or talk.
Returns:
tuple of datetime.date:
opening and closing dates.
Raise:
ToolException:
no conference date found.
"""
if u"meeting_name" not in record:
raise ToolException(MSG_NO_CONF_DATE)
meeting = record[u"meeting_name"]
meeting = (meeting[0] if isinstance(meeting, list) else meeting)
# CDS has the opening and closing dates encoded as 20141231
if u"opening_date" in meeting and u"closing_date" in meeting:
fmt = "%Y%m%d"
val = meeting[u"opening_date"]
opening = datetime.strptime(val, fmt)
val = meeting[u"closing_date"]
closing = datetime.strptime(val, fmt)
return (opening, closing)
# both CDS and INSPIRE have the dates subfield
val = meeting[u"date"]
# date is encode as 12 - 15 Mar 2014
m = REG_CONF_DATES_1.match(val)
if m:
fmt = "%d-%b-%Y"
val = u"%s-%s-%s" % (m.group(1), m.group(3), m.group(4))
opening = datetime.strptime(val, fmt)
val = u"%s-%s-%s" % (m.group(2), m.group(3), m.group(4))
closing = datetime.strptime(val, fmt)
return (opening, closing)
# dates are encoded 29 Feb - 1 Mar 2014
m = REG_CONF_DATES_2.match(val)
if not m:
raise ToolException(MSG_NO_CONF_DATE)
fmt = "%d-%b-%Y"
val = u"%s-%s-%s" % (m.group(1), m.group(2), m.group(5))
opening = datetime.strptime(val, fmt)
val = u"%s-%s-%s" % (m.group(3), m.group(4), m.group(5))
closing = datetime.strptime(val, fmt)
return (opening, closing)
def _get_reg_institute(self): def _get_reg_institute(self):
"""Get the regular expression defining the affiliation of my institute. """Get the regular expression defining the affiliation of my institute.
...@@ -232,8 +306,7 @@ class CheckAndFix(object): ...@@ -232,8 +306,7 @@ class CheckAndFix(object):
return False return False
@staticmethod def _recover_submitted(self, record):
def _recover_submitted(record):
"""Recover submitted date using conference, preprint or thesis """Recover submitted date using conference, preprint or thesis
information. information.
...@@ -249,25 +322,8 @@ class CheckAndFix(object): ...@@ -249,25 +322,8 @@ class CheckAndFix(object):
val = u"" val = u""
if isinstance(record, RecordConf): if isinstance(record, RecordConf):
# CDS opening date is encoded as 20141231 opening, closing = self._get_conference_dates(record)
if u"opening_date" in record[u"meeting_name"]: return opening.strftime("%Y-%m-%d")
val = record[u"meeting_name"][u"opening_date"]
val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])
# CDS / INSPIREHEP date
# date is encoded as 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
# decode as DD-MMM-YYYY
elif u"date" in record[u"meeting_name"]:
val = record[u"meeting_name"][u"date"]
m1 = REG_CONF_DATES_1.match(val)
m2 = REG_CONF_DATES_2.match(val)
if m1:
val = u"%s-%s-%s" % (m1.group(1), m1.group(3), m1.group(4))
elif m2:
val = u"%s-%s-%s" % (m1.group(1), m1.group(2), m1.group(5))
elif isinstance(record, RecordThesis): elif isinstance(record, RecordThesis):
val = record.these_defense() val = record.these_defense()
...@@ -316,10 +372,8 @@ class CheckAndFix(object): ...@@ -316,10 +372,8 @@ class CheckAndFix(object):
if not val: if not val:
return return
db = self.db
try: try:
search_synonym(db.collaborations, "collaboration", val) search_synonym(self.db.collaborations, "collaboration", val)
except ToolException as e: except ToolException as e:
raise CheckException(*e.args) raise CheckException(*e.args)
...@@ -329,69 +383,70 @@ class CheckAndFix(object): ...@@ -329,69 +383,70 @@ class CheckAndFix(object):
Have a look to the synonyms when the country does not exist. Have a look to the synonyms when the country does not exist.
Args: Args:
record (RecordConf): record describing a talk or a proceeding. record (RecordConf):
record describing a talk or a proceeding.
Raises: Raises:
CheckException: when the country is not defined CheckException:
nor entered as a synonym. the country is not defined nor entered as a synonym.
""" """
if not isinstance(record, RecordConf): if not isinstance(record, RecordConf):
return return
db = self.db
val = record.conference_country() val = record.conference_country()
try: try:
search_synonym(db.countries, "country", val) search_synonym(self.db.countries, "country", val)
except ToolException as e: except ToolException as e:
raise CheckException(*e.args) raise CheckException(*e.args)
def conference_date(self, record, host): def conference_date(self, record):
"""Check conference date. """Check conference date and format it properly.
Args: Args:
record (RecordConf): record describing a talk or a proceeding. record (RecordConf):
host (str): possible values ares ``cds.cern.ch`` record describing a talk or a proceeding.
or ``inspirehep.net``
Raises: Raises:
CheckException: when dates are not found or not well formed. CheckException:
dates are not found.
""" """
# conference information are available, i.e proceeding # conference information are available, i.e proceeding
if not isinstance(record, RecordConf): if not isinstance(record, RecordConf):
return return
# inspirehep.net val = record.conference_dates()
if host == "inspirehep.net": if len(val) == 0:
value = record.conference_dates()
if len(value) == 0:
raise CheckException(MSG_NO_CONF_DATE) raise CheckException(MSG_NO_CONF_DATE)
# is it well formed
if REG_WELL_FORMED_CONF_DATES_1.match(val):
return return
# cds.cern.ch if REG_WELL_FORMED_CONF_DATES_2.match(val):
if not ("111" in record and "d" in record["111"]): return
raise CheckException(MSG_NO_CONF_DATE)
value = record["111"]["d"]
m = REG_CONF_DATES.match(value)
if not m:
# 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
m1 = REG_CONF_DATES_1.match(value)
m2 = REG_CONF_DATES_2.match(value)
if m1:
record["111"]["d"] = "%s-%s %s %s" % m1.groups()
elif m2: # format the date properly
record["111"]["d"] = "%s %s - %s %s %s" % m2.groups() opening, closing = self._get_conference_dates(record)
if opening.month == closing.month:
val = "%02i - %02i %s %i" % (opening.day,
closing.day,
opening.strftime("%b"),
opening.year)
else: else:
raise CheckException(MSG_WELL_FORMED_CONF_DATES) val = "%02i %s - %02i %s %i" % (opening.day,
opening.strftime("%b"),
closing.day,
closing.strftime("%b"),
opening.year)
meeting = record[u"meeting_name"]
meeting = (meeting[0] if isinstance(meeting, list) else meeting)
meeting[u"date"] = val
def is_bad_oai_used(self, record): def is_bad_oai_used(self, record):
"""Bad OAI is when the ``id`` in the OAI field is different from """Bad OAI is when the ``id`` in the OAI field is different from
...@@ -576,10 +631,12 @@ class CheckAndFix(object): ...@@ -576,10 +631,12 @@ class CheckAndFix(object):
"""Check that the record described a conference talk / proceeding. """Check that the record described a conference talk / proceeding.
Args: Args:
record (RecordPubli): record describing a publication. record (RecordPubli):
record describing a publication.
Raises: Raises:
CheckException: when the record is not associated to a conference. CheckException:
the record is not associated to a conference.
""" """
if not isinstance(record, RecordConf): if not isinstance(record, RecordConf):
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
- publisher - publisher
- paper_reference - paper_reference
- submitted - submitted
- year
- format_author - format_author
- get_my_authors - get_my_authors
...@@ -16,7 +15,6 @@ ...@@ -16,7 +15,6 @@
""" """
import pytest import pytest
from harvest_tools.checkandfix import CheckAndFix from harvest_tools.checkandfix import CheckAndFix
from harvest_tools.exception import ToolException
from invenio_tools import load_record from invenio_tools import load_record
...@@ -71,23 +69,13 @@ def test_paper_reference(svc, reccds): ...@@ -71,23 +69,13 @@ def test_paper_reference(svc, reccds):
assert reccds.paper_reference() == paper_ref assert reccds.paper_reference() == paper_ref
# Paper is published but there are error in the paper reference
# Correct reference is Eur. Phys. J. C 75 (2015) 158
# But volume is not defined and pagination is wrong (75)
# It is not possible to recover it from the doi data.
record = load_record("cds.cern.ch", 1753190)
with pytest.raises(ToolException):
svc.paper_reference(record)
def test_submitted(svc, reccds, recins): def test_submitted(svc, reccds, recins):
assert reccds.submitted() == "19 Jan 2017"
assert recins.submitted() == "2017-01-19" assert recins.submitted() == "2017-01-19"
assert reccds.submitted() == "19 Jan 2017"
svc.submitted(reccds) svc.submitted(reccds)
reccds.submitted()
assert reccds.submitted() == "2017-01-19" assert reccds.submitted() == "2017-01-19"
# test the case 19 01 2017 # test the case 19 01 2017
......
# -*- coding: utf-8 -*-
"""test_11_CheckAndFix_proceeding
* Test CheckAndFix methods for proceeding.
Use the same proceeding in cds.cern.ch and inspirehep.net
- is_conference
- country
- conference_date
- submitted
- format_editor (already test with article)
- publisher (already test with article)
- paper_reference (already test with article)
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from invenio_tools import load_record
@pytest.fixture(scope="module")
def reccds():
return load_record("cds.cern.ch", 1411352)
@pytest.fixture(scope="module")
def recins():
return load_record("inspirehep.net", 1089237)
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_is_conference(svc, reccds, recins):
assert svc.is_conference(reccds) is None
assert svc.is_conference(recins) is None
def test_country(svc, reccds, recins):
assert svc.country(reccds) is None
assert svc.country(recins) is None
def test_conference_date(svc, reccds, recins):
assert reccds.conference_dates() == "6 - 11 Dec 2010"
svc.conference_date(reccds)
assert reccds.conference_dates() == "06 - 11 Dec 2010"
assert recins.conference_dates() == "6-11 Dec 2010"
svc.conference_date(recins)
assert recins.conference_dates() == "06 - 11 Dec 2010"
def test_submitted(svc, reccds, recins):
assert reccds.submitted() == "05 Jan 2012"
svc.submitted(reccds)
assert reccds.submitted() == "2012-01-05"
assert recins.submitted() == "2011"
svc.submitted(recins)
assert recins.submitted() == "2010-12-06"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment