Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit e98db28b authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate check and fix method to RecordCdsConf and RecordHepConf

parent a0dd169b
......@@ -3,10 +3,12 @@
"""
from .automaton import Automaton
from .base import (learn_my_authors,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from plugin_dbui import get_id, UNDEF_ID
from store_tools import CheckException
MSG_NO_EDITOR = "Reject article is not published"
MSG_NOT_ARTICLE = "Reject publication is not and article"
......@@ -52,7 +54,7 @@ class Articles(Automaton):
can not be corrected.
"""
self.logger.debug(f"{T4}check record (article)")
self.logger.debug(f"{T4}check and fix record (article)")
if record.subtype() == "article":
self.logs[-1].reject(MSG_NOT_ARTICLE, record)
......@@ -74,10 +76,14 @@ class Articles(Automaton):
record.check_publisher(self.db)
record.check_paper_reference()
except Exception as e:
except CheckException as e:
self.logs[-1].reject(e, record=record)
return False
except Exception as e:
self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
return False
return True
def get_record_by_fields(self,
......
......@@ -6,6 +6,8 @@ from .base import MSG_CRASH, MSG_LOAD
from .checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
MSG_NOT_PROCEEDING = "Reject publication is not a proceeding"
T4 = " "*4
......@@ -13,6 +15,7 @@ class Proceedings(Automaton):
"""Automaton for conference proceedings.
"""
def check_record(self, record):
"""Check the content of the proceeding in order to fix non conformities.
......@@ -26,24 +29,29 @@ class Proceedings(Automaton):
corrected.
"""
if not Automaton.check_record(self, record):
return False
self.logger.debug(f"{T4}check nd fix record (proceeding)")
self.logger.debug(f"{T4}check record (proceeding)")
if record.subtype() == "proceeding":
self.logs[-1].reject(MSG_NOT_PROCEEDING, record)
return False
try:
self.check.is_conference(record)
self.check.country(record)
self.check.conference_date(record)
self.check.submitted(record)
self.check.format_editor(record)
self.check.publisher(record)
self.check.paper_reference(record)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
# is with authors form my institute
# standardise name of collaboration
# format authors according to my format
# extract authors form my institute signing the publication
# is submitted date well formed
record.check_and_fix(self.rex_institute,
fmt_author="F. Last",
sep_author=", ",
sort_author=True)
record.check_country()
record.check_conference_date()
record.format_editor()
record.check_publisher(self.db)
record.check_paper_reference()
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -15,6 +15,7 @@ INS = ("inspirehep", "inspirehep.net")
MSG_INV_CONF = "Reject invalid conference information"
MSG_INV_CONF_KEY = "Reject invalid conference key"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_CONF_DATE = "Reject no conference date"
MSG_NO_CONF_ID_KEY = "Reject no conference identifier and key"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_ENTRY = "Reject %s is not defined"
......@@ -25,6 +26,7 @@ MSG_NO_SHELF = "No shelf %s for store %s"
MSG_NO_THESIS = "Reject no thesis information"
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms"
MSG_UNKNOWN_COLLABORATION = "Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY = "Reject country is unknown."
MSG_WELL_FORMED_COLLABORATION = "Reject collaboration is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
......
"""confmixin.py
"""
from .base import (MSG_NO_CONF_DATE,
MSG_UNKNOWN_COUNTRY,
REG_DATE,
search_synonym,
T6)
from .exception import CheckException
from datetime import datetime
from plugin_dbui import CLEAN_SPACES
from plugin_dbui import CLEAN_SPACES, UNDEF_ID
class ConfMixin(object):
"""Mixin to handle conference data.
The parent class must have the attribute ``conference``.
It is a dictionary with at least the following keys:
The parent class must have the attribute ``conference``:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
+----------------+----------------------------------------+
"""
def check_conference_date(self):
"""Check conference date exists.
Note:
Date is well formed by construction (conference_date)
Raises:
CheckException:
dates are not found.
"""
self.logger.debug(f"{T6}check conference date")
val = self.conference_dates()
if len(val) == 0:
raise CheckException(MSG_NO_CONF_DATE)
def check_country(self, db=None):
"""Check synonyms for conference country by using by the proper value.
Args:
db (pydal.DAL):
database connection
Raises:
CheckException:
* the country is unknown (neither country nor synonym)
* more than one synonym found.
"""
if db is None:
self.logger.debug(f"{T6}skip check country -- db is None")
return
self.logger.debug(f"{T6}check country")
val = self.conference_country()
if len(val) == 0:
raise CheckException(MSG_UNKNOWN_COUNTRY)
dbid = search_synonym(db.countries, "country", val)
if dbid == UNDEF_ID:
raise CheckException(MSG_UNKNOWN_COUNTRY)
country = db.countries[dbid].country
if country != val:
self.conference["addresses"][0]["country"] = country
def conference_country(self):
"""The country where the conference took place.
......@@ -57,7 +114,10 @@ class ConfMixin(object):
opening = conference.get("opening_date", None)
closing = conference.get("closing_date", None)
if opening is None or closing is None:
if opening is None or REG_DATE.match(opening) is None:
return ""
if closing is None or REG_DATE.match(closing) is None:
return ""
ds = datetime.strptime(opening, "%Y-%m-%d")
......
......@@ -3,9 +3,13 @@
"""
import re
from .base import T4, T6
from .base import (MSG_WELL_FORMED_DATE,
REG_DATE_YYYYMM,
T4,
T6)
from .cdsstore import CdsStore
from .confmixin import ConfMixin
from .exception import CheckException
from .recordcdspubli import RecordCdsPubli
REX_DATE8 = re.compile(r"(\d{4})(\d{2})(\d{2})")
......@@ -17,14 +21,19 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
Attributes:
conference (dict or None):
the conference metadata:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
* year
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
| year | str |
+----------------+----------------------------------------+
"""
......@@ -61,7 +70,7 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
store = CdsStore("cds.cern.ch")
if conf_id is not None:
logger.debug(f"{T6}search by conference by id {conf_id}")
logger.debug(f"{T6}search conference by id {conf_id}")
recjson = store.get_record(conf_id)
if recjson["recid"] != int(conf_id):
......@@ -72,7 +81,7 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
return
elif conf_key is not None:
logger.debug(f"{T6}search by conference by key {conf_key}")
logger.debug(f"{T6}search conference by key {conf_key}")
ids = store.get_ids(p=conf_key)
mtch = False
......@@ -132,3 +141,34 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
#
# Append conference data
self.conference = dct
def check_submitted_date(self):
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
* the date is not well formed
"""
self.logger.debug(f"{T6}check submitted date")
xdate = self.submitted()
if REG_DATE_YYYYMM.match(xdate):
return
# recover by using the opening date of the conference
val = self.conference.get("opening_date", None)
if val is not None:
if "prepublication" in self:
prepublication = self["prepublication"]
if isinstance(prepublication, list):
prepublication[0]["date"] = val
else:
prepublication["date"] = val
else:
self["prepublication"] = {"date": val}
else:
raise CheckException(MSG_WELL_FORMED_DATE)
......@@ -833,7 +833,7 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
Returns:
str:
* "articles", "preprint", "note" or "report"
* "articles", "preprint", "proceeding", "note" or "report"
* empty string when it is not defined
"""
......@@ -843,7 +843,10 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst = [dct.get("primary", "").lower() for dct in collection]
# order matter since note can have preprint+note
if "conferencepaper" in lst:
return "proceeding"
# order matter since we have (preprint+note)
for val in ("article", "note", "report", "preprint"):
if val in lst:
return val
......
......@@ -3,8 +3,12 @@
"""
import requests
from .base import T4, T6
from .base import (MSG_WELL_FORMED_DATE,
REG_DATE_YYYYMM,
T4,
T6)
from .confmixin import ConfMixin
from .exception import CheckException
from .recordheppubli import RecordHepPubli
......@@ -21,13 +25,17 @@ class RecordHepConfPaper(RecordHepPubli, ConfMixin):
https://inspire-schemas.readthedocs.io/en/latest/schemas/
Main information are:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
+----------------+----------------------------------------+
"""
......@@ -72,3 +80,25 @@ class RecordHepConfPaper(RecordHepPubli, ConfMixin):
# append conference data
self.conference = obj.get("metadata", None)
def check_submitted_date(self):
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
* the date is not well formed
"""
self.logger.debug(f"{T6}check submitted date")
xdate = self.submitted()
if REG_DATE_YYYYMM.match(xdate):
return
# recover by using the opening date of the conference
val = self.conference.get("opening_date", None)
if val is not None:
self["preprint_date"] = val
else:
raise CheckException(MSG_WELL_FORMED_DATE)
......@@ -275,7 +275,7 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
"""
def check_submitted_date(self):
"""Check that submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
......@@ -530,7 +530,7 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
Returns:
str:
* "articles", "preprint", "note" or "report"
* "articles", "preprint", "proceeding", "note" or "report"
* empty string when it is not defined
"""
......@@ -540,13 +540,14 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
doctype = (doctype[0] if isinstance(doctype, list) else doctype)
if doctype != "article":
return doctype
if doctype == "article":
pubinfo = self.get("publication_info", None)
return ("preprint" if pubinfo is None else "article")
# separate article from preprint
# in the latter case publication_info is missing
pubinfo = self.get("publication_info", None)
return ("preprint" if pubinfo is None else "article")
elif doctype == "conference paper":
return ("proceeding" if self.is_published() else "")
return doctype
def title(self):
"""The title of the publication.
......
......@@ -29,7 +29,7 @@ def record():
return load_record("cds.cern.ch", 2242641)
def test_subtype_ins_13001(record):
def test_subtype_13001(record):
assert record.subtype() == "article"
......@@ -120,91 +120,3 @@ def test_check_paper_reference_13022(record):
# check_paper_reference is a dummy method to preserve interface
assert record.paper_reference() == "Phys. Rev. D 95 2017 052005"
assert record.check_paper_reference() is None
# def test_format_editor_cds_13001(svc, reccds):
#
# # cds
# assert reccds.paper_editor() == "Phys. Rev. D"
# assert reccds.paper_volume() == "95"
#
# svc.format_editor(reccds)
#
# assert reccds.paper_editor() == "Phys. Rev. D"
# assert reccds.paper_volume() == "95"
#
#
# def test_format_editor_ins_13002(svc, recins):
# # inspire
# assert recins.paper_editor() == "Phys. Rev. D"
# assert recins.paper_volume() == "95"
#
# svc.format_editor(recins)
#
# assert recins.paper_editor() == "Phys. Rev. D"
# assert recins.paper_volume() == "95"
#
#
# def test_publisher_cds_13003(svc, reccds):
# assert svc.publisher(reccds) is None
#
#
# def test_paper_reference_cds_13004(svc, reccds):
#
# # check recovery procedure using DOI
# # remove the publisher and volume information
# paper_ref = reccds.paper_reference()
#
# reccds.df_info.loc[0, ["title", "volume"]] = ["", ""]
# svc.paper_reference(reccds)
#
# assert reccds.paper_reference() == paper_ref
#
#
# def test_submitted_cds_13005(svc, reccds):
#
# assert reccds.submitted() == "19 Jan 2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01-19"
#
# # test the case 19 01 2017
# reccds["prepublication"]["date"] = "19 01 2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01-19"
#
# # test the case 2017
# reccds["prepublication"]["date"] = "2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01"
#
#
# def test_format_authors_cds_13007(svc, reccds):
#
# authors = reccds.authors_as_list()
#
# assert len(authors) == reccds["number_of_authors"]
# assert authors[0] == "Aaij, Roel"
# assert authors[1] == "Adeva, Bernardo"
# assert authors[344] == "Koopman, Rose"
# assert authors[-1] == "Zucchelli, Stefano"
#
# svc.format_authors(reccds, fmt="F. Last")
# authors = reccds.authors_as_list()
#
# assert authors[0] == "R. Aaij"
# assert authors[1] == "B. Adeva"
# assert authors[344] == "R. Koopman"
# assert authors[-1] == "S. Zucchelli"
#
#
# def test_get_my_authors_cds_13008(svc, reccds):
#
# svc.format_authors(reccds, fmt="F. Last")
# assert svc.get_my_authors(reccds, sep="|", sort=True) is None
#
# my_authors = reccds.my_authors
#
# assert my_authors == "J. Arnau Romeu|E. Aslanides|J. Cogan|" \
# "K. De Bruyn|R. Le Gac|O. Leroy|" \
# "G. Mancinelli|M. Martin|A. Mordà|" \
# "J. Serrano|A. Tayduganov|A. Tsaregorodtsev"
"""test_15_CheckAndFix_proceeding
* Test CheckAndFix methods for proceeding.
Use the same proceeding in cds.cern.ch and inspirehep.net
- is_conference
- country
- conference_date
- submitted
- format_editor (already test with article)
- publisher (already test with article)
- paper_reference (already test with article)
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from harvest_tools.exception import CheckException
from store_tools import load_record
@pytest.fixture(scope="module")
def reccds():
return load_record("cds.cern.ch", 1411352)
@pytest.fixture(scope="module")
def recins():
return load_record("inspirehep.net", 1089237, shelf="literature")
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_is_conference_cds_15001(svc, reccds):
assert svc.is_conference(reccds) is None
# test exception
# the publication cds2242595 is a talk without conference data
#
reccds2 = load_record("cds.cern.ch", 2242595)
with pytest.raises(CheckException):
svc.is_conference(reccds2)
def test_is_conference_ins_15002(svc, recins):
assert svc.is_conference(recins) is None
def test_country_cds_15003(svc, reccds):
assert svc.country(reccds) is None
def test_country_ins_15004(svc, recins):
assert svc.country(recins) is None
def test_conference_date_cds_15005(svc, reccds):
assert reccds.conference_dates() == "6 - 11 Dec 2010"
svc.conference_date(reccds)
assert reccds.conference_dates() == "6-11 Dec 2010"
def test_conference_date_cds_15006(svc):
reccds = load_record("cds.cern.ch", 2688580)
assert reccds.conference_dates() == "04-06 Sept 2019"
svc.conference_date(reccds)