Commit e98db28b authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate check and fix method to RecordCdsConf and RecordHepConf

parent a0dd169b
......@@ -3,10 +3,12 @@
"""
from .automaton import Automaton
from .base import (learn_my_authors,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from plugin_dbui import get_id, UNDEF_ID
from store_tools import CheckException
MSG_NO_EDITOR = "Reject article is not published"
MSG_NOT_ARTICLE = "Reject publication is not and article"
......@@ -52,7 +54,7 @@ class Articles(Automaton):
can not be corrected.
"""
self.logger.debug(f"{T4}check record (article)")
self.logger.debug(f"{T4}check and fix record (article)")
if record.subtype() == "article":
self.logs[-1].reject(MSG_NOT_ARTICLE, record)
......@@ -74,10 +76,14 @@ class Articles(Automaton):
record.check_publisher(self.db)
record.check_paper_reference()
except Exception as e:
except CheckException as e:
self.logs[-1].reject(e, record=record)
return False
except Exception as e:
self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
return False
return True
def get_record_by_fields(self,
......
......@@ -6,6 +6,8 @@ from .base import MSG_CRASH, MSG_LOAD
from .checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
MSG_NOT_PROCEEDING = "Reject publication is not a proceeding"
T4 = " "*4
......@@ -13,6 +15,7 @@ class Proceedings(Automaton):
"""Automaton for conference proceedings.
"""
def check_record(self, record):
"""Check the content of the proceeding in order to fix non conformities.
......@@ -26,24 +29,29 @@ class Proceedings(Automaton):
corrected.
"""
if not Automaton.check_record(self, record):
return False
self.logger.debug(f"{T4}check nd fix record (proceeding)")
self.logger.debug(f"{T4}check record (proceeding)")
if record.subtype() == "proceeding":
self.logs[-1].reject(MSG_NOT_PROCEEDING, record)
return False
try:
self.check.is_conference(record)
self.check.country(record)
self.check.conference_date(record)
self.check.submitted(record)
self.check.format_editor(record)
self.check.publisher(record)
self.check.paper_reference(record)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
# is with authors form my institute
# standardise name of collaboration
# format authors according to my format
# extract authors form my institute signing the publication
# is submitted date well formed
record.check_and_fix(self.rex_institute,
fmt_author="F. Last",
sep_author=", ",
sort_author=True)
record.check_country()
record.check_conference_date()
record.format_editor()
record.check_publisher(self.db)
record.check_paper_reference()
except CheckException as e:
self.logs[-1].reject(e, record=record)
......
......@@ -15,6 +15,7 @@ INS = ("inspirehep", "inspirehep.net")
MSG_INV_CONF = "Reject invalid conference information"
MSG_INV_CONF_KEY = "Reject invalid conference key"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_CONF_DATE = "Reject no conference date"
MSG_NO_CONF_ID_KEY = "Reject no conference identifier and key"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_ENTRY = "Reject %s is not defined"
......@@ -25,6 +26,7 @@ MSG_NO_SHELF = "No shelf %s for store %s"
MSG_NO_THESIS = "Reject no thesis information"
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms"
MSG_UNKNOWN_COLLABORATION = "Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY = "Reject country is unknown."
MSG_WELL_FORMED_COLLABORATION = "Reject collaboration is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
......
"""confmixin.py
"""
from .base import (MSG_NO_CONF_DATE,
MSG_UNKNOWN_COUNTRY,
REG_DATE,
search_synonym,
T6)
from .exception import CheckException
from datetime import datetime
from plugin_dbui import CLEAN_SPACES
from plugin_dbui import CLEAN_SPACES, UNDEF_ID
class ConfMixin(object):
"""Mixin to handle conference data.
The parent class must have the attribute ``conference``.
It is a dictionary with at least the following keys:
The parent class must have the attribute ``conference``:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
+----------------+----------------------------------------+
"""
def check_conference_date(self):
"""Check conference date exists.
Note:
Date is well formed by construction (conference_date)
Raises:
CheckException:
dates are not found.
"""
self.logger.debug(f"{T6}check conference date")
val = self.conference_dates()
if len(val) == 0:
raise CheckException(MSG_NO_CONF_DATE)
def check_country(self, db=None):
"""Check synonyms for conference country by using by the proper value.
Args:
db (pydal.DAL):
database connection
Raises:
CheckException:
* the country is unknown (neither country nor synonym)
* more than one synonym found.
"""
if db is None:
self.logger.debug(f"{T6}skip check country -- db is None")
return
self.logger.debug(f"{T6}check country")
val = self.conference_country()
if len(val) == 0:
raise CheckException(MSG_UNKNOWN_COUNTRY)
dbid = search_synonym(db.countries, "country", val)
if dbid == UNDEF_ID:
raise CheckException(MSG_UNKNOWN_COUNTRY)
country = db.countries[dbid].country
if country != val:
self.conference["addresses"][0]["country"] = country
def conference_country(self):
"""The country where the conference took place.
......@@ -57,7 +114,10 @@ class ConfMixin(object):
opening = conference.get("opening_date", None)
closing = conference.get("closing_date", None)
if opening is None or closing is None:
if opening is None or REG_DATE.match(opening) is None:
return ""
if closing is None or REG_DATE.match(closing) is None:
return ""
ds = datetime.strptime(opening, "%Y-%m-%d")
......
......@@ -3,9 +3,13 @@
"""
import re
from .base import T4, T6
from .base import (MSG_WELL_FORMED_DATE,
REG_DATE_YYYYMM,
T4,
T6)
from .cdsstore import CdsStore
from .confmixin import ConfMixin
from .exception import CheckException
from .recordcdspubli import RecordCdsPubli
REX_DATE8 = re.compile(r"(\d{4})(\d{2})(\d{2})")
......@@ -17,14 +21,19 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
Attributes:
conference (dict or None):
the conference metadata:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
* year
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
| year | str |
+----------------+----------------------------------------+
"""
......@@ -61,7 +70,7 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
store = CdsStore("cds.cern.ch")
if conf_id is not None:
logger.debug(f"{T6}search by conference by id {conf_id}")
logger.debug(f"{T6}search conference by id {conf_id}")
recjson = store.get_record(conf_id)
if recjson["recid"] != int(conf_id):
......@@ -72,7 +81,7 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
return
elif conf_key is not None:
logger.debug(f"{T6}search by conference by key {conf_key}")
logger.debug(f"{T6}search conference by key {conf_key}")
ids = store.get_ids(p=conf_key)
mtch = False
......@@ -132,3 +141,34 @@ class RecordCdsConfPaper(RecordCdsPubli, ConfMixin):
#
# Append conference data
self.conference = dct
def check_submitted_date(self):
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
* the date is not well formed
"""
self.logger.debug(f"{T6}check submitted date")
xdate = self.submitted()
if REG_DATE_YYYYMM.match(xdate):
return
# recover by using the opening date of the conference
val = self.conference.get("opening_date", None)
if val is not None:
if "prepublication" in self:
prepublication = self["prepublication"]
if isinstance(prepublication, list):
prepublication[0]["date"] = val
else:
prepublication["date"] = val
else:
self["prepublication"] = {"date": val}
else:
raise CheckException(MSG_WELL_FORMED_DATE)
......@@ -833,7 +833,7 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
Returns:
str:
* "articles", "preprint", "note" or "report"
* "articles", "preprint", "proceeding", "note" or "report"
* empty string when it is not defined
"""
......@@ -843,7 +843,10 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst = [dct.get("primary", "").lower() for dct in collection]
# order matter since note can have preprint+note
if "conferencepaper" in lst:
return "proceeding"
# order matter since we have (preprint+note)
for val in ("article", "note", "report", "preprint"):
if val in lst:
return val
......
......@@ -3,8 +3,12 @@
"""
import requests
from .base import T4, T6
from .base import (MSG_WELL_FORMED_DATE,
REG_DATE_YYYYMM,
T4,
T6)
from .confmixin import ConfMixin
from .exception import CheckException
from .recordheppubli import RecordHepPubli
......@@ -21,13 +25,17 @@ class RecordHepConfPaper(RecordHepPubli, ConfMixin):
https://inspire-schemas.readthedocs.io/en/latest/schemas/
Main information are:
* addresses: [{cities: [], country: str, ...}, ...]
* cnum: str
* control_number: int
* closing_date: str
* opening_date: str
* titles: [{value: str}, ...]
* urls: [{value: str}, ...]
+----------------+----------------------------------------+
| key | value |
+----------------+----------------------------------------+
| addresses | [{cities: [], country: str, ...}, ...] |
| cnum | str |
| control_number | int |
| closing_date | str |
| opening_date | str |
| titles | [{title: str}, ...] |
| urls | [{value: str}, ...] |
+----------------+----------------------------------------+
"""
......@@ -72,3 +80,25 @@ class RecordHepConfPaper(RecordHepPubli, ConfMixin):
# append conference data
self.conference = obj.get("metadata", None)
def check_submitted_date(self):
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
* the date is not well formed
"""
self.logger.debug(f"{T6}check submitted date")
xdate = self.submitted()
if REG_DATE_YYYYMM.match(xdate):
return
# recover by using the opening date of the conference
val = self.conference.get("opening_date", None)
if val is not None:
self["preprint_date"] = val
else:
raise CheckException(MSG_WELL_FORMED_DATE)
......@@ -275,7 +275,7 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
"""
def check_submitted_date(self):
"""Check that submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
"""Check that submitted date is either ``YYYY-MM`` or ``YYYY-MM-DD``.
Raises:
CheckException::
......@@ -530,7 +530,7 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
Returns:
str:
* "articles", "preprint", "note" or "report"
* "articles", "preprint", "proceeding", "note" or "report"
* empty string when it is not defined
"""
......@@ -540,13 +540,14 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
doctype = (doctype[0] if isinstance(doctype, list) else doctype)
if doctype != "article":
return doctype
if doctype == "article":
pubinfo = self.get("publication_info", None)
return ("preprint" if pubinfo is None else "article")
# separate article from preprint
# in the latter case publication_info is missing
pubinfo = self.get("publication_info", None)
return ("preprint" if pubinfo is None else "article")
elif doctype == "conference paper":
return ("proceeding" if self.is_published() else "")
return doctype
def title(self):
"""The title of the publication.
......
......@@ -29,7 +29,7 @@ def record():
return load_record("cds.cern.ch", 2242641)
def test_subtype_ins_13001(record):
def test_subtype_13001(record):
assert record.subtype() == "article"
......@@ -120,91 +120,3 @@ def test_check_paper_reference_13022(record):
# check_paper_reference is a dummy method to preserve interface
assert record.paper_reference() == "Phys. Rev. D 95 2017 052005"
assert record.check_paper_reference() is None
# def test_format_editor_cds_13001(svc, reccds):
#
# # cds
# assert reccds.paper_editor() == "Phys. Rev. D"
# assert reccds.paper_volume() == "95"
#
# svc.format_editor(reccds)
#
# assert reccds.paper_editor() == "Phys. Rev. D"
# assert reccds.paper_volume() == "95"
#
#
# def test_format_editor_ins_13002(svc, recins):
# # inspire
# assert recins.paper_editor() == "Phys. Rev. D"
# assert recins.paper_volume() == "95"
#
# svc.format_editor(recins)
#
# assert recins.paper_editor() == "Phys. Rev. D"
# assert recins.paper_volume() == "95"
#
#
# def test_publisher_cds_13003(svc, reccds):
# assert svc.publisher(reccds) is None
#
#
# def test_paper_reference_cds_13004(svc, reccds):
#
# # check recovery procedure using DOI
# # remove the publisher and volume information
# paper_ref = reccds.paper_reference()
#
# reccds.df_info.loc[0, ["title", "volume"]] = ["", ""]
# svc.paper_reference(reccds)
#
# assert reccds.paper_reference() == paper_ref
#
#
# def test_submitted_cds_13005(svc, reccds):
#
# assert reccds.submitted() == "19 Jan 2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01-19"
#
# # test the case 19 01 2017
# reccds["prepublication"]["date"] = "19 01 2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01-19"
#
# # test the case 2017
# reccds["prepublication"]["date"] = "2017"
# svc.submitted(reccds)
# assert reccds.submitted() == "2017-01"
#
#
# def test_format_authors_cds_13007(svc, reccds):
#
# authors = reccds.authors_as_list()
#
# assert len(authors) == reccds["number_of_authors"]
# assert authors[0] == "Aaij, Roel"
# assert authors[1] == "Adeva, Bernardo"
# assert authors[344] == "Koopman, Rose"
# assert authors[-1] == "Zucchelli, Stefano"
#
# svc.format_authors(reccds, fmt="F. Last")
# authors = reccds.authors_as_list()
#
# assert authors[0] == "R. Aaij"
# assert authors[1] == "B. Adeva"
# assert authors[344] == "R. Koopman"
# assert authors[-1] == "S. Zucchelli"
#
#
# def test_get_my_authors_cds_13008(svc, reccds):
#
# svc.format_authors(reccds, fmt="F. Last")
# assert svc.get_my_authors(reccds, sep="|", sort=True) is None
#
# my_authors = reccds.my_authors
#
# assert my_authors == "J. Arnau Romeu|E. Aslanides|J. Cogan|" \
# "K. De Bruyn|R. Le Gac|O. Leroy|" \
# "G. Mancinelli|M. Martin|A. Mordà|" \
# "J. Serrano|A. Tayduganov|A. Tsaregorodtsev"
"""test_15_CheckAndFix_proceeding
* Test CheckAndFix methods for proceeding.
Use the same proceeding in cds.cern.ch and inspirehep.net
- is_conference
- country
- conference_date
- submitted
- format_editor (already test with article)
- publisher (already test with article)
- paper_reference (already test with article)
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from harvest_tools.exception import CheckException
from store_tools import load_record
@pytest.fixture(scope="module")
def reccds():
return load_record("cds.cern.ch", 1411352)
@pytest.fixture(scope="module")
def recins():
return load_record("inspirehep.net", 1089237, shelf="literature")
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_is_conference_cds_15001(svc, reccds):
assert svc.is_conference(reccds) is None
# test exception
# the publication cds2242595 is a talk without conference data
#
reccds2 = load_record("cds.cern.ch", 2242595)
with pytest.raises(CheckException):
svc.is_conference(reccds2)
def test_is_conference_ins_15002(svc, recins):
assert svc.is_conference(recins) is None
def test_country_cds_15003(svc, reccds):
assert svc.country(reccds) is None
def test_country_ins_15004(svc, recins):
assert svc.country(recins) is None
def test_conference_date_cds_15005(svc, reccds):
assert reccds.conference_dates() == "6 - 11 Dec 2010"
svc.conference_date(reccds)
assert reccds.conference_dates() == "6-11 Dec 2010"
def test_conference_date_cds_15006(svc):
reccds = load_record("cds.cern.ch", 2688580)
assert reccds.conference_dates() == "04-06 Sept 2019"
svc.conference_date(reccds)
assert reccds.conference_dates