Commit c8095367 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate check and fix method to RecordCdsPubli

parent a3173b67
......@@ -20,6 +20,7 @@ MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_ENTRY = "Reject %s is not defined"
MSG_NO_HOST = "Reject no host information in record"
MSG_NO_PUBLISHER = "Reject invalid publisher"
MSG_NO_REF = "Reject incomplete paper reference. Check "
MSG_NO_SHELF = "No shelf %s for store %s"
MSG_NO_THESIS = "Reject no thesis information"
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms"
......@@ -39,9 +40,10 @@ REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
# group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
REG_CONF = re.compile("^C\d+-\d+-\d+(?:\.\d+)?$")
REG_DATE = re.compile(r"(\d{4}-\d{2}-\d{2})")
REG_DATE_YYYYMM = re.compile(r"(\d{4}-\d{2})")
REG_CONF = re.compile("^C\d+-\d+-\d+(?:\.\d+)?$")
REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
......
......@@ -2,14 +2,50 @@
"""
import logging
import numpy as np
import pprint
from .authorsmixin import AuthorsMixin
from .base import ARXIV, OAI, OAI_URL, REG_OAI
import re
from .authorsmixin import AuthorsMixin, MSG_NO_MY_AUTHOR
from .base import (ARXIV,
MSG_UNKNOWN_COLLABORATION,
MSG_NO_REF,
OAI,
OAI_URL,
search_synonym,
REG_DOI,
REG_OAI,
T6)
from .exception import CheckException
from filters import CLEAN_COLLABORATION
from pandas import concat, DataFrame
from plugin_dbui import CLEAN_SPACES
from .publicationinfomixin import PublicationInfoMixin
from plugin_dbui import CLEAN_SPACES, UNDEF_ID
from .publicationinfomixin import PAPER_REFERENCE_KEYS, PublicationInfoMixin
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
MONTHS = {"Jan": "01",
"Feb": "02",
"Fev": "02",
"Mar": "03",
"Apr": "04",
"Avr": "04",
"May": "05",
"Mai": "05",
"Jun": "06",
"Jul": "07",
"Aug": "08",
"Sep": "09",
"Oct": "10",
"Nov": "11",
"Dec": "12"}
MSG_NO_DATE = "Reject no submission date"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
def to_str(x):
......@@ -310,6 +346,257 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
# replace
self.df_info = df
def _recover_submitted(self):
"""Recover submitted date using conference, preprint or thesis
information.
Args:
record (RecordPubli):
record describing a publication.
Returns:
str:
target at least YYYY-MM
empty when procedure failed
"""
val = ""
# try by using the preprint information
report = self.preprint_number()
if report:
m_arxiv = DECODE_ARXIV.match(report)
if m_arxiv:
val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))
# last change use the creation date for the record
if val == "" or len(val) < 7:
val = self["creation_date"][0:7]
return val
def check_collaboration(self, db=None):
"""Check synonyms for collaboration by using by the proper value.
Args:
db (pydal.DAL):
database connection
Raises:
CheckException:
* the collaboration is unknown in the database
* more than one synonym found.
"""
if db is None:
self.logger.debug(f"{T6}skip check collaboration -- db is None")
return
self.logger.debug(f"{T6}check collaboration")
val = self.collaboration()
if len(val) == 0:
return
dbid = search_synonym(db.collaborations, "collaboration", val)
if dbid == UNDEF_ID:
raise CheckException(MSG_UNKNOWN_COLLABORATION)
collaboration = db.collaborations[dbid].collaboration
if collaboration != val:
if "corporate_name" in self:
# one collaboration
if isinstance(self["corporate_name"], dict):
self["corporate_name"]["collaboration"] = collaboration
# several collaboration
# replace the list of dictionary by a single one
else:
self["corporate_name"] = {"collaboration": collaboration}
def check_my_affiliation(self, rex_institute=None):
"""Check that authors of my institute are signatories.
Args:
rex_institute (str):
regular expression defining my institute
Raises:
CheckException
"""
if rex_institute is None:
self.logger.debug(f"{T6}skip check my affiliation -- rex is None")
return
self.logger.debug(f"{T6}check my affiliation")
value = self.find_affiliation(rex_institute)
if len(value) == 0:
raise CheckException(MSG_NO_MY_AUTHOR)
def check_paper_reference(self):
"""Check that editor, page, volume and paper year are defined
for a published paper. Repair it from doi when possible.
Args:
record (RecordCdsPubli):
record describing a publication.
Raises:
CheckException:
the paper reference is not well formed.
"""
self.logger.debug(f"{T6}check paper reference")
if self.is_published():
return
# paper reference can be incomplete or missing
# is the paper published ? In that case the doi is defined
if "doi" not in self:
return
# what information is missing ?
# * df.columns are title, volume, year and pagination
# * df can contains one or more rows due to erratum.
# * assume that the first row is the oldest one and corresponds tp
# the first publication
# * the row contains empty string when the record is not published.
# * iloc[0] returns a serie where the index are the column's name
#
columns = (self.df_info.iloc[0]
.replace("", np.nan)
.dropna()
.index)
missing = PAPER_REFERENCE_KEYS.difference(columns)
# try to recover from the doi when it has the form
# xx.yyyy/Publisher.Volume.Page
m = REG_DOI.match(self["doi"])
if not m:
raise CheckException(MSG_NO_REF + str(list(missing)))
for subfield in missing:
if subfield == "title":
# transform PhysRevD in Phys. Rev. D
li = re.split(r"([A-Z][a-z]+)", m.group(1))
title = ". ".join([el for el in li if len(el) > 0])
self.df_info.loc[0, "title"] = title
elif subfield == "volume":
self.df_info.loc[0, "volume"] = m.group(2)
elif subfield == "pagination":
self.df_info.loc[0, "pagination"] = m.group(3)
elif subfield == "year":
raise CheckException(MSG_NO_REF + "[year]")
def check_submitted_date(self):
"""Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
Look for alternative when it is not defined.
Note:
After this check the year submitted contains one entry.
Args:
record (RecordCdsPubli):
record describing a publication.
Raises:
CheckException::
* the date is not well formed
* more than one date are found.
"""
self.logger.debug(f"{T6}check submitted")
date = self.submitted()
# recover missing date using conference, preprint, thesis information
if len(date) < 7:
date = self._recover_submitted()
if len(date) == 0:
raise CheckException(MSG_NO_DATE)
elif len(date) < 7:
raise CheckException(MSG_WELL_FORMED_DATE)
# 22 Mar 2011
m = DECODE_DD_MMM_YYYY.match(date)
if m:
data = (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
date = '%s-%s-%02i' % data
# 22 03 2011
m = DECODE_DD_MM_YYYY.match(date)
if m:
data = (m.group(3), int(m.group(2)), int(m.group(1)))
date = '%s-%02i-%02i' % data
# in some case we have to deal with a list (see cds 2234042)
# in some case it is not defined (e.g. phd thesis)
if "prepublication" in self:
prepublication = self["prepublication"]
if isinstance(prepublication, list):
prepublication[0]["date"] = date
else:
prepublication["date"] = date
else:
self["prepublication"] = {"date": date}
def check_and_fix_record(self,
db=None,
fmt_author=None,
rex_institute=None,
sep_author=", ",
sort_author=False):
"""Check record and fix non-conformities.
* is with authors
* is with authors form my institute
* standardise name of collaboration
* format authors according to my format
* extract authors form my institute signing the publication
Args:
db (pydal.DAL):
database connection
fmt_author (str):
define the format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``
rex_institute (str):
regular expression defining my institute
sep_author (str):
string separating author names. The default is the comma.
sort_author (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Raises:
CheckException
"""
self.check_authors()
self.check_my_affiliation(rex_institute)
self.check_collaboration(db)
self.check_format_authors(fmt_author)
self.extract_my_authors(rex_institute, sep_author, sort_author)
self.check_submitted_date()
def collaboration(self):
"""The collaboration(s) signing the publication.
......@@ -442,6 +729,40 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
recid = self["recid"]
return f"http://cds.cern.ch/record/{recid}"
def report_number(self):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
# CDS
if "report_number" in self:
data = self["report_number"]
data = (data if isinstance(data, list) else [data])
li = []
[li.extend(di.values()) for di in data]
return ", ".join(sorted(li))
# OLD.INSPIRE
if "primary_report_number" in self:
data = self["primary_report_number"]
data = (data if isinstance(data, list) else [data])
li = [el for el in data
if el is not None and not el.startswith(ARXIV)]
return ", ".join(sorted(li))
return ""
def secondary_oai(self):
"""The secondary OAI identifier.
......@@ -489,40 +810,6 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
return ""
def report_number(self):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
# CDS
if "report_number" in self:
data = self["report_number"]
data = (data if isinstance(data, list) else [data])
li = []
[li.extend(di.values()) for di in data]
return ", ".join(sorted(li))
# OLD.INSPIRE
if "primary_report_number" in self:
data = self["primary_report_number"]
data = (data if isinstance(data, list) else [data])
li = [el for el in data
if el is not None and not el.startswith(ARXIV)]
return ", ".join(sorted(li))
return ""
def submitted(self):
"""The date of submission.
......@@ -537,6 +824,28 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
val = self._get("prepublication", "date")
return (val[0] if isinstance(val, list) else val)
def subtype(self):
"""The subtype of the publication.
Returns:
str:
* "articles", "preprint", "note" or "report"
* empty string when it is not defined
"""
collection = self.get("collection", None)
if collection is None:
return ""
lst = [dct.get("primary", "").lower() for dct in collection]
# order matter since note can have preprint+note
for val in ("article", "note", "report", "preprint"):
if val in lst:
return val
return ""
def title(self):
"""The title of the publication.
......
......@@ -387,6 +387,19 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst = [self.primary_oai(), self.secondary_oai()]
return ", ".join(lst).strip(", ")
def oai_url(self):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
* the pattern of the URL is ``http://host/record/id``
* primary and secondary URLs are separated by a comma.
* an empty string when it is not defined
"""
lst = [self.primary_oai_url(), self.secondary_oai_url()]
return ", ".join(lst).strip(", ")
def paper_url(self):
"""The URL of the document.
......@@ -458,19 +471,6 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst = [elt["value"] for elt in lst]
return ", ".join(lst)
def oai_url(self):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
* the pattern of the URL is ``http://host/record/id``
* primary and secondary URLs are separated by a comma.
* an empty string when it is not defined
"""
lst = [self.primary_oai_url(), self.secondary_oai_url()]
return ", ".join(lst).strip(", ")
def secondary_oai(self):
"""The secondary OAI identifier.
......
"""test_13_CheckAndFix_article
* Test CheckAndFix methods for article:
- format_editor
- publisher
- paper_reference
- submitted
- format_author
- get_my_authors
* Same article in cds.cern.ch and inspirehep.net
Phys. Rev. D 95 (2017) 052005
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from store_tools import load_record
@pytest.fixture(scope="module")
def reccds():
return load_record("cds.cern.ch", 2242641)
@pytest.fixture(scope="module")
def recins():
return load_record("inspirehep.net", 1509922, shelf="literature")
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_format_editor_cds_13001(svc, reccds):
# cds
assert reccds.paper_editor() == "Phys. Rev. D"
assert reccds.paper_volume() == "95"
svc.format_editor(reccds)
assert reccds.paper_editor() == "Phys. Rev. D"
assert reccds.paper_volume() == "95"
def test_format_editor_ins_13002(svc, recins):
# inspire
assert recins.paper_editor() == "Phys. Rev. D"
assert recins.paper_volume() == "95"
svc.format_editor(recins)
assert recins.paper_editor() == "Phys. Rev. D"
assert recins.paper_volume() == "95"
def test_publisher_cds_13003(svc, reccds):
assert svc.publisher(reccds) is None
def test_paper_reference_cds_13004(svc, reccds):
# check recovery procedure using DOI
# remove the publisher and volume information
paper_ref = reccds.paper_reference()
reccds.df_info.loc[0, ["title", "volume"]] = ["", ""]
svc.paper_reference(reccds)
assert reccds.paper_reference() == paper_ref
def test_submitted_cds_13005(svc, reccds):
assert reccds.submitted() == "19 Jan 2017"
svc.submitted(reccds)
assert reccds.submitted() == "2017-01-19"
# test the case 19 01 2017
reccds["prepublication"]["date"] = "19 01 2017"
svc.submitted(reccds)
assert reccds.submitted() == "2017-01-19"
# test the case 2017
reccds["prepublication"]["date"] = "2017"
svc.submitted(reccds)
assert reccds.submitted() == "2017-01"
def test_submitted_ins_13006(svc, recins):
assert recins.submitted() == "2017-01-19"
def test_format_authors_cds_13007(svc, reccds):
authors = reccds.authors_as_list()
assert len(authors) == reccds["number_of_authors"]
assert authors[0] == "Aaij, Roel"
assert authors[1] == "Adeva, Bernardo"
assert authors[344] == "Koopman, Rose"
assert authors[-1] == "Zucchelli, Stefano"
svc.format_authors(reccds, fmt="F. Last")
authors = reccds.authors_as_list()
assert authors[0] == "R. Aaij"
assert authors[1] == "B. Adeva"
assert authors[344] == "R. Koopman"
assert authors[-1] == "S. Zucchelli"
def test_get_my_authors_cds_13008(svc, reccds):
svc.format_authors(reccds, fmt="F. Last")
assert svc.get_my_authors(reccds, sep="|", sort=True) is None
my_authors = reccds.my_authors
assert my_authors == "J. Arnau Romeu|E. Aslanides|J. Cogan|" \
"K. De Bruyn|R. Le Gac|O. Leroy|" \
"G. Mancinelli|M. Martin|A. Mordà|" \
"J. Serrano|A. Tayduganov|A. Tsaregorodtsev"
def test_collaboration_ins_13009(svc):
# require the CPPM database (test_limbra)
record = load_record("inspirehep.net", 1826290, shelf="literature")