Commit a3173b67 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Major upgrade to migrate check ad fix methods in RecordHepPubli

parent 6cae4419
......@@ -7,15 +7,13 @@ from .base import (DRY_RUN,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD,
MSG_NO_ENTRY,
MSG_TOOMANY_SYNONYM,
family_name_fr,
search_synonym)
learn_my_authors,
get_rex_institute)
from .automaton import Automaton
from .articles import Articles
from .checkandfix import CheckAndFix, MONTHS
from .exception import CheckException, ToolException
from .factory import build_harvester_tool, get_harvester_tool
from .msg import Msg
from .msgcollection import MsgCollection
......
......@@ -3,14 +3,13 @@
"""
from .automaton import Automaton
from .base import (learn_my_authors,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from .checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
MSG_NO_EDITOR = "Reject article is not published"
MSG_NOT_ARTICLE = "Reject publication is not and article"
MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article"
T4 = " "*4
......@@ -32,6 +31,17 @@ class Articles(Automaton):
def check_record(self, record):
"""Check the content of the article in order to fix non-conformities.
* publication is a published article
* is with authors form my institute
* standardise name of collaboration
* format authors according to my format
* extract authors form my institute signing the publication
* is submitted date well formed
* format editor according to my criteria
* resolve published synonym
* check reference paper
Args:
record (RecordPubli):
the record describing the article.
......@@ -42,32 +52,30 @@ class Articles(Automaton):
can not be corrected.
"""
if not Automaton.check_record(self, record):
return False
self.logger.debug(f"{T4}check record (article)")
try:
if not record.is_published():
self.logs[-1].reject(MSG_NO_EDITOR, record=record)
return False
self.check.format_editor(record)
self.check.publisher(record)
if record.subtype() == "article":
self.logs[-1].reject(MSG_NOT_ARTICLE, record)
return False
self.check.paper_reference(record)
self.check.submitted(record)
try:
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
# is with authors form my institute
# standardise name of collaboration
# format authors according to my format
# extract authors form my institute signing the publication
# is submitted date well formed
record.check_and_fix(self.rex_institute,
fmt_author="F. Last",
sep_author=", ",
sort_author=True)
except CheckException as e:
self.logs[-1].reject(e, record=record)
return False
record.format_editor()
record.check_publisher(self.db)
record.check_paper_reference()
except Exception as e:
self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
self.logs[-1].reject(e, record=record)
return False
return True
......
......@@ -4,17 +4,16 @@
import logging
import re
from .base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
ToolException)
from .checkandfix import CheckAndFix
from .base import get_rex_institute, MSG_FIX_ORIGIN, MSG_IN_DB
from gluon import current
from gluon.storage import Storage
from .msg import Msg
from .msgcollection import MsgCollection
from plugin_dbui import CALLBACK_ERRORS, get_id
from store_tools import (StoreException,
build_store,
OAI_URL)
from store_tools import (build_store,
OAI_URL,
StoreException,
ToolException)
from store_tools.factory import build_record
MSG_NO_CAT = 'Select a "category" !!!'
......@@ -123,7 +122,6 @@ class Automaton(object):
if not id_category:
raise ToolException(MSG_NO_CAT)
self.check = CheckAndFix()
self.collection_logs = []
self.controller = automaton
self.db = db
......@@ -133,6 +131,7 @@ class Automaton(object):
self.id_project = id_project
self.logs = []
self.logger = logging.getLogger("web2py.app.limbra")
self.rex_intitute = get_rex_institute(db, current.app)
self.store = None
self.year_start = year_start
self.year_end = year_end
......@@ -295,13 +294,7 @@ class Automaton(object):
corrected.
Note:
Some checks depend on the type of publications and have to be
implemented in inherited class.
Note:
The order of the checks matter. It should be OAI,
temporary record, authors, my authors and then a series of checks
specific to the publication type.
To be implemented by inheried classes
Args:
record (Record):
......@@ -313,28 +306,7 @@ class Automaton(object):
corrected.
"""
self.logger.debug(f"{T4}check record (automaton)")
try:
# fix record with a missing OAI
if not self.check.is_oai(record):
oai = OAI % (self.harvester.host, record.id())
record["oai"] = {"value": oai}
if self.check.is_bad_oai_used(record):
self.logs[-1].idle(MSG_IN_DB, record.submitted())
return False
self.check.temporary_record(record)
self.check.authors(record)
self.check.my_affiliation(record, self.id_project, self.id_team)
self.check.collaboration(record)
except Exception as e:
self.logs[-1].reject(e, record=record)
return False
return True
return False
def get_record_by_fields(self, oai_url, year, **kwargs):
"""Get database record matching fields values defined
......
""" harvest_tools.base
"""
from .exception import ToolException
from plugin_dbui import get_id, UNDEF_ID
DRY_RUN = "dry run"
MSG_CRASH = "Crash: %s"
MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database"
MSG_LOAD = "Load in the database"
MSG_NO_ENTRY = "Reject %s is not defined"
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms"
def family_name_fr(full_name):
......@@ -99,68 +93,48 @@ def learn_my_authors(db,
db.my_authors[row.id] = dict(authors=', '.join(database_authors))
def search_synonym(table, fieldname, value, create=False):
"""Get the database identifier for the record having the database field
or the synonyms field matching the value.
def get_rex_institute(db, app):
"""Get the regular expression defining the affiliation of my institute.
It is obtained by concatenating the affiliation keys.
Affiliation key can contains character like ``(``, ``)`` or ``&``.
They are replaced by ``\(`` *etc*.
Note:
The database table must have a field name *synonyms*.
It contains a list of strings.
Args:
table (gluon.DAL.Table):
database table.
db (pydal.DAL):
database connection
app (gluon.storage.Storage):
namespace defining the application
Returns:
str:
fieldname (str):
field of the database table identified by its name.
"""
# alias
reg_institute = app.reg_institute
value (str):
value to be matched.
# regular expression for the affiliation keys
# protect special character
# add start and end of string for an exact match
if not reg_institute:
create(bool):
create a new entry in the database table when
it is ``True``
lst = []
for row in db(db.affiliation_keys.id > 0).iterselect():
val = row.key_u
Returns:
int:
* the id of the database record.
* UNDEF_ID if value is not defined.
val = (val
.replace("(", "\(")
.replace(")", "\)")
.replace("&", "\&")
.replace("$", "\$")
.replace("+", "\+")
.replace("?", "\?"))
Raises:
ToolException:
* no synonym found and not allow to create a new one.
* more than one synonym is found.
val = r"(^|\|){}($|\|)" .format(val)
"""
if not value:
return UNDEF_ID
db = table._db
kwargs = {}
kwargs[fieldname] = value
id_rec = get_id(table, **kwargs)
if id_rec is not None:
return id_rec
# nothing found, have a look to the synonyms field
query = table.synonyms.contains(value)
setrows = db(query)
# no synonym found, create the entry
ncount = setrows.count()
if ncount == 0:
if create:
return table.insert(**kwargs)
else:
msg = MSG_NO_ENTRY % table._tablename
raise ToolException(msg)
# one synonym found
elif ncount == 1:
return setrows.select(table.id).first().id
# more than one synonyms - don't know what to choose
else:
msg = MSG_TOOMANY_SYNONYM % table._tablename
raise ToolException(msg)
lst.append(val)
reg_institute = r"|".join(lst)
return reg_institute
......@@ -5,11 +5,10 @@ import logging
import numpy as np
import re
from .base import search_synonym, ToolException
from datetime import datetime
from .exception import CheckException
from gluon import current
from store_tools import (MSG_NO_CONF,
from store_tools import (CheckException,
MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
RecordCdsConf,
......@@ -19,7 +18,9 @@ from store_tools import (MSG_NO_CONF,
RecordHepPubli,
RecordHepThesis,
REG_OAI,
REG_YEAR)
REG_YEAR,
search_synonym,
ToolException)
from store_tools.publicationinfomixin import PAPER_REFERENCE_KEYS
......
""" harvest_tools.exception
"""
class CheckException(Exception):
pass
class ToolException(Exception):
pass
......@@ -3,10 +3,9 @@
"""
import json
from .base import MSG_NO_ENTRY, MSG_TOOMANY_SYNONYM
from gluon import current
from gluon.storage import Storage
from store_tools import OAI_URL
from store_tools import MSG_NO_ENTRY, MSG_TOOMANY_SYNONYM, OAI_URL
MSGS = (MSG_NO_ENTRY, MSG_TOOMANY_SYNONYM)
TABLES = ("collaborations", "countries", "publishers")
......
......@@ -12,18 +12,26 @@ from .base import (ARXIV,
is_thesis,
MSG_NO_CONF,
MSG_NO_COUNTRY,
MSG_NO_ENTRY,
MSG_NO_PUBLISHER,
MSG_NO_THESIS,
MSG_TOOMANY_SYNONYM,
MSG_UNKNOWN_COLLABORATION,
MSG_WELL_FORMED_COLLABORATION,
OAI_URL,
REG_ARXIV_NUMBER,
REG_AUTHOR,
REG_DATE,
REG_DATE_YYYYMM,
REG_OAI,
REG_YEAR,
search_synonym,
THESIS_DIR)
from .exception import (StoreException,
RecordException)
from .exception import (CheckException,
RecordException,
StoreException,
ToolException)
from .factory import build_record, build_store
from .inspirehepstore import InspirehepStore
......
......@@ -3,7 +3,8 @@
"""
import re
from .exception import RecordException
from .base import T6
from .exception import CheckException, RecordException
from numpy import NaN
from pandas import concat
......@@ -14,7 +15,10 @@ AUTHOR_FORMATS = [
"Last, First",
"Last F."]
MSG_FAUTHOR_COLLABORATION = "Reject first author is a Collaboration"
MSG_INVALID_FMT = "Invalid format for author"
MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
def to_initial(name):
......@@ -125,6 +129,85 @@ class AuthorsMixin(object):
return li
def check_authors(self):
"""Check that:
* author fields are defined.
* first author is not like ATLAS Collaboration
Args:
record (RecordCdsPubli):
record describing a publication.
Raises:
CheckException:
when there is no authors.
"""
self.logger.debug(f"{T6}check authors")
if not self.is_authors():
raise CheckException(MSG_NO_AUTHOR)
if "collaboration" in self.first_author().lower():
raise CheckException(MSG_FAUTHOR_COLLABORATION)
def check_format_authors(self, fmt=None):
"""Format the author names.
Args:
fmt (str):
define the format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``
"""
if fmt is None:
self.logger.debug(f"{T6}skip format authors -- fmt is None")
return
self.logger.debug(f"{T6}format authors")
self.reformat_authors(fmt)
def extract_my_authors(self, rex_institute, sep=", ", sort=False):
"""Authors of my institutes signing the record.
The information is append to the record instance via the attribute
``my_authors``.
Args:
rex_institute (str):
regular expression defining the affiliation of my institute
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
the list of authors separated by the ``sep`` argument.
Raises:
CheckException:
the list is empty
"""
self.logger.debug(f"{T6}get my authors")
# might have been computed when affiliation is checked
if self.my_authors is not None:
return
# find authors of my institute signing the record
value = self.find_authors_by_affiliation(rex_institute, sep, sort)
if len(value) == 0:
raise CheckException(MSG_NO_MY_AUTHOR)
self.my_authors = value
def find_affiliation(self, pattern):
"""Find affiliation matching the regular expression *pattern*.
......@@ -310,6 +393,9 @@ class AuthorsMixin(object):
bool:
"""
if getattr(self, "df_authors", None) is None:
return False
df = self.df_authors
cols = {"first_name", "full_name", "last_name"}
......
......@@ -3,6 +3,9 @@
"""
import re
from .exception import ToolException
from plugin_dbui import get_id, UNDEF_ID
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
......@@ -14,11 +17,15 @@ MSG_INV_CONF_KEY = "Reject invalid conference key"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_CONF_ID_KEY = "Reject no conference identifier and key"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_ENTRY = "Reject %s is not defined"
MSG_NO_HOST = "Reject no host information in record"
MSG_NO_PUBLISHER = "Reject invalid publisher"
MSG_NO_SHELF = "No shelf %s for store %s"
MSG_NO_THESIS = "Reject no thesis information"
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms"
MSG_UNKNOWN_COLLABORATION = "Reject collaboration is unknown."
MSG_WELL_FORMED_COLLABORATION = "Reject collaboration is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
OAI = "oai:%s:%s"
OAI_URL = "http://%s/record/%s"
......@@ -33,10 +40,13 @@ REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_AUTHOR = re.compile(r"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$", re.UNICODE)
REG_DATE = re.compile(r"(\d{4}-\d{2}-\d{2})")
REG_DATE_YYYYMM = re.compile(r"(\d{4}-\d{2})")
REG_CONF = re.compile("^C\d+-\d+-\d+(?:\.\d+)?$")
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
T2, T4, T6 = " "*2, " "*4, " "*6
THESIS_DIR = "dir."
......@@ -140,3 +150,70 @@ def is_thesis(recjson):
return True
return False
def search_synonym(table, fieldname, value, create=False):
"""Get the database identifier for the record having the database field
or the synonyms field matching the value.
Note:
The database table must have a field name *synonyms*.
It contains a list of strings.
Args:
table (gluon.DAL.Table):
database table.
fieldname (str):
field of the database table identified by its name.
value (str):
value to be matched.
create(bool):
create a new entry in the database table when
it is ``True``
Returns:
int:
* the id of the database record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
* no synonym found and not allow to create a new one.
* more than one synonym is found.
"""
if len(value) == 0:
return UNDEF_ID
db = table._db
kwargs = {}
kwargs[fieldname] = value
id_rec = get_id(table, **kwargs)
if id_rec is not None:
return id_rec
# nothing found, have a look to the synonyms field
query = table.synonyms.contains(value)
setrows = db(query)
# no synonym found, create the entry
ncount = setrows.count()
if ncount == 0:
if create:
return table.insert(**kwargs)
else:
msg = MSG_NO_ENTRY % table._tablename
raise ToolException(msg)
# one synonym found
elif ncount == 1:
return setrows.select(table.id).first().id
# more than one synonyms - don't know what to choose
else:
msg = MSG_TOOMANY_SYNONYM % table._tablename
raise ToolException(msg)