Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 44907556 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Improve the logic of the automaton and modify the article class accordingly.

parent 8982c977
......@@ -6,7 +6,12 @@ import traceback
from automaton import Automaton
from base import family_name_fr, MSG_CRASH, MSG_FIX_ORIGIN, MSG_IN_DB, MSG_LOAD
from base import (family_name_fr,
format_author_fr,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from invenio_tools import CheckException
from plugin_dbui import get_id, UNDEF_ID
......@@ -50,17 +55,15 @@ class Articles(Automaton):
self.logs[-1].reject(MSG_NO_EDITOR, record.year())
return False
self.check.my_authors(record,
reference=self._my_author_list(record),
cmpFct=family_name_fr)
self.check.oai(record)
self.check.paper_reference(record)
self.check.submitted(record)
self.check.year(record)
self.check.paper_reference(record)
self.check.format_editor(record)
self.check.format_authors(record, format_author_fr)
self.check.get_my_authors(record, family_name_fr)
self.check.fix_oai(record)
except CheckException as e:
self.logs[-1].reject(e, record.year())
......@@ -73,15 +76,15 @@ class Articles(Automaton):
return True
def get_by_origin(self,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
title=None,
volume=None,
year=None):
def get_record_by_origin(self,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
title=None,
volume=None,
year=None):
"""Get an existing record using the origin field.
- Transform a preprint into article.
......@@ -130,16 +133,16 @@ class Articles(Automaton):
return (rec_id, 1)
def get_by_fields(self,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
preprint_number=None,
title=None,
volume=None,
year=None):
def get_record_by_fields(self,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
preprint_number=None,
title=None,
volume=None,
year=None):
"""Get a record matching the fields: id_projects,
id_publishers, id_teams, pages, volume and year.
......@@ -234,6 +237,7 @@ class Articles(Automaton):
# alias
editor = record.paper_editor()
first_author = record.first_author()
my_authors = record.my_authors
oai_url = record.oai_url()
pages = record.paper_pages()
preprint_number = record.preprint_number()
......@@ -254,26 +258,26 @@ class Articles(Automaton):
# The latter is useful to cover the case where the record
# is entered by hand or by another harvester.
#
rec_id, status = self.get_by_origin(id_publisher=id_publisher,
my_authors=record.my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
title=title,
volume=volume,
year=year)
rec_id, status = self.get_record_by_origin(id_publisher=id_publisher,
my_authors=my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
title=title,
volume=volume,
year=year)
if rec_id:
return status
rec_id, status = self.get_by_fields(id_publisher=id_publisher,
my_authors=record.my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
preprint_number=preprint_number,
title=title,
volume=volume,
year=year)
rec_id, status = self.get_record_by_fields(id_publisher=id_publisher,
my_authors=my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
preprint_number=preprint_number,
title=title,
volume=volume,
year=year)
if rec_id:
return status
......@@ -282,7 +286,7 @@ class Articles(Automaton):
if not self.dry_run:
db.publications.insert(authors=record.authors(),
authors_institute=record.my_authors,
authors_institute=my_authors,
first_author=first_author,
id_categories=self.id_category,
id_collaborations=id_collaboration,
......
......@@ -6,14 +6,13 @@ import re
import traceback
from base import format_author_fr, MSG_FIX_ORIGIN, MSG_IN_DB, ToolException
from base import MSG_FIX_ORIGIN, MSG_IN_DB, ToolException
from gluon.storage import Storage
from invenio_tools import (CheckAndFix,
InvenioStore,
Marc12,
OAI_URL,
REG_OAI,
REG_YEAR)
REG_OAI)
from msg import Msg
from msgcollection import MsgCollection
from plugin_dbui import get_create_id, get_id, UNDEF_ID
......@@ -115,10 +114,6 @@ class Automaton(object):
controller=self.controller,
id_categories=self.id_category)
# private cache for my_author rescue list
self.__par = None
self.__reference = None
def _is_record_in_db(self, rec_id, title):
"""Return C{True} if the record is already in the database.
The search is based on the origin field.
......@@ -221,74 +216,7 @@ class Automaton(object):
so='d') # descending order
return dic
def _my_author_list(self, record):
"""Extract the rescue list for my authors in the database.
@type record: L{Record}
@param record:
@rtype: list
@return: empty when not defined
"""
year = record.year()
# try to recover year when not defined
if not year:
# published article, proceeding
if "773" in record and "y" in record["773"]:
year = record["773"]["y"]
# start date of a conference
elif "111" in record and "x" in record["111"]:
year = record["111"]["x"]
# end date of a conference
elif "111" in record and "z" in record["111"]:
year = record["111"]["z"]
# submitted date
elif "269" in record and "c" in record["269"]:
year = record["269"]["c"]
else:
return []
#
# NOTE:
# keep in mind that the CheckAndfix mechanism is not yet run
# therefore year can be a list due to erratum, ...
#
if isinstance(year, list):
year.sort()
year = year[0]
# the value can have several format 1992, 1992-12-31, ....
m = REG_YEAR.search(year)
if m:
year = m.group(1)
else:
return []
# caching
t = (year, self.id_project, self.id_team)
if t == self.__par:
return self.__reference
# extract the list from the database
row = self.db.my_authors(year=year,
id_projects=self.id_project,
id_teams=self.id_team)
if row:
self.__reference = row['authors'].split(', ')
else:
self.__reference = []
return self.__reference
def check_record(self, record):
def check_record(self, record, cmpFct=None):
"""Check the content of the record in order to fix non conformities.
Return False when a non conformities has been found and can not be
corrected.
......@@ -314,7 +242,7 @@ class Automaton(object):
try:
self.check.temporary_record(record)
self.check.authors(record)
self.check.format_authors(record, format_author_fr)
self.check.my_affiliation(record, self.id_project, self.id_team)
self.check.collaboration(record)
except Exception as e:
......@@ -323,7 +251,7 @@ class Automaton(object):
return True
def get_by_fields(self, **kwargs):
def get_record_by_fields(self, **kwargs):
"""Get database record matching fields defined in the keyword arguments.
- Fix the field origin when a match is found.
......
......@@ -100,16 +100,119 @@ def load_record(host, record_id):
class CheckAndFix(object):
"""Tool to check and repair the content of the Marc12 record:
"""A collection of tools to check and repair the content
of the Marc12 record.
- Check the validity of the record.
- Conference information are added for talk and proceeding.
- Fix as far as possible inconsistencies and non-conformity.
"""
def __init__(self):
They can be call separately or in one go.
Most of the method raise the CheckException when something went wrong.
self.db = current.globalenv['db']
self.reg_institute = self._get_reg_institute()
# private cache for my_author rescue list
self.__par = None
self.__reference = None
# private cache for my authors list
self.__my_authors = {}
def _get_reg_institute(self):
"""
@rtype: unicode
@return: the regular expression defining my institute
"""
# alias
app = current.app
reg_institute = app.reg_institute
# regular expression for the institute is not defined
# find it using the institute definition in inspirehep
# store the regular expression in current.app for a later use
if not reg_institute:
institute_id = app.inspirehep_institute_id
institute = load_record("inspirehep.net", institute_id)
reg_institute = institute.rex()
app.institute = institute
app.reg_institute = reg_institute
return reg_institute
def _get_author_rescue_list(self, record, id_project, id_team):
"""Get the rescue list for my authors.
@type record: L{Record}
@param record:
@type id_project: int
@param id_project: Identifier of the project in the database
@type id_team: int
@param id_team: Identifier of the team in the database
@rtype: list
@return: empty when not defined
"""
year = record.year()
# try to recover year when not defined
if not year:
# published article, proceeding
if "773" in record and "y" in record["773"]:
year = record["773"]["y"]
# start date of a conference
elif "111" in record and "x" in record["111"]:
year = record["111"]["x"]
# end date of a conference
elif "111" in record and "z" in record["111"]:
year = record["111"]["z"]
# submitted date
elif "269" in record and "c" in record["269"]:
year = record["269"]["c"]
else:
return []
#
# NOTE
# keep in mind that the CheckAndfix mechanism is not yet run
# therefore year can be a list due to erratum, ...
#
if isinstance(year, list):
year.sort()
year = year[0]
# the value can have several format 1992, 1992-12-31, ....
m = REG_YEAR.search(year)
if m:
year = m.group(1)
else:
return []
# caching
t = (year, self.id_project, self.id_team)
if t == self.__par:
return self.__reference
# extract the list from the database
row = self.db.my_authors(year=year,
id_projects=self.id_project,
id_teams=self.id_team)
if row:
self.__reference = row['authors'].split(', ')
else:
self.__reference = []
return self.__reference
"""
def _recover_submitted(self, record):
"""Recover submitted date using conference, preprint or thesis
information.
......@@ -202,7 +305,7 @@ class CheckAndFix(object):
raise CheckException(MSG_NO_REF)
def authors(self, record):
"""Check the consistency between author fields.
"""Check that author fields are defined.
@type record: L{Record}
@param record:
......@@ -266,7 +369,7 @@ class CheckAndFix(object):
return
# check country information (all valid countries have been enter once)
db = current.globalenv['db']
db = self.db
id = get_id(db.countries, country=record.conference_country())
if not id:
raise CheckException(MSG_NO_COUNTRY)
......@@ -292,6 +395,46 @@ class CheckAndFix(object):
else:
raise CheckException(MSG_WELL_FORMED_CONF_DATES)
def fix_oai(self, record):
"""The id in the OAI field might be different from the record id.
In INVENIO there is a mechanism to redirect to the correct one
The fix depend on the content of the database
@type record: L{Record}
@param record:
"""
value = record.oai()
match = REG_OAI.match(value)
myid = record.id()
# The id in the OAI field might be different from the record id.
# In INVENIO there is a mechanism to redirect to the correct one
#
# The fix depend on the content of the database
if match.group(2) != myid:
db = self.db
# The record OAI is already used in the database. Do nothing
oai_url = OAI_URL % (match.group(1), match.group(2))
if get_id(db.publications, origin=oai_url):
return
# The OAI based on the record id is already used in the database.
# Modify the record OAI
oai_url = OAI_URL % (match.group(1), myid)
if get_id(db.publications, origin=oai_url):
# the location of the oai information depends on the store
# cds: (248, a), inspirehep: (909C0, o)
if u"0248" in record:
field, subfield = u"0248", "a"
elif u"909CO" in record:
field, subfield = u"909CO", "o"
record[field][subfield] = OAI_INVENIO % (match.group(1), myid)
def format_authors(self, record, func):
"""Format the author names using the function func.
......@@ -450,6 +593,40 @@ class CheckAndFix(object):
value = value.replace('U.', university)
record[u'502']['b'][i] = value
def get_my_authors(self, record, cmpFct=None):
"""Get authors of my institutes signing the record.
The information is append to the L{Record} via the attribute C{my_authors}.
@type record: L{Record}
@param record:
@type cmpFct: reference to a function
@param cmpFct: Extract the family name from the full name.
It is used to sort my author list according to the author family name.
@rtype: unicode
@return: the list of authors separated by comma
@raise CheckException: when the list is empty
"""
# might have been computed when affiliation is checked
rec_id = record.id()
if rec_id in self.__my_authors:
li = self.__my_authors[rec_id]
li.sort(key=cmpFct)
value = u', '.join(li)
# find authors of my institute signing the record
else:
reg_institute = self.reg_institute
value = record.find_authors_by_affiliation(reg_institute, cmpFct)
if not value:
raise CheckException(MSG_NO_MY_AUTHOR)
record.my_author = value
def is_conference(self, record):
"""Check that the record described a conference talk / proceeding.
......@@ -474,92 +651,44 @@ class CheckAndFix(object):
if not isinstance(record, RecordThesis):
raise CheckException(MSG_NO_THESIS)
def my_authors(self, record, reference=[], cmpFct=None):
"""Check that authors of my institutes signed the record.
Fill the meta data record.my_authors.
def my_affiliation(self, record, id_project, id_team):
"""Check that authors of my institute are signatories.
Launch a recovery procedure when affiliations are not defined.
It is based on the author rescue list stored in the database.
@type record: L{Record}
@param record:
@type reference: list
@param reference: list of author names belonging to my institute
@type id_project: int
@param id_project: Identifier of the project in the database
@type cmpFct: reference to a function
@param cmpFct: Extract the family name from the full name.
It is used to sort my author list according to the author family name.
@type id_team: int
@param id_team: Identifier of the team in the database
@raise CheckException:
"""
# alias
app = current.app
reg_institute = app.reg_institute
# regular expression for the institute is not defined
# find it using the institute definition in inspirehep
# store the regular expression in current.app for a later use
if not reg_institute:
institute_id = app.inspirehep_institute_id
institute = load_record("inspirehep.net", institute_id)
reg_institute = institute.rex()
app.institute = institute
app.reg_institute = reg_institute
# find authors of my institute signing the record
s = record.find_authors_by_affiliation(reg_institute, cmpFct)
# nothing found try with the rescue list
if not s and reference:
s1 = set(record.authors_as_list())
s2 = set(reference)
li = list(s1.intersection(s2))
li.sort(key=cmpFct)
value = record.find_affiliation(self.reg_institute)
if value:
return value
s = u', '.join(li)
# affiliation is not defined
# try to recover using the authors rescue list
rescue_list = self._get_author_rescue_list(record, id_project, id_team)
if not rescue_list:
raise CheckException(MSG_NO_MY_AUTHOR)
if s:
record.my_authors = s
return
raise CheckException(MSG_NO_MY_AUTHOR)
def oai(self, record):
"""Check that the OAI field is defined and well formed.
In some tricky case the OAI can evolve with time. Therefore, the record
has several values. In that case, the method selects the one matching the
OAI use in the database.
# compute the intersection between the authors and the rescue list
set_1 = set(record.authors_as_list())
set_2 = set(rescue_list)
@type record: L{Record}
@param record:
li = list(set_1.intersection(set_2))
if not li:
raise CheckException(MSG_NO_MY_AUTHOR)
@raise CheckException:
"""
value = record.oai()
match = REG_OAI.match(value)
myid = record.id()
# The id in the OAI field might be different from the record id.
# In INVENIO there is a mechanism to redirect to the correct one
#
# The fix depend on the content of the database
if match.group(2) != myid:
db = current.globalenv['db']
# The record OAI is already used in the database. Do nothing
oai_url = OAI_URL % (match.group(1), match.group(2))
if get_id(db.publications, origin=oai_url):
return
# The OAI based on the record id is already used in the database.
# Modify the record OAI
oai_url = OAI_URL % (match.group(1), myid)
if get_id(db.publications, origin=oai_url):
record[field][subfield] = OAI_INVENIO % (match.group(1), myid)
# cache the result for a latter use
self.__my_authors[record.id()] = li
def paper_reference(self, record):
<