Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 31067897 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '16-reject-on-oai' into 'master'

16 reject on oai

* First step to improve the logic of the harvester
* The automate `PublicationTool` processes the requested URL. It starts with a list of record `id`. At that step, it looks for a publication in the database corresponding to that `id`. It moves to the next id when there is a match.
* The processing continue, the record is request to the store and decoded. If the `record.oai()` is not defined, the record is rejected.
* Several non conformities have been found when testing this new logic. Therefore, several protections and tests have been added
* Close #16

See merge request !25
parents 8f37ee99 d476ca13
......@@ -11,13 +11,12 @@ from harvest_tools import (format_author_fr,
build_harvester_tool,
PublicationsTool,
ToolException)
from invenio_tools import (CdsException,
CheckAndFix,
from invenio_tools import (CheckAndFix,
CheckException,
Marc12Exception,
InvenioStore,
Marc12,
OAI_URL)
load_record,
OAI_URL,
RecordConf,
RecordThesis)
from plugin_dbui import (get_id,
INLINE_ALERT,
Selector,
......@@ -116,10 +115,7 @@ def edit_insert():
return INLINE_ALERT % (T('Error'), msg)
# record
store = InvenioStore(selector.host)
xml = store.get_record(selector.record_id)
decode = Marc12()
record = decode(xml)[0]
record = load_record(selector.host, selector.record_id)
# form configuration
cfg = to_formPanel(db.publications)
......@@ -189,23 +185,25 @@ def edit_insert():
except CheckException:
pass
values['PublicationsConference_title'] = record.conference_title()
values['PublicationsConference_url'] = record.conference_url()
values['PublicationsConference_dates'] = record.conference_dates()
values['PublicationsConference_town'] = record.conference_town()
if isinstance(record, RecordConf):
values['PublicationsConference_title'] = record.conference_title()
values['PublicationsConference_url'] = record.conference_url()
values['PublicationsConference_dates'] = record.conference_dates()
values['PublicationsConference_town'] = record.conference_town()
recId = get_id(db.countries, country=record.conference_country())
values['PublicationsId_countries'] = \
recId if recId is not None else UNDEF_ID
recId = get_id(db.countries, country=record.conference_country())
values['PublicationsId_countries'] = \
recId if recId is not None else UNDEF_ID
values['PublicationsConference_speaker'] = record.first_author()
values['PublicationsConference_speaker'] = record.first_author()
# thesis
if selector.controller == 'theses':
values['PublicationsUniversities'] = record.these_universities()
values['PublicationsDirectors'] = record.these_directors()
values['PublicationsDefense'] = record.these_defense()
if isinstance(record, RecordThesis):
values['PublicationsUniversities'] = record.these_universities()
values['PublicationsDirectors'] = record.these_directors()
values['PublicationsDefense'] = record.these_defense()
# submitted date and year
try:
......@@ -389,4 +387,4 @@ def run_all():
return dict(collection_logs=collection_logs,
controller='all harvesters',
logs=logs,
selector=selector)
selector=selector)
\ No newline at end of file
......@@ -129,7 +129,7 @@ class Articles(PublicationsTool):
volume=volume,
year=year)
# fix orign field
# fix origin field
if rec_id and not db.publications[rec_id].origin:
if not self.dry_run:
db.publications[rec_id] = dict(origin=oai_url)
......
......@@ -5,6 +5,7 @@
import re
from gluon import current
from invenio_tools import REG_AUTHOR
DRY_RUN = "dry run"
......@@ -49,7 +50,7 @@ def format_author_fr(name):
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
match = re.match(r'(.+), (\S+)( |\-)*(\S+)*', name)
match = REG_AUTHOR.match(name)
# reformat the name as L. Family
# or keep it as it is
......
......@@ -36,10 +36,6 @@ class Notes(PublicationsTool):
year = record.year()
# check against already published notes
rec_id, status = self.check_by_origin(oai_url=oai_url, year=year)
if rec_id:
return status
rec_id, status = self.check_by_fields(first_author=first_author,
id_categories=self.id_category,
id_projects=self.id_project,
......
......@@ -47,10 +47,6 @@ class Preprints(PublicationsTool):
id_collaboration = self.check_collaboration(record.collaboration())
# check against preprint or article already published
rec_id, status = self.check_by_origin(oai_url=oai_url, year=year)
if rec_id:
return status
rec_id, status = self.check_by_fields(first_author=first_author,
id_projects=self.id_project,
id_teams=self.id_team,
......
......@@ -36,12 +36,6 @@ class Proceedings(PublicationsTool):
if not year:
year = record.year()
# check against already published proceeding using the field origin
rec_id, status = self.check_by_origin(oai_url=oai_url, year=year)
if rec_id:
return status
# alias
authors = record.authors()
editor = record.paper_editor()
......@@ -136,6 +130,7 @@ class Proceedings(PublicationsTool):
cmpFct=family_name_fr)
self.check.oai(record)
self.check.is_conference(record)
self.check.conference(record)
self.check.clean_erratum(record)
......
......@@ -5,17 +5,27 @@
import re
import traceback
from base import format_author_fr, MSG_FIX_ORIGIN, MSG_IN_DB, ToolException
from gluon.storage import Storage
from invenio_tools import CheckAndFix, InvenioStore, Marc12, OAI_URL, REG_YEAR
from invenio_tools import (CheckAndFix,
InvenioStore,
Marc12,
OAI_URL,
REG_OAI,
REG_YEAR)
from msg import Msg
from msgcollection import MsgCollection
from plugin_dbui import get_create_id, get_id, UNDEF_ID
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
MSG_NO_OAI = "Reject no OAI identifier"
MSG_WELL_FORM_OAI = "Reject OAI is not well formed"
class PublicationsTool(object):
"""Base class to search and process publications.
......@@ -109,6 +119,49 @@ class PublicationsTool(object):
self.__par = None
self.__reference = None
def _is_in_db(self, rec_id, title):
"""Return C{True} if the record is already in the database.
The search is based on the origin field.
A new log entry is created
@type rec_id: int
@param rec_id: record identifier
@type title: str
@param title: title of the collection
@rtype: bool
"""
db = self.db
harvester = self.harvester
# check
url = OAI_URL % (harvester.host, rec_id)
db_id = get_id(db.publications, origin=url)
if db_id is None:
return False
publication = db.publications[db_id]
# same category for the publication and the harvester
# keep the record if it is not the case
# this is required to transform a preprint into article
if publication.id_categories != harvester.id_categories:
return False
# log
self.logs.append(Msg(harvester=self.harvester,
collection=title,
record_id=rec_id,
title=publication.title))
self.logs[-1].idle(MSG_IN_DB, publication.year)
return True
def _search_parameters(self, collection):
"""Build the keywords to steer the URL search in invenio store.
The main parameter is the collection and the date range defined
......@@ -235,36 +288,6 @@ class PublicationsTool(object):
return self.__reference
def check_by_origin(self, oai_url=None, year=None):
"""Check that a record already exist using the origin field.
- Actions are logged.
@type oai_url: unicode
@param oai_url: typical value is "http://cds.cern.ch/record/123456"
@type year: unicode
@param year:
@note: this method can be customised in inherited class
to perform dedicated action.
@rtype: tuple
@return: the tuple (id, status). The id of the record or None.
The status is equal to one when the existing record was modified
zero otherwise
"""
if self.dbg:
print "check existing record by origin"
rec_id = get_id(self.db.publications, origin=oai_url)
if not rec_id:
return (None, 0)
self.logs[-1].idle(MSG_IN_DB, year)
return (rec_id, 0)
def check_by_fields(self, **kwargs):
"""Check that a record already exist using the fields defined
in the keyword arguments.
......@@ -483,10 +506,13 @@ class PublicationsTool(object):
print "\nprocessing record", rec_id
try:
if self._is_in_db(rec_id, title):
continue
xml = store.get_record(rec_id)
self.decode_xml(xml)
except BaseException as e:
except Exception as e:
print traceback.format_exc()
url = OAI_URL % (host, rec_id)
self.logs.append(Msg(harvester=self.harvester,
......@@ -521,6 +547,16 @@ class PublicationsTool(object):
record_id=record.id(),
title=record.title()))
# reject record with undefined OAI field
oai = record.oai()
if not oai:
self.logs[-1].reject(MSG_NO_OAI, record.year())
# reject record is not well form OAI
match = REG_OAI.match(oai)
if not match:
self.logs[-1].reject(MSG_WELL_FORM_OAI, record.year())
# additional selection stage
# at this step the validity of the record is checked
# and non-conformities are repaired
......
......@@ -55,10 +55,6 @@ class Reports(PublicationsTool):
id_collaboration = self.check_collaboration(record.collaboration())
# check against already published reports
rec_id, status = self.check_by_origin(oai_url=oai_url, year=year)
if rec_id:
return status
rec_id, status = self.check_by_fields(id_categories=self.id_category,
id_projects=self.id_project,
id_teams=self.id_team,
......
......@@ -32,11 +32,6 @@ class Talks(PublicationsTool):
oai_url = record.oai_url()
year = record.year()
# check against already published talk using the origin field
rec_id, status = self.check_by_origin(oai_url=oai_url, year=year)
if rec_id:
return status
# alias for the conference information
conference_dates = record.conference_dates()
conference_title = record.conference_title()
......@@ -103,6 +98,7 @@ class Talks(PublicationsTool):
cmpFct=family_name_fr)
self.check.oai(record)
self.check.is_conference(record)
self.check.conference(record)
self.check.submitted(record)
......
......@@ -46,10 +46,6 @@ class Thesis(PublicationsTool):
year = re.search(r"(\d\d\d\d)", defense_date).group(1)
# check against already published thesis
rec_id, status = self.check_by_origin(oai_url=oai_url, year=year)
if rec_id:
return status
rec_id, status = self.check_by_fields(first_author=first_author,
defense=defense_date,
id_projects=self.id_project,
......@@ -97,6 +93,7 @@ class Thesis(PublicationsTool):
cmpFct=family_name_fr)
self.check.oai(record)
self.check.is_thesis(record)
self.check.submitted(record)
self.check.year(record)
self.check.format_universities(record)
......
......@@ -10,6 +10,7 @@ from base import (ARXIV,
is_thesis,
OAI_URL,
REG_ARXIV_NUMBER,
REG_AUTHOR,
REG_OAI,
REG_YEAR,
THESIS_DIR)
......
......@@ -7,11 +7,26 @@ import re
ARXIV = "arXiv"
ARXIV_PDF = "http://arxiv.org/pdf/"
MSG_NO_CONF = "Reject no conference information"
MSG_NO_THESIS = "Reject no thesis information"
OAI_URL = "http://%s/record/%s"
REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR = re.compile("(\d{4})")
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
REG_AUTHOR = re.compile(r"(.+), (\S+)( |\-)*(\S+)*")
REG_OAI = re.compile(r"oai:([a-z\.]+):([\d]+)")
REG_YEAR = re.compile(r"(\d{4})")
THESIS_DIR = u"dir."
......
......@@ -5,7 +5,12 @@
import re
import regex
from base import OAI_URL, REG_OAI, REG_YEAR
from base import (MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from exception import CheckException
from filters import CLEAN_REVIEW
from gluon import current
......@@ -47,9 +52,9 @@ MONTHS = {u'Jan':'01',
MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_CONF_DATE = "Reject no conference date"
MSG_NO_DATE = "Reject no submission date"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF = "Reject incomplete paper reference"
MSG_NO_YEAR = "Reject no publication year"
......@@ -64,7 +69,6 @@ MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
MSG_WELL_FORMED_OAI = "Reject OAI is not well formed"
OAI_INVENIO = "oai:%s:%s"
......@@ -121,12 +125,12 @@ class CheckAndFix(object):
if isinstance(record, RecordConf):
# INSPIREHEP start date encoded as 2014-12-31
if "x" in record["111"]:
val = record["111"]["x"]
if "x" in record[u"111"]:
val = record[u"111"]["x"]
# CDS end date encoded as 20141231
elif "z" in record["111"]:
val = record["111"]["z"]
elif "z" in record[u"111"]:
val = record[u"111"]["z"]
val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])
elif isinstance(record, RecordThesis):
......@@ -157,34 +161,34 @@ class CheckAndFix(object):
"""
# standard case
if isinstance(record["773"], dict):
if isinstance(record[u"773"], dict):
if "o" in record["773"]:
if "o" in record[u"773"]:
for reg in DECODE_REF:
m = reg.match(record["773"]["o"])
m = reg.match(record[u"773"]["o"])
if m:
record["773"]["p"] = m.group("p")
record["773"]["v"] = m.group("v")
record["773"]["y"] = m.group("y")
record["773"]["c"] = m.group("c")
record[u"773"]["p"] = m.group("p")
record[u"773"]["v"] = m.group("v")
record[u"773"]["y"] = m.group("y")
record[u"773"]["c"] = m.group("c")
return
raise CheckException(MSG_NO_REF)
# list case -- paper with erratum
elif isinstance(record["773"], list):
elif isinstance(record[u"773"], list):
for i in range(len(record["773"])):
for i in range(len(record[u"773"])):
if "o" in record["773"][i]:
if "o" in record[u"773"][i]:
fixed = False
for reg in DECODE_REF:
m = reg.match(record["773"][i]["o"])
m = reg.match(record[u"773"][i]["o"])
if m:
record["773"][i]["p"] = m.group("p")
record["773"][i]["v"] = m.group("v")
record["773"][i]["y"] = m.group("y")
record["773"][i]["c"] = m.group("c")
record[u"773"][i]["p"] = m.group("p")
record[u"773"][i]["v"] = m.group("v")
record[u"773"][i]["y"] = m.group("y")
record[u"773"][i]["c"] = m.group("c")
fixed = True
break
......@@ -207,50 +211,50 @@ class CheckAndFix(object):
"""
if "100" not in record and "700" not in record:
if u"100" not in record and u"700" not in record:
raise CheckException(MSG_NO_AUTHOR)
if "100" in record and isinstance(record["100"], list):
if u"100" in record and isinstance(record[u"100"], list):
# from time to time first authors is duplicated
li = []
for di in record["100"]:
for di in record[u"100"]:
if di not in li:
li.append(di)
if len(li) == 1:
record["100"] = li[0]
record[u"100"] = li[0]
else:
raise CheckException(MSG_TO_MANY_FAUTHOR)
# alias
authors, first_author = None, None
if "700" in record:
authors = record["700"]
if u"700" in record:
authors = record[u"700"]
if "100" in record:
first_author = record["100"]
if u"100" in record:
first_author = record[u"100"]
# first author not defined
if not first_author and authors:
if isinstance(record["700"], list):
record["100"] = record["700"][0]
if isinstance(record[u"700"], list):
record[u"100"] = record[u"700"][0]
else:
record["100"] = record["700"]
record[u"100"] = record[u"700"]
# first author not in the authors list
elif first_author and authors:
if isinstance(record["700"], list):
if record["100"]["a"] != record["700"][0]["a"]:
record["700"].insert(0, record["100"])
if isinstance(record[u"700"], list):
if record[u"100"]["a"] != record[u"700"][0]["a"]:
record[u"700"].insert(0, record[u"100"])
elif record["700"]["a"] != record["100"]["a"]:
record["700"] = [record["100"], record["700"]]
elif record[u"700"]["a"] != record[u"100"]["a"]:
record[u"700"] = [record[u"100"], record[u"700"]]
# only the first author is defined
elif first_author and not authors:
record["700"] = record["100"]
record[u"700"] = record[u"100"]
def clean_erratum(self, record):
"""Clean record with erratum by removing them.
......@@ -267,10 +271,10 @@ class CheckAndFix(object):
# use the simplest algorithm by selecting the first entry in the list
# fare to assume that the article is published first.
record["773"] = record["773"][0]
record[u"773"] = record[u"773"][0]
# treat year and submitted date
for k in ("260", "269"):
for k in (u"260", u"269"):
if k in record and isinstance(record[k], list):
record[k] = record[k][0]
......@@ -307,6 +311,9 @@ class CheckAndFix(object):
raise CheckException(MSG_NO_COUNTRY)
# check and fix conference date
if not (u"111" in record and "d" in record[u"111"]):
raise CheckException(MSG_NO_CONF_DATE)
value = record[u"111"]["d"]
m = REG_CONF_DATES.match(value)
if not m:
......@@ -334,14 +341,36 @@ class CheckAndFix(object):
@param func: function used to format the author names
"""
for key in ("100", "700"):
for key in (u"100", u"700"):
if key in record:
if isinstance(record[key], list):
for i in range(len(record[key])):
record[key][i]["a"] = func(record[key][i]["a"])
for i in xrange(len(record[key])):
if "a" in record[key][i]:
# PROTECTION
# see RecordPubli.author_as_list
value = record[key][i]["a"]
if isinstance(value, unicode):
record[key][i]["a"] = func(value)
elif isinstance(value, list):
for elt in value:
if REG_AUTHOR.match(elt):
record[key][i]["a"] = func(elt)
else:
record[key]["a"] = func(record[key]["a"])
if "a" in record[key]:
value = record[key]["a"]
# PROTECTION
# see RecordPubli.authors_as_list
if isinstance(value, unicode):