Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 6876afef authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '17-harvester-logic' into 'master'

17 harvester logic

* Major redesign of the harvester logic.
* Rename the class `PublicationsTool` to `Automaton`.
* Refactoring of several methods by moving the code between the class `Record` and `CheckAndFix`, by renaing the method, *etc*.
* Detect errors when a record is rejected by the database engine.
* Close #17.

See merge request !26
parents 7c9997ac 6346233c
......@@ -6,10 +6,10 @@ import traceback
from gluon import current
from gluon.restricted import RestrictedError
from harvest_tools import (format_author_fr,
from harvest_tools import (build_harvester_tool,
DRY_RUN,
format_author_fr,
family_name_fr,
build_harvester_tool,
PublicationsTool,
ToolException)
from invenio_tools import (CheckAndFix,
CheckException,
......@@ -23,7 +23,7 @@ from plugin_dbui import (get_id,
to_formPanel,
UNDEF_ID)
DRY_RUN = T("dry run")
MODE_DRY_RUN = T(DRY_RUN)
MSG_NO_REG_INSTITUTE = T("Preference REG_INSTITUTE is not defined.")
MSG_NO_HARVESTER = T("No harvesters for your selection !!!")
......@@ -63,7 +63,7 @@ def free_run():
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
......@@ -124,16 +124,6 @@ def edit_insert():
values = {}
check = CheckAndFix()
# NOTE: the publication tool is only require to extract
# the list of my authors
tool = PublicationsTool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
selector.id_categories,
dry_run=True,
debug=False)
# title, preprint, URL, report number
values['PublicationsTitle'] = record.title()
values['PublicationsPreprint'] = record.preprint_number()
......@@ -143,11 +133,9 @@ def edit_insert():
# authors
try:
check.authors(record)
check.my_affiliation(record, selector.id_projects, selector.id_teams)
check.format_authors(record, format_author_fr)
check.my_authors(record,
reference=tool._my_author_list(record),
cmpFct=family_name_fr)
check.get_my_authors(record, cmpFct=family_name_fr)
except CheckException:
pass
......@@ -181,7 +169,9 @@ def edit_insert():
if selector.controller in ('proceedings', 'talks'):
try:
check.conference(record)
check.country(record)
check.conference_date(record)
except CheckException:
pass
......@@ -247,7 +237,7 @@ def insert_marcxml():
selector.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
......@@ -301,7 +291,7 @@ def run():
row.harvesters.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
......@@ -359,7 +349,7 @@ def run_all():
harvester.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == DRY_RUN),
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
......@@ -387,4 +377,4 @@ def run_all():
return dict(collection_logs=collection_logs,
controller='all harvesters',
logs=logs,
selector=selector)
\ No newline at end of file
selector=selector)
......@@ -8,13 +8,13 @@ import re
from check_tools import check_publication
from gluon.storage import Storage
from harvest_tools import DRY_RUN
from plugin_dbui import (is_foreign_field,
get_foreign_field,
Selector,
from plugin_dbui import (is_foreign_field,
get_foreign_field,
Selector,
to_fields)
DRY_RUN = T(DRY_RUN)
MODE_DRY_RUN = T(DRY_RUN)
INLINE_ALERT = "<script>Ext.Msg.alert('%s', '%s');</script>"
......@@ -24,31 +24,31 @@ MSG_NO_AUTHORS = "<br><br>Removing affiliation failed.<br>"\
def check_validate():
"""Check and validate publication records.
"""
counters = {}
logs = []
id_ok = db(db.status.code=='OK').select().first().id
# get user requirement
selector = Selector(virtdb.check_selector, exclude_fields=('mode'))
# extract the publication satisfying selector criteria
rows = selector.select(db.publications,
rows = selector.select(db.publications,
orderby=(db.projects.project, db.categories.code))
# analyse the publications
for row in rows:
# alias
# alias
project = row.projects.project
# initialize counters
# initialize counters
if project not in counters:
counters[project] = Storage(found=0, ok=0, validated=0)
counters[project] = Storage(found=0, ok=0, validated=0)
counters[project].found += 1
# skip publication already validated
if row.status.code == 'OK':
counters[project].ok += 1
......@@ -61,18 +61,18 @@ def check_validate():
msg.category = row.categories.code
msg.id = row.publications.id
msg.project = project
msg.title = row.publications.title
msg.title = row.publications.title
msg.year = row.publications.year
# check the record
# check the record
msg.txt, msg.ids = check_publication(row)
# update publication status
if not msg.txt:
if not msg.txt:
counters[project].validated += 1
del logs[-1]
if selector.mode != DRY_RUN:
if selector.mode != MODE_DRY_RUN:
row.publications.update_record(id_status=id_ok)
return dict(counters=counters,
......@@ -84,16 +84,16 @@ def check_validate():
def compare_publications():
"""Compare the publication fields for two ids and show only the difference.
The arguments of the URL are id1 and id2.
"""
data, idrow = [], []
if 'id1' not in request.vars or 'id2' not in request.vars:
return INLINE_ALERT % (T('Error'), T('Specify id1 and id2 in the URL'))
row1 = db.publications[request.vars.id1]
row2 = db.publications[request.vars.id2]
# find the fields of row2 which are different from those of row1
# the difference is a set containing (key, value) tuples
s1 = set(row1.items())
......@@ -104,7 +104,7 @@ def compare_publications():
if fieldname in ('delete_record', 'update_record'):
continue
# alias
field = db.publications[fieldname]
value1 = row1[fieldname]
......@@ -113,21 +113,21 @@ def compare_publications():
if fieldname == 'id':
idrow = ['id', value1, value2]
continue
# convert foreign fields
if is_foreign_field(field):
k_tablename, k_fieldname, k_id = get_foreign_field(field)
value1 = db[k_tablename][value1][k_fieldname]
value2 = db[k_tablename][value2][k_fieldname]
data.append([T(field.label), value1, value2])
# add the ids as the first data
# the protection covers the case in which id1=id2
if idrow:
data.insert(0, idrow)
# delegate the rendering to the view
return dict(data=data, title=row1.title)
......@@ -136,10 +136,10 @@ def extract_authors():
"""Extract a list of authors in a string containing
author name and their affiliation. It also extract authors for a
given affiliation.
"""
selector = Selector(virtdb.authors_selector)
# remove stupid character in the authors string
authors = re.sub(r'[\n\t\r\v\f]', '', selector.authors)
......@@ -150,19 +150,19 @@ def extract_authors():
all_authors = []
my_authors = []
rex = re.compile('([^\d]+)[ ,]?([\d,a-z\*]+)[ ,]')
for el in rex.finditer(authors):
author = el.group(1)
# when the matching work, author contains 0 or 1 comma
# use this property to detect when matching failed
if author.count(',') > 1:
case_1 = False
break
author = author.strip().replace(',', '')
all_authors.append(author)
if el.group(2) == selector.affiliation:
my_authors.append(author)
......@@ -171,21 +171,21 @@ def extract_authors():
# if not case_1:
# # remove number for footnote
# authors = re.sub(r'(,\d)', '', authors)
#
#
# # remove space before comma
# authors = re.sub(r'( ,)', ',', authors)
#
#
# # get author and its affiliation
# rex = re.compile(r'([\w \.-]+)[ ]?(a?[a-z],(a?[a-z],)*) ')
# for el in rex.finditer(authors):
# print el.groups()
#
#
# authors = re.sub(r'(a?[a-z],(a?[a-z],)*)', ',', authors)
# authors = re.sub(r'[a-z]\Z', '', authors)
#
#
# all_authors = authors.split(',')
if not case_1:
return MSG_NO_AUTHORS
return dict(all=', '.join(all_authors), my_authors=', '.join(my_authors))
\ No newline at end of file
......@@ -8,34 +8,34 @@ import re
import regex
from gluon import current
from plugin_dbui import (UNDEF,
UNDEF_ID,
get_id,
from plugin_dbui import (UNDEF,
UNDEF_ID,
get_id,
get_where_query)
# syntax for the submission date YYYY-MM or YYYY-MM-DD
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
# HTML code like &gt;
# HTML code like &gt;
REG_HTML = re.compile('&[a-z]+;')
def check_publication(row):
"""Check the publication fields.
@type row: gluon.dal.Row
@param row: record defining a publication. Its contains the publications
table as well as its reference tables.
@rtype: tuple
@return:
@return:
- the first element contains the list of message
- the second one contains the list of duplicate ids.
"""
T, li, idset = current.T, [], set()
# status code
if row.status.code == '???':
text = T("The status is ???")
......@@ -45,7 +45,7 @@ def check_publication(row):
if row.categories.code == UNDEF:
text = T("The category is undefined")
li.append(text)
# team
if row.publications.id_teams == UNDEF_ID:
text = T("The team is undefined")
......@@ -55,8 +55,8 @@ def check_publication(row):
if row.publications.id_projects == UNDEF_ID:
text = T("The project is undefined")
li.append(text)
# authors list
# authors list
if 'et al' in row.publications.authors:
text = T("'et al.' in authors")
li.append(text)
......@@ -65,23 +65,23 @@ def check_publication(row):
if row.teams.team in row.publications.authors_institute:
text = T("The institute authors contains the team name?")
li.append(text)
# submitted date
if not row.publications.submitted:
if not row.publications.submitted:
text = T("Submitted date is not defined")
li.append(text)
if row.publications.submitted:
if not REG_SUBMITTED.match(row.publications.submitted):
if row.publications.submitted:
if not REG_SUBMITTED.match(row.publications.submitted):
text = T("Submitted date is not valid")
li.append(text)
# publication URL
if row.publications.publication_url:
if 'pdf' not in row.publications.publication_url:
text = T("Check that the publication URL corresponds to a pdf file.")
li.append(text)
# latex syntax
title = row.publications.title
rules = "√" in title or \
......@@ -90,39 +90,39 @@ def check_publication(row):
("->" in title) or \
("s**(1/2)" in title) or \
REG_HTML.search(title)
if rules:
text = T("Check latex syntax in the title")
li.append(text)
# "Note :" in report number
value = row.publications.report_numbers
rules = "Note :" in value or \
"Note:" in value or \
";" in value
if rules:
text = T('Report numbers contains "Note :" or ";"')
li.append(text)
# duplicate by origin
ids = duplicate_origin(row.publications)
if len(ids):
idset = idset.union(ids)
text = T("Entries with duplicate origin")
li.append(text)
# specific fields for article
if row.categories.usual == 'article':
if row.publications.id_publishers == UNDEF_ID:
text = T("Publishers is not defined")
li.append(text)
if not row.publications.volume:
text = T("Volume number is not defined")
li.append(text)
if not row.publications.pages:
text = T("Pages range is not defined")
li.append(text)
......@@ -165,10 +165,10 @@ def check_publication(row):
idset = idset.union(ids)
text = T("Possible duplicate entries")
li.append(text)
# specific fields for report
if row.categories.usual == 'report':
if not row.publications.report_numbers:
text = T("Report number is missing")
li.append(text)
......@@ -178,25 +178,25 @@ def check_publication(row):
idset = idset.union(ids)
text = T("Possible duplicate entries")
li.append(text)
return (li, list(idset))
def extend_ids(db, query, ids):
"""helper functions
@type db: gluon.dal.DAL
@param db:
@type query: gluon.dal.query
@param query:
@param query:
@type ids: list of string
@param ids: the current list of ids
@note: the current list of publication ids will be extend by those
corresponding to the C{query}. The id are unique in the list.
"""
set = db(query)
if set.count():
......@@ -205,47 +205,48 @@ def extend_ids(db, query, ids):
if id not in ids:
ids.append(id)
def duplicate_article(publication):
"""Look for duplicate article.
The comparison is performed on article published by the given team
using the following criteria:
- title, publishers, volume and pages
- publisher, volume and pages
- publisher and title
@type publication: dict or gluon.storage.Storage
@param publication: contains the publication fields and theirs values
@rtype: list
@return: list of ids corresponding to duplicate entries
"""
ids = []
db = current.globalenv['db']
qcat = (db.categories.code == 'ACL') | (db.categories.code == 'ACLN')
qmain = get_where_query(db.publications)
qmain = ((qmain) & (qcat))
qmain = ((qmain) & (db.publications.id_teams == publication['id_teams']))
qmain = ((qmain) & (db.publications.id_publishers == publication['id_publishers']))
if 'id' in publication and publication['id']:
qmain = ((qmain) & (db.publications.id != publication['id']))
# title, publishers, volume and pages
query = ((qmain) & (db.publications.title == publication['title']))
query = ((query) & (db.publications.volume == publication['volume']))
query = ((query) & (db.publications.pages == publication['pages']))
extend_ids(db, query, ids)
# publisher, volume and pages
# publisher, volume, pages and year
query = ((qmain) & (db.publications.volume == publication['volume']))
query = ((query) & (db.publications.pages == publication['pages']))
query = ((query) & (db.publications.year == publication['year']))
extend_ids(db, query, ids)
# publisher and title
query = ((qmain) & (db.publications.title == publication['title']))
extend_ids(db, query, ids)
......@@ -255,51 +256,51 @@ def duplicate_article(publication):
def duplicate_conference(publication):
"""Look for duplicate talk / proceeding.
The comparison is performed on conference talk/proceeding published
The comparison is performed on conference talk/proceeding published
by the given team using the following criteria:
- title, conference title, conference date and conference town
- title, conference date and conference town
- title, conference title and conference town
@type publication: dict or gluon.storage.Storage
@param publication: contains the publication fields and theirs values
@rtype: list
@return: list of ids corresponding to duplicate entries
"""
ids = []
db = current.globalenv['db']
qcat = (db.categories.code == 'ACTI') | \
(db.categories.code == 'ACTN') | \
(db.categories.code == 'COM')
qmain = get_where_query(db.publications)
qmain = ((qmain) & (qcat))
qmain = ((qmain) & (db.publications.id_teams == publication['id_teams']))
qmain = ((qmain) & (db.publications.title == publication['title']))
if 'id' in publication and publication['id']:
qmain = ((qmain) & (db.publications.id != publication['id']))
# title, conference title, conference date and conference town
query = ((qmain) & (db.publications.conference_title == publication['conference_title']))
query = ((query) & (db.publications.conference_dates == publication['conference_dates']))
query = ((query) & (db.publications.conference_town == publication['conference_town']))
extend_ids(db, query, ids)
extend_ids(db, query, ids)
# title, conference date and conference town
query = ((query) & (db.publications.conference_dates == publication['conference_dates']))
query = ((query) & (db.publications.conference_town == publication['conference_town']))
extend_ids(db, query, ids)
extend_ids(db, query, ids)
# title, conference title and conference town
query = ((qmain) & (db.publications.conference_title == publication['conference_title']))
query = ((query) & (db.publications.conference_town == publication['conference_town']))
extend_ids(db, query, ids)
extend_ids(db, query, ids)
return ids
......@@ -308,10 +309,10 @@ def duplicate_origin(publication):
@type publication: dict or gluon.storage.Storage
@param publication: contains the publication fields and theirs values
@rtype: list
@return: list of ids corresponding to duplicate entries
"""