Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 1e863a07 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

break harvest_tools/__init__.py in small pieces.

parent 4020bbb7
This diff is collapsed.
# -*- coding: utf-8 -*-
""" harvest_tools.articles
"""
import traceback
from base import family_name_fr, MSG_CRASH, MSG_FIX_ORIGIN, MSG_IN_DB, MSG_LOAD
from gluon import current
from invenio_tools import CheckException
from publicationstool import PublicationsTool
from plugin_dbui import get_id, UNDEF_ID
MSG_NO_EDITOR = current.T("Reject article is not published", lazy=False)
MSG_TRANSFORM_PREPRINT = \
current.T("Transform the preprint into an article", lazy=False)
class Articles(PublicationsTool):
"""Publications tool for articles.
"""
def __init__(self, *args, **kwargs):
PublicationsTool.__init__(self, *args, **kwargs)
# the preprint categories
self.id_preprint = get_id(self.db.categories, code="PRE")
def check_by_origin(self,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
title=None,
volume=None,
year=None):
"""Check that a record already exist using the origin field.
- Transform a preprint into article.
- Actions are logged.
@keyword id_publisher:
@keyword oai_url:
@keyword pages:
@keyword publication_url:
@keyword title:
@keyword volume:
@keyword year:
@rtype: tuple
@return: the tuple (id, status). The id of the record or None.
The status is equal to one when the existing record was modified
zero otherwise
"""
if self.dbg:
print "check existing article by origin"
db = self.db
rec_id = get_id(db.publications, origin=oai_url)
if not rec_id:
return (None, 0)
# not a preprint ?
if db.publications[rec_id].id_categories != self.id_preprint:
self.logs[-1].idle(MSG_IN_DB, year)
return (rec_id, 0)
# transform a preprint into an article
self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
if not self.dry_run:
db.publications[rec_id] = dict(authors_institute=my_authors,
id_categories=self.id_category,
id_publishers=id_publisher,
id_status=UNDEF_ID,
pages=pages,
publication_url=publication_url,
title=title,
volume=volume,
year=year)
return (rec_id, 1)
def check_by_fields(self,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
preprint_number=None,
title=None,
volume=None,
year=None):
"""Check that a record already exist using the fields: id_projects,
id_publishers, id_teams, pages, volume and year.
- Fix the field origin when a match is found.
- Transform a preprint into article.
- Actions are logged.
@keyword id_publisher:
@keyword oai_url:
@keyword pages:
@keyword publication_url:
@keyword preprint_number:
@keyword title:
@keyword volume:
@keyword year:
@rtype: tuple
@return: the tuple (id, status). The id of the record or None.
The status is equal to one when the existing record was modified
zero otherwise
"""
if self.dbg:
print "check existing article by fields"
db = self.db
# check against published articles
rec_id = get_id(db.publications,
id_projects=self.id_project,
id_publishers=id_publisher,
id_teams=self.id_team,
pages=pages,
volume=volume,
year=year)
# fix orign field
if rec_id and not db.publications[rec_id].origin:
if not self.dry_run:
db.publications[rec_id] = dict(origin=oai_url)
self.logs[-1].modify(MSG_FIX_ORIGIN, year)
return (rec_id, 1)
if rec_id:
self.logs[-1].idle(MSG_IN_DB, year)
return (rec_id, 0)
# check against published preprint
# a preprint can be identified by its category which is PRE (15)
rec_id = get_id(db.publications,
id_categories=self.id_preprint,
id_projects=self.id_project,
id_teams=self.id_team,
preprint=preprint_number)
if not rec_id:
return (None, 0)
# transform an existing preprint into article
# institute authors can be missing in the preprint
# change also the status
self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
if not self.dry_run:
db.publications[rec_id] = dict(authors_institute=my_authors,
id_categories=self.id_category,
id_publishers=id_publisher,
id_status=UNDEF_ID,
pages=pages,
publication_url=publication_url,
title=title,
volume=volume,
year=year)
return (rec_id, 1)
def load_db(self, record):
"""Load an article in the database.
The method assume that erratum are removed.
@type record: L{Record}
@param record:
@rtype: int
@return: one when the record is inserted / updated in the database
zero otherwise.
"""
db = self.db
# alias
editor = record.paper_editor()
first_author = record.first_author()
oai_url = record.oai_url()
pages = record.paper_pages()
preprint_number = record.preprint_number()
publication_url = record.paper_url()
submitted = record.submitted()[0]
title = record.title()
volume = record.paper_volume()
year = record.paper_year()
# check the publisher
id_publisher = self.check_publisher(editor)
# check the collaboration
id_collaboration = self.check_collaboration(record.collaboration())
# check against already published articles or preprint
# A preprint is transform itno an article.
#
# NOTE: The check is performed by origin then by fields.
# The latter is useful to cover the case where the record
# is entered by hand or by another haverster.
#
rec_id, status = self.check_by_origin(id_publisher=id_publisher,
my_authors=record.my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
title=title,
volume=volume,
year=year)
if rec_id:
return status
rec_id, status = self.check_by_fields(id_publisher=id_publisher,
my_authors=record.my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
preprint_number=preprint_number,
title=title,
volume=volume,
year=year)
if rec_id:
return status
# eventually insert a new articles in the database
# try to improve the rescue list for CPPM authors
if not self.dry_run:
db.publications.insert(authors=record.authors(),
authors_institute=record.my_authors,
first_author=first_author,
id_categories=self.id_category,
id_collaborations=id_collaboration,
id_projects=self.id_project,
id_publishers=id_publisher,
id_status=UNDEF_ID,
id_teams=self.id_team,
origin=oai_url,
pages=pages,
preprint=preprint_number,
publication_url=publication_url,
submitted=submitted,
title=title,
volume=volume,
year=year)
learn_my_authors(db,
authors=record.my_authors,
id_project=self.id_project,
id_team=self.id_team,
year=year)
self.logs[-1].load(MSG_LOAD, year)
return 1
def select_record(self, record):
"""C{True} when the C{record} is published.
@type record: L{Record}
@param record:
@rtype: bool
"""
if not PublicationsTool.select_record(self, record):
return False
if self.dbg:
print "select article record"
try:
self.check.clean_erratum(record)
if not record.is_published():
self.logs[-1].reject(MSG_NO_EDITOR, record.year())
return False
self.check.my_authors(record,
reference=self._my_author_list(record),
cmpFct=family_name_fr)
self.check.oai(record)
self.check.submitted(record)
self.check.year(record)
self.check.paper_reference(record)
self.check.format_editor(record)
except CheckException as e:
self.logs[-1].reject(e, record.year())
return False
except BaseException as e:
self.logs[-1].reject(MSG_CRASH % e, record.year())
print traceback.format_exc()
return False
return True
# -*- coding: utf-8 -*-
""" harvest_tools.base
"""
import re
from gluon import current
MSG_CRASH = "Crash: %s"
DRY_RUN = current.T("dry run")
MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False)
MSG_IN_DB = current.T("Already in the database", lazy=False)
MSG_LOAD = current.T("Load in the database", lazy=False)
def family_name_fr(full_name):
"""Extract the family name when the full name is encoded as C{J. Doe}.
@type full_name: unicode
@rtype: unicode
"""
return full_name[full_name.find(' ') + 1:]
def fix_amu(record):
"""Fix the name of the C{Aix Marseille University}
@type record: L{Record}
@rtype: unicode
@return: the university names separated by comma.
"""
universities = record.these_universities()
for idx in range(len(universities)):
if re.search(current.app.reg_institute, universities[idx]):
year = re.search(r"(\d\d\d\d)", record.these_defense()).group(1)
if int(year) < 2012:
universities[idx] = \
u"Université de la Méditerrannée Aix-Marseille II"
else:
universities[idx] = u"Aix Marseille Université"
return ', '.join(universities)
def format_author_fr(name):
"""Format the author name according to French typographic rules,
I{i.e.} C{J.-P. Doe}.
The name stays unchanged when the formatting failed.
@type name: unicode
@param name:
@rtype: unicode
"""
# protection
if name == '' or name is None:
return name
# name are encoded Family, L
# Family, P L
# Family, M -H
# Family Name, J
# Family-Name, J
# Family, F Name
# Family, First
# To avoid to deal with unicode character
# look for non empty string \S
match = re.match(r'(.+), (\S+)( |\-)*(\S+)*', name)
# reformat the name as L. Family
# or keep it as it is
if match:
if match.group(3) and match.group(4):
result = '%s.%s%s. %s' % (match.group(2)[0], match.group(3)[0],
match.group(4)[0], match.group(1))
elif "-" in match.group(2):
li = [el[0] for el in match.group(2).split("-")]
li.append(match.group(1))
result = "%s.-%s. %s" % tuple(li)
else:
result = '%s. %s' % (match.group(2)[0], match.group(1))
else:
result = name
# avoid author name in upper case (R. LE FOO --> R. Le Foo)
result = result.title()
return result
def learn_my_authors(db,
authors=None,
id_project=None,
id_team=None,
year=None):
"""Train the rescue list of the authors of my institute,
stored in the database, using the list C{authors} provided in argument.
@note: all keyword arguments have to be defined.
@type db: gluon.dal.DAL
@param db:
@type authors: list
@param authors: authors names
@type id_project: int
@param id_project: project identifier
@type id_team: int
@param id_team: team identifier
@type year: int
@param year:
"""
# get the list of authors store in the database
row = db.my_authors(id_projects=id_project,
id_teams=id_team,
year=year)
# no entry in the database
if not row:
db.my_authors[0] = dict(authors=authors,
id_projects=id_project,
id_teams=id_team,
year=year)
return
database_authors = row.authors.split(', ')
# compare with the input list
# and extract authors which are not in the db
new = set(authors.split(', '))
ref = set(database_authors)
diff = new.difference(ref)
# update the database
if diff:
# NOTE1: be careful with the string encoding
# NOTE2: handle the case J. Foo and J. M. Foo are the same person
elems = []
for elem in diff:
if isinstance(elem, unicode):
elem = elem.encode('utf8')
family_name = elem[elem.rfind('. ') + 2:] # extract family name
if family_name not in row.authors:
elems.append(elem)
database_authors.extend(elems)
database_authors.sort(key=family_name_fr)
db.my_authors[row.id] = dict(authors=', '.join(database_authors))
class ToolException(Exception): pass
# -*- coding: utf-8 -*-
""" harvest_tools.factory
"""
from articles import Articles
from notes import Notes
from preprints import Preprints
from proceedings import Proceedings
from reports import Reports
from talks import Talks
from thesis import Thesis
def build_harvester_tool(db,
id_team,
id_project,
controller,
id_category,
year_start=None,
year_end=None,
dry_run=True,
debug=False):
"""
Harvest tool factory function, returns the appropriate harverster tool or
None if no factory exist for the specified controller.
@type db: gluon.dal.DAL
@param db:
@type id_team: int
@param id_team: Identifier of the team in the db
@type id_project: int
@param id_project: Identifier of the project in the db
@type controller: unicode
@param controller: Type of publication (i.e. 'article', 'proceedings', ...)
@type id_category: int
@param id_category: Identifier of the category of publication
(i.e. ACL, ACTI, ...)
@type year_start: int
@keyword year_start: Start year of search (i.e. '2014')
@type year_end: int
@keyword year_end: End year of search (i.e. '2015')
@type dry_run: boolean
@keyword dry_run: True if no record is to be written to the db
@type debug: bool
@param debug: activate the debug mode
"""
tool_class = get_harvester_tool(controller)
if tool_class is None:
return None
return tool_class(db,
id_team,
id_project,
controller,
id_category,
year_start,
year_end,
dry_run,
debug)
def get_harvester_tool(controller):
"""Get the harvester tool associated to the controller
or None if .
@note: valid names for the controller are:
- articles
- notes
- preprints
- proceedings
- reports
- talks
- theses
@type controller: unicode
@param controller: name of the controller
@rtype: class reference or None
@return: None when the controller corresponds to nothing.
"""
if controller == 'articles':
tool_class = Articles
elif controller == 'notes':
tool_class = Notes
elif controller == 'preprints':
tool_class = Preprints
elif controller == 'proceedings':
tool_class = Proceedings
elif controller == 'reports':
tool_class = Reports
elif controller == 'talks':
tool_class = Talks
elif controller == 'theses':
tool_class = Thesis
else:
tool_class = None
return tool_class
# -*- coding: utf-8 -*-
""" harvest_tools.msg
"""
import json
from gluon.storage import Storage
from invenio_tools import OAI_URL
class Msg(Storage):
"""Message and action taken for a publication.
- The publication is found by an harvester tool, in a store.
- The action refers to the database.
Fours action are defined:
- C{idle}
- C{load}