# -*- coding: utf-8 -*- """ harvest_tools.articles """ import traceback from automaton import Automaton from base import family_name_fr, MSG_CRASH, MSG_FIX_ORIGIN, MSG_IN_DB, MSG_LOAD from invenio_tools import CheckException from plugin_dbui import get_id, UNDEF_ID MSG_NO_EDITOR = "Reject article is not published" MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article" class Articles(Automaton): """Automaton for articles. """ def __init__(self, *args, **kwargs): Automaton.__init__(self, *args, **kwargs) # the preprint categories self.id_preprint = get_id(self.db.categories, code="PRE") def check_by_origin(self, id_publisher=None, my_authors=None, oai_url=None, pages=None, publication_url=None, title=None, volume=None, year=None): """Check that a record already exist using the origin field. - Transform a preprint into article. - Actions are logged. @keyword id_publisher: @keyword oai_url: @keyword pages: @keyword publication_url: @keyword title: @keyword volume: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing article by origin" db = self.db rec_id = get_id(db.publications, origin=oai_url) if not rec_id: return (None, 0) # not a preprint ? if db.publications[rec_id].id_categories != self.id_preprint: self.logs[-1].idle(MSG_IN_DB, year) return (rec_id, 0) # transform a preprint into an article self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year) if not self.dry_run: db.publications[rec_id] = dict(authors_institute=my_authors, id_categories=self.id_category, id_publishers=id_publisher, id_status=UNDEF_ID, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) return (rec_id, 1) def check_by_fields(self, id_publisher=None, my_authors=None, oai_url=None, pages=None, publication_url=None, preprint_number=None, title=None, volume=None, year=None): """Check that a record already exist using the fields: id_projects, id_publishers, id_teams, pages, volume and year. - Fix the field origin when a match is found. - Transform a preprint into article. - Actions are logged. @keyword id_publisher: @keyword oai_url: @keyword pages: @keyword publication_url: @keyword preprint_number: @keyword title: @keyword volume: @keyword year: @rtype: tuple @return: the tuple (id, status). The id of the record or None. The status is equal to one when the existing record was modified zero otherwise """ if self.dbg: print "check existing article by fields" db = self.db # check against published articles rec_id = get_id(db.publications, id_projects=self.id_project, id_publishers=id_publisher, id_teams=self.id_team, pages=pages, volume=volume, year=year) # fix origin field if rec_id and not db.publications[rec_id].origin: if not self.dry_run: db.publications[rec_id] = dict(origin=oai_url) self.logs[-1].modify(MSG_FIX_ORIGIN, year) return (rec_id, 1) if rec_id: self.logs[-1].idle(MSG_IN_DB, year) return (rec_id, 0) # check against published preprint # a preprint can be identified by its category which is PRE (15) rec_id = get_id(db.publications, id_categories=self.id_preprint, id_projects=self.id_project, id_teams=self.id_team, preprint=preprint_number) if not rec_id: return (None, 0) # transform an existing preprint into article # institute authors can be missing in the preprint # change also the status self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year) if not self.dry_run: db.publications[rec_id] = dict(authors_institute=my_authors, id_categories=self.id_category, id_publishers=id_publisher, id_status=UNDEF_ID, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) return (rec_id, 1) def check_record(self, record): """Check the content of the article in order to fix non conformities. @type record: L{Record} @param record: @rtype: bool @return: C{False} when a non conformity is found and can not be corrected. """ if not Automaton.check_record(self, record): return False if self.dbg: print "select article record" try: self.check.clean_erratum(record) if not record.is_published(): self.logs[-1].reject(MSG_NO_EDITOR, record.year()) return False self.check.my_authors(record, reference=self._my_author_list(record), cmpFct=family_name_fr) self.check.oai(record) self.check.submitted(record) self.check.year(record) self.check.paper_reference(record) self.check.format_editor(record) except CheckException as e: self.logs[-1].reject(e, record.year()) return False except Exception as e: self.logs[-1].reject(MSG_CRASH % e, record.year(), translate=False) print traceback.format_exc() return False return True def load_db(self, record): """Load an article in the database. The method assume that erratum are removed. @type record: L{Record} @param record: @rtype: int @return: one when the record is inserted / updated in the database zero otherwise. """ db = self.db # alias editor = record.paper_editor() first_author = record.first_author() oai_url = record.oai_url() pages = record.paper_pages() preprint_number = record.preprint_number() publication_url = record.paper_url() submitted = record.submitted()[0] title = record.title() volume = record.paper_volume() year = record.paper_year() # check the publisher id_publisher = self.check_publisher(editor) # check the collaboration id_collaboration = self.check_collaboration(record.collaboration()) # check against already published articles or preprint # A preprint is transform itno an article. # # NOTE: The check is performed by origin then by fields. # The latter is useful to cover the case where the record # is entered by hand or by another haverster. # rec_id, status = self.check_by_origin(id_publisher=id_publisher, my_authors=record.my_authors, oai_url=oai_url, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) if rec_id: return status rec_id, status = self.check_by_fields(id_publisher=id_publisher, my_authors=record.my_authors, oai_url=oai_url, pages=pages, publication_url=publication_url, preprint_number=preprint_number, title=title, volume=volume, year=year) if rec_id: return status # eventually insert a new articles in the database # try to improve the rescue list for CPPM authors if not self.dry_run: db.publications.insert(authors=record.authors(), authors_institute=record.my_authors, first_author=first_author, id_categories=self.id_category, id_collaborations=id_collaboration, id_projects=self.id_project, id_publishers=id_publisher, id_status=UNDEF_ID, id_teams=self.id_team, origin=oai_url, pages=pages, preprint=preprint_number, publication_url=publication_url, submitted=submitted, title=title, volume=volume, year=year) learn_my_authors(db, authors=record.my_authors, id_project=self.id_project, id_team=self.id_team, year=year) self.logs[-1].load(MSG_LOAD, year) return 1