""" harvest_tools.articles """ from .automaton import Automaton from .base import (learn_my_authors, MSG_CRASH, MSG_FIX_ORIGIN, MSG_IN_DB, MSG_LOAD, T4, T6) from plugin_dbui import get_id, UNDEF_ID from store_tools import CheckException MSG_IS_PREPRINT = "Reject publication is a preprint" MSG_NO_EDITOR = "Reject article is not published" MSG_NOT_ARTICLE = "Reject publication is not and article" MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article" class Articles(Automaton): """Automaton for articles. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # the preprint categories self.id_preprint = get_id(self.db.categories, code="PRE") def check_record(self, record): """Check the content of the article in order to fix non-conformities. * publication is a published article * is with authors form my institute * standardise name of collaboration * format authors according to my format * extract authors form my institute signing the publication * is submitted date well formed * format editor according to my criteria * resolve published synonym * check reference paper Args: record (RecordPubli): the record describing the article. Returns: bool: ``False`` when a non conformity is found and can not be corrected. """ self.logger.debug(f"{T4}check and fix record (article)") stype = record.subtype() if stype != "article": msg = (MSG_IS_PREPRINT if stype == "preprint" else MSG_NOT_ARTICLE) self.logs[-1].reject(msg, record) return False try: # is with authors form my institute # standardise name of collaboration # format authors according to my format # extract authors form my institute signing the publication # is submitted date well formed record.check_and_fix(db=self.db, fmt_author="F. Last", rex_institute=self.rex_institute, sep_author=", ", sort_author=True) record.format_editor() record.check_publisher(self.db) except CheckException as e: self.logs[-1].reject(e, record=record) return False except Exception as e: self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False) return False return True def get_record_by_fields(self, oai_url, year, id_publisher=None, my_authors=None, pages=None, publication_url=None, preprint_number=None, title=None, volume=None): """Get article matching fields values defined in the keyword arguments. Note: This method is required to deal with an article entered by hand and found later by the harvester. Args: oai_url (str): the oai_url, *e.g* ``http://cds.cern.ch/record/123456``. The origin field of the existing database record is update to **oai_url** when a match is found. year (str): the year of the publication. It is used by the search algorithm and by the logger. Keyword Args: id_publisher (int): identifier of the publisher in the database. my_authors (str): authors of my institute separated by a comma. pages (str): the page reference. publication_url (str): the URL of the publications preprint_number (str): the preprint number title (str): the title of the publication. volume (str): the volume reference. Returns: tuple: ``(id, status)`` which contains the ``id`` of the record. It is equal to ``None`` when nothing is found. The ``status`` is equal to one when the existing preprint was modified into article, zero otherwise """ self.logger.debug(f"{T6}check existing article by fields") # alias db = self.db id_project = self.id_project id_team = self.id_team logs = self.logs # check against published articles rec_id = get_id(db.publications, id_projects=id_project, id_publishers=id_publisher, id_teams=id_team, pages=pages, volume=volume, year=year) # fix origin field publication = db.publications[rec_id] if rec_id and not publication.origin: if not self.dry_run: publication = dict(origin=oai_url) logs[-1].modify(MSG_FIX_ORIGIN, year) return (rec_id, 1) if rec_id: logs[-1].idle(MSG_IN_DB, year) return (rec_id, 0) # check against published preprint # a preprint can be identified by its category which is PRE (15) rec_id = get_id(db.publications, id_categories=self.id_preprint, id_projects=id_project, id_teams=id_team, preprint=preprint_number) if not rec_id: return (None, 0) # transform an existing preprint into article # institute authors can be missing in the preprint # change also the status self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year) if not self.dry_run: db.publications[rec_id] = dict(authors_institute=my_authors, id_categories=self.id_category, id_publishers=id_publisher, id_status=UNDEF_ID, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) return (rec_id, 1) def get_record_by_origin(self, primary_oai_url, year, id_publisher=None, my_authors=None, oai_url=None, pages=None, publication_url=None, title=None, volume=None): """Get an existing record using the origin field and its value defined in the *primary_oai_url* argument. Note: This method is required to transform a preprint into and article. All the keyword arguments are needed by the transformation. Args: primary_oai_url (str): the *primary* OAI identifier of the record. It is used by the search algorithm. year (str): the year of publication which is used by the logger. Keyword Args: id_publisher (int): identifier of the publisher in the database. my_authors (str): authors of my institute separated by a comma. oai_url (str): the full oai_url(s) of the article. pages (str): the page reference. publication_url (str): the URL of the publications title (str): the title of the publication. volume (str): the volume reference. Returns: tuple: ``(id, status)`` which contains the ``id`` of the record. It is equal to ``None`` when nothing is found. The ``status`` is equal to one when the existing preprint was modified into article, zero otherwise """ self.logger.debug(f"{T6}check existing article by origin") # alias db = self.db logs = self.logs publications = db.publications # search by origin query = db.publications.origin.contains(primary_oai_url) setrows = db(query) if setrows.count() == 0: return (None, 0) # a record is found rec_id = setrows.select(publications.id).first().id publication = publications[rec_id] # not a preprint ? if publication.id_categories != self.id_preprint: logs[-1].idle(MSG_IN_DB, year) return (rec_id, 0) # transform a preprint into an article logs[-1].modify(MSG_TRANSFORM_PREPRINT, year) if not self.dry_run: db.publications[rec_id] = dict(authors_institute=my_authors, id_categories=self.id_category, id_publishers=id_publisher, id_status=UNDEF_ID, oai_url=oai_url, pages=pages, publication_url=publication_url, title=title, volume=volume, year=year) return (rec_id, 1) def insert_record(self, record): """Insert an article in the database. Note: The method assumes that erratum are removed. Args: record (RecordPubli): the record describing the article. Returns: int: one when the record is inserted / updated in the database, zero otherwise. """ db = self.db # alias editor = record.paper_editor() first_author = record.first_author() my_authors = record.my_authors oai_url = record.oai_url() pages = record.paper_pages() preprint_number = record.preprint_number() publication_url = record.paper_url() submitted = record.submitted() title = record.title() volume = record.paper_volume() year = record.paper_year() # get the collaboration / publisher identifiers id_collaboration = \ get_id(db.collaborations, collaboration=record.collaboration()) id_publisher = get_id(db.publishers, abbreviation=editor) # get already published articles or preprint # A preprint is transform into an article. # # NOTE: The check is performed by origin then by fields. # The latter is useful to cover the case where the record # is entered by hand or by another harvester. # fields = dict(id_publisher=id_publisher, my_authors=my_authors, oai_url=oai_url, pages=pages, publication_url=publication_url, title=title, volume=volume) rec_id, status = self.get_record_by_origin(record.primary_oai_url(), year, **fields) if rec_id: return status fields = dict(id_publisher=id_publisher, my_authors=my_authors, pages=pages, publication_url=publication_url, preprint_number=preprint_number, title=title, volume=volume) rec_id, status = self.get_record_by_fields(oai_url, year, **fields) if rec_id: return status # eventually insert a new articles in the database # try to improve the rescue list for CPPM authors ret = 1 if not self.dry_run: fields = dict(authors=record.authors(), authors_institute=my_authors, first_author=first_author, id_categories=self.id_category, id_collaborations=id_collaboration, id_projects=self.id_project, id_publishers=id_publisher, id_status=UNDEF_ID, id_teams=self.id_team, origin=oai_url, pages=pages, preprint=preprint_number, publication_url=publication_url, submitted=submitted, title=title, volume=volume, year=year) ret = self._insert_in_db(log_year=year, **fields) if ret == 1: learn_my_authors(db, authors=record.my_authors, id_project=self.id_project, id_team=self.id_team, year=year) if ret == 1: self.logs[-1].load(MSG_LOAD, year) return 1 return 0