""" store_tools.recordheppubli """ import logging import pprint from .authorsmixin import AuthorsMixin, MSG_NO_MY_AUTHOR from .base import (search_synonym, MSG_UNKNOWN_COLLABORATION, MSG_WELL_FORMED_DATE, REG_DATE_YYYYMM, T4, T6) from .exception import CheckException from filters import CLEAN_COLLABORATION from pandas import DataFrame from .publicationinfomixin import PublicationInfoMixin from plugin_dbui import UNDEF_ID def pages(row): """Help function to build the pages argument Args: row (pandas.Series): * artid (str) * page_start (int) * page_end (int) Return str: * either 23 or 23-45 * empty string when information is missing """ artid = row.get("artid", None) pstart = row.get("page_start", None) pend = row.get("page_end", None) if pstart is None and pend is None and artid is None: return "" elif pstart is None and pend is None: return artid elif pend is None: return f"{pstart}" return f"{pstart}-{pend}" class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin): """Article, preprint and proceeding from inspirehep.net version 2. Schema for publication is documented here: https://inspire-schemas.readthedocs.io/en/latest/schemas/ """ def __init__(self, recjson): super().__init__(recjson) self.logger = logging.getLogger("web2py.app.limbra") self._last_fmt_author = "Last, First" self._process_authors() self._process_publication_info() # the authors of my institutes signing the record # string containing a list of name separated by a comma self.my_authors = None def _process_authors(self): """Convert authors information into DataFrame: Authors and their affiliations are stored in DataFrame with the following structure: +---------------+--------------------------------+ | column | | +===============+================================+ | affiliation | value separated by "|" | +---------------+--------------------------------+ | first_name | first name | +---------------+--------------------------------+ | fmt_name | formated name | +---------------+--------------------------------+ | full_name | Last, First | +---------------+--------------------------------+ | last_name | family name | +---------------+--------------------------------+ | role | equal to dir. for phd director | +---------------+--------------------------------+ Note: After running this method, the attribute ``df_authors`` is defined. It contains one entry with empty strings when the file ``authors`` is not defined. """ self.logger.debug(f"{T4}process authors") authors = self.get("authors", None) if authors is None: cols = ["affiliation", "first_name", "fmt_name", "full_name", "last_name", "role"] self.df_authors = DataFrame([[""] * len(cols)], columns=cols) return data = [] for author in authors: affiliations = [] if "affiliations" in author: affiliations = [elt["value"] for elt in author["affiliations"]] role = \ (author["inspire_roles"] if "inspire_roles" in author else []) full_name = author["full_name"] idx = full_name.find(",") last_name = full_name[:idx] first_name = full_name[idx + 1:].strip() dct = {"affiliation": "|".join(affiliations), "first_name": first_name.strip(), "fmt_name": full_name, "full_name": full_name, "last_name": last_name.strip(), "role": ", ".join(role)} data.append(dct) df = DataFrame(data) # protection against duplicated entries, e.g. twice the first author if "full_name" in df.columns: df = df.drop_duplicates("full_name") # replace self.df_authors = df def _process_publication_info(self): """Convert publication_info into DataFrame: Note: * the field is a list when there are erratum * in some case the subfield year is a list (cds 1951625) publication information are stored in DataFrame with the following structure: +------------+--------------------------------+ | column | | +============+================================+ | title | abbreviation of the publisher | +------------+--------------------------------+ | volume | volume | +------------+--------------------------------+ | year | year of publication | +------------+--------------------------------+ | pagination | page number or ranges | +------------+--------------------------------+ Note: * After running this method, the attribute ``df_info`` is defined. It contains one entry with empty strings when the ``publication_info`` field does not exist. * In order to deal with erratum entry are sorter by year and volume. """ self.logger.debug(f"{T4}process publication info") cols = ["title", "volume", "year", "pagination"] data = self.get("publication_info", None) if data is None: self.df_info = DataFrame([[""] * len(cols)], columns=cols) return # filter data to keep only row with year information data = [dct for dct in data if "year" in dct] if len(data) == 0: self.df_info = DataFrame([[""] * len(cols)], columns=cols) return # convert data to DataFrame with a well know structure df = (DataFrame(data) .astype({"year": str}) .rename(columns={"journal_title": "title", "journal_volume": "volume"}, errors="ignore")) # construction pagination columns df["pagination"] = df.apply(pages, axis="columns") # erratum -- sort by year and volume columns = df.columns if set(["year", "volume"]).issubset(columns): df = df.sort_values(["year", "volume"]) elif "year" in columns: df = df.sort_values("year") # replace self.df_info = df def check_collaboration(self, db=None): """Check synonyms for collaboration by using by the proper value. Args: db (pydal.DAL): database connection Raises: CheckException: * the collaboration is unknown in the database * more than one synonym found. """ if db is None: self.logger.debug(f"{T6}skip check collaboration -- db is None") return self.logger.debug(f"{T6}check collaboration") val = self.collaboration() if len(val) == 0: return dbid = search_synonym(db.collaborations, "collaboration", val) if dbid == UNDEF_ID: raise CheckException(MSG_UNKNOWN_COLLABORATION) collaboration = db.collaborations[dbid].collaboration if collaboration != val: self["collaborations"] = [{"value": collaboration}] def check_my_affiliation(self, rex_institute=None): """Check that authors of my institute are signatories. Args: rex_institute (str): regular expression defining my institute Raises: CheckException """ if rex_institute is None: self.logger.debug(f"{T6}skip check my affiliation -- rex is None") return self.logger.debug(f"{T6}check my affiliation") value = self.find_affiliation(rex_institute) if len(value) == 0: raise CheckException(MSG_NO_MY_AUTHOR) def check_paper_reference(self): """Check_paper_reference is a dummy method to preserve interface """ def check_submitted_date(self): """Check that submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``. Raises: CheckException:: * the date is not well formed """ self.logger.debug(f"{T6}check submitted date") xdate = self.submitted() if not REG_DATE_YYYYMM.match(xdate): raise CheckException(MSG_WELL_FORMED_DATE) def check_and_fix_record(self, db=None, fmt_author=None, rex_institute=None, sep_author=", ", sort_author=False): """Check record and fix non-conformities. * is with authors * is with authors form my institute * standardise name of collaboration * format authors according to my format * extract authors form my institute signing the publication Args: db (pydal.DAL): database connection fmt_author (str): define the format for author names. Possible values are ``First, Last``, ``F. Last``, ``Last``, ``Last, First`` and ``Last F.`` rex_institute (str): regular expression defining my institute sep_author (str): string separating author names. The default is the comma. sort_author (bool): sort authors by family name when true otherwise use the order of authors at the creation of the record Raises: CheckException """ self.check_authors() self.check_my_affiliation(rex_institute) self.check_collaboration(db) self.check_format_authors(fmt_author) self.extract_my_authors(rex_institute, sep_author, sort_author) self.check_submitted_date() def collaboration(self): """The collaboration(s) signing the publication. Returns: str: * collaborations are separated by a comma. * The filter CLEAN_COLLABORATION is applied. * empty string when not defined """ collaborations = self.get("collaborations", None) if collaborations is None: return "" lst = [elt["value"] for elt in collaborations] return CLEAN_COLLABORATION(", ".join(lst)) def debug(self): """Print the record structure on the standard output. """ pprint.pprint(self) def host(self): """The store housing the record. Returns: str: """ return "inspirehep.net" def id(self): """The id of the record in the store. Returns: int: """ return self["control_number"] def oai(self): """The Open Archive Initiative identifier(s). Returns: str: * the pattern of the identifier is ``oai:host:id`` * primary and secondary OAI identifier are separated by a comma * an empty string when it is not defined. """ lst = [self.primary_oai(), self.secondary_oai()] return ", ".join(lst).strip(", ") def oai_url(self): """The Open Archive Initiative identifier URL(s). Returns: str: * the pattern of the URL is ``http://host/record/id`` * primary and secondary URLs are separated by a comma. * an empty string when it is not defined """ lst = [self.primary_oai_url(), self.secondary_oai_url()] return ", ".join(lst).strip(", ") def paper_url(self): """The URL of the document. Returns: str: * the string is empty when no URLs are found. * first URL is selected when there is more than one """ documents = self.get("documents", None) return ("" if documents is None else documents[0]["url"]) def preprint_number(self): """The ArXiv preprint number. Returns: str: * numbers are separated by a comma. * empty string when it is not defined. """ lst = self.get("arxiv_eprints", None) if lst is None: return "" lst = [f"arXiv:{elt['value']}" for elt in lst] return ", ".join(lst) def primary_oai(self): """The primary Open Archive Initiative identifier. The primary OAI identifier matches the record identifier. Returns: str: * the pattern is ``oai:host:id``. * empty string when it is not defined """ return f"oai:inspirehep.net:{self['control_number']}" def primary_oai_url(self): """The Open Archive Initiative URL for the primary OAI. Returns: str: * the pattern is ``http://inspirehep.net/record/id`` """ recid = self["control_number"] return f"http://inspirehep.net/record/{recid}" def report_number(self): """The report number(s) associated to the publication. Returns: str: - Numbers are separated by a comma - Number are sorted in alphabetic order. - Empty string when not defined. """ lst = self.get("report_numbers", None) if lst is None: return "" lst = [elt["value"] for elt in lst] return ", ".join(lst) def secondary_oai(self): """The secondary OAI identifier. the secondary OAI identifier corresponds to the record in the store, *cds.cern.ch*. Returns: str: * the pattern is ``oai:host:id``. * empty string when it is not defined """ if "external_system_identifiers" not in self: return "" for elt in self["external_system_identifiers"]: if elt["schema"] == "CDS": return f"oai:cds.cern.ch:{elt['value']}" return "" def secondary_oai_url(self): """The Open Archive Initiative URL for the secondary OAI. the secondary OAI URL corresponds to the record in the store, *cds.cern.ch*. Returns: str: * the pattern is ``http://cds.cern.ch/record/id`` * empty string when it is not defined """ if "external_system_identifiers" not in self: return "" for elt in self["external_system_identifiers"]: if elt["schema"] == "CDS": return f"http://cds.cern.ch/record/{elt['value']}" return "" def submitted(self): """The date of submission. Returns: str: * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.* * Empty string when not defined. """ val = self.get("preprint_date", None) return ("" if val is None else val) def subtype(self): """The subtype of the publication. Returns: str: * "articles", "preprint", "note" or "report" * empty string when it is not defined """ doctype = self.get("document_type", None) if doctype is None: return "" doctype = (doctype[0] if isinstance(doctype, list) else doctype) if doctype != "article": return doctype # separate article from preprint # in the latter case publication_info is missing pubinfo = self.get("publication_info", None) return ("preprint" if pubinfo is None else "article") def title(self): """The title of the publication. Returns: str: * Empty string when not defined. * The filter CLEAN_SPACES is applied. * First one is selectec when ther is more than one """ titles = self.get("titles", None) return ("" if titles is None else titles[0]["title"])