""" store_tools.recordheppubli """ import logging from filters import CLEAN_COLLABORATION from pandas import DataFrame from .recordhep import RecordHep from store_tools.pluginauthors import PluginAuthors from store_tools.pluginpublicationinfo import PluginPublicationInfo def pages(row): """Help function to build the pages argument Args: row (pandas.Series): * artid (str) * page_start (int) * page_end (int) Return str: * either 23 or 23-45 * empty string when information is missing """ artid = row.get("artid", None) pstart = row.get("page_start", None) pend = row.get("page_end", None) if pstart is None and pend is None and artid is None: return "" elif pstart is None and pend is None: return artid elif pend is None: return f"{pstart}" return f"{pstart}-{pend}" class RecordHepPubli(RecordHep, PluginAuthors, PluginPublicationInfo): """Article, preprint and proceeding from inspirehep.net version 2. Schema for publication is documented here: https://inspire-schemas.readthedocs.io/en/latest/schemas/ """ def __init__(self, recjson): super().__init__(recjson) self.logger = logging.getLogger("web2py.app.limbra") self._last_fmt_author = "Last, First" self._process_authors() self._process_publication_info() def _process_authors(self): """Convert authors information into DataFrame: Authors and their affiliations are stored in DataFrame with the following structure: +---------------+--------------------------------+ | column | | +===============+================================+ | affiliation | value separated by "|" | +---------------+--------------------------------+ | first_name | first name | +---------------+--------------------------------+ | fmt_name | formated name | +---------------+--------------------------------+ | full_name | Last, First | +---------------+--------------------------------+ | last_name | family name | +---------------+--------------------------------+ | role | equal to dir. for phd director | +---------------+--------------------------------+ Note: After running this method, the attribute ``df_authors`` is defined. It contains one entry with empty strings when the file ``authors`` is not defined. """ self.logger.debug(" process authors") authors = self.get("authors", None) if authors is None: cols = ["affiliation", "first_name", "fmt_name", "full_name", "last_name", "role"] self.df_authors = DataFrame([[""] * len(cols)], columns=cols) return data = [] for author in authors: affiliations = [] if "affiliations" in author: affiliations = [elt["value"] for elt in author["affiliations"]] role = \ (author["inspire_roles"] if "inspire_roles" in author else []) full_name = author["full_name"] idx = full_name.find(",") last_name = full_name[:idx] first_name = full_name[idx + 1:].strip() dct = {"affiliation": "|".join(affiliations), "first_name": first_name.strip(), "fmt_name": full_name, "full_name": full_name, "last_name": last_name.strip(), "role": ", ".join(role)} data.append(dct) df = DataFrame(data) # protection against duplicated entries, e.g. twice the first author if "full_name" in df.columns: df = df.drop_duplicates("full_name") # replace self.df_authors = df def _process_publication_info(self): """Convert publication_info into DataFrame: Note: * the field is a list when there are erratum * in some case the subfield year is a list (cds 1951625) publication information are stored in DataFrame with the following structure: +------------+--------------------------------+ | column | | +============+================================+ | title | abbreviation of the publisher | +------------+--------------------------------+ | volume | volume | +------------+--------------------------------+ | year | year of publication | +------------+--------------------------------+ | pagination | page number or ranges | +------------+--------------------------------+ Note: * After running this method, the attribute ``df_info`` is defined. It contains one entry with empty strings when the ``publication_info`` field does not exist. * In order to deal with erratum entry are sorter by year and volume. """ self.logger.debug(" process publication info") cols = ["title", "volume", "year", "pagination"] data = self.get("publication_info", None) if data is None: self.df_info = DataFrame([[""] * len(cols)], columns=cols) return # filter data to keep only row with year information data = [dct for dct in data if "year" in dct] if len(data) == 0: self.df_info = DataFrame([[""] * len(cols)], columns=cols) return # convert data to DataFrame with a well know structure df = (DataFrame(data) .astype({"year": str}) .rename(columns={"journal_title": "title", "journal_volume": "volume"}, errors="ignore")) # construction pagination columns df["pagination"] = df.apply(pages, axis="columns") # erratum -- sort by year and volume columns = df.columns if set(["year", "volume"]).issubset(columns): df = df.sort_values(["year", "volume"]) elif "year" in columns: df = df.sort_values("year") # replace self.df_info = df def collaboration(self): """The collaboration(s) signing the publication. Returns: str: * collaborations are separated by a comma. * The filter CLEAN_COLLABORATION is applied. * empty string when not defined """ collaborations = self.get("collaborations", None) if collaborations is None: return "" lst = [elt["value"] for elt in collaborations] return CLEAN_COLLABORATION(", ".join(lst)) def paper_url(self): """The URL of the document. Returns: str: * the string is empty when no URLs are found. * first URL is selected when there is more than one """ documents = self.get("documents", None) return ("" if documents is None else documents[0]["url"]) def preprint_number(self): """The ArXiv preprint number. Returns: str: * numbers are separated by a comma. * empty string when it is not defined. """ lst = self.get("arxiv_eprints", None) if lst is None: return "" lst = [f"arXiv:{elt['value']}" for elt in lst] return ", ".join(lst) def report_number(self): """The report number(s) associated to the publication. Returns: str: - Numbers are separated by a comma - Number are sorted in alphabetic order. - Empty string when not defined. """ lst = self.get("report_numbers", None) if lst is None: return "" lst = [elt["value"] for elt in lst] return ", ".join(lst) def submitted(self): """The date of submission. Returns: str: * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.* * Empty string when not defined. """ val = self.get("preprint_date", None) return ("" if val is None else val) def title(self): """The title of the publication. Returns: str: * Empty string when not defined. * The filter CLEAN_SPACES is applied. * First one is selectec when ther is more than one """ titles = self.get("titles", None) return ("" if titles is None else titles[0]["title"])