""" store_tools.recordheppubli """ from filters import CLEAN_COLLABORATION from pandas import DataFrame from .recordhep import RecordHep from store_tools.pluginauthors import PluginAuthors from store_tools.pluginpublicationinfo import PluginPublicationInfo class RecordHepPubli(RecordHep, PluginAuthors, PluginPublicationInfo): """Article, preprint and proceeding from inspirehep.net version 2. Schema documentation is defined here: https://inspire-schemas.readthedocs.io/en/latest/schemas/ """ def __init__(self, recjson): super().__init__(recjson) self._last_fmt_author = "Last, First" self._process_authors() self._process_publication_info() def _process_authors(self): """Convert authors information into DataFrame: Authors and their affiliations are stored in DataFrame with the following structure: +---------------+--------------------------------+ | column | | +===============+================================+ | affiliation | value separated by "|" | +---------------+--------------------------------+ | first_name | first name | +---------------+--------------------------------+ | fmt_name | formated name | +---------------+--------------------------------+ | full_name | Last, First | +---------------+--------------------------------+ | last_name | family name | +---------------+--------------------------------+ | relator_name | equal to dir. for phd director | +---------------+--------------------------------+ Note: After running this method, the field ``authors`` is always defined. It contains one entry with empty strings when the field does not exist. """ authors = self.get("authors", None) if authors is None: cols = ["affiliation", "first_name", "fmt_name", "full_name", "last_name"] self["authors"] = DataFrame([[""] * len(cols)], columns=cols) return data = [] for author in authors: affiliations = [] if "affiliations" in author: affiliations = [elt["value"] for elt in author["affiliations"]] full_name = author["full_name"] last_name, first_name = full_name.split(",") dct = {"affiliation": "|".join(affiliations), "first_name": first_name.strip(), "fmt_name": full_name, "full_name": full_name, "last_name": last_name.strip()} data.append(dct) df = DataFrame(data) # protection against duplicated entries, e.g. twice the first author if "full_name" in df.columns: df = df.drop_duplicates("full_name") # replace self["authors"] = df def _process_publication_info(self): """Convert publication_info into DataFrame: Note: * the field is a list when there are erratum * in some case the subfield year is a list (cds 1951625) publication information are stored in DataFrame with the following structure: +------------+--------------------------------+ | column | | +============+================================+ | title | abbreviation of the publisher | +------------+--------------------------------+ | volume | volume | +------------+--------------------------------+ | year | year of publication | +------------+--------------------------------+ | pagination | page number or ranges | +------------+--------------------------------+ Note: * After running this method, the field ``publication_info`` is always defined. It contains one entry with empty strings when the field does not exist. * In order to deal with erratum entry are sorter by year and volume. """ data = self.get("publication_info", None) if data is None: cols = ["title", "volume", "year", "pagination"] self.df_info = DataFrame([[""] * len(cols)], columns=cols) return df = (DataFrame(data) .astype({"year": str}) .rename(columns={"artid": "pagination", "journal_title": "title", "journal_volume": "volume"})) columns = df.columns # erratum -- sort by year and volume if set(["year", "volume"]).issubset(columns): df = df.sort_values(["year", "volume"]) elif "year" in columns: df = df.sort_values("year") # replace self.df_info = df def collaboration(self): """The collaboration(s) signing the publication. Returns: str: * collaborations are separated by a comma. * The filter CLEAN_COLLABORATION is applied. * empty string when not defined """ collaborations = self.get("collaborations", None) if collaborations is None: return "" lst = [] for elt in collaborations: val = elt["value"] val = (val if val.endswith("ollaboration") else f"{val} Collaboration") lst.append(val) return CLEAN_COLLABORATION(", ".join(lst)) def paper_url(self): """The URL of the document. Returns: str: * the string is empty when no URLs are found. * first URL is selected when there is more than one """ documents = self.get("documents", None) return ("" if documents is None else documents[0]["url"]) def preprint_number(self): """The ArXiv preprint number. Returns: str: * numbers are separated by a comma. * empty string when it is not defined. """ lst = self.get("arxiv_eprints", None) if lst is None: return "" lst = [f"arXiv:{elt['value']}" for elt in lst] return ", ".join(lst) def report_number(self): """The report number(s) associated to the publication. Returns: str: - Numbers are separated by a comma - Number are sorted in alphabetic order. - Empty string when not defined. """ lst = self.get("report_numbers", None) if lst is None: return "" lst = [elt["value"] for elt in lst] return ", ".join(lst) def submitted(self): """The date of submission. Returns: str: * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.* * Empty string when not defined. """ val = self.get("preprint_date", None) return ("" if val is None else val) def title(self): """The title of the publication. Returns: str: * Empty string when not defined. * The filter CLEAN_SPACES is applied. * First one is selectec when ther is more than one """ titles = self.get("titles", None) return ("" if titles is None else titles[0]["title"])