recordheppubli.py 9.07 KB
Newer Older
1 2 3
""" store_tools.recordheppubli

"""
4 5
import logging

6 7 8 9 10 11 12
from filters import CLEAN_COLLABORATION
from pandas import DataFrame
from .recordhep import RecordHep
from store_tools.pluginauthors import PluginAuthors
from store_tools.pluginpublicationinfo import PluginPublicationInfo


13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
def pages(row):
    """Help function to build the pages argument

    Args:
        row (pandas.Series):
            * artid (str)
            * page_start (int)
            * page_end (int)

    Return
        str:
            * either 23 or 23-45
            * empty string when information is missing

    """
    artid = row.get("artid", None)
    pstart = row.get("page_start", None)
    pend = row.get("page_end", None)

    if pstart is None and pend is None and artid is None:
        return ""

    elif pstart is None and pend is None:
        return artid

    elif pend is None:
        return f"{pstart}"

    return f"{pstart}-{pend}"


44 45 46
class RecordHepPubli(RecordHep, PluginAuthors, PluginPublicationInfo):
    """Article, preprint and proceeding from inspirehep.net version 2.

47
    Schema for publication is documented here:
48 49 50 51 52 53 54 55
    https://inspire-schemas.readthedocs.io/en/latest/schemas/

    """

    def __init__(self, recjson):

        super().__init__(recjson)

56 57
        self.logger = logging.getLogger("web2py.app.limbra")

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
        self._last_fmt_author = "Last, First"
        self._process_authors()
        self._process_publication_info()

    def _process_authors(self):
        """Convert authors information into DataFrame:

        Authors and their affiliations are stored in DataFrame with the
        following structure:

            +---------------+--------------------------------+
            | column        |                                |
            +===============+================================+
            | affiliation   | value separated by "|"         |
            +---------------+--------------------------------+
            | first_name    | first name                     |
            +---------------+--------------------------------+
            | fmt_name      | formated name                  |
            +---------------+--------------------------------+
            | full_name     | Last, First                    |
            +---------------+--------------------------------+
            | last_name     | family name                    |
            +---------------+--------------------------------+
81
            | role          | equal to dir. for phd director |
82 83 84
            +---------------+--------------------------------+

        Note:
85 86 87
            After running this method, the attribute ``df_authors`` is defined.
            It contains one entry with empty strings when the file ``authors``
            is not defined.
88 89

        """
90 91
        self.logger.debug("    process authors")

92 93 94 95 96 97 98
        authors = self.get("authors", None)

        if authors is None:
            cols = ["affiliation",
                    "first_name",
                    "fmt_name",
                    "full_name",
99 100
                    "last_name",
                    "role"]
101
            self.df_authors = DataFrame([[""] * len(cols)], columns=cols)
102 103 104 105 106 107 108 109 110
            return

        data = []
        for author in authors:

            affiliations = []
            if "affiliations" in author:
                affiliations = [elt["value"] for elt in author["affiliations"]]

111 112 113
            role = \
                (author["inspire_roles"] if "inspire_roles" in author else [])

114
            full_name = author["full_name"]
115 116 117
            idx = full_name.find(",")
            last_name = full_name[:idx]
            first_name = full_name[idx + 1:].strip()
118 119 120 121 122

            dct = {"affiliation": "|".join(affiliations),
                   "first_name": first_name.strip(),
                   "fmt_name": full_name,
                   "full_name": full_name,
123 124
                   "last_name": last_name.strip(),
                   "role": ", ".join(role)}
125 126 127 128 129 130 131 132 133 134

            data.append(dct)

        df = DataFrame(data)

        # protection against duplicated entries, e.g. twice the first author
        if "full_name" in df.columns:
            df = df.drop_duplicates("full_name")

        # replace
135
        self.df_authors = df
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159

    def _process_publication_info(self):
        """Convert publication_info into DataFrame:

            Note:
                * the field is a list when there are erratum
                * in some case the subfield year is a list (cds 1951625)

        publication information are stored in DataFrame with the
        following structure:

            +------------+--------------------------------+
            | column     |                                |
            +============+================================+
            | title      | abbreviation of the publisher  |
            +------------+--------------------------------+
            | volume     | volume                         |
            +------------+--------------------------------+
            | year       | year of publication            |
            +------------+--------------------------------+
            | pagination | page number or ranges          |
            +------------+--------------------------------+

        Note:
160 161 162
            * After running this method, the attribute ``df_info``
              is defined. It contains one entry with empty strings
              when the ``publication_info`` field does not exist.
163 164 165 166 167

            * In order to deal with erratum entry are sorter by year
              and volume.

        """
168 169 170 171 172 173 174
        self.logger.debug("    process publication info")

        cols = ["title",
                "volume",
                "year",
                "pagination"]

175 176 177
        data = self.get("publication_info", None)

        if data is None:
178
            self.df_info = DataFrame([[""] * len(cols)], columns=cols)
179 180
            return

181 182 183
        # filter data to keep only row with year information
        data = [dct for dct in data if "year" in dct]

184 185 186 187
        if len(data) == 0:
            self.df_info = DataFrame([[""] * len(cols)], columns=cols)
            return

188
        # convert data to DataFrame with a well know structure
189 190
        df = (DataFrame(data)
              .astype({"year": str})
191 192 193
              .rename(columns={"journal_title": "title",
                               "journal_volume": "volume"},
                      errors="ignore"))
194

195 196
        # construction pagination columns
        df["pagination"] = df.apply(pages, axis="columns")
197 198

        # erratum -- sort by year and volume
199
        columns = df.columns
200 201 202 203 204 205 206
        if set(["year", "volume"]).issubset(columns):
            df = df.sort_values(["year", "volume"])

        elif "year" in columns:
            df = df.sort_values("year")

        # replace
207
        self.df_info = df
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223

    def collaboration(self):
        """The collaboration(s) signing the publication.

        Returns:
            str:
                * collaborations are separated by a comma.
                * The filter CLEAN_COLLABORATION is applied.
                * empty string when not defined

        """
        collaborations = self.get("collaborations", None)

        if collaborations is None:
            return ""

224
        lst = [elt["value"] for elt in collaborations]
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
        return CLEAN_COLLABORATION(", ".join(lst))

    def paper_url(self):
        """The URL of the document.

        Returns:
            str:
                * the string is empty when no URLs are found.
                * first URL is selected when there is more than one

        """
        documents = self.get("documents", None)
        return ("" if documents is None else documents[0]["url"])

    def preprint_number(self):
        """The ArXiv preprint number.

        Returns:
            str:
                * numbers are separated by a comma.
                * empty string when it is not defined.

        """
        lst = self.get("arxiv_eprints", None)

        if lst is None:
            return ""

        lst = [f"arXiv:{elt['value']}" for elt in lst]
        return ", ".join(lst)

    def report_number(self):
        """The report number(s) associated to the publication.

        Returns:
            str:
                - Numbers are separated by a comma
                - Number are sorted in alphabetic order.
                - Empty string when not defined.

        """
        lst = self.get("report_numbers", None)

        if lst is None:
            return ""

        lst = [elt["value"] for elt in lst]
        return ", ".join(lst)

    def submitted(self):
        """The date of submission.

        Returns:
            str:
                * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
                * Empty string when not defined.

        """
        val = self.get("preprint_date", None)
        return ("" if val is None else val)

    def title(self):
        """The title of the publication.

        Returns:
            str:
                * Empty string when not defined.
                * The filter CLEAN_SPACES is applied.
                * First one is selectec when ther is more than one

        """
        titles = self.get("titles", None)
        return ("" if titles is None else titles[0]["title"])