recordpubli.py 16.2 KB
Newer Older
1
""" store_tools.recordpubli
2 3

"""
4
from filters import CLEAN_COLLABORATION
5
from pandas import concat, DataFrame
6
from plugin_dbui import CLEAN_SPACES
7
from .record import Record
8 9 10
from store_tools import ARXIV
from store_tools.pluginauthors import PluginAuthors
from store_tools.pluginpublicationinfo import PluginPublicationInfo
11

12

13 14 15 16
def to_str(x):
    return ("|".join(x) if isinstance(x, list) else x)


17
class RecordPubli(Record, PluginAuthors, PluginPublicationInfo):
18 19 20 21
    """Article, preprint, proceeding, report and talk from cds.cern.ch or
    old.inspirehep.net.

    The record is organised in ``field`` and ``subfield``:
LE GAC Renaud's avatar
LE GAC Renaud committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118

        +---------------------------------+----------------------------------+
        | field                           | subfield                         |
        +=================================+==================================+
        | FIXME_OAI (inspire)             | id                               |
        +---------------------------------+----------------------------------+
        | abstract                        |                                  |
        +---------------------------------+----------------------------------+
        | accelerator_experiment          |                                  |
        +---------------------------------+----------------------------------+
        | agency_code (cds)               |                                  |
        +---------------------------------+----------------------------------+
        | authors                         | INSPIRE_number, affiliation,     |
        |                                 | control_number, first_name,      |
        |                                 | full_name, last_name,            |
        |                                 | relator_name (phd director)      |
        +---------------------------------+----------------------------------+
        | base (cds)                      |                                  |
        +---------------------------------+----------------------------------+
        | collection                      |                                  |
        +---------------------------------+----------------------------------+
        | comment                         |                                  |
        +---------------------------------+----------------------------------+
        | copyright_status (cds)          |                                  |
        +---------------------------------+----------------------------------+
        | corporate_name                  | collaboration                    |
        +---------------------------------+----------------------------------+
        | creation_date                   |                                  |
        +---------------------------------+----------------------------------+
        | doi                             |                                  |
        +---------------------------------+----------------------------------+
        | email_message (cds)             |                                  |
        +---------------------------------+----------------------------------+
        | filenames                       |                                  |
        +---------------------------------+----------------------------------+
        | files                           | comment, description, eformat,   |
        |                                 | full_name, full_path, magic,     |
        |                                 | name, path, size, status,        |
        |                                 | subformat, superformat, type,    |
        |                                 | url, version                     |
        +---------------------------------+----------------------------------+
        | filetypes                       |                                  |
        +---------------------------------+----------------------------------+
        | imprint                         |                                  |
        +---------------------------------+----------------------------------+
        | keywords                        |                                  |
        +---------------------------------+----------------------------------+
        | language (cds)                  |                                  |
        +---------------------------------+----------------------------------+
        | license                         |                                  |
        +---------------------------------+----------------------------------+
        | number_of_authors               |                                  |
        +---------------------------------+----------------------------------+
        | number_of_citations             |                                  |
        +---------------------------------+----------------------------------+
        | number_of_comments              |                                  |
        +---------------------------------+----------------------------------+
        | number_of_reviews               |                                  |
        +---------------------------------+----------------------------------+
        | oai (cds)                       | value                            |
        +---------------------------------+----------------------------------+
        | other_report_number (cds)       |                                  |
        +---------------------------------+----------------------------------+
        | persistent_identifiers_keys     |                                  |
        +---------------------------------+----------------------------------+
        | physical_description            |                                  |
        +---------------------------------+----------------------------------+
        | prepublication                  | date, publisher_name, place      |
        +---------------------------------+----------------------------------+
        | primary_report_number           |                                  |
        +---------------------------------+----------------------------------+
        | publication_info                | pagination, title, volume, year  |
        +---------------------------------+----------------------------------+
        | recid                           | none                             |
        +---------------------------------+----------------------------------+
        | reference (inspire)             |                                  |
        +---------------------------------+----------------------------------+
        | report_number (cds)             | internal, report_number          |
        +---------------------------------+----------------------------------+
        | source_of_acquisition (inspire) |                                  |
        +---------------------------------+----------------------------------+
        | status_week (cds)               |                                  |
        +---------------------------------+----------------------------------+
        | subject                         |                                  |
        +---------------------------------+----------------------------------+
        | system_control_number           | institute, value or canceled     |
        +---------------------------------+----------------------------------+
        | thesaurus_terms                 |                                  |
        +---------------------------------+----------------------------------+
        | title                           | title                            |
        +---------------------------------+----------------------------------+
        | title_additional (inspire)      |                                  |
        +---------------------------------+----------------------------------+
        | url (cds)                       | description, url                 |
        +---------------------------------+----------------------------------+
        | version_id                      |                                  |
        +---------------------------------+----------------------------------+
119 120

    """
121

122 123
    def __init__(self, *args):

124 125
        self._last_fmt_author = "Last, First"

126 127
        Record.__init__(self, *args)
        self._process_authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
128
        self._process_publication_info()
129 130 131 132

    def _process_authors(self):
        """Convert authors information into DataFrame:

133 134 135
        Authors and their affiliations are stored in DataFrame with the
        following structure:

LE GAC Renaud's avatar
LE GAC Renaud committed
136 137
            +---------------+--------------------------------+
            | column        |                                |
LE GAC Renaud's avatar
LE GAC Renaud committed
138
            +===============+================================+
LE GAC Renaud's avatar
LE GAC Renaud committed
139
            | affiliation   | value separated by "|"         |
LE GAC Renaud's avatar
LE GAC Renaud committed
140
            +---------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
141
            | first_name    | first name                     |
LE GAC Renaud's avatar
LE GAC Renaud committed
142
            +---------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
143
            | fmt_name      | formated name                  |
LE GAC Renaud's avatar
LE GAC Renaud committed
144
            +---------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
145
            | full_name     | Last, First                    |
LE GAC Renaud's avatar
LE GAC Renaud committed
146
            +---------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
147
            | last_name     | family name                    |
LE GAC Renaud's avatar
LE GAC Renaud committed
148
            +---------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
149 150
            | relator_name  | equal to dir. for phd director |
            +---------------+--------------------------------+
151

LE GAC Renaud's avatar
LE GAC Renaud committed
152 153 154 155
        Note:
            After running this method, the field ``authors`` is always defined.
            It contains one entry with empty strings when the field does not
            exist.
156

LE GAC Renaud's avatar
LE GAC Renaud committed
157
        """
158
        if "authors" not in self:
LE GAC Renaud's avatar
LE GAC Renaud committed
159 160 161 162 163
            cols = ["affiliation",
                    "first_name",
                    "fmt_name",
                    "full_name",
                    "last_name"]
164
            self["authors"] = DataFrame([[""] * len(cols)], columns=cols)
LE GAC Renaud's avatar
LE GAC Renaud committed
165
            return
166

167
        data = self["authors"]
LE GAC Renaud's avatar
LE GAC Renaud committed
168
        data = (data if isinstance(data, list) else [data])
169

LE GAC Renaud's avatar
LE GAC Renaud committed
170
        df = DataFrame(data)
171

LE GAC Renaud's avatar
LE GAC Renaud committed
172 173 174 175 176 177
        # drop useless columns
        refcols = ["affiliation",
                   "first_name",
                   "full_name",
                   "last_name",
                   "relator_name"]
178

LE GAC Renaud's avatar
LE GAC Renaud committed
179 180
        columns = df.columns
        df = df.drop(columns.difference(refcols), axis="columns")
181

LE GAC Renaud's avatar
LE GAC Renaud committed
182 183 184 185
        # protection against duplicated entries, e.g. twice the first author
        if set(["last_name", "first_name"]).issubset(df.columns):
            df = df.drop_duplicates(["last_name", "first_name"])

LE GAC Renaud's avatar
LE GAC Renaud committed
186 187
        # protection -- affiliation not defined
        if "affiliation" not in columns:
188
            dfa = DataFrame([""] * len(df), columns=["affiliation"])
LE GAC Renaud's avatar
LE GAC Renaud committed
189
            df = concat([df, dfa], axis="columns")
190

LE GAC Renaud's avatar
LE GAC Renaud committed
191 192 193 194
        # convert list of affiliation to string separated by |
        df.affiliation = (df.affiliation
                          .fillna("")
                          .apply(lambda x: to_str(x)))
195

LE GAC Renaud's avatar
LE GAC Renaud committed
196 197
        # add the column fmt_name
        df["fmt_name"] = df.full_name
198

LE GAC Renaud's avatar
LE GAC Renaud committed
199
        # replace
200
        self["authors"] = df
201

LE GAC Renaud's avatar
LE GAC Renaud committed
202 203
    def _process_publication_info(self):
        """Convert publication_info into DataFrame:
204

LE GAC Renaud's avatar
LE GAC Renaud committed
205
            Note:
206
                * the field is a list when there are erratum
LE GAC Renaud's avatar
LE GAC Renaud committed
207
                * in some case the subfield year is a list (cds 1951625)
208

LE GAC Renaud's avatar
LE GAC Renaud committed
209 210
        publication information are stored in DataFrame with the
        following structure:
211

LE GAC Renaud's avatar
LE GAC Renaud committed
212 213
            +------------+--------------------------------+
            | column     |                                |
LE GAC Renaud's avatar
LE GAC Renaud committed
214
            +============+================================+
LE GAC Renaud's avatar
LE GAC Renaud committed
215
            | title      | abbreviation of the publisher  |
LE GAC Renaud's avatar
LE GAC Renaud committed
216
            +------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
217
            | volume     | volume                         |
LE GAC Renaud's avatar
LE GAC Renaud committed
218
            +------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
219
            | year       | year of publication            |
LE GAC Renaud's avatar
LE GAC Renaud committed
220
            +------------+--------------------------------+
LE GAC Renaud's avatar
LE GAC Renaud committed
221 222
            | pagination | page number or ranges          |
            +------------+--------------------------------+
223

LE GAC Renaud's avatar
LE GAC Renaud committed
224 225 226 227
        Note:
            * After running this method, the field ``publication_info``
              is always defined. It contains one entry with empty strings
              when the field does not exist.
228

LE GAC Renaud's avatar
LE GAC Renaud committed
229 230
            * In order to deal with erratum entry are sorter by year
              and volume.
231

LE GAC Renaud's avatar
LE GAC Renaud committed
232
        """
233
        if "publication_info" not in self:
LE GAC Renaud's avatar
LE GAC Renaud committed
234 235 236 237
            cols = ["title",
                    "volume",
                    "year",
                    "pagination"]
238

239
            self["publication_info"] = \
240
                DataFrame([[""] * len(cols)], columns=cols)
241

LE GAC Renaud's avatar
LE GAC Renaud committed
242
            return
243

244
        data = self["publication_info"]
LE GAC Renaud's avatar
LE GAC Renaud committed
245
        data = (data if isinstance(data, list) else [data])
246

LE GAC Renaud's avatar
LE GAC Renaud committed
247
        df = DataFrame(data)
LE GAC Renaud's avatar
LE GAC Renaud committed
248
        columns = df.columns
249

LE GAC Renaud's avatar
LE GAC Renaud committed
250
        # protection -- list of year, e.g. [2014, 2014] (cds 1951625)
LE GAC Renaud's avatar
LE GAC Renaud committed
251 252 253 254 255
        if "year" in columns:
            df["year"] = \
                df.year.apply(
                    lambda x:
                        (", ".join(set(x)) if isinstance(x, list) else x))
256

LE GAC Renaud's avatar
LE GAC Renaud committed
257
        # erratum -- sort by year and volume
258 259 260 261 262
        if set(["year", "volume"]).issubset(columns):
            df = df.sort_values(["year", "volume"])

        elif "year" in columns:
            df = df.sort_values("year")
263

LE GAC Renaud's avatar
LE GAC Renaud committed
264
        # replace
265
        self["publication_info"] = df
266

267 268 269
    def collaboration(self):
        """The collaboration(s) signing the publication.

270
        Returns:
271
            str:
272 273
                * names of collaboration are separated by a comma.
                * The filter CLEAN_COLLABORATION is applied.
274 275

        """
276
        li = self._get("corporate_name", "collaboration", force_list=True)
LE GAC Renaud's avatar
LE GAC Renaud committed
277
        return CLEAN_COLLABORATION(", ".join(li))
278 279 280 281

    def paper_url(self):
        """The URL of the preprint.

282 283
        Note:
            Many others URL exists mainly those related to open access.
284

285
        Returns:
286
            str:
LE GAC Renaud's avatar
LE GAC Renaud committed
287
                the string is empty when no URLs are found.
288 289

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
290 291
        # scan the list of files
        # work for both stores.
292 293
        pdf = f"{self.preprint_number()}.pdf"
        pdf = pdf.replace(ARXIV + ":", "")
294

295
        li = self._get("files", "url", force_list=True)
LE GAC Renaud's avatar
LE GAC Renaud committed
296
        li = [el for el in li if el.endswith(pdf)]
297

LE GAC Renaud's avatar
LE GAC Renaud committed
298 299
        if len(li) == 1:
            return li[0]
300

301
        return ""
302 303 304 305

    def preprint_number(self):
        """The ArXiv preprint number.

306
        Returns:
307
            str:
308
                empty string when it is not defined.
309 310

        """
311
        if "primary_report_number" not in self:
LE GAC Renaud's avatar
LE GAC Renaud committed
312 313
            return

314
        data = self["primary_report_number"]
LE GAC Renaud's avatar
LE GAC Renaud committed
315 316
        data = (data if isinstance(data, list) else [data])

317
        li = [el for el in data if el is not None and el.startswith(ARXIV)]
LE GAC Renaud's avatar
LE GAC Renaud committed
318 319 320
        if len(li) == 1:
                return li[0]

321
        return ""
322 323 324 325

    def report_number(self):
        """The report number(s) associated to the publication.

326
        Returns:
327
            str:
328 329 330
                - Numbers are separated by a comma
                - Number are sorted in alphabetic order.
                - Empty string when not defined.
331 332

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
333
        # CDS
334
        if "report_number" in self:
LE GAC Renaud's avatar
LE GAC Renaud committed
335

336
            data = self["report_number"]
LE GAC Renaud's avatar
LE GAC Renaud committed
337 338 339
            data = (data if isinstance(data, list) else [data])

            li = []
340
            [li.extend(di.values()) for di in data]
LE GAC Renaud's avatar
LE GAC Renaud committed
341 342 343

            return ", ".join(sorted(li))

344
        # OLD.INSPIRE
345
        if "primary_report_number" in self:
LE GAC Renaud's avatar
LE GAC Renaud committed
346

347
            data = self["primary_report_number"]
LE GAC Renaud's avatar
LE GAC Renaud committed
348 349
            data = (data if isinstance(data, list) else [data])

350 351
            li = [el for el in data
                  if el is not None and not el.startswith(ARXIV)]
LE GAC Renaud's avatar
LE GAC Renaud committed
352 353 354

            return ", ".join(sorted(li))

355
        return ""
356 357 358 359

    def submitted(self):
        """The date of submission.

360
        Returns:
361
            str:
LE GAC Renaud's avatar
LE GAC Renaud committed
362
                * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
363
                * Empty string when not defined.
364 365

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
366 367
        # in some case there is more than one date (see cds 2234042)
        # select the oldest one which should be the first one
368
        val = self._get("prepublication", "date")
LE GAC Renaud's avatar
LE GAC Renaud committed
369
        return (val[0] if isinstance(val, list) else val)
370 371 372 373

    def title(self):
        """The title of the publication.

374
        Returns:
375
            str:
376 377
                * Empty string when not defined.
                * The filter CLEAN_SPACES is applied.
378 379

        """
380
        return CLEAN_SPACES(self._get("title", "title"))