recordpubli.py 24.9 KB
Newer Older
1 2 3 4 5 6
""" invenio_tools.recordpubli

"""
import re


7 8 9 10
from .base import (ARXIV,
                   ARXIV_PDF,
                   REG_ARXIV_NUMBER,
                   REG_YEAR)
11

12
from .exception import RecordException
13
from filters import CLEAN_COLLABORATION
14
from numpy import NaN
15
from pandas import concat, DataFrame
16
from plugin_dbui import as_list, CLEAN_SPACES
17
from .record import Record
18

19 20 21 22 23 24 25 26

AUTHOR_FORMATS = [
    "First, Last",
    "F. Last",
    "Last",
    "Last, First",
    "Last F."]

LE GAC Renaud's avatar
LE GAC Renaud committed
27 28 29
# decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
30 31 32 33
_ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]

34 35
MSG_INVALID_FMT = "Invalid format for author"

LE GAC Renaud's avatar
LE GAC Renaud committed
36 37
# the keys containing paper reference
PAPER_REFERENCE_KEYS = {"pagination", "title", "volume", "year"}
38

LE GAC Renaud's avatar
LE GAC Renaud committed
39 40
# extract initial of a first name
REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?$"
41 42


43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
def to_initial(x, y, z):
    """Help function to extract initial from a first name split in x, y and z:

        Albert                (x="Albert", y="", z="")
        Antonio Augusto       (x="Antonio", y="", z="Augusto")
        Jean-Pierre           (x="Jean", y="-", z="Pierre")

    Args:
        x (str): first part
        y (str): separator
        z (str): second part

    Returns:
        str

    """
    if z == "":
        return "%s." % x[0:1]

    if y == "":
        return "%s. %s." % (x[0:1], z[0:1])
    else:
        return "%s.%s%s." % (x[0:1], y[0:1], z[0:1])

67

68 69 70 71
def to_str(x):
    return ("|".join(x) if isinstance(x, list) else x)


72
class RecordPubli(Record):
LE GAC Renaud's avatar
LE GAC Renaud committed
73 74 75 76
    """The record describes an article, preprint, proceeding, report and talk.
    The main ``field`` and ``subfield`` are::

         +---------------------------------+----------------------------------+
77
         | field                           | subfield                         |
LE GAC Renaud's avatar
LE GAC Renaud committed
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
         +---------------------------------+----------------------------------+
         | FIXME_OAI (inspire)             | id                               |
         | abstract                        |                                  |
         | accelerator_experiment          |                                  |
         | agency_code (cds)               |                                  |
         | authors                         | INSPIRE_number, affiliation,     |
         |                                 | control_number, first_name,      |
         |                                 | full_name, last_name,            |
         |                                 | relator_name (phd director)      |
         | base (cds)                      |                                  |
         | collection                      |                                  |
         | comment                         |                                  |
         | copyright_status (cds)          |                                  |
         | corporate_name                  | collaboration                    |
         | creation_date                   |                                  |
         | doi                             |                                  |
         | email_message (cds)             |                                  |
         | filenames                       |                                  |
         | files                           | comment, description, eformat,   |
         |                                 | full_name, full_path, magic,     |
         |                                 | name, path, size, status,        |
         |                                 | subformat, superformat, type,    |
         |                                 | url, version                     |
         | filetypes                       |                                  |
         | imprint                         |                                  |
         | keywords                        |                                  |
         | language (cds)                  |                                  |
         | license                         |                                  |
         | number_of_authors               |                                  |
         | number_of_citations             |                                  |
         | number_of_comments              |                                  |
         | number_of_reviews               |                                  |
         | oai (cds)                       | value                            |
         | other_report_number (cds)       |                                  |
         | persistent_identifiers_keys     |                                  |
         | physical_description            |                                  |
         | prepublication                  | date, publisher_name, place      |
         | primary_report_number           |                                  |
         | publication_info                | pagination, title, volume, year  |
         | recid                           | none                             |
         | reference (inspire)             |                                  |
         | report_number (cds)             | internal, report_number          |
         | source_of_acquisition (inspire) |                                  |
         | status_week (cds)               |                                  |
         | subject                         |                                  |
         | system_control_number           | institute, value or canceled     |
         | thesaurus_terms                 |                                  |
         | title                           | title                            |
         | title_additional (inspire)      |                                  |
         | url (cds)                       | description, url                 |
         | version_id                      |                                  |
         +---------------------------------+----------------------------------+
130 131

    """
132 133
    def __init__(self, *args):

134 135
        self._last_fmt_author = "Last, First"

136 137
        Record.__init__(self, *args)
        self._process_authors()
LE GAC Renaud's avatar
LE GAC Renaud committed
138
        self._process_publication_info()
139 140 141 142

    def _process_authors(self):
        """Convert authors information into DataFrame:

143 144 145
        Authors and their affiliations are stored in DataFrame with the
        following structure:

LE GAC Renaud's avatar
LE GAC Renaud committed
146 147 148 149 150 151 152 153 154 155
            +---------------+--------------------------------+
            | column        |                                |
            +---------------+--------------------------------+
            | affiliation   | value separated by "|"         |
            | first_name    | first name                     |
            | fmt_name      | formated name                  |
            | full_name     | Last, First                    |
            | last_name     | family name                    |
            | relator_name  | equal to dir. for phd director |
            +---------------+--------------------------------+
156

LE GAC Renaud's avatar
LE GAC Renaud committed
157 158 159 160
        Note:
            After running this method, the field ``authors`` is always defined.
            It contains one entry with empty strings when the field does not
            exist.
161

LE GAC Renaud's avatar
LE GAC Renaud committed
162 163 164 165 166 167 168 169 170
        """
        if u"authors" not in self:
            cols = ["affiliation",
                    "first_name",
                    "fmt_name",
                    "full_name",
                    "last_name"]
            self[u"authors"] = DataFrame([[""]*len(cols)], columns=cols)
            return
171

LE GAC Renaud's avatar
LE GAC Renaud committed
172 173
        data = self[u"authors"]
        data = (data if isinstance(data, list) else [data])
174

LE GAC Renaud's avatar
LE GAC Renaud committed
175
        df = DataFrame(data)
176

LE GAC Renaud's avatar
LE GAC Renaud committed
177 178 179 180 181 182
        # drop useless columns
        refcols = ["affiliation",
                   "first_name",
                   "full_name",
                   "last_name",
                   "relator_name"]
183

LE GAC Renaud's avatar
LE GAC Renaud committed
184 185
        columns = df.columns
        df = df.drop(columns.difference(refcols), axis="columns")
186

LE GAC Renaud's avatar
LE GAC Renaud committed
187 188 189 190
        # protection -- affiliation not defined
        if "affiliation" not in columns:
            dfa = DataFrame([""]*len(df), columns=["affiliation"])
            df = concat([df, dfa], axis="columns")
191

LE GAC Renaud's avatar
LE GAC Renaud committed
192 193 194 195
        # convert list of affiliation to string separated by |
        df.affiliation = (df.affiliation
                          .fillna("")
                          .apply(lambda x: to_str(x)))
196

LE GAC Renaud's avatar
LE GAC Renaud committed
197 198
        # add the column fmt_name
        df["fmt_name"] = df.full_name
199

LE GAC Renaud's avatar
LE GAC Renaud committed
200 201
        # replace
        self[u"authors"] = df
202

LE GAC Renaud's avatar
LE GAC Renaud committed
203 204
    def _process_publication_info(self):
        """Convert publication_info into DataFrame:
205

LE GAC Renaud's avatar
LE GAC Renaud committed
206
            Note:
207
                * the field is a list when there are erratum
LE GAC Renaud's avatar
LE GAC Renaud committed
208
                * in some case the subfield year is a list (cds 1951625)
209

LE GAC Renaud's avatar
LE GAC Renaud committed
210 211
        publication information are stored in DataFrame with the
        following structure:
212

LE GAC Renaud's avatar
LE GAC Renaud committed
213 214 215 216 217 218 219 220
            +------------+--------------------------------+
            | column     |                                |
            +------------+--------------------------------+
            | title      | abbreviation of the publisher  |
            | volume     | volume                         |
            | year       | year of publication            |
            | pagination | page number or ranges          |
            +------------+--------------------------------+
221

LE GAC Renaud's avatar
LE GAC Renaud committed
222 223 224 225
        Note:
            * After running this method, the field ``publication_info``
              is always defined. It contains one entry with empty strings
              when the field does not exist.
226

LE GAC Renaud's avatar
LE GAC Renaud committed
227 228
            * In order to deal with erratum entry are sorter by year
              and volume.
229

LE GAC Renaud's avatar
LE GAC Renaud committed
230 231 232 233 234 235
        """
        if u"publication_info" not in self:
            cols = ["title",
                    "volume",
                    "year",
                    "pagination"]
236

LE GAC Renaud's avatar
LE GAC Renaud committed
237 238
            self[u"publication_info"] = \
                DataFrame([[""]*len(cols)], columns=cols)
239

LE GAC Renaud's avatar
LE GAC Renaud committed
240
            return
241

LE GAC Renaud's avatar
LE GAC Renaud committed
242 243
        data = self[u"publication_info"]
        data = (data if isinstance(data, list) else [data])
244

LE GAC Renaud's avatar
LE GAC Renaud committed
245
        df = DataFrame(data)
246

LE GAC Renaud's avatar
LE GAC Renaud committed
247 248 249 250
        # protection -- list of year, e.g. [2014, 2014] (cds 1951625)
        df["year"] = \
            df.year.apply(
                lambda x: (", ".join(set(x)) if isinstance(x, list) else x))
251

LE GAC Renaud's avatar
LE GAC Renaud committed
252
        # erratum -- sort by year and volume
253 254 255 256 257 258
        columns = df.columns
        if set(["year", "volume"]).issubset(columns):
            df = df.sort_values(["year", "volume"])

        elif "year" in columns:
            df = df.sort_values("year")
259

LE GAC Renaud's avatar
LE GAC Renaud committed
260 261
        # replace
        self[u"publication_info"] = df
262

263
    def authors(self, sep=", ", sort=False):
264 265
        """The author(s) signing the publication.

266
        Args:
267
            sep (str):
268
                string separating author names. The default is the comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
269

270 271 272
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
273

274
        Returns:
275
            str:
276
                * Author names are separated by the ``sep`` argument.
277
                * The string is empty when there is no authors.
278 279

        """
280 281
        li = self.authors_as_list(sort=sort)
        return sep.join(li)
282

283
    def authors_as_list(self, sort=False):
284 285
        """The list of author(s) signing the publication.

286 287 288 289 290
        Args:
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

291
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
292 293
            list:
                the list is empty when authors are not defined.
294 295

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
296 297
        df = self[u"authors"]

298
        if sort:
LE GAC Renaud's avatar
LE GAC Renaud committed
299
            li = (df[["last_name", "fmt_name"]]
300 301 302 303 304
                  .sort_values(by="last_name")
                  .fmt_name
                  .tolist())

        else:
LE GAC Renaud's avatar
LE GAC Renaud committed
305
            li = (df.fmt_name
306 307
                  .sort_index()
                  .tolist())
308 309 310 311 312

        if len(li) == 1 and li[0] == "":
            li = []

        return li
313 314 315 316

    def collaboration(self):
        """The collaboration(s) signing the publication.

317
        Returns:
318
            str:
319 320
                * names of collaboration are separated by a comma.
                * The filter CLEAN_COLLABORATION is applied.
321 322

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
323
        li = self._get(u"corporate_name", u"collaboration", force_list=True)
LE GAC Renaud's avatar
LE GAC Renaud committed
324
        return CLEAN_COLLABORATION(", ".join(li))
325

326
    def find_affiliation(self, pattern):
327
        """Find affiliation matching the regular expression *pattern*.
328

329
        Args:
330
            pattern (str):
331
                regular expression defining the affiliation keys.
332 333 334
                It has to be build for an exact match namely containing
                start and end of string. This is reuqired to separate
                `Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
335

336
        Returns:
337
            str:
338
                - the affiliation or the first one when several are found.
339
                - empty string when nothing is found.
340 341

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
342 343 344 345 346 347 348
        df = self[u"authors"]

        # modify the pattern to capture group
        pattern = "(%s)" % pattern

        data = (df.affiliation.str.extract(pattern, expand=False)
                .dropna())
349

350
        return (data[0] if len(data) > 0 else "")
351

352
    def find_authors(self, pattern, sep=", ", sort=False):
353
        """Find authors containing the regular expression *pattern*.
354
        The search is performed on the formatted name.
355

356
        Args:
357
            pattern (str):
358
                regular expression defining the author name(s).
LE GAC Renaud's avatar
LE GAC Renaud committed
359 360

            sep (unicode):
361
                string separating author names. The default is the comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
362

363 364 365
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
366

367
        Returns:
368
            str:
369
                * Author names are separated by ``sep`` argument.
370
                * The string is empty when nothing is found.
371 372

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
373
        df = self[u"authors"]
374

375 376 377 378 379 380
        query = df.fmt_name.str.contains(pattern)

        if sort:
            data = (df.loc[query, ["last_name", "fmt_name"]]
                    .sort_values(by="last_name")
                    .fmt_name)
381

382 383 384 385 386
        else:
            data = (df.loc[query, ["fmt_name"]]
                    .sort_index()
                    .fmt_name)

387
        return ("" if len(data) == 0 else sep.join(data))
388

389
    def find_authors_by_affiliation(self, pattern, sep=", ", sort=False):
390 391
        """Find authors belonging to a given institute(s) defined by a regular
        expression.
392

393
        Args:
394
            pattern (str):
395
                regular expression defining the affiliation keys
396
                for the institute(s).
LE GAC Renaud's avatar
LE GAC Renaud committed
397 398

            sep (unicode):
399
                string separating author names. The default is the comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
400

401 402 403
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
404

405
        Returns:
406
            str:
407
                * Author names are separated by the ``sep`` argument.
408
                * Author are sorted according to their family name.
409
                * Empty string when authors are not found.
410 411

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
412 413 414
        df = self[u"authors"]

        query = df.affiliation.str.contains(pattern)
415

416 417 418 419 420 421 422 423 424 425
        if sort:
            data = (df.loc[query, ["last_name", "fmt_name"]]
                    .sort_values(by="last_name")
                    .fmt_name)

        else:
            data = (df.loc[query, ["fmt_name"]]
                    .sort_index()
                    .fmt_name)

426
        return (sep.join(data) if len(data) > 0 else "")
427 428 429 430

    def first_author(self):
        """The name of the first author.

431
        Returns:
432
            str:
433
                - Empty string when the first author is not defined.
434 435

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
436
        return self[u"authors"].fmt_name.iloc[0]
437 438 439 440

    def first_author_institutes(self):
        """The institute(s) associated to the first author.

441
        Note:
442 443
            Search is performed via the affiliation defined by the "u" key
            of the author field.
444

445
        Returns:
446
            str:
447
                - names are separated by ``|``.
448
                - The string is empty when institutes are not defined.
449 450

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
451
        val = self[u"authors"].affiliation.iloc[0]
452
        return ("" if val == NaN else val)
453 454 455 456

    def institutes(self):
        """The list of institute signing the publication.

457
        Note:
458
            Name of institute are given by the affiliation defined by
LE GAC Renaud's avatar
LE GAC Renaud committed
459
            the "affiliation" key of the author field.
460

461
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
462 463
            list:
                the list is sort in alphabetic order.
464 465

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
466 467
        df = self[u"authors"]

468
        # expand multi-affiliation (one per column)
LE GAC Renaud's avatar
LE GAC Renaud committed
469
        df = df.affiliation.str.split("|", expand=True)
470

471 472 473 474 475 476
        # merge all columns into a single one,
        # sort and remove duplicate entries
        li = [df[el].dropna() for el in df.columns]
        df = (concat(li, ignore_index=True)
              .sort_values()
              .unique())
477

478
        return df.tolist()
479

480
    def is_affiliations(self):
481
        """``True`` when affiliations are defined for authors.
482

483
        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
484 485
            This is a fast algorithm checking that the ``affiliation`` field
            exists. To check that the affiliation is defined for all authors,
486
            uses the method :func:`is_affiliation_for_all`.
487

488 489
        Returns:
            bool:
490 491

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
492
        df = self[u"authors"]
493

LE GAC Renaud's avatar
LE GAC Renaud committed
494
        if len(df) == 1 and df.affiliation.iloc[0] == "":
495
            return False
496 497 498

        return True

499
    def is_affiliation_for_all(self):
500
        """``True`` when affiliation are defined for all authors.
501

502 503
        Return:
            bool:
504 505

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
506
        df = self[u"authors"]
507

LE GAC Renaud's avatar
LE GAC Renaud committed
508 509
        query = df.affiliation.isin(["", NaN])
        return df.affiliation[query].size == 0
510

511
    def is_authors(self):
512
        """``True`` when authors are defined.
513

514 515
        Returns:
            bool:
516 517

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
518
        df = self[u"authors"]
519

LE GAC Renaud's avatar
LE GAC Renaud committed
520 521
        cols = {"first_name", "full_name", "last_name"}
        if len(df.columns.intersection(cols)) != 3:
522 523
            return False

LE GAC Renaud's avatar
LE GAC Renaud committed
524
        if len(df) == 1 and df.full_name.iloc[0] == "":
525 526 527
            return False

        return True
528

529
    def is_published(self):
LE GAC Renaud's avatar
LE GAC Renaud committed
530 531
        """``True`` is the record is published and contains a complet set
        of publication infromation (title, volume, year and pagination).
532 533 534

        Returns:
            bool:
535 536

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
537
        df = self[u"publication_info"]
538

LE GAC Renaud's avatar
LE GAC Renaud committed
539 540 541 542 543
        query = \
            (df.title.str.len() > 0) \
            & (df.volume.str.len() > 0) \
            & (df.year.str.len() > 0) \
            & (df.pagination.str.len() > 0)
544

LE GAC Renaud's avatar
LE GAC Renaud committed
545
        return len(df[query]) > 0
546 547

    def is_with_erratum(self):
548
        """``True`` when the record contains erratum data.
549

550 551
        Returns:
            bool
552 553

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
554 555
        df = self[u"publication_info"]
        return len(df) > 1
556 557

    def paper_editor(self):
558
        """The abbreviated version of the review, *e.g* Phys Lett B.
559

560
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
561
            unicode:
562
                * Empty string when not defined.
563 564

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
565 566
        df = self[u"publication_info"]
        return df.title.iloc[0]
567 568 569 570

    def paper_pages(self):
        """The page number / range when the record is published in a review.

571
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
572
            unicode:
573 574
                * The format is "45-67" or "234".
                * Empty string when not defined.
575 576

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
577 578
        df = self[u"publication_info"]
        return df.pagination.iloc[0]
579 580 581 582

    def paper_reference(self):
        """The full reference for a publication published in a review.

583
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
584
            unicode:
585 586 587
                * The format is "Phys Lett B 456 2010 5-6".
                * The string is empty when the publication is not
                  published in a review.
588 589

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
590
        paper = self[u"publication_info"].iloc[0]
591

LE GAC Renaud's avatar
LE GAC Renaud committed
592 593 594 595
        li = [paper.title,
              paper.volume,
              paper.year,
              paper.pagination]
596

LE GAC Renaud's avatar
LE GAC Renaud committed
597
        return u" ".join(li).strip()
598 599 600 601

    def paper_url(self):
        """The URL of the preprint.

602 603
        Note:
            Many others URL exists mainly those related to open access.
604

605
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
606 607
            unicode:
                the string is empty when no URLs are found.
608 609

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
610 611 612 613
        # depends on the store
        # start with CDS looking for the field `url`
        if u"url" in self:
            data = self[u"url"]
614

LE GAC Renaud's avatar
LE GAC Renaud committed
615 616
            li = (data if isinstance(data, list) else [data])
            li = [di[u"url"] for di in li if di[u"description"] == u"Preprint"]
617

LE GAC Renaud's avatar
LE GAC Renaud committed
618 619
            if len(li) == 1:
                return li[0]
620

LE GAC Renaud's avatar
LE GAC Renaud committed
621 622 623
        # scan the list of files
        # work for both stores.
        pdf = "%s.pdf" % self.preprint_number()
624

LE GAC Renaud's avatar
LE GAC Renaud committed
625 626
        li = self._get(u"files", u"url", force_list=True)
        li = [el for el in li if el.endswith(pdf)]
627

LE GAC Renaud's avatar
LE GAC Renaud committed
628 629
        if len(li) == 1:
            return li[0]
630

LE GAC Renaud's avatar
LE GAC Renaud committed
631
        return u""
632 633 634 635

    def paper_volume(self):
        """The volume number when the record is published in a review.

636
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
637
            unicode:
638
                - Empty string when nothing is found.
639 640

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
641 642
        df = self[u"publication_info"]
        return df.volume.iloc[0]
643 644 645 646

    def paper_year(self):
        """The year of the publication.

647
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
648
            unicode:
649
                - Empty string if the year is not defined.
650 651

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
652 653
        df = self[u"publication_info"]
        return df.year.iloc[0]
654 655 656 657

    def preprint_number(self):
        """The ArXiv preprint number.

658
        Returns:
659
            str: empty string when it is not defined.
660 661

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
662 663 664 665 666 667 668 669 670 671 672
        if u"primary_report_number" not in self:
            return

        data = self[u"primary_report_number"]
        data = (data if isinstance(data, list) else [data])

        li = [el for el in data if el.startswith(ARXIV)]
        if len(li) == 1:
                return li[0]

        return u""
673

674 675 676 677 678 679 680 681 682 683 684 685
    def reformat_authors(self, fmt="Last, First"):
        """Reformat names of authors.

        The default formatting for cds/invenio record is ``Last, First``.

        Args:
            fmt (str):
                define the new format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."

        Raises:
686
            RecordException:
LE GAC Renaud's avatar
LE GAC Renaud committed
687
                * the argument ``fmt`` is not valid.
688 689 690 691 692 693 694 695 696 697

        """
        if fmt not in AUTHOR_FORMATS:
            raise RecordException(MSG_INVALID_FMT)

        if fmt == self._last_fmt_author:
            return

        self._last_fmt_author = fmt

LE GAC Renaud's avatar
LE GAC Renaud committed
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
        df = self[u"authors"]

        # ....................................................................
        #
        # Compute initial for the first name
        #
        if fmt in ("F. Last", "Last F."):

            dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
                   .fillna(""))

            df["initial"] = dfm.apply(
                lambda x: to_initial(x[0], x[1], x[2]), axis="columns")

        # ....................................................................
        #
        # Format
        #
        if fmt == "Last, First":
            df["fmt_name"] = df.last_name + ", " + df.first_name

        elif fmt == "First, Last":
            df["fmt_name"] = df.first_name + ", " + df.last_name

        elif fmt == "F. Last":
            df["fmt_name"] = df.initial + " " + df.last_name

        elif fmt == "Last":
            df["fmt_name"] = df.last_name

        elif fmt == "Last F.":
            df["fmt_name"] = df.last_name + " " + df.initial

        # ....................................................................
        #
        # Clean initial column
        #
        if fmt in ("F. Last", "Last F."):
            df = df.drop("initial", axis="columns")
737

738 739 740
    def report_number(self):
        """The report number(s) associated to the publication.

741
        Returns:
742
            str:
743 744 745
                - Numbers are separated by a comma
                - Number are sorted in alphabetic order.
                - Empty string when not defined.
746 747

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
        # CDS
        if u"report_number" in self:

            data = self[u"report_number"]
            data = (data if isinstance(data, list) else [data])

            li = []
            [li.extend(di.itervalues()) for di in data]

            return ", ".join(sorted(li))

        # INSPIRE
        if u"primary_report_number" in self:

            data = self[u"primary_report_number"]
            data = (data if isinstance(data, list) else [data])

            li = [el for el in data if not el.startswith(ARXIV)]

            return ", ".join(sorted(li))

        return u""
770 771 772 773

    def submitted(self):
        """The date of submission.

774
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
775 776 777
            unicode:
                * format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
                * Empty sring when not defined.
778 779

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
780
        return self._get(u"prepublication", u"date")
781 782 783 784

    def title(self):
        """The title of the publication.

785
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
786
            unicode:
787 788
                * Empty string when not defined.
                * The filter CLEAN_SPACES is applied.
789 790

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
791
        return CLEAN_SPACES(self._get(u"title", u"title"))