recordpubli.py 20.5 KB
Newer Older
1 2 3 4 5 6 7
# -*- coding: utf-8 -*-
""" invenio_tools.recordpubli

"""
import re


8 9 10 11
from base import (ARXIV,
                  ARXIV_PDF,
                  REG_ARXIV_NUMBER,
                  REG_AUTHOR,
12
                  REG_YEAR)
13
from filters import CLEAN_COLLABORATION
14
from plugin_dbui import as_list, CLEAN_SPACES
15 16
from record import Record

17 18 19 20 21 22 23 24 25 26
# Decode publication reference:
#  Phys. Rev. Lett. 113, 032001 (2014)
#  Eur. Phys. J. C (2014) 74:2883
_ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]

# The MARC12 keys containing paper reference
PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])

27 28

class RecordPubli(Record):
29 30
    """The MARC record describing a publication.
    Usual publications are article, preprint, proceeding, report and talk.
31 32
    The relation between methods and MARC fields are the following::

33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
        +-----------------------+---------+----------+
        |                       |  CDS    | INSPIREP |
        +-----------------------+---------+----------+
        | authors               | 700 a   |          |
        | collaboration         | 710 g   |          |
        | first author          | 100 a   |          |
        | institutes            | 700 u   |          |
        | paper editor          | 773 p   |          |
        | paper pages           | 773 c   |          |
        | paper reference       | 773 o   |          |
        | paper URL             | 8564 u  |          |
        | paper volume          | 773 v   |          |
        | paper year            | 773 y   |          |
        | preprint number       | 037 a   |          |
        | report number         | 088 a   | 037a     |
        | submitted             | 269 c   |          |
        | title                 | 245 a   |          |
        | year                  | 260 c   |          |
        +-----------------------+---------+----------+
52 53 54 55 56

    """
    def authors(self, cmpFct=None):
        """The author(s) signing the publication.

57 58 59 60 61
        Args:
            cmpFct (reference): function to compare author names.
                The comparison function takes two items and returns -1, 0, or 1
                depending on whether the first argument is considered smaller
                than, equal to, or larger than the second one.
62

63 64 65 66 67
        Returns:
            unicode:
                * Author names are separated by comma.
                * Author are sorted according to the function *cmpFct*.
                * The string is empty when there is no authors.
68 69 70 71 72 73 74 75 76 77

        """
        li = self.authors_as_list()
        if cmpFct:
            li.sort(key=cmpFct)
        return u', '.join(li)

    def authors_as_list(self):
        """The list of author(s) signing the publication.

78 79
        Returns:
            list: the list is empty when authors are not defined.
80 81 82

        """
        authors = []
83
        first_author = self.first_author()
84

85
        # a single author
86
        if u"700" in self and isinstance(self[u"700"], dict):
87
            if "a" in self[u"700"]:
88 89
                authors.append(self[u"700"]["a"])

90
        # a list of authors
91 92
        elif u"700" in self and isinstance(self[u"700"], list):
            for di in self[u"700"]:
93
                if "a" in di:
94 95 96 97 98 99 100
                    author = di["a"]

                    # PROTECTION
                    # in most of the case the author is a string
                    # but it can be a list, e.g inspirehep.net/138663:
                    # [u'Zuniga, J.', u'(the A.N.T.ARES. Collaboration)']
                    if isinstance(author, unicode):
101
                        authors.append(author)
102 103 104 105 106 107

                    elif isinstance(author, list):
                        for elt in author:
                            if REG_AUTHOR.match(elt):
                                authors.append(elt)
                                break
108

109 110 111 112 113
        # the first author is defined not the other one
        elif first_author:
            authors.append(first_author)

        # sometime the first author is missing
114
        if first_author and len(authors) > 0 and first_author != authors[0]:
115 116
            authors.insert(0, first_author)

117 118 119 120 121
        return authors

    def collaboration(self):
        """The collaboration(s) signing the publication.

122 123 124 125
        Returns:
            unicode:
                * names of collaboration are separated by a comma.
                * The filter CLEAN_COLLABORATION is applied.
126 127 128 129 130

        """
        li = self._get(u"710", 'g', force_list=True)
        return CLEAN_COLLABORATION(', '.join(li))

131
    def find_affiliation(self, pattern):
132
        """Find affiliation matching the regular expression *pattern*.
133

134 135 136
        Args:
            pattern (unicode): regular expression defining the
                affiliation names.
137

138 139 140 141
        Returns:
            unicode:
                - the affiliation
                - empty string when nothing is found.
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167

        """
        reg_affiliation = re.compile(pattern)

        for field in (u"100", u"700"):
            if field not in self:
                continue

            dictionaries = self[field]
            if isinstance(dictionaries, dict):
                dictionaries = [dictionaries]

            for di in dictionaries:
                if "u" not in di:
                    continue

                li = di["u"]
                if isinstance(li, unicode):
                    li = [li]

                for affiliation in li:
                    if reg_affiliation.search(affiliation):
                        return affiliation

        return u""

168
    def find_authors(self, pattern):
169
        """Find authors matching the regular expression *pattern*.
170

171 172
        Args:
            pattern (unicode): regular expression defining the author name(s).
173

174 175 176 177
        Returns:
            unicode:
                * Author names are separated by a comma.
                * The string is empty when nothing is found.
178 179 180 181 182 183 184 185 186 187 188

        """
        li = []
        regex = re.compile(pattern)

        for author in self.authors_as_list():
            if regex.search(author):
                li.append(author)

        return u', '.join(li)

189
    def find_authors_by_affiliation(self, pattern, cmpFct=None):
190 191 192
        """Find authors belonging to a given institute(s) defined by a regular
        expression.

193 194 195
        Args:
            pattern (unicode): regular expression defining the
                institute name(s)
196

197 198 199 200
            cmpFct (reference): function to compare author names.
                The comparison function takes two items and returns -1, 0, or 1
                depending on whether the first argument is considered smaller
                than, equal to, or larger than the second one.
201

202 203 204 205 206
        Returns:
            unicode:
                * Author names are separated by a comma.
                * Author are sorted according to the function *cmpFct*.
                * Empty string when authors are not found.
207 208 209

        """
        # authors not defined
210 211
        if not self.is_authors():
            return u""
212

213
        authors = []
214 215
        regex = re.compile(pattern)

216 217 218 219
        # standard case
        data_authors = (self[u"700"] if u"700" in self else [])
        if isinstance(data_authors, dict):
            data_authors = [data_authors]
220

221 222 223 224
        # to cover the case in which the first author is not in self[u"700"]
        data_first_author = (self[u"100"] if u"100" in self else [])
        if isinstance(data_first_author, dict):
            data_first_author = [data_first_author]
225

226
        # scan
227
        for dictionaries in (data_authors, data_first_author):
228

229 230
            for di in dictionaries:
                # one author without affiliation -- skip it
231
                if 'u' not in di:
232
                    continue
233

234 235 236
                affiliations = di['u']
                if isinstance(affiliations, list):
                    affiliations = u", ".join(affiliations)
237

238 239
                # affiliation match
                if regex.search(affiliations):
240
                    if "a" in di:
241
                        authors.append(di["a"])
242

243 244 245 246
        # remove duplicate entries and sort
        authors = list(set(authors))
        if cmpFct:
            authors.sort(key=cmpFct)
247

248
        return u", ".join(authors)
249 250 251 252

    def first_author(self):
        """The name of the first author.

253 254 255 256
        Returns:
            unicode or list:
                - Empty string when the first author is not defined.
                - List of name when there is more than one.
257 258

        """
259 260 261 262 263 264 265
        # standard case
        value = self._get(u"100", "a")

        if value:
            # PROTECTION
            # It happens that the first author is duplicate, remove it
            if isinstance(value, list):
266 267 268
                value = list(set(value))
                if len(value) == 1:
                    return value[0]
269 270 271 272 273 274 275 276 277 278 279 280 281

            return value

        # sometime it is only defined in the authors list
        if u"700" in self:
            if isinstance(self[u"700"], dict) and "a" in self[u"700"]:
                return self[u"700"]["a"]

            elif isinstance(self[u"700"], list):
                if "a" in self[u"700"][0]:
                    return self[u"700"][0]["a"]

        return u""
282 283 284 285

    def first_author_institutes(self):
        """The institute(s) associated to the first author.

286 287 288 289
        Returns:
            unicode:
                - names are separated by a comma.
                - The string is empty when institutes are not defined.
290 291

        """
292 293 294 295
        # standard case

        # PROTECTION
        # sometime the first author is duplicate -- remove duplicate
296
        li = self._get(u"100", "u", force_list=True)
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
        if li:
            return u", ".join(set(li))

        # sometime it is only defined in the authors list
        if u"700" in self:
            if isinstance(self[u"700"], dict) and "u" in self[u"700"]:
                if isinstance(self[u"700"]["u"], unicode):
                    return self[u"700"]["u"]
                elif isinstance(self[u"700"]["u"], list):
                    return u", ".join(self[u"700"]["u"])

            elif isinstance(self[u"700"], list):
                if "u" in self[u"700"][0]:
                    if isinstance(self[u"700"][0]["u"], unicode):
                        return self[u"700"][0]["u"]
                    elif isinstance(self[u"700"][0]["u"], list):
                        return u", ".join(self[u"700"][0]["u"])

        return u""
316 317 318 319

    def institutes(self):
        """The list of institute signing the publication.

320 321
        Returns:
            list: the list is sort in alphabetic order.
322 323 324 325 326 327

        """
        li = []

        # each entry can be a string or a list when the author has
        # several affiliations
328 329 330 331 332 333
        for field in (u"100", u"700"):
            for el in self._get(field, "u", force_list=True):
                if isinstance(el, list):
                    li.extend(el)
                else:
                    li.append(el)
334 335 336 337 338 339 340 341 342

        # remove duplicate entries
        li = list(set(li))

        # sort institute in alphabetic order
        li.sort()

        return li

343
    def is_affiliations(self):
344
        """``True`` when affiliations are defined for authors.
345

346 347 348 349
        Note:
            This is a fast algorithm checking only first and last authors.
            To check that the affiliation is defined for all authors,
            uses the method :func:`is_affiliation_for_all`.
350 351


352 353
        Returns:
            bool:
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368

        """
        for field in (u"100", u"700"):
            if field in self:
                if isinstance(self[field], dict):
                    if "u" not in self[field]:
                        return False

                elif isinstance(self[field], list):
                    for i in (1, -1):
                        if "u" not in self[field][i]:
                            return False

        return True

369
    def is_affiliation_for_all(self):
370
        """``True`` when affiliation are defined for all authors.
371

372 373
        Return:
            bool:
374 375

        """
376
        if u"700" not in self and u"100" not in self:
377 378
            return False

379 380 381 382 383 384 385 386
        for field in (u"100", u"700"):
            if field in self:
                dictionaries = self[field]
                if isinstance(dictionaries, dict):
                    dictionaries = [dictionaries]

                for di in dictionaries:
                    if isinstance(di, dict):
387 388 389 390
                        if "u" in di:
                            continue
                        else:
                            return False
391 392
                    else:
                        return False
393

394
        return True
395

396
    def is_authors(self):
397
        """``True`` when authors are defined.
398

399 400
        Returns:
            bool:
401 402 403 404

        """
        return u"100" in self or u"700" in self

405
    def is_published(self):
406 407 408 409
        """``True`` is the record is published.

        Returns:
            bool:
410 411 412 413 414

        """
        if u"773" not in self:
            return False

415 416 417
        # record can contains erratum
        for di in as_list(self[u"773"]):

418 419 420
            # the reference field is complete and contains, at least,
            # the keys "p", "v", "y" and "c"
            if PAPER_REFERENCE_KEYS.issubset(set(di.keys())):
421 422
                return True

423 424 425 426 427 428 429 430 431 432 433
            # paper reference my be incomplete or even wrong
            # the recovery procedure will use the 773o
            # check that 773o contains the paper reference:
            #    Eur. Phys. J. C (2014) 74:2883
            #    Phys. Rev. Lett. 113, 032001 (2014)
            if "o" in di:
                value = di["o"]
                for reg in DECODE_REF:
                    if reg.match(value):
                        return True

434 435 436
        return False

    def is_with_erratum(self):
437
        """``True`` when the record contains erratum data.
438

439 440
        Returns:
            bool
441 442 443 444 445 446

        """
        # record with erratum contains a list of editor
        return u"773" in self and isinstance(self[u"773"], list)

    def paper_editor(self):
447
        """The abbreviated version of the review, *e.g* Phys Lett B.
448

449 450 451 452
        Returns:
            unicode or list:
                * A list when there are erratum.
                * Empty string when not defined.
453 454 455 456 457 458 459

        """
        return self._get(u"773", "p")

    def paper_pages(self):
        """The page number / range when the record is published in a review.

460 461 462 463 464
        Returns:
            unicode or list:
                * The format is "45-67" or "234".
                * A list when there are erratum.
                * Empty string when not defined.
465 466 467 468 469 470 471

        """
        return self._get(u"773", "c")

    def paper_reference(self):
        """The full reference for a publication published in a review.

472 473 474 475 476
        Returns:
            unicode or list:
                * The format is "Phys Lett B 456 2010 5-6".
                * The string is empty when the publication is not
                  published in a review.
477 478

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
479 480
        if u"773" not in self:
            return u""
481 482 483 484 485 486 487 488 489 490 491

        li = []
        for k in ("p", "v", "y", "c"):
            if k in self[u"773"]:
                li.append(self[u"773"][k])

        return u' '.join(li)

    def paper_url(self):
        """The URL of the preprint.

492 493
        Note:
            Many others URL exists mainly those related to open access.
494

495 496
        Returns:
            unicode: the string is empty when no URLs are found.
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524

        """
        pdf = "%s.pdf" % self.preprint_number()

        if u"8564" in self and isinstance(self[u"8564"], list):
            for el in self[u"8564"]:

                # protection see http://cds.cern.ch/record/2014733
                if "u" in el and isinstance(el["u"], list) and pdf:

                    m = REG_ARXIV_NUMBER.search(pdf)
                    if m:
                        return "%s%s" % (ARXIV_PDF, m.group())

                # cds.cern.ch
                if "y" in el and el["y"] == u"Preprint":
                    return el["u"]

                # inspirehep.net
                elif "y" not in el and el["u"].endswith(pdf):
                    return el["u"]

        else:
            return u""

    def paper_volume(self):
        """The volume number when the record is published in a review.

525 526 527 528
        Returns:
            unicode or list:
                - A list when there are erratum.
                - Empty string when nothing is found.
529 530 531 532 533 534 535

        """
        return self._get(u"773", "v")

    def paper_year(self):
        """The year of the publication.

536 537 538 539
        Returns:
            unicode or list:
                - A list when there are erratum.
                - Empty string if the year is not defined.
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556

        """
        rep = self._get(u"773", "y")

        # protection
        # in record http://cds.cern.ch:record/1951625 the entrie 773y
        # is duplicate but there is no erratum
        if isinstance(rep, list) and not isinstance(self["773"], list):
            rep = list(set(rep))
            if len(rep) == 1:
                rep = rep[0]

        return rep

    def preprint_number(self):
        """The ArXiv preprint number.

557 558
        Returns:
            unicode: empty string when it is not defined.
559 560 561 562 563 564 565 566 567 568 569 570 571

        """
        # for both CDS and INSPRIREHEP preprint data in 37 a
        # for CDS preprint information are also store in 88 a
        for k in (u"037", u"088"):
            for val in self._get(k, "a", force_list=True):
                if ARXIV in val:
                    return val
        return u''

    def report_number(self):
        """The report number(s) associated to the publication.

572 573 574 575 576
        Returns:
            unicode:
                - Numbers are separated by a comma
                - Number are sorted in alphabetic order.
                - Empty string when not defined.
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624

        """
        li = []

        # cds.cern.ch
        # report number can be in 37a, 88a and 88 9
        # entry can be the preprint number arXiv:xxx
        if self.host().startswith("cds"):
            for elt in self._get(u"088", "a", force_list=True):
                if not elt.startswith(ARXIV):
                    li.append(elt)

            # if empty have a look to "088" "9"
            # logic to avoid version number in 88/9
            # 88/a = LHCB-PAPER-2015-016 while 88/9 = LHCB-PAPER-2015-016-003
            if not li:
                for elt in self._get(u"088", "9", force_list=True):
                    if not elt.startswith(ARXIV):
                        li.append(elt)

        # inspirehep.net / cds.cern.ch -- example of MARC structure:
        # 037__ $$aLHCB-PAPER-2014-047
        # 037__ $$aCERN-PH-EP-2014-221
        # 037__ $$9arXiv$$aarXiv:1410.0149$$chep-ex
        if u"037" in self:

            if isinstance(self[u"037"], dict):
                if "9" in self[u"037"] and self[u"037"]["9"] == ARXIV:
                    pass
                elif "a" in self[u"037"]:
                    if not self[u"037"]["a"].startswith(ARXIV):
                        li.append(self[u"037"]["a"])

            elif isinstance(self[u"037"], list):
                for di in self[u"037"]:
                    if "9" in di and di["9"] == ARXIV:
                        continue

                    if "a" in di:
                        if not di["a"].startswith(ARXIV):
                            li.append(di["a"])

        li.sort()
        return ', '.join(li)

    def submitted(self):
        """The date of submission.

625 626 627 628 629
        Returns:
            unicode or list:
                * The format is "YYYY-MM" or "YYYY-MM-DD"
                * A list when there are erratum.
                * Empty list when not defined.
630 631

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
632
        return self._get(u"269", "c", force_list=True)
633 634 635 636

    def title(self):
        """The title of the publication.

637 638 639 640 641
        Returns:
            unicode or list:
                * A list when there are erratum.
                * Empty string when not defined.
                * The filter CLEAN_SPACES is applied.
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657

        """
        val = self._get(u"245", "a")

        if isinstance(val, (unicode, str)):
            return CLEAN_SPACES(val)

        elif isinstance(val, list):
            for i in range(len(val)):
                val[i] = CLEAN_SPACES(val[i])
        else:
            return val

    def year(self):
        """The year of the publication.

658 659 660 661
        Returns:
            unicode or list:
                * A list when there are erratum.
                * Empty string when it is not defined.
662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679

        """
        val = self._get(u"260", "c")

        if isinstance(val, list):
            if len(val):
                val.sort()
                val = val[0]
            else:
                val = u""

        # several form are possible 2014, 2014-12 or 2014-12-31
        if val:
            match = REG_YEAR.search(val)
            if match:
                val = match.group(1)

        return val