checkandfix.py 29.2 KB
Newer Older
1
""" harvest_tools.checkandfix
2 3

"""
4
import numpy as np
5 6
import re

7
from .base import search_synonym, ToolException
8
from datetime import datetime
9
from .exception import CheckException
10
from gluon import current
11
from invenio_tools import (MSG_NO_CONF,
12 13 14 15 16 17
                           MSG_NO_THESIS,
                           OAI_URL,
                           RecordConf,
                           RecordThesis,
                           REG_OAI,
                           REG_YEAR)
18 19 20 21

from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS

from itertools import imap
22
from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID
23 24


25
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
26 27 28 29 30

# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
MONTHS = {"Jan": "01",
          "Feb": "02",
          "Fev": "02",
          "Mar": "03",
          "Apr": "04",
          "Avr": "04",
          "May": "05",
          "Mai": "05",
          "Jun": "06",
          "Jul": "07",
          "Aug": "08",
          "Sep": "09",
          "Oct": "10",
          "Nov": "11",
          "Dec": "12"}
46

LE GAC Renaud's avatar
LE GAC Renaud committed
47
MSG_FAUTHOR_COLLABORATION = "Reject first author is a Collaboration"
48
MSG_NO_AUTHOR = "Reject no author(s)"
49
MSG_NO_CONF_DATE = "Reject no conference date"
50
MSG_NO_DATE = "Reject no submission date"
51
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
52
MSG_NO_REF = "Reject incomplete paper reference. Check "
53

54
MSG_TEMPORARY_RECORD = "Temporary record"
55 56 57
MSG_UNKNOWN_COLLABORATION = "Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY = "Reject country is unknown."
MSG_UNKNOWN_PUBLISHER = "Reject publisher is unknown."
58
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
59 60

REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
61 62

REG_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
63
    re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
64

65 66
REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")

LE GAC Renaud's avatar
LE GAC Renaud committed
67 68
REG_WELL_FORMED_CONF_DATES_1 = \
    re.compile("\d{1,2}-\d{1,2} [A-Z][a-z]{2} \d{4}")
69 70

REG_WELL_FORMED_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
71
    re.compile("\d{1,2} [A-Z][a-z]{2} - \d{1,2} [A-Z][a-z]{2} \d{4}")
72

73 74
UNIVERSITY = "University"

75 76

class CheckAndFix(object):
77 78 79 80 81
    """A collection of tools to check and repair the content of record.

    Args:
        debug (bool):
            activate the debug mode.
82

83
    """
84
    def __init__(self, debug=False):
85

86
        self.db = current.db
87
        self.dbg = debug
88 89 90 91 92 93 94
        self.reg_institute = self._get_reg_institute()

        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

        # private cache for my authors list
95
        self._my_authors = {}
96

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    @staticmethod
    def _get_conference_dates(record):
        """Return the opening and closing dates of a conference.

        Args:
            record (RecordConf):
                record describing a conference proceeding or talk.

        Returns:
            tuple of datetime.date:
                opening and closing dates.

        Raise:
            ToolException:
                no conference date found.

        """
        if u"meeting_name" not in record:
            raise ToolException(MSG_NO_CONF_DATE)

        meeting = record[u"meeting_name"]
        meeting = (meeting[0] if isinstance(meeting, list) else meeting)

        # CDS has the opening and closing dates encoded as 20141231
        if u"opening_date" in meeting and u"closing_date" in meeting:

            fmt = "%Y%m%d"

            val = meeting[u"opening_date"]
            opening = datetime.strptime(val, fmt)

            val = meeting[u"closing_date"]
            closing = datetime.strptime(val, fmt)

            return (opening, closing)

        # both CDS and INSPIRE have the dates subfield
        val = meeting[u"date"]

        # date is encode as 12 - 15 Mar 2014
        m = REG_CONF_DATES_1.match(val)
        if m:

            fmt = "%d-%b-%Y"

            val = u"%s-%s-%s" % (m.group(1), m.group(3), m.group(4))
            opening = datetime.strptime(val, fmt)

            val = u"%s-%s-%s" % (m.group(2), m.group(3), m.group(4))
            closing = datetime.strptime(val, fmt)

            return (opening, closing)

        # dates are encoded 29 Feb - 1 Mar 2014
        m = REG_CONF_DATES_2.match(val)
        if not m:
            raise ToolException(MSG_NO_CONF_DATE)

        fmt = "%d-%b-%Y"

        val = u"%s-%s-%s" % (m.group(1), m.group(2), m.group(5))
        opening = datetime.strptime(val, fmt)

        val = u"%s-%s-%s" % (m.group(3), m.group(4), m.group(5))
        closing = datetime.strptime(val, fmt)

        return (opening, closing)

165
    def _get_reg_institute(self):
166 167 168 169 170 171
        """Get the regular expression defining the affiliation of my institute.

        It is obtained by concatenating the affiliation keys.
        Affiliation key can contains character like ``(``, ``)`` or ``&``.
        They are replaced by ``\(`` *etc*.

172
        Returns:
173
            str:
174 175 176

        """
        # alias
177
        db = self.db
178 179 180
        app = current.app
        reg_institute = app.reg_institute

181 182 183
        # regular expression for the affiliation keys
        # protect special character
        # add start and end of string for an exact match
184 185
        if not reg_institute:

186 187 188
            lst = []
            for row in db(db.affiliation_keys.id > 0).iterselect():
                val = row.key_u
189

190 191 192 193 194 195 196 197 198 199 200 201 202
                val = (val
                       .replace("(", "\(")
                       .replace(")", "\)")
                       .replace("&", "\&")
                       .replace("$", "\$")
                       .replace("+", "\+")
                       .replace("?", "\?"))

                val = r"(^|\|){}($|\|)" .format(val)

                lst.append(val)

            reg_institute = r"|".join(lst)
203

204 205 206 207 208
        return reg_institute

    def _get_author_rescue_list(self, record, id_project, id_team):
        """Get the rescue list for my authors.

209
        Args:
210 211 212 213 214 215 216 217
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database.

            id_team (int):
                identifier of the team in the database.
218

219
        Returns:
220 221
            list:
                empty when not defined
222 223

        """
224
        year = record.submitted()
225 226 227 228

        # try to recover year when not defined
        if not year:
            # published article, proceeding
229 230
            if record[u"publication_info"].year.iloc[0] != "":
                year = record[u"publication_info"].year.iloc[0]
231 232

            # start date of a conference
233 234
            elif record._get(u"meeting_name", u"opening_date") != u"":
                year = record._get(u"meeting_name", u"opening_date")
235 236

            # end date of a conference
237 238
            elif record._get(u"meeting_name", u"closing_date") != u"":
                year = record._get(u"meeting_name", u"closing_date")
239 240 241 242 243

            else:
                return []

        #
244 245
        # protection
        # submitted and paper year are protect against erratum, but ...
246 247 248 249 250 251 252 253 254 255 256 257 258 259
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)

        else:
            return []

        # caching
LE GAC Renaud's avatar
LE GAC Renaud committed
260
        t = (year, id_project, id_team)
261 262 263 264 265
        if t == self.__par:
            return self.__reference

        # extract the list from the database
        row = self.db.my_authors(year=year,
LE GAC Renaud's avatar
LE GAC Renaud committed
266 267
                                 id_projects=id_project,
                                 id_teams=id_team)
268 269

        if row:
270
            self.__reference = row['authors'].strip("\n"). split(', ')
271 272 273 274
        else:
            self.__reference = []

        return self.__reference
275

276 277 278 279
    def _is_synonym(self, tablename, value):
        """Check that the synonym field contains *value*.

        Args:
280 281
            tablename (str): name of the database table
            value (str): value to be searched
282 283 284 285 286

        Returns:
            bool: ``True`` if *one* row is found, ``False`` otherwise.

        """
287
        query = self.db[tablename].synonyms.contains(value)
288 289 290 291 292
        if db(query).count() == 1:
            return True

        return False

293
    def _recover_submitted(self, record):
294 295 296
        """Recover submitted date using conference, preprint or thesis
        information.

297
        Args:
298 299
            record (RecordPubli):
                record describing a publication.
300

301
        Returns:
302
            unicode:
303
                target at least YYYY-MM
304
                empty when procedure failed
305 306

        """
307
        val = u""
308
        if isinstance(record, RecordConf):
309

LE GAC Renaud's avatar
LE GAC Renaud committed
310
            opening = self._get_conference_dates(record)[0]
311
            val = opening.strftime("%Y-%m-%d")
312

313
        elif isinstance(record, RecordThesis):
314 315 316 317 318 319 320 321 322
            val = record.these_defense()

        else:
            report = record.preprint_number()
            if report:
                m_arxiv = DECODE_ARXIV.match(report)
                if m_arxiv:
                    val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))

323 324 325 326
        # last change use the creation date for the record
        if val == u"" or len(val) < 7:
            val = record[u"creation_date"][0:7]

327 328
        return val

329
    def authors(self, record):
LE GAC Renaud's avatar
LE GAC Renaud committed
330
        """Check that:
LE GAC Renaud's avatar
LE GAC Renaud committed
331

LE GAC Renaud's avatar
LE GAC Renaud committed
332 333
            * author fields are defined.
            * first author is not like ATLAS Collaboration
334

335
        Args:
336 337
            record (RecordPubli):
                record describing a publication.
338

339
        Raises:
340 341
            CheckException:
                when there is no authors.
342 343

        """
344
        if self.dbg:
LE GAC Renaud's avatar
LE GAC Renaud committed
345
            print("\t\tCheck authors")
346

347
        if not record.is_authors():
348 349
            raise CheckException(MSG_NO_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
350 351 352
        if "collaboration" in record.first_author().lower():
            raise CheckException(MSG_FAUTHOR_COLLABORATION)

353
    def collaboration(self, record):
354
        """Check synonyms for collaboration by using by the proper value.
355

356
        Args:
357 358
            record (RecordPubli):
                record describing a publication.
359

360
        Raises:
361
            CheckException:
LE GAC Renaud's avatar
LE GAC Renaud committed
362
                * the collaboration is unknown (neither collaborationnor synonym)
363
                * more than one synonym found.
364
        """
365 366 367
        if self.dbg:
            print "\t\tCheck collaboration"

368
        val = record.collaboration()
369 370 371
        if not val:
            return

372
        try:
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
            db = self.db
            dbid = search_synonym(db.collaborations, "collaboration", val)

            if dbid == UNDEF_ID:
                raise ToolException(MSG_UNKNOWN_COLLABORATION)

            collaboration = db.collaborations[dbid].collaboration
            if collaboration != val:

                # one collaboration
                if isinstance(record[u"corporate_name"], dict):
                    record[u"corporate_name"][u"collaboration"] = collaboration

                # several collaboration
                # replace the list of dictionary by a single one
                else:
                    record[u"corporate_name"] = \
                        {u"collaboration": collaboration}
391

392 393 394
        except ToolException as e:
            raise CheckException(*e.args)

395
    def country(self, record):
396 397 398 399 400 401 402 403 404 405 406
        """Check synonyms for conference country by using by the proper value.

        Args:
            record (RecordPubli):
                record describing a publication.

        Raises:
            CheckException:
                * the country is unknown (neither country nor synonym)
                * more than one synonym found.

407
        """
408 409 410
        if self.dbg:
            print "\t\tCheck country"

411
        if not isinstance(record, RecordConf):
412 413
            return

414
        val = record.conference_country()
415 416

        try:
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
            db = self.db
            dbid = search_synonym(db.countries, "country", val)

            if dbid == UNDEF_ID:
                raise ToolException(MSG_UNKNOWN_COUNTRY)

            country = db.countries[dbid].country

            if country != val:
                obj = record[u"meeting_name"]

                if isinstance(obj, dict):
                    location = obj[u"location"].replace(val, country)
                    record[u"meeting_name"][u"location"] = location

                else:
                    for di in obj:
                        if u"location" in di:
                            di[u"location"] = \
                                di[u"location"].replace(val, country)

                    record[u"meeting_name"] = obj
439

440 441
        except ToolException as e:
            raise CheckException(*e.args)
442

443
    def conference_date(self, record):
LE GAC Renaud's avatar
LE GAC Renaud committed
444
        """Check conference date exists and well formatted.
445

446
        Args:
447 448
            record (RecordConf):
                record describing a talk or a proceeding.
449

450
        Raises:
451 452
            CheckException:
                dates are not found.
453 454

        """
455 456 457
        if self.dbg:
            print "\t\tCheck conference date"

458 459 460 461
        # conference information are available, i.e proceeding
        if not isinstance(record, RecordConf):
            return

462 463 464
        val = record.conference_dates()
        if len(val) == 0:
            raise CheckException(MSG_NO_CONF_DATE)
465

466 467
        # is it well formed
        if REG_WELL_FORMED_CONF_DATES_1.match(val):
468 469
            return

470 471
        if REG_WELL_FORMED_CONF_DATES_2.match(val):
            return
472

473 474
        # format the date properly
        opening, closing = self._get_conference_dates(record)
475

476
        if opening.month == closing.month:
LE GAC Renaud's avatar
LE GAC Renaud committed
477 478 479 480
            val = "%i-%i %s %i" % (opening.day,
                                   closing.day,
                                   opening.strftime("%b"),
                                   opening.year)
481
        else:
LE GAC Renaud's avatar
LE GAC Renaud committed
482 483 484 485 486
            val = "%i %s - %i %s %i" % (opening.day,
                                        opening.strftime("%b"),
                                        closing.day,
                                        closing.strftime("%b"),
                                        opening.year)
487

488 489 490
        meeting = record[u"meeting_name"]
        meeting = (meeting[0] if isinstance(meeting, list) else meeting)
        meeting[u"date"] = val
491

492
    def is_bad_oai_used(self, record):
493 494 495
        """Bad OAI is when the ``id`` in the OAI field is different from
        the ``record id``. This happens when an old record is redirected
        to new one.
496

497
        Args:
498 499
            record (RecordPubli):
                record describing a publication.
500

501
        Returns:
502 503
            bool:
                ``True`` when a record is found in the database with
504
                the bad OAI.
505

506
        """
507 508 509
        if self.dbg:
            print "\t\tCheck is bad oai used"

510 511 512
        value = record.oai()
        match = REG_OAI.match(value)

LE GAC Renaud's avatar
LE GAC Renaud committed
513
        if int(match.group(2)) != record.id():
514 515
            db = self.db

516
            # a record with the bad OAI exists in the database
517 518 519
            bad_oai_url = OAI_URL % (match.group(1), match.group(2))
            if get_id(db.publications, origin=bad_oai_url):
                return True
520

521
        return False
522

523
    def is_oai(self, record):
524
        """``True`` when the OAI is not defined in the record.
525 526 527 528

        Args:
            record (RecordPubli): record describing a publication.

529 530 531
        Returns:
            bool:
                ``True`` when the OAI is not defined in the record.
532
        """
533 534 535
        if self.dbg:
            print "\t\tCheck is oai"

536
        # field / subfield depends on the store
LE GAC Renaud's avatar
LE GAC Renaud committed
537 538
        test = (u"oai" in record and u"value" in record[u"oai"]) or \
               (u"FIXME_OAI" in record and u"id" in record[u"FIXME_OAI"])
539

540
        return test
541

542
    def format_authors(self, record, fmt="Last, First"):
543
        """Format the author names.
544

545
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
546 547 548
            record (RecordPubli):
                record describing a publication.

549 550
            fmt (str):
                define the format for author names.
LE GAC Renaud's avatar
LE GAC Renaud committed
551 552
                Possible values are ``First, Last``, ``F. Last``, ``Last``,
                ``Last, First`` and ``Last F.``
553 554

        """
555 556 557
        if self.dbg:
            print "\t\tFormat authors"

558
        record.reformat_authors(fmt)
559

560
    def format_editor(self, record):
561 562
        """Format the editor abbreviation.
        The encoding depends on the store::
563

564 565
            INVENIO:    Phys. Lett. B + volume 673
            INSPIREHEP: Phys.Lett + volume B673
566

567
        Standardise the answer as ``Phys. Lett. B``.
568

569
        Args:
570 571
            record (RecordPubli):
                record describing a publication.
572

573
        Raises:
574 575
            CheckException:
                when the editor is not well formed.
576 577

        """
578 579 580
        if self.dbg:
            print "\t\tFormat editor"

581 582 583
        if not record.is_published():
            return

584
        df = record[u"publication_info"].iloc[0]
585

586 587
        editor = df.title
        volume = df.volume
588

589 590
        # add space after the dot  Phys.Rev -> Phys. Rev
        editor = re.sub(r'\.([A-Z])', r'. \1', editor)
591

592 593 594 595 596
        # get the volume letter
        m = re.match(r'([A-Z]+) *(\d+)', volume)
        if m and m.group(1) != editor[-1]:
            editor = "%s %s" % (editor, m.group(1))
            volume = m.group(2)
597

598 599
        # remove stupid mistake
        editor = CLEAN_SPACES(editor)
600

601
        df[["title", "volume"]] = [editor, volume]
602

603 604 605
    def format_universities(self, record):
        """Format the name of the university for PhD:

606 607
            * Fix the name of Aix-Marseille University
            * Replace U. by University
608

609
        Args:
610 611
            record (RecordThesis):
                record describing a thesis.
612 613

        """
614 615 616
        if self.dbg:
            print "\t\tFormat university"

617
        # protection
618
        if not isinstance(record, RecordThesis):
619 620
            return

621
        values = record[u"dissertation_note"][u"university"]
622

623
        # CPPM -- fix the name of Aix-Marseille university
LE GAC Renaud's avatar
LE GAC Renaud committed
624 625 626 627 628 629
        affiliations = record.first_author_institutes()

        if "CPPM" in affiliations:

            # name of the university depends on the year
            year = re.search(r"(\d{4})", record.these_defense()).group(1)
630 631

            if int(year) < 2012:
LE GAC Renaud's avatar
LE GAC Renaud committed
632 633
                university = \
                    u"Université de la Méditerrannée Aix-Marseille II"
634
            else:
635
                university = "Aix Marseille Université"
636

LE GAC Renaud's avatar
LE GAC Renaud committed
637 638 639 640 641 642 643 644 645 646
            # single affiliation
            affiliations = affiliations.split("|")
            if len(affiliations) == 1:
                values = university

            # multiple affiliation are separated by "|"
            else:
                li = [el for el in affiliations if "CPPM" in el]
                if len(li) == 1:
                    values = values.replace(li[0], university)
647

648
        # Other -- replace U. by University
LE GAC Renaud's avatar
LE GAC Renaud committed
649 650
        university = current.T(UNIVERSITY).decode("utf8")
        values = values.replace('U.', university)
651 652

        record[u"dissertation_note"][u"university"] = values
653

654
    def get_my_authors(self, record, sep=", ", sort=False):
655
        """Get authors of my institutes signing the record.
656 657
        The information is append to the Record object via the attribute
        ``my_authors``.
658

659
        Args:
660 661 662 663
            record (RecordPubli):
                record describing a publication.

            sep (unicode):
664
                string separating author names. The default is the comma.
665

666 667 668
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
669

670
        Returns:
671 672
            unicode:
                the list of authors separated by the ``sep`` argument.
673

674
        Raises:
675
            CheckException:
LE GAC Renaud's avatar
LE GAC Renaud committed
676
                the list is empty
677 678

        """
679 680 681
        if self.dbg:
            print "\t\tGet my authors"

682 683
        # might have been computed when affiliation is checked
        rec_id = record.id()
684 685 686
        if rec_id in self._my_authors:
            li = self._my_authors[rec_id]
            value = sep.join(li)
687 688 689 690

        # find authors of my institute signing the record
        else:
            reg_institute = self.reg_institute
691 692
            value = \
                record.find_authors_by_affiliation(reg_institute, sep, sort)
693

694
        if len(value) == 0:
695 696
            raise CheckException(MSG_NO_MY_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
697
        record.my_authors = value
698

699
    def is_conference(self, record):
LE GAC Renaud's avatar
LE GAC Renaud committed
700
        """Check that the record contains conference data.
701

702
        Args:
703 704
            record (RecordPubli):
                record describing a publication.
705

706
        Raises:
707 708
            CheckException:
                the record is not associated to a conference.
709 710

        """
711 712 713
        if self.dbg:
            print "\t\tIs conference"

714 715 716
        if not isinstance(record, RecordConf):
            raise CheckException(MSG_NO_CONF)

LE GAC Renaud's avatar
LE GAC Renaud committed
717 718 719
        if u"meeting_name" not in record:
            raise CheckException(MSG_NO_CONF)

720
    def is_thesis(self, record):
721
        """Check that the record described a thesis.
722

723
        Args:
724 725
            record (RecordPubli):
                record describing a publication.
726

727
        Raises:
728 729
            CheckException:
                the record does not describe a thesis.
730 731

        """
732 733 734
        if self.dbg:
            print "\t\tIs thesis"

735 736 737
        if not isinstance(record, RecordThesis):
            raise CheckException(MSG_NO_THESIS)

738 739 740 741 742 743 744
    def my_affiliation(
            self,
            record,
            id_project,
            id_team,
            fmt_rescue="F. Last",
            sort=False):
745 746 747 748
        """Check that authors of my institute are signatories.

        Launch a recovery procedure when affiliations are not defined.
        It is based on the author rescue list stored in the database.
749

750
        Args:
751 752 753 754 755 756 757 758 759
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database

            id_team (int):
                identifier of the team in the database

760 761
            fmt_rescue (str):
                the format for the authors used in the rescue list
762

763 764 765 766
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

LE GAC Renaud's avatar
LE GAC Renaud committed
767
        Returns:
768 769 770
            str:
                * the found affiliation
                * an empty string when the rescue list is used.
771

772
        Raises:
773 774 775 776
            CheckException:
                when the rescue list is required but empty
                or because the intersection between the rescue list
                and the author is null.
777 778

        """
779 780 781
        if self.dbg:
            print "\t\tCheck my affiliation"

782
        value = record.find_affiliation(self.reg_institute)
783
        if len(value) > 0:
784
            return value
785

786 787 788 789 790
        # affiliation is not defined
        # try to recover using the authors rescue list
        rescue_list = self._get_author_rescue_list(record, id_project, id_team)
        if not rescue_list:
            raise CheckException(MSG_NO_MY_AUTHOR)
791

792
        # format the author in the same way as the rescue list
793 794 795 796
        fmt_ref = record._last_fmt_author
        record.reformat_authors(fmt_rescue)

        if sort:
797
            authors = (record[u"authors"][["last_name", "fmt_name"]]
798 799 800 801
                       .sort_values(by="last_name")
                       .fmt_name)

        else:
802
            authors = (record[u"authors"].fmt_name
803 804 805 806
                       .sort_index())

        # go back to the origin formatting
        record.reformat_authors(fmt_ref)
807

808 809
        # compute the intersection between the authors and the rescue list
        intersection = set(authors) & set(rescue_list)
810

811
        if len(intersection) == 0:
812
            raise CheckException(MSG_NO_MY_AUTHOR)
813

814
        # cache the result for a latter use
815 816
        self._my_authors[record.id()] = list(intersection)

817
        return ""
818

819
    def paper_reference(self, record):
820
        """Check that editor, page, volume and paper year are defined
821
        for a published paper. Repair it from doi when possible.
822

823
        Args:
824 825
            record (RecordPubli):
                record describing a publication.
826

827
        Raises:
828
            CheckException:
LE GAC Renaud's avatar
LE GAC Renaud committed
829
                the paper reference is not well formed.
830 831

        """
832 833 834
        if self.dbg:
            print "\t\tCheck paper reference"

835
        if record.is_published():
836 837
            return

838 839 840 841
        # paper reference can be incomplete or missing
        # is the paper published ? In that case the doi is defined
        if u"doi" not in record:
            return
842

843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879
        # what information is missing ?
        # * df.columns are title, volume, year and pagination
        # * df can contains one or more rows due to erratum.
        # * assume that the first row is the oldest one and corresponds tp
        #   the first publication
        # * the row contains empty string when the record is not published.
        # * iloc[0] returns a serie where the index are the column's name
        #
        columns = (record[u"publication_info"].iloc[0]
                   .replace("", np.nan)
                   .dropna()
                   .index)

        missing = PAPER_REFERENCE_KEYS.difference(columns)

        # try to recover from the doi when it has the form
        # xx.yyyy/Publisher.Volume.Page
        m = REG_DOI.match(record[u"doi"])
        if not m:
            raise ToolException(MSG_NO_REF + str(list(missing)))

        for subfield in missing:
            if subfield == "title":

                # transform PhysRevD in Phys. Rev. D
                li = re.split(r"([A-Z][a-z]+)", m.group(1))
                title = ". ".join([el for el in li if len(el) > 0])
                record[u"publication_info"].loc[0, u"title"] = title

            elif subfield == "volume":
                record[u"publication_info"].loc[0, u"volume"] = m.group(2)

            elif subfield == "pagination":
                record[u"publication_info"].loc[0, u"pagination"] = m.group(3)

            elif subfield == "year":
                raise ToolException(MSG_NO_REF + "[year]")
880

881
    def publisher(self, record):
882
        """Check synonyms for publisher by replacing by the abbreviation value.
883 884

        Args:
885 886
            record (RecordPubli):
                record describing a publication.
887 888

        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
889 890
            CheckException::