checkandfix.py 28.9 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
""" harvest_tools.checkandfix
3 4

"""
5
import numpy as np
6 7
import re

8
from .base import search_synonym, ToolException
9
from datetime import datetime
10
from .exception import CheckException
11
from gluon import current
12
from invenio_tools import (MSG_NO_CONF,
13 14 15 16 17 18
                           MSG_NO_THESIS,
                           OAI_URL,
                           RecordConf,
                           RecordThesis,
                           REG_OAI,
                           REG_YEAR)
19 20 21 22

from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS

from itertools import imap
23
from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID
24 25


26
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
27 28 29 30 31

# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
MONTHS = {"Jan": "01",
          "Feb": "02",
          "Fev": "02",
          "Mar": "03",
          "Apr": "04",
          "Avr": "04",
          "May": "05",
          "Mai": "05",
          "Jun": "06",
          "Jul": "07",
          "Aug": "08",
          "Sep": "09",
          "Oct": "10",
          "Nov": "11",
          "Dec": "12"}
47

48
MSG_NO_AUTHOR = "Reject no author(s)"
49
MSG_NO_CONF_DATE = "Reject no conference date"
50
MSG_NO_DATE = "Reject no submission date"
51
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
52
MSG_NO_OAI = "Reject no OAI identifier"
53
MSG_NO_REF = "Reject incomplete paper reference. Check "
54

55
MSG_TEMPORARY_RECORD = "Temporary record"
56 57 58
MSG_UNKNOWN_COLLABORATION = "Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY = "Reject country is unknown."
MSG_UNKNOWN_PUBLISHER = "Reject publisher is unknown."
59
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
60 61

REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
62 63

REG_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
64
    re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
65

66 67
REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")

68 69 70 71 72
REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}")

REG_WELL_FORMED_CONF_DATES_2 = \
    re.compile("\d{2} [A-Z][a-z]{2} - \d{2} [A-Z][a-z]{2} \d{4}")

73 74
UNIVERSITY = "University"

75 76

class CheckAndFix(object):
77 78 79 80 81
    """A collection of tools to check and repair the content of record.

    Args:
        debug (bool):
            activate the debug mode.
82

83
    """
84
    def __init__(self, debug=False):
85

86
        self.db = current.db
87
        self.dbg = debug
88 89 90 91 92 93 94
        self.reg_institute = self._get_reg_institute()

        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

        # private cache for my authors list
95
        self._my_authors = {}
96

97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    @staticmethod
    def _get_conference_dates(record):
        """Return the opening and closing dates of a conference.

        Args:
            record (RecordConf):
                record describing a conference proceeding or talk.

        Returns:
            tuple of datetime.date:
                opening and closing dates.

        Raise:
            ToolException:
                no conference date found.

        """
        if u"meeting_name" not in record:
            raise ToolException(MSG_NO_CONF_DATE)

        meeting = record[u"meeting_name"]
        meeting = (meeting[0] if isinstance(meeting, list) else meeting)

        # CDS has the opening and closing dates encoded as 20141231
        if u"opening_date" in meeting and u"closing_date" in meeting:

            fmt = "%Y%m%d"

            val = meeting[u"opening_date"]
            opening = datetime.strptime(val, fmt)

            val = meeting[u"closing_date"]
            closing = datetime.strptime(val, fmt)

            return (opening, closing)

        # both CDS and INSPIRE have the dates subfield
        val = meeting[u"date"]

        # date is encode as 12 - 15 Mar 2014
        m = REG_CONF_DATES_1.match(val)
        if m:

            fmt = "%d-%b-%Y"

            val = u"%s-%s-%s" % (m.group(1), m.group(3), m.group(4))
            opening = datetime.strptime(val, fmt)

            val = u"%s-%s-%s" % (m.group(2), m.group(3), m.group(4))
            closing = datetime.strptime(val, fmt)

            return (opening, closing)

        # dates are encoded 29 Feb - 1 Mar 2014
        m = REG_CONF_DATES_2.match(val)
        if not m:
            raise ToolException(MSG_NO_CONF_DATE)

        fmt = "%d-%b-%Y"

        val = u"%s-%s-%s" % (m.group(1), m.group(2), m.group(5))
        opening = datetime.strptime(val, fmt)

        val = u"%s-%s-%s" % (m.group(3), m.group(4), m.group(5))
        closing = datetime.strptime(val, fmt)

        return (opening, closing)

165
    def _get_reg_institute(self):
166 167 168 169 170 171
        """Get the regular expression defining the affiliation of my institute.

        It is obtained by concatenating the affiliation keys.
        Affiliation key can contains character like ``(``, ``)`` or ``&``.
        They are replaced by ``\(`` *etc*.

172
        Returns:
173
            str:
174 175 176

        """
        # alias
177
        db = self.db
178 179 180
        app = current.app
        reg_institute = app.reg_institute

181 182 183
        # regular expression for the affiliation keys
        # protect special character
        # add start and end of string for an exact match
184 185
        if not reg_institute:

186 187 188
            lst = []
            for row in db(db.affiliation_keys.id > 0).iterselect():
                val = row.key_u
189

190 191 192 193 194 195 196 197 198 199 200 201 202
                val = (val
                       .replace("(", "\(")
                       .replace(")", "\)")
                       .replace("&", "\&")
                       .replace("$", "\$")
                       .replace("+", "\+")
                       .replace("?", "\?"))

                val = r"(^|\|){}($|\|)" .format(val)

                lst.append(val)

            reg_institute = r"|".join(lst)
203

204 205 206 207 208
        return reg_institute

    def _get_author_rescue_list(self, record, id_project, id_team):
        """Get the rescue list for my authors.

209
        Args:
210 211 212 213 214 215 216 217
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database.

            id_team (int):
                identifier of the team in the database.
218

219
        Returns:
220 221
            list:
                empty when not defined
222 223

        """
224
        year = record.submitted()
225 226 227 228

        # try to recover year when not defined
        if not year:
            # published article, proceeding
229 230
            if record[u"publication_info"].year.iloc[0] != "":
                year = record[u"publication_info"].year.iloc[0]
231 232

            # start date of a conference
233 234
            elif record._get(u"meeting_name", u"opening_date") != u"":
                year = record._get(u"meeting_name", u"opening_date")
235 236

            # end date of a conference
237 238
            elif record._get(u"meeting_name", u"closing_date") != u"":
                year = record._get(u"meeting_name", u"closing_date")
239 240 241 242 243

            else:
                return []

        #
244 245
        # protection
        # submitted and paper year are protect against erratum, but ...
246 247 248 249 250 251 252 253 254 255 256 257 258 259
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)

        else:
            return []

        # caching
LE GAC Renaud's avatar
LE GAC Renaud committed
260
        t = (year, id_project, id_team)
261 262 263 264 265
        if t == self.__par:
            return self.__reference

        # extract the list from the database
        row = self.db.my_authors(year=year,
LE GAC Renaud's avatar
LE GAC Renaud committed
266 267
                                 id_projects=id_project,
                                 id_teams=id_team)
268 269

        if row:
270
            self.__reference = row['authors'].strip("\n"). split(', ')
271 272 273 274
        else:
            self.__reference = []

        return self.__reference
275

276 277 278 279
    def _is_synonym(self, tablename, value):
        """Check that the synonym field contains *value*.

        Args:
280 281
            tablename (str): name of the database table
            value (str): value to be searched
282 283 284 285 286

        Returns:
            bool: ``True`` if *one* row is found, ``False`` otherwise.

        """
287
        query = self.db[tablename].synonyms.contains(value)
288 289 290 291 292
        if db(query).count() == 1:
            return True

        return False

293
    def _recover_submitted(self, record):
294 295 296
        """Recover submitted date using conference, preprint or thesis
        information.

297
        Args:
298 299
            record (RecordPubli):
                record describing a publication.
300

301
        Returns:
302
            unicode:
303
                target at least YYYY-MM
304
                empty when procedure failed
305 306

        """
307
        val = u""
308
        if isinstance(record, RecordConf):
309

LE GAC Renaud's avatar
LE GAC Renaud committed
310
            opening = self._get_conference_dates(record)[0]
311
            val = opening.strftime("%Y-%m-%d")
312

313
        elif isinstance(record, RecordThesis):
314 315 316 317 318 319 320 321 322
            val = record.these_defense()

        else:
            report = record.preprint_number()
            if report:
                m_arxiv = DECODE_ARXIV.match(report)
                if m_arxiv:
                    val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))

323 324 325 326
        # last change use the creation date for the record
        if val == u"" or len(val) < 7:
            val = record[u"creation_date"][0:7]

327 328
        return val

329
    def authors(self, record):
330
        """Check that author fields are defined.
331

332
        Args:
333 334
            record (RecordPubli):
                record describing a publication.
335

336
        Raises:
337 338
            CheckException:
                when there is no authors.
339 340

        """
341 342
        if self.dbg:
            print "\t\tCheck authors"
343

344
        if not record.is_authors():
345 346 347
            raise CheckException(MSG_NO_AUTHOR)

    def collaboration(self, record):
348
        """Check synonyms for collaboration by using by the proper value.
349

350
        Args:
351 352
            record (RecordPubli):
                record describing a publication.
353

354
        Raises:
355
            CheckException:
356 357 358
                * the collaboration is unknown
                  (neither collaboration nor synonym)
                * more than one synonym found.
359 360

        """
361 362 363
        if self.dbg:
            print "\t\tCheck collaboration"

364
        val = record.collaboration()
365 366 367
        if not val:
            return

368
        try:
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
            db = self.db
            dbid = search_synonym(db.collaborations, "collaboration", val)

            if dbid == UNDEF_ID:
                raise ToolException(MSG_UNKNOWN_COLLABORATION)

            collaboration = db.collaborations[dbid].collaboration
            if collaboration != val:

                # one collaboration
                if isinstance(record[u"corporate_name"], dict):
                    record[u"corporate_name"][u"collaboration"] = collaboration

                # several collaboration
                # replace the list of dictionary by a single one
                else:
                    record[u"corporate_name"] = \
                        {u"collaboration": collaboration}
387

388 389 390
        except ToolException as e:
            raise CheckException(*e.args)

391
    def country(self, record):
392 393 394 395 396 397 398 399 400 401 402 403
        """Check synonyms for conference country by using by the proper value.

        Args:
            record (RecordPubli):
                record describing a publication.

        Raises:
            CheckException:
                * the country is unknown (neither country nor synonym)
                * more than one synonym found.

        """
404
        """Check conference country.
405
        Have a look to the synonyms when the country does not exist.
406

407
        Args:
408 409
            record (RecordConf):
                record describing a talk or a proceeding.
410

411
        Raises:
412 413
            CheckException:
                the country is not defined nor entered as a synonym.
414 415

        """
416 417 418
        if self.dbg:
            print "\t\tCheck country"

419
        if not isinstance(record, RecordConf):
420 421
            return

422
        val = record.conference_country()
423 424

        try:
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
            db = self.db
            dbid = search_synonym(db.countries, "country", val)

            if dbid == UNDEF_ID:
                raise ToolException(MSG_UNKNOWN_COUNTRY)

            country = db.countries[dbid].country

            if country != val:
                obj = record[u"meeting_name"]

                if isinstance(obj, dict):
                    location = obj[u"location"].replace(val, country)
                    record[u"meeting_name"][u"location"] = location

                else:
                    for di in obj:
                        if u"location" in di:
                            di[u"location"] = \
                                di[u"location"].replace(val, country)

                    record[u"meeting_name"] = obj
447

448 449
        except ToolException as e:
            raise CheckException(*e.args)
450

451 452
    def conference_date(self, record):
        """Check conference date and format it properly.
453

454
        Args:
455 456
            record (RecordConf):
                record describing a talk or a proceeding.
457

458
        Raises:
459 460
            CheckException:
                dates are not found.
461 462

        """
463 464 465
        if self.dbg:
            print "\t\tCheck conference date"

466 467 468 469
        # conference information are available, i.e proceeding
        if not isinstance(record, RecordConf):
            return

470 471 472
        val = record.conference_dates()
        if len(val) == 0:
            raise CheckException(MSG_NO_CONF_DATE)
473

474 475
        # is it well formed
        if REG_WELL_FORMED_CONF_DATES_1.match(val):
476 477
            return

478 479
        if REG_WELL_FORMED_CONF_DATES_2.match(val):
            return
480

481 482
        # format the date properly
        opening, closing = self._get_conference_dates(record)
483

484 485 486 487 488 489 490 491 492 493 494
        if opening.month == closing.month:
            val = "%02i - %02i %s %i" % (opening.day,
                                         closing.day,
                                         opening.strftime("%b"),
                                         opening.year)
        else:
            val = "%02i %s - %02i %s %i" % (opening.day,
                                            opening.strftime("%b"),
                                            closing.day,
                                            closing.strftime("%b"),
                                            opening.year)
495

496 497 498
        meeting = record[u"meeting_name"]
        meeting = (meeting[0] if isinstance(meeting, list) else meeting)
        meeting[u"date"] = val
499

500
    def is_bad_oai_used(self, record):
501 502 503
        """Bad OAI is when the ``id`` in the OAI field is different from
        the ``record id``. This happens when an old record is redirected
        to new one.
504

505
        Args:
506 507
            record (RecordPubli):
                record describing a publication.
508

509
        Returns:
510 511
            bool:
                ``True`` when a record is found in the database with
512
                the bad OAI.
513

514
        """
515 516 517
        if self.dbg:
            print "\t\tCheck is bad oai used"

518 519 520
        value = record.oai()
        match = REG_OAI.match(value)

LE GAC Renaud's avatar
LE GAC Renaud committed
521
        if int(match.group(2)) != record.id():
522 523
            db = self.db

524
            # a record with the bad OAI exists in the database
525 526 527
            bad_oai_url = OAI_URL % (match.group(1), match.group(2))
            if get_id(db.publications, origin=bad_oai_url):
                return True
528

529
        return False
530

531
    def is_oai(self, record):
532 533 534 535 536 537 538 539 540
        """Raise exception when the OAI is not defined.

        Args:
            record (RecordPubli): record describing a publication.

        Raise:
            ToolException:
                OAI is not defined
        """
541 542 543
        if self.dbg:
            print "\t\tCheck is oai"

544
        # field / subfield depends on the store
LE GAC Renaud's avatar
LE GAC Renaud committed
545 546
        test = (u"oai" in record and u"value" in record[u"oai"]) or \
               (u"FIXME_OAI" in record and u"id" in record[u"FIXME_OAI"])
547 548 549 550

        if not test:
            raise ToolException(MSG_NO_OAI)

551
    def format_authors(self, record, fmt="Last, First"):
552
        """Format the author names.
553

554
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
555 556 557
            record (RecordPubli):
                record describing a publication.

558 559 560 561
            fmt (str):
                define the format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."
562 563

        """
564 565 566
        if self.dbg:
            print "\t\tFormat authors"

567
        record.reformat_authors(fmt)
568

569
    def format_editor(self, record):
570 571
        """Format the editor abbreviation.
        The encoding depends on the store::
572

573 574
            INVENIO:    Phys. Lett. B + volume 673
            INSPIREHEP: Phys.Lett + volume B673
575

576
        Standardise the answer as ``Phys. Lett. B``.
577

578
        Args:
579 580
            record (RecordPubli):
                record describing a publication.
581

582
        Raises:
583 584
            CheckException:
                when the editor is not well formed.
585 586

        """
587 588 589
        if self.dbg:
            print "\t\tFormat editor"

590 591 592
        if not record.is_published():
            return

593
        df = record[u"publication_info"].iloc[0]
594

595 596
        editor = df.title
        volume = df.volume
597

598 599
        # add space after the dot  Phys.Rev -> Phys. Rev
        editor = re.sub(r'\.([A-Z])', r'. \1', editor)
600

601 602 603 604 605
        # get the volume letter
        m = re.match(r'([A-Z]+) *(\d+)', volume)
        if m and m.group(1) != editor[-1]:
            editor = "%s %s" % (editor, m.group(1))
            volume = m.group(2)
606

607 608
        # remove stupid mistake
        editor = CLEAN_SPACES(editor)
609

610
        df[["title", "volume"]] = [editor, volume]
611

612 613 614
    def format_universities(self, record):
        """Format the name of the university for PhD:

615 616
            * Fix the name of Aix-Marseille University
            * Replace U. by University
617

618
        Args:
619 620
            record (RecordThesis):
                record describing a thesis.
621 622

        """
623 624 625
        if self.dbg:
            print "\t\tFormat university"

626
        # protection
627
        if not isinstance(record, RecordThesis):
628 629
            return

630
        values = record[u"dissertation_note"][u"university"]
631

632 633
        # CPPM -- fix the name of Aix-Marseille university
        if self._get_reg_institute().find("CPPM") != -1:
634 635 636

            year = REG_YEAR.search(record.these_defense()).group(1)
            if int(year) < 2012:
637
                university = "Université de la Méditerrannée Aix-Marseille II"
638
            else:
639
                university = "Aix Marseille Université"
640

641
            values = (university if "Marseille" in values else values)
642

643
        # Other -- replace U. by University
644
        else:
645 646 647 648
            university = current.T(UNIVERSITY).decode("utf8")
            values.replace('U.', university)

        record[u"dissertation_note"][u"university"] = values
649

650
    def get_my_authors(self, record, sep=", ", sort=False):
651
        """Get authors of my institutes signing the record.
652 653
        The information is append to the Record object via the attribute
        ``my_authors``.
654

655
        Args:
656 657 658 659
            record (RecordPubli):
                record describing a publication.

            sep (unicode):
660
                string separating author names. The default is the comma.
661

662 663 664
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
665

666
        Returns:
667 668
            unicode:
                the list of authors separated by the ``sep`` argument.
669

670
        Raises:
671
            CheckException:
LE GAC Renaud's avatar
LE GAC Renaud committed
672
                the list is empty
673 674

        """
675 676 677
        if self.dbg:
            print "\t\tGet my authors"

678 679
        # might have been computed when affiliation is checked
        rec_id = record.id()
680 681 682
        if rec_id in self._my_authors:
            li = self._my_authors[rec_id]
            value = sep.join(li)
683 684 685 686

        # find authors of my institute signing the record
        else:
            reg_institute = self.reg_institute
687 688
            value = \
                record.find_authors_by_affiliation(reg_institute, sep, sort)
689

690
        if len(value) == 0:
691 692
            raise CheckException(MSG_NO_MY_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
693
        record.my_authors = value
694

695
    def is_conference(self, record):
LE GAC Renaud's avatar
LE GAC Renaud committed
696
        """Check that the record contains conference data.
697

698
        Args:
699 700
            record (RecordPubli):
                record describing a publication.
701

702
        Raises:
703 704
            CheckException:
                the record is not associated to a conference.
705 706

        """
707 708 709
        if self.dbg:
            print "\t\tIs conference"

710 711 712
        if not isinstance(record, RecordConf):
            raise CheckException(MSG_NO_CONF)

LE GAC Renaud's avatar
LE GAC Renaud committed
713 714 715
        if u"meeting_name" not in record:
            raise CheckException(MSG_NO_CONF)

716
    def is_thesis(self, record):
717
        """Check that the record described a thesis.
718

719
        Args:
720 721
            record (RecordPubli):
                record describing a publication.
722

723
        Raises:
724 725
            CheckException:
                the record does not describe a thesis.
726 727

        """
728 729 730
        if self.dbg:
            print "\t\tIs thesis"

731 732 733
        if not isinstance(record, RecordThesis):
            raise CheckException(MSG_NO_THESIS)

734 735 736 737 738 739 740
    def my_affiliation(
            self,
            record,
            id_project,
            id_team,
            fmt_rescue="F. Last",
            sort=False):
741 742 743 744
        """Check that authors of my institute are signatories.

        Launch a recovery procedure when affiliations are not defined.
        It is based on the author rescue list stored in the database.
745

746
        Args:
747 748 749 750 751 752 753 754 755
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database

            id_team (int):
                identifier of the team in the database

756 757
            fmt_rescue (str):
                the format for the authors used in the rescue list
758

759 760 761 762 763 764 765 766
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

        Return
            str:
                * the found affiliation
                * an empty string when the rescue list is used.
767

768
        Raises:
769 770 771 772
            CheckException:
                when the rescue list is required but empty
                or because the intersection between the rescue list
                and the author is null.
773 774

        """
775 776 777
        if self.dbg:
            print "\t\tCheck my affiliation"

778
        value = record.find_affiliation(self.reg_institute)
779
        if len(value) > 0:
780
            return value
781

782 783 784 785 786
        # affiliation is not defined
        # try to recover using the authors rescue list
        rescue_list = self._get_author_rescue_list(record, id_project, id_team)
        if not rescue_list:
            raise CheckException(MSG_NO_MY_AUTHOR)
787

788
        # format the author in the same way as the rescue list
789 790 791 792
        fmt_ref = record._last_fmt_author
        record.reformat_authors(fmt_rescue)

        if sort:
793
            authors = (record[u"authors"][["last_name", "fmt_name"]]
794 795 796 797
                       .sort_values(by="last_name")
                       .fmt_name)

        else:
798
            authors = (record[u"authors"].fmt_name
799 800 801 802
                       .sort_index())

        # go back to the origin formatting
        record.reformat_authors(fmt_ref)
803

804 805
        # compute the intersection between the authors and the rescue list
        intersection = set(authors) & set(rescue_list)
806

807
        if len(intersection) == 0:
808
            raise CheckException(MSG_NO_MY_AUTHOR)
809

810
        # cache the result for a latter use
811 812
        self._my_authors[record.id()] = list(intersection)

813
        return ""
814

815
    def paper_reference(self, record):
816
        """Check that editor, page, volume and paper year are defined
817
        for a published paper. Repair it from doi when possible.
818

819
        Args:
820 821
            record (RecordPubli):
                record describing a publication.
822

823
        Raises:
824
            CheckException:
LE GAC Renaud's avatar
LE GAC Renaud committed
825
                the paper reference is not well formed.
826 827

        """
828 829 830
        if self.dbg:
            print "\t\tCheck paper reference"

831
        if record.is_published():
832 833
            return

834 835 836 837
        # paper reference can be incomplete or missing
        # is the paper published ? In that case the doi is defined
        if u"doi" not in record:
            return
838

839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
        # what information is missing ?
        # * df.columns are title, volume, year and pagination
        # * df can contains one or more rows due to erratum.
        # * assume that the first row is the oldest one and corresponds tp
        #   the first publication
        # * the row contains empty string when the record is not published.
        # * iloc[0] returns a serie where the index are the column's name
        #
        columns = (record[u"publication_info"].iloc[0]
                   .replace("", np.nan)
                   .dropna()
                   .index)

        missing = PAPER_REFERENCE_KEYS.difference(columns)

        # try to recover from the doi when it has the form
        # xx.yyyy/Publisher.Volume.Page
        m = REG_DOI.match(record[u"doi"])
        if not m:
            raise ToolException(MSG_NO_REF + str(list(missing)))

        for subfield in missing:
            if subfield == "title":

                # transform PhysRevD in Phys. Rev. D
                li = re.split(r"([A-Z][a-z]+)", m.group(1))
                title = ". ".join([el for el in li if len(el) > 0])
                record[u"publication_info"].loc[0, u"title"] = title

            elif subfield == "volume":
                record[u"publication_info"].loc[0, u"volume"] = m.group(2)

            elif subfield == "pagination":
                record[u"publication_info"].loc[0, u"pagination"] = m.group(3)

            elif subfield == "year":
                raise ToolException(MSG_NO_REF + "[year]")
876

877
    def publisher(self, record):
878
        """Check synonyms for publisher by replacing by the abbreviation value.
879 880

        Args:
881 882
            record (RecordPubli):
                record describing a publication.
883 884

        Raises:
885
            CheckException:
886 887
                * the publisher is unknown (neither abbreviation nor synonym)
                * more than one synonym found.
888 889

        """
890 891 892
        if self.dbg:
            print "\t\tCheck publisher"

893
        val = record.paper_editor()
894
        if len(val) == 0:
895 896
            return

897
        try:
898
            db = self.db
899
            dbid = search_synonym(db.publishers, "abbreviation", val)
900