checkandfix.py 30.1 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
""" harvest_tools.checkandfix
3 4 5 6 7

"""
import re
import regex

8 9
from .base import search_synonym, ToolException
from .exception import CheckException
10
from gluon import current
11 12 13 14 15 16 17 18
from invenio_tools import (DECODE_REF,
                           MSG_NO_CONF,
                           MSG_NO_THESIS,
                           OAI_URL,
                           RecordConf,
                           RecordThesis,
                           REG_OAI,
                           REG_YEAR)
19
from plugin_dbui import CLEAN_SPACES, get_id
20 21


22
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
23 24 25 26

# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
27
DECODE_YYYY = re.compile(r"^(\d{4})$")
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
MONTHS = {"Jan": "01",
          "Feb": "02",
          "Fev": "02",
          "Mar": "03",
          "Apr": "04",
          "Avr": "04",
          "May": "05",
          "Mai": "05",
          "Jun": "06",
          "Jul": "07",
          "Aug": "08",
          "Sep": "09",
          "Oct": "10",
          "Nov": "11",
          "Dec": "12"}
44

45 46
MSG_INVALID_HOST = "Invalid host"

47
MSG_NO_AUTHOR = "Reject no author(s)"
48
MSG_NO_CONF_DATE = "Reject no conference date"
49
MSG_NO_DATE = "Reject no submission date"
50
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
51 52
MSG_NO_REF = "Reject incomplete paper reference"
MSG_NO_YEAR = "Reject no publication year"
53

54
MSG_TEMPORARY_RECORD = "Temporary record"
55

56 57 58
MSG_TO_MANY_DATE = "Reject to many submit date"
MSG_TO_MANY_FAUTHOR = "Reject to many first author"
MSG_TO_MANY_YEAR = "Reject to many year"
59

60 61
MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
62

63
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
64

65 66
OAI_INVENIO = "oai:%s:%s"

67 68
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
69 70

REG_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
71
    re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
72

73 74 75
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)

76 77
UNIVERSITY = "University"

78 79

class CheckAndFix(object):
80 81
    """A collection of tools to check and repair the content
    of the Marc12 record.
82

83 84
    """
    def __init__(self):
85

86
        self.db = current.db
87 88 89 90 91 92 93
        self.reg_institute = self._get_reg_institute()

        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

        # private cache for my authors list
94
        self._my_authors = {}
95 96

    def _get_reg_institute(self):
97 98 99 100 101 102
        """Get the regular expression defining the affiliation of my institute.

        It is obtained by concatenating the affiliation keys.
        Affiliation key can contains character like ``(``, ``)`` or ``&``.
        They are replaced by ``\(`` *etc*.

103
        Returns:
104
            unicode:
105 106 107

        """
        # alias
108
        db = self.db
109 110 111
        app = current.app
        reg_institute = app.reg_institute

112 113 114
        # regular expression for the affiliation keys
        # protect special character
        # add start and end of string for an exact match
115 116
        if not reg_institute:

117 118 119
            lst = []
            for row in db(db.affiliation_keys.id > 0).iterselect():
                val = row.key_u
120

121 122 123 124 125 126 127 128 129 130 131 132 133
                val = (val
                       .replace("(", "\(")
                       .replace(")", "\)")
                       .replace("&", "\&")
                       .replace("$", "\$")
                       .replace("+", "\+")
                       .replace("?", "\?"))

                val = r"(^|\|){}($|\|)" .format(val)

                lst.append(val)

            reg_institute = r"|".join(lst)
134

135 136 137 138 139
        return reg_institute

    def _get_author_rescue_list(self, record, id_project, id_team):
        """Get the rescue list for my authors.

140 141 142 143
        Args:
            record (RecordPubli): record describing a publication.
            id_project (int): identifier of the project in the database.
            id_team (int): identifier of the team in the database.
144

145 146
        Returns:
            list: empty when not defined
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189

        """
        year = record.year()

        # try to recover year when not defined
        if not year:
            # published article, proceeding
            if "773" in record and "y" in record["773"]:
                year = record["773"]["y"]

            # start date of a conference
            elif "111" in record and "x" in record["111"]:
                year = record["111"]["x"]

            # end date of a conference
            elif "111" in record and "z" in record["111"]:
                year = record["111"]["z"]

            # submitted date
            elif "269" in record and "c" in record["269"]:
                year = record["269"]["c"]

            else:
                return []

        #
        # NOTE
        # keep in mind that the CheckAndfix mechanism is not yet run
        # therefore year can be a list due to erratum, ...
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)

        else:
            return []

        # caching
LE GAC Renaud's avatar
LE GAC Renaud committed
190
        t = (year, id_project, id_team)
191 192 193 194 195
        if t == self.__par:
            return self.__reference

        # extract the list from the database
        row = self.db.my_authors(year=year,
LE GAC Renaud's avatar
LE GAC Renaud committed
196 197
                                 id_projects=id_project,
                                 id_teams=id_team)
198 199 200 201 202 203 204

        if row:
            self.__reference = row['authors'].split(', ')
        else:
            self.__reference = []

        return self.__reference
205

206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
    def _is_synonym(self, tablename, value):
        """Check that the synonym field contains *value*.

        Args:
            tablename (unicode): name of the database table
            value (unicode): value to be searched

        Returns:
            bool: ``True`` if *one* row is found, ``False`` otherwise.

        """
        db = self.db
        table = db[tablename]

        query = table.synonyms.contains(value)
        if db(query).count() == 1:
            return True

        return False

226 227 228 229
    def _recover_submitted(self, record):
        """Recover submitted date using conference, preprint or thesis
        information.

230 231
        Args:
            record (RecordPubli): record describing a publication.
232

233
        Returns:
234
            str: empty when procedure failed
235 236

        """
237
        val = ''
238
        if isinstance(record, RecordConf):
239 240

            # INSPIREHEP start date encoded as 2014-12-31
241 242
            if "x" in record["111"]:
                val = record["111"]["x"]
243 244

            # CDS end date encoded as 20141231
245 246
            elif "z" in record["111"]:
                val = record["111"]["z"]
247 248
                val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])

249
        elif isinstance(record, RecordThesis):
250 251 252 253 254 255 256 257 258 259 260 261 262
            val = record.these_defense()

        else:
            report = record.preprint_number()
            if report:
                m_arxiv = DECODE_ARXIV.match(report)
                if m_arxiv:
                    val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))

        return val

    def _repair_paper_reference(self, record):
        """Repair paper reference.
263
        The recovery procedure use the "o" field (invenio)::
264

265 266
            Eur. Phys. J. C (2014) 74:2883
            Phys. Rev. Lett. 113, 032001 (2014)
267 268 269

        in order to extract editor, volume, year and page data.

270 271
        Args:
            record (RecordPubli): record describing a publication.
272

273 274
        Raises:
            CheckException: when the repair failed.
275 276 277

        """
        # standard case
278
        if isinstance(record["773"], dict):
279

280
            if "o" in record["773"]:
281
                for reg in DECODE_REF:
282
                    m = reg.match(record["773"]["o"])
283
                    if m:
284 285 286 287
                        record["773"]["p"] = m.group("p")
                        record["773"]["v"] = m.group("v")
                        record["773"]["y"] = m.group("y")
                        record["773"]["c"] = m.group("c")
288 289 290 291 292
                        return

            raise CheckException(MSG_NO_REF)

        # list case -- paper with erratum
293
        elif isinstance(record["773"], list):
294

295
            for i in range(len(record["773"])):
296

297
                if "o" in record["773"][i]:
298 299
                    fixed = False
                    for reg in DECODE_REF:
300
                        m = reg.match(record["773"][i]["o"])
301
                        if m:
302 303 304 305
                            record["773"][i]["p"] = m.group("p")
                            record["773"][i]["v"] = m.group("v")
                            record["773"][i]["y"] = m.group("y")
                            record["773"][i]["c"] = m.group("c")
306 307 308 309 310 311 312 313 314 315 316 317 318
                            fixed = True
                            break

                    if not fixed:
                        raise CheckException(MSG_NO_REF)
                else:
                    raise CheckException(MSG_NO_REF)

        # case not expected
        else:
            raise CheckException(MSG_NO_REF)

    def authors(self, record):
319
        """Check that author fields are defined.
320

321 322
        Args:
            record (RecordPubli): record describing a publication.
323

324 325 326
        Raises:
            CheckException: when there is no authors or more than
                one *first author*.
327 328 329

        """

330
        if not record.is_authors():
331 332
            raise CheckException(MSG_NO_AUTHOR)

333
        if len(record["100"]) > 1:
334
            raise CheckException(MSG_TO_MANY_FAUTHOR)
335 336 337 338

    def clean_erratum(self, record):
        """Clean record with erratum by removing them.

339 340 341
        Note:
            After this check the editor, volume, page and
            paper year field contains one entry.
342

343 344
        Args:
            record (RecordPubli): record describing a publication.
345 346 347 348 349 350 351

        """
        if not record.is_with_erratum():
            return

        # use the simplest algorithm by selecting the first entry in the list
        # fare to assume that the article is published first.
352
        record["773"] = record["773"][0]
353 354

        # treat year and submitted date
355
        for k in ("260", "269"):
356 357 358 359
            if k in record and isinstance(record[k], list):
                record[k] = record[k][0]

    def collaboration(self, record):
360 361
        """Check the collaboration.
        Have a look to the synonyms when the collaboration is not well formed.
362

363 364
        Args:
            record (RecordPubli): record describing a publication.
365

366
        Raises:
367
            CheckException: when the collaboration value is defined
368
                nor entered as a synonym.
369 370 371

        """
        val = record.collaboration()
372 373 374
        if not val:
            return

375
        db = self.db
376 377 378 379 380 381

        try:
            search_synonym(db.collaborations, "collaboration", val)
        except ToolException as e:
            raise CheckException(*e.args)

382
    def country(self, record):
383
        """Check conference country.
384
        Have a look to the synonyms when the country does not exist.
385

386 387
        Args:
            record (RecordConf): record describing a talk or a proceeding.
388

389
        Raises:
390
            CheckException: when the country is not defined
391
                nor entered as a synonym.
392 393

        """
394
        if not isinstance(record, RecordConf):
395 396
            return

397
        db = self.db
398
        val = record.conference_country()
399 400 401 402 403

        try:
            search_synonym(db.countries, "country", val)
        except ToolException as e:
            raise CheckException(*e.args)
404

405
    def conference_date(self, record, host):
406
        """Check conference date.
407

408 409
        Args:
            record (RecordConf): record describing a talk or a proceeding.
410 411
            host (str): possible values ares ``cds.cern.ch``
                or ``inspirehep.net``
412

413 414
        Raises:
            CheckException: when dates are not found or not well formed.
415 416 417 418 419 420

        """
        # conference information are available, i.e proceeding
        if not isinstance(record, RecordConf):
            return

421 422 423 424 425 426 427 428 429
        # inspirehep.net
        if host == "inspirehep.net":
            value = record.conference_dates()
            if len(value) == 0:
                raise CheckException(MSG_NO_CONF_DATE)

            return

        # cds.cern.ch
430
        if not ("111" in record and "d" in record["111"]):
431
                raise CheckException(MSG_NO_CONF_DATE)
432

433
        value = record["111"]["d"]
434

435 436 437 438 439 440 441 442
        m = REG_CONF_DATES.match(value)
        if not m:

            # 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
            m1 = REG_CONF_DATES_1.match(value)
            m2 = REG_CONF_DATES_2.match(value)

            if m1:
443
                record["111"]["d"] = "%s-%s %s %s" % m1.groups()
444 445

            elif m2:
446
                record["111"]["d"] = "%s %s - %s %s %s" % m2.groups()
447 448 449 450

            else:
                raise CheckException(MSG_WELL_FORMED_CONF_DATES)

451
    def is_bad_oai_used(self, record):
452 453 454
        """Bad OAI is when the ``id`` in the OAI field is different from
        the ``record id``. This happens when an old record is redirected
        to new one.
455

456 457
        Args:
            record (RecordPubli): record describing a publication.
458

459 460 461
        Returns:
            bool: ``True`` when a record is found in the database with
                the bad OAI.
462

463 464 465 466
        """
        value = record.oai()
        match = REG_OAI.match(value)

467
        if match.group(2) != record.id():
468 469
            db = self.db

470
            # a record with the bad OAI exists in the database
471 472 473
            bad_oai_url = OAI_URL % (match.group(1), match.group(2))
            if get_id(db.publications, origin=bad_oai_url):
                return True
474

475
        return False
476

477 478
    def format_authors(self, record, fmt="Last, First"):
        """Format the author names.
479

480 481
        Args:
            record (RecordPubli): record describing a publication.
482 483 484 485
            fmt (str):
                define the format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."
486 487

        """
488
        record.reformat_authors(fmt)
489 490

    def format_editor(self, record):
491 492
        """Format the editor abbreviation. The encoding
        depends on the store::
493

494 495
            INVENIO:    Phys. Lett. B + volume 673
            INSPIREHEP: Phys.Lett + volume B673
496

497
        Standardise the answer as ``Phys. Lett. B``.
498

499 500
        Note:
            It is recommended to call this method when erratum are removed.
501

502 503
        Args:
            record (RecordPubli): record describing a publication.
504

505 506
        Raises:
            CheckException: when the editor is not well formed.
507 508 509 510 511 512

        """
        if not record.is_published():
            return

        # standard case
513 514
        if isinstance(record["773"], dict):
            if "p" in record["773"] and "v" in record["773"]:
515

516 517
                editor = record["773"]["p"]
                volume = record["773"]["v"]
518 519 520 521 522 523 524 525 526 527 528

                # add space after the dot  Phys.Rev -> Phys. Rev
                editor = re.sub(r'\.([A-Z])', r'. \1', editor)

                # get the volume letter
                m = re.match(r'([A-Z]+) *(\d+)', volume)
                if m and m.group(1) != editor[-1]:
                    editor = "%s %s" % (editor, m.group(1))
                    volume = m.group(2)

                # remove stupid mistake
529
                editor = CLEAN_SPACES(editor)
530

531 532
                record["773"]["p"] = editor
                record["773"]["v"] = volume
533 534

        # list case -- publication with erratum
535
        elif isinstance(record["773"], list):
536

537 538
            editors = record._get("773", 'p', force_list=True)
            volumes = record._get("773", 'v', force_list=True)
539 540 541 542 543 544 545 546 547 548 549 550 551 552

            if len(editors) != len(volumes):
                raise CheckException(MSG_WELL_FORMED_EDITOR)

            for i in range(len(editors)):
                # add space after the dot  Phys.Rev -> Phys. Rev
                editor = re.sub(r'\.([A-Z])', r'. \1', editors[i])

                # get the volume letter
                m = re.match(r'([A-Z]+) *(\d+)', volumes[i])
                if m and m.group(1) != editor[-1]:
                    editor = "%s %s" % (editor, m.group(1))
                    volume = m.group(2)

553
                editor = CLEAN_SPACES(editor)
554

555 556
                record["773"][i]["p"] = editor
                record["773"][i]["v"] = volume
557

558 559 560
    def format_universities(self, record):
        """Format the name of the university for PhD:

561 562
            * Fix the name of Aix-Marseille University
            * Replace U. by University
563

564 565
        Args:
            record (RecordThesis): record describing a thesis.
566 567 568

        """
        # protection
569
        if not isinstance(record, RecordThesis):
570 571
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
572
        is_cppm = self._get_reg_institute().find("CPPM") != -1
573

574
        # CPPM: fix the name of Aix-Marseille university
575
        if is_cppm:
576 577 578

            year = REG_YEAR.search(record.these_defense()).group(1)
            if int(year) < 2012:
579
                university = "Université de la Méditerrannée Aix-Marseille II"
580
            else:
581
                university = "Aix Marseille Université"
582

583 584 585 586
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    if "Marseille" in record["502"]["b"]:
                        record["502"]["b"] = university
587

588 589 590 591
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        if "Marseille" in record["502"]["b"][i]:
                            record["502"]["b"][i] = university
592 593 594

        # Other: replace U. by University
        else:
595
            university = current.T(UNIVERSITY)
596

597 598 599
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    value = record["502"]["b"]
600 601
                    if "U." in value:
                        value = value.replace('U.', university)
602
                        record["502"]["b"] = value
603

604 605 606
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        value = record["502"]["b"][i]
607 608
                        if "U." in value:
                            value = value.replace('U.', university)
609
                            record["502"]["b"][i] = value
610

611
    def get_my_authors(self, record, sep=", ", sort=False):
612
        """Get authors of my institutes signing the record.
613 614
        The information is append to the Record object via the attribute
        ``my_authors``.
615

616 617
        Args:
            record (RecordPubli): record describing a publication.
618 619 620 621 622
            sep (unicode):
                string separating author names. The default is the comma.
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
623

624
        Returns:
625
            unicode: the list of authors separated by the ``sep`` argument.
626

627 628
        Raises:
            CheckException: when the list is empty
629 630 631 632

        """
        # might have been computed when affiliation is checked
        rec_id = record.id()
633 634 635
        if rec_id in self._my_authors:
            li = self._my_authors[rec_id]
            value = sep.join(li)
636 637 638 639

        # find authors of my institute signing the record
        else:
            reg_institute = self.reg_institute
640 641
            value = \
                record.find_authors_by_affiliation(reg_institute, sep, sort)
642

643
        if len(value) == 0:
644 645
            raise CheckException(MSG_NO_MY_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
646
        record.my_authors = value
647

648 649 650
    def is_conference(self, record):
        """Check that the record described a conference talk / proceeding.

651 652
        Args:
            record (RecordPubli): record describing a publication.
653

654 655
        Raises:
            CheckException: when the record is not associated to a conference.
656 657 658 659 660 661

        """
        if not isinstance(record, RecordConf):
            raise CheckException(MSG_NO_CONF)

    def is_thesis(self, record):
662
        """Check that the record described a thesis.
663

664 665
        Args:
            record (RecordPubli): record describing a publication.
666

667 668
        Raises:
            CheckException: when the record does not describe a thesis.
669 670 671 672 673

        """
        if not isinstance(record, RecordThesis):
            raise CheckException(MSG_NO_THESIS)

674 675 676 677 678 679 680
    def my_affiliation(
            self,
            record,
            id_project,
            id_team,
            fmt_rescue="F. Last",
            sort=False):
681 682 683 684
        """Check that authors of my institute are signatories.

        Launch a recovery procedure when affiliations are not defined.
        It is based on the author rescue list stored in the database.
685

686 687 688 689
        Args:
            record (RecordPubli): record describing a publication.
            id_project (int): identifier of the project in the database
            id_team (int): identifier of the team in the database
690 691 692 693 694 695 696 697 698 699
            fmt_rescue (str):
                the format for the authors used in the rescue list
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

        Return
            str:
                * the found affiliation
                * an empty string when the rescue list is used.
700

701
        Raises:
702 703 704 705
            CheckException:
                when the rescue list is required but empty
                or because the intersection between the rescue list
                and the author is null.
706 707

        """
708
        value = record.find_affiliation(self.reg_institute)
709
        if len(value) > 0:
710
            return value
711

712 713 714 715 716
        # affiliation is not defined
        # try to recover using the authors rescue list
        rescue_list = self._get_author_rescue_list(record, id_project, id_team)
        if not rescue_list:
            raise CheckException(MSG_NO_MY_AUTHOR)
717

718
        # format the author in the same way as the rescue list
719 720 721 722
        fmt_ref = record._last_fmt_author
        record.reformat_authors(fmt_rescue)

        if sort:
723
            authors = (record["700"][["last_name", "fmt_name"]]
724 725 726 727
                       .sort_values(by="last_name")
                       .fmt_name)

        else:
728
            authors = (record["700"].fmt_name
729 730 731 732
                       .sort_index())

        # go back to the origin formatting
        record.reformat_authors(fmt_ref)
733

734 735
        # compute the intersection between the authors and the rescue list
        intersection = set(authors) & set(rescue_list)
736

737
        if len(intersection) == 0:
738
            raise CheckException(MSG_NO_MY_AUTHOR)
739

740
        # cache the result for a latter use
741 742
        self._my_authors[record.id()] = list(intersection)

743
        return ""
744 745 746 747 748

    def paper_reference(self, record):
        """Check that editor, page, volume and paper year are defined
        for a published paper. Repair it as far as possible.

749 750 751
        Note:
            It is recommended to call this method when the
            erratum are removed.
752

753 754
        Args:
            record (RecordPubli): record describing a publication.
755

756 757
        Raises:
            CheckException: when the paper reference is not well formed.
758 759 760 761 762 763

        """
        if not record.is_published():
            return

        # list of reference (paper with erratum)
764 765
        refs = record["773"]
        if not isinstance(record["773"], list):
766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
            refs = [refs]

        # INSPIREHEP
        # fix the following case:
        #    {'p': u'Phys.Lett.B'},
        #    {'y': u'2013', 'p': u'Phys.Lett.B', 'c': u'6-12', 'v': u'B720'}

        if len(refs) == 2 and record.host().startswith('inspirehep'):
            nkeys = [len(di) for di in refs]
            if nkeys.count(1):
                index = nkeys.index(1)
                if 'p' in refs[index]:
                    refs.pop(index)

        # do we have all fields defining a reference
        # editor, volume, year and page
        for ref in refs:

            # proceeding can only contains "w" and "y" keys
            if "w" in ref:
                continue

            # check full reference
            for k in ("p", "v", "y", "c"):
                if k not in ref:
                    self._repair_paper_reference(record)
                    return

794 795 796 797 798 799 800 801
    def publisher(self, record):
        """Check publisher.
        Have a look to the synonyms when the publisher does not exist.

        Args:
            record (RecordPubli): record describing a publication.

        Raises:
802
            CheckException: when the publisher is not defined
803
                nor entered as a synonym.
804 805 806 807 808 809 810 811

        """
        db = self.db
        val = record.paper_editor()
        if not val:
            return

        # erratum -- check the first entry
812
        if isinstance(val, list):
813 814
            val = val[0]

815 816 817 818 819
        # convert ToolException to CheckExcpetion
        try:
            search_synonym(db.publishers, "abbreviation", val)
        except ToolException as e:
            raise CheckException(*e.args)
820

821 822 823 824
    def recover_oai(self, record, host):
        """Recover the OAI identifier when it is not defined
        or not well form.

825 826 827 828
        Args:
            record (RecordPubli): record describing a publication.
            host (unicode): possible values ares ``cds.cern.ch``
                or ``inspirehep.net``
829 830

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
831 832 833 834 835 836 837
        # Note:
        # For the record cds 1951625, possible values are:
        # oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
        # oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
        # in both store)
        # In all the case the first OAI corresponds to the record.id()
        #
838 839 840 841 842
        oai = record.oai()
        if oai is not None and REG_OAI.match(oai):
            return

        if host == "cds.cern.ch":
843
            field, subfield = "0248", "a"
844 845

        elif host == "inspirehep.net":
846
            field, subfield = "909CO", "o"
847 848 849 850 851 852 853 854 855

        else:
            raise ValueError(MSG_INVALID_HOST)

        if field not in record:
            record[field] = dict()

        record[field][subfield] = OAI_INVENIO % (host, record.id())

856
    def submitted(self, record):
857
        """Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
858 859
        Look for alternative when it is not defined.

860 861
        Note:
            After this check the year submitted contains one entry.
862

863 864
        Args:
            record (RecordPubli): record describing a publication.
865

866 867 868
        Raises:
            CheckException: when the date is not well formed or when more
                than one date are found.
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885

        """
        dates = record.submitted()

        # recover missing date using conference, preprint, thesis information
        if not dates:
            val = self._recover_submitted(record)
            if not val:
                raise CheckException(MSG_NO_DATE)
            dates.append(val)

        # check that date are well formed
        for i in range(len(dates)):

            # 22 Mar 2011
            m = DECODE_DD_MMM_YYYY.match(dates[i])
            if m:
886 887
                data = (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
                dates[i] = '%s-%s-%02i' % data
888 889 890 891 892
                continue

            # 22 03 2011
            m = DECODE_DD_MM_YYYY.match(dates[i])
            if m:
LE GAC Renaud's avatar
LE GAC Renaud committed
893
                data = (m.group(3), int(m.group(2)), int(m.group(1)))
894
                dates[i] = '%s-%02i-%02i' % data
895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
                continue

            # 2011
            m_year = DECODE_YYYY.match(dates[i])
            if m_year:
                dates[i] = self._recover_submitted(record)

            # check the minimum requirement is 2001-05
            if not REG_SUBMITTED.match(dates[i]):
                raise CheckException(MSG_WELL_FORMED_DATE)

        # protection against list of identical date
        if len(dates) > 1:
            dates = list(set(dates))

        # Only one date
        if len(dates) != 1:
            raise CheckException(MSG_TO_MANY_DATE)

914 915
        if "269" not in record or isinstance(record["269"], list):
            record["269"] = dict()
916

917
        record["269"]["c"] = dates[0]
918 919 920 921

    def temporary_record(self, record):
        """Some records are marked temporary.

922 923
        Args:
            record (RecordPubli): record describing a publication.
924

925 926
        Raises:
            CheckException: when the record is marked temporary
927 928 929

        """
        # found on INSPIREHEP (see record 1317573)
930 931
        if "500" in record and "a" in record["500"]:
            if record["500"]["a"] == "*Temporary record*":
932 933 934
                raise CheckException(MSG_TEMPORARY_RECORD)

    def year(self, record):
935
        """Standardise the the year as ``YYYY`` and look for alternative when
936 937
        it is not defined.

938 939
        Note:
            The method assumes that the erratum are removed.
940

941 942 943 944
        Note:
            It is recommended to call this method after
            the ``submitted`` one, since the submitted field is used in
            the recovery procedure.
945

946