checkandfix.py 30.4 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
""" harvest_tools.checkandfix
3 4 5 6 7

"""
import re
import regex

8 9
from .base import search_synonym, ToolException
from .exception import CheckException
10
from gluon import current
11 12 13 14 15 16 17 18
from invenio_tools import (DECODE_REF,
                           MSG_NO_CONF,
                           MSG_NO_THESIS,
                           OAI_URL,
                           RecordConf,
                           RecordThesis,
                           REG_OAI,
                           REG_YEAR)
19
from plugin_dbui import CLEAN_SPACES, get_id
20 21


22
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
23 24 25 26

# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
27
DECODE_YYYY = re.compile(r"^(\d{4})$")
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
MONTHS = {"Jan": "01",
          "Feb": "02",
          "Fev": "02",
          "Mar": "03",
          "Apr": "04",
          "Avr": "04",
          "May": "05",
          "Mai": "05",
          "Jun": "06",
          "Jul": "07",
          "Aug": "08",
          "Sep": "09",
          "Oct": "10",
          "Nov": "11",
          "Dec": "12"}
44

45 46
MSG_INVALID_HOST = "Invalid host"

47
MSG_NO_AUTHOR = "Reject no author(s)"
48
MSG_NO_CONF_DATE = "Reject no conference date"
49
MSG_NO_DATE = "Reject no submission date"
50
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
51 52
MSG_NO_REF = "Reject incomplete paper reference"
MSG_NO_YEAR = "Reject no publication year"
53

54
MSG_TEMPORARY_RECORD = "Temporary record"
55

56 57 58
MSG_TO_MANY_DATE = "Reject to many submit date"
MSG_TO_MANY_FAUTHOR = "Reject to many first author"
MSG_TO_MANY_YEAR = "Reject to many year"
59

60 61
MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
62

63
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
64

65 66
OAI_INVENIO = "oai:%s:%s"

67 68
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
69 70

REG_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
71
    re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
72

73 74 75
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)

76 77
UNIVERSITY = "University"

78 79

class CheckAndFix(object):
80 81
    """A collection of tools to check and repair the content
    of the Marc12 record.
82

83 84
    """
    def __init__(self):
85

86
        self.db = current.db
87 88 89 90 91 92 93
        self.reg_institute = self._get_reg_institute()

        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

        # private cache for my authors list
94
        self._my_authors = {}
95 96

    def _get_reg_institute(self):
97 98 99 100 101 102
        """Get the regular expression defining the affiliation of my institute.

        It is obtained by concatenating the affiliation keys.
        Affiliation key can contains character like ``(``, ``)`` or ``&``.
        They are replaced by ``\(`` *etc*.

103
        Returns:
104
            str:
105 106 107

        """
        # alias
108
        db = self.db
109 110 111
        app = current.app
        reg_institute = app.reg_institute

112 113 114
        # regular expression for the affiliation keys
        # protect special character
        # add start and end of string for an exact match
115 116
        if not reg_institute:

117 118 119
            lst = []
            for row in db(db.affiliation_keys.id > 0).iterselect():
                val = row.key_u
120

121 122 123 124 125 126 127 128 129 130 131 132 133
                val = (val
                       .replace("(", "\(")
                       .replace(")", "\)")
                       .replace("&", "\&")
                       .replace("$", "\$")
                       .replace("+", "\+")
                       .replace("?", "\?"))

                val = r"(^|\|){}($|\|)" .format(val)

                lst.append(val)

            reg_institute = r"|".join(lst)
134

135 136 137 138 139
        return reg_institute

    def _get_author_rescue_list(self, record, id_project, id_team):
        """Get the rescue list for my authors.

140
        Args:
141 142 143 144 145 146 147 148
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database.

            id_team (int):
                identifier of the team in the database.
149

150
        Returns:
151 152
            list:
                empty when not defined
153 154

        """
155
        year = record.submitted()
156 157 158 159

        # try to recover year when not defined
        if not year:
            # published article, proceeding
160 161
            if record[u"publication_info"].year.iloc[0] != "":
                year = record[u"publication_info"].year.iloc[0]
162 163

            # start date of a conference
164 165
            elif record._get(u"meeting_name", u"opening_date") != u"":
                year = record._get(u"meeting_name", u"opening_date")
166 167

            # end date of a conference
168 169
            elif record._get(u"meeting_name", u"closing_date") != u"":
                year = record._get(u"meeting_name", u"closing_date")
170 171 172 173 174

            else:
                return []

        #
175 176
        # protection
        # submitted and paper year are protect against erratum, but ...
177 178 179 180 181 182 183 184 185 186 187 188 189 190
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)

        else:
            return []

        # caching
LE GAC Renaud's avatar
LE GAC Renaud committed
191
        t = (year, id_project, id_team)
192 193 194 195 196
        if t == self.__par:
            return self.__reference

        # extract the list from the database
        row = self.db.my_authors(year=year,
LE GAC Renaud's avatar
LE GAC Renaud committed
197 198
                                 id_projects=id_project,
                                 id_teams=id_team)
199 200

        if row:
201
            self.__reference = row['authors'].strip("\n"). split(', ')
202 203 204 205
        else:
            self.__reference = []

        return self.__reference
206

207 208 209 210
    def _is_synonym(self, tablename, value):
        """Check that the synonym field contains *value*.

        Args:
211 212
            tablename (str): name of the database table
            value (str): value to be searched
213 214 215 216 217 218 219 220 221 222 223 224 225 226

        Returns:
            bool: ``True`` if *one* row is found, ``False`` otherwise.

        """
        db = self.db
        table = db[tablename]

        query = table.synonyms.contains(value)
        if db(query).count() == 1:
            return True

        return False

227 228 229 230
    def _recover_submitted(self, record):
        """Recover submitted date using conference, preprint or thesis
        information.

231 232
        Args:
            record (RecordPubli): record describing a publication.
233

234
        Returns:
235
            str: empty when procedure failed
236 237

        """
238
        val = ''
239
        if isinstance(record, RecordConf):
240 241

            # INSPIREHEP start date encoded as 2014-12-31
242 243
            if "x" in record["111"]:
                val = record["111"]["x"]
244 245

            # CDS end date encoded as 20141231
246 247
            elif "z" in record["111"]:
                val = record["111"]["z"]
248 249
                val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])

250
        elif isinstance(record, RecordThesis):
251 252 253 254 255 256 257 258 259 260 261 262 263
            val = record.these_defense()

        else:
            report = record.preprint_number()
            if report:
                m_arxiv = DECODE_ARXIV.match(report)
                if m_arxiv:
                    val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))

        return val

    def _repair_paper_reference(self, record):
        """Repair paper reference.
264
        The recovery procedure use the "o" field (invenio)::
265

266 267
            Eur. Phys. J. C (2014) 74:2883
            Phys. Rev. Lett. 113, 032001 (2014)
268 269 270

        in order to extract editor, volume, year and page data.

271 272
        Args:
            record (RecordPubli): record describing a publication.
273

274 275
        Raises:
            CheckException: when the repair failed.
276 277 278

        """
        # standard case
279
        if isinstance(record["773"], dict):
280

281
            if "o" in record["773"]:
282
                for reg in DECODE_REF:
283
                    m = reg.match(record["773"]["o"])
284
                    if m:
285 286 287 288
                        record["773"]["p"] = m.group("p")
                        record["773"]["v"] = m.group("v")
                        record["773"]["y"] = m.group("y")
                        record["773"]["c"] = m.group("c")
289 290 291 292 293
                        return

            raise CheckException(MSG_NO_REF)

        # list case -- paper with erratum
294
        elif isinstance(record["773"], list):
295

296
            for i in range(len(record["773"])):
297

298
                if "o" in record["773"][i]:
299 300
                    fixed = False
                    for reg in DECODE_REF:
301
                        m = reg.match(record["773"][i]["o"])
302
                        if m:
303 304 305 306
                            record["773"][i]["p"] = m.group("p")
                            record["773"][i]["v"] = m.group("v")
                            record["773"][i]["y"] = m.group("y")
                            record["773"][i]["c"] = m.group("c")
307 308 309 310 311 312 313 314 315 316 317 318
                            fixed = True
                            break

                    if not fixed:
                        raise CheckException(MSG_NO_REF)
                else:
                    raise CheckException(MSG_NO_REF)

        # case not expected
        else:
            raise CheckException(MSG_NO_REF)

319 320
    @staticmethod
    def authors(record):
321
        """Check that author fields are defined.
322

323
        Args:
324 325
            record (RecordPubli):
                record describing a publication.
326

327
        Raises:
328 329
            CheckException:
                when there is no authors.
330 331 332

        """

333
        if not record.is_authors():
334 335 336 337 338
            raise CheckException(MSG_NO_AUTHOR)

    def clean_erratum(self, record):
        """Clean record with erratum by removing them.

339 340 341
        Note:
            After this check the editor, volume, page and
            paper year field contains one entry.
342

343 344
        Args:
            record (RecordPubli): record describing a publication.
345 346 347 348 349 350 351

        """
        if not record.is_with_erratum():
            return

        # use the simplest algorithm by selecting the first entry in the list
        # fare to assume that the article is published first.
352
        record["773"] = record["773"][0]
353 354

        # treat year and submitted date
355
        for k in ("260", "269"):
356 357 358 359
            if k in record and isinstance(record[k], list):
                record[k] = record[k][0]

    def collaboration(self, record):
360 361
        """Check the collaboration.
        Have a look to the synonyms when the collaboration is not well formed.
362

363
        Args:
364 365
            record (RecordPubli):
                record describing a publication.
366

367
        Raises:
368 369
            CheckException:
                when the collaboration value is defined
370
                nor entered as a synonym.
371 372 373

        """
        val = record.collaboration()
374 375 376
        if not val:
            return

377
        db = self.db
378 379 380

        try:
            search_synonym(db.collaborations, "collaboration", val)
381

382 383 384
        except ToolException as e:
            raise CheckException(*e.args)

385
    def country(self, record):
386
        """Check conference country.
387
        Have a look to the synonyms when the country does not exist.
388

389 390
        Args:
            record (RecordConf): record describing a talk or a proceeding.
391

392
        Raises:
393
            CheckException: when the country is not defined
394
                nor entered as a synonym.
395 396

        """
397
        if not isinstance(record, RecordConf):
398 399
            return

400
        db = self.db
401
        val = record.conference_country()
402 403 404 405 406

        try:
            search_synonym(db.countries, "country", val)
        except ToolException as e:
            raise CheckException(*e.args)
407

408
    def conference_date(self, record, host):
409
        """Check conference date.
410

411 412
        Args:
            record (RecordConf): record describing a talk or a proceeding.
413 414
            host (str): possible values ares ``cds.cern.ch``
                or ``inspirehep.net``
415

416 417
        Raises:
            CheckException: when dates are not found or not well formed.
418 419 420 421 422 423

        """
        # conference information are available, i.e proceeding
        if not isinstance(record, RecordConf):
            return

424 425 426 427 428 429 430 431 432
        # inspirehep.net
        if host == "inspirehep.net":
            value = record.conference_dates()
            if len(value) == 0:
                raise CheckException(MSG_NO_CONF_DATE)

            return

        # cds.cern.ch
433
        if not ("111" in record and "d" in record["111"]):
434
                raise CheckException(MSG_NO_CONF_DATE)
435

436
        value = record["111"]["d"]
437

438 439 440 441 442 443 444 445
        m = REG_CONF_DATES.match(value)
        if not m:

            # 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
            m1 = REG_CONF_DATES_1.match(value)
            m2 = REG_CONF_DATES_2.match(value)

            if m1:
446
                record["111"]["d"] = "%s-%s %s %s" % m1.groups()
447 448

            elif m2:
449
                record["111"]["d"] = "%s %s - %s %s %s" % m2.groups()
450 451 452 453

            else:
                raise CheckException(MSG_WELL_FORMED_CONF_DATES)

454
    def is_bad_oai_used(self, record):
455 456 457
        """Bad OAI is when the ``id`` in the OAI field is different from
        the ``record id``. This happens when an old record is redirected
        to new one.
458

459
        Args:
460 461
            record (RecordPubli):
                record describing a publication.
462

463
        Returns:
464 465
            bool:
                ``True`` when a record is found in the database with
466
                the bad OAI.
467

468 469 470 471
        """
        value = record.oai()
        match = REG_OAI.match(value)

472
        if match.group(2) != record.id():
473 474
            db = self.db

475
            # a record with the bad OAI exists in the database
476 477 478
            bad_oai_url = OAI_URL % (match.group(1), match.group(2))
            if get_id(db.publications, origin=bad_oai_url):
                return True
479

480
        return False
481

482 483
    def format_authors(self, record, fmt="Last, First"):
        """Format the author names.
484

485 486
        Args:
            record (RecordPubli): record describing a publication.
487 488 489 490
            fmt (str):
                define the format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."
491 492

        """
493
        record.reformat_authors(fmt)
494 495

    def format_editor(self, record):
496 497
        """Format the editor abbreviation. The encoding
        depends on the store::
498

499 500
            INVENIO:    Phys. Lett. B + volume 673
            INSPIREHEP: Phys.Lett + volume B673
501

502
        Standardise the answer as ``Phys. Lett. B``.
503

504 505
        Note:
            It is recommended to call this method when erratum are removed.
506

507 508
        Args:
            record (RecordPubli): record describing a publication.
509

510 511
        Raises:
            CheckException: when the editor is not well formed.
512 513 514 515 516 517

        """
        if not record.is_published():
            return

        # standard case
518 519
        if isinstance(record["773"], dict):
            if "p" in record["773"] and "v" in record["773"]:
520

521 522
                editor = record["773"]["p"]
                volume = record["773"]["v"]
523 524 525 526 527 528 529 530 531 532 533

                # add space after the dot  Phys.Rev -> Phys. Rev
                editor = re.sub(r'\.([A-Z])', r'. \1', editor)

                # get the volume letter
                m = re.match(r'([A-Z]+) *(\d+)', volume)
                if m and m.group(1) != editor[-1]:
                    editor = "%s %s" % (editor, m.group(1))
                    volume = m.group(2)

                # remove stupid mistake
534
                editor = CLEAN_SPACES(editor)
535

536 537
                record["773"]["p"] = editor
                record["773"]["v"] = volume
538 539

        # list case -- publication with erratum
540
        elif isinstance(record["773"], list):
541

542 543
            editors = record._get("773", 'p', force_list=True)
            volumes = record._get("773", 'v', force_list=True)
544 545 546 547 548 549 550 551 552 553 554 555 556 557

            if len(editors) != len(volumes):
                raise CheckException(MSG_WELL_FORMED_EDITOR)

            for i in range(len(editors)):
                # add space after the dot  Phys.Rev -> Phys. Rev
                editor = re.sub(r'\.([A-Z])', r'. \1', editors[i])

                # get the volume letter
                m = re.match(r'([A-Z]+) *(\d+)', volumes[i])
                if m and m.group(1) != editor[-1]:
                    editor = "%s %s" % (editor, m.group(1))
                    volume = m.group(2)

558
                editor = CLEAN_SPACES(editor)
559

560 561
                record["773"][i]["p"] = editor
                record["773"][i]["v"] = volume
562

563 564 565
    def format_universities(self, record):
        """Format the name of the university for PhD:

566 567
            * Fix the name of Aix-Marseille University
            * Replace U. by University
568

569 570
        Args:
            record (RecordThesis): record describing a thesis.
571 572 573

        """
        # protection
574
        if not isinstance(record, RecordThesis):
575 576
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
577
        is_cppm = self._get_reg_institute().find("CPPM") != -1
578

579
        # CPPM: fix the name of Aix-Marseille university
580
        if is_cppm:
581 582 583

            year = REG_YEAR.search(record.these_defense()).group(1)
            if int(year) < 2012:
584
                university = "Université de la Méditerrannée Aix-Marseille II"
585
            else:
586
                university = "Aix Marseille Université"
587

588 589 590 591
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    if "Marseille" in record["502"]["b"]:
                        record["502"]["b"] = university
592

593 594 595 596
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        if "Marseille" in record["502"]["b"][i]:
                            record["502"]["b"][i] = university
597 598 599

        # Other: replace U. by University
        else:
600
            university = current.T(UNIVERSITY, lazy=False)
601

602 603 604
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    value = record["502"]["b"]
605 606
                    if "U." in value:
                        value = value.replace('U.', university)
607
                        record["502"]["b"] = value
608

609 610 611
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        value = record["502"]["b"][i]
612 613
                        if "U." in value:
                            value = value.replace('U.', university)
614
                            record["502"]["b"][i] = value
615

616
    def get_my_authors(self, record, sep=", ", sort=False):
617
        """Get authors of my institutes signing the record.
618 619
        The information is append to the Record object via the attribute
        ``my_authors``.
620

621 622
        Args:
            record (RecordPubli): record describing a publication.
623
            sep (str):
624 625 626 627
                string separating author names. The default is the comma.
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
628

629
        Returns:
630
        strcode: the list of authors separated by the ``sep`` argument.
631

632 633
        Raises:
            CheckException: when the list is empty
634 635 636 637

        """
        # might have been computed when affiliation is checked
        rec_id = record.id()
638 639 640
        if rec_id in self._my_authors:
            li = self._my_authors[rec_id]
            value = sep.join(li)
641 642 643 644

        # find authors of my institute signing the record
        else:
            reg_institute = self.reg_institute
645 646
            value = \
                record.find_authors_by_affiliation(reg_institute, sep, sort)
647

648
        if len(value) == 0:
649 650
            raise CheckException(MSG_NO_MY_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
651
        record.my_authors = value
652

653 654 655
    def is_conference(self, record):
        """Check that the record described a conference talk / proceeding.

656 657
        Args:
            record (RecordPubli): record describing a publication.
658

659 660
        Raises:
            CheckException: when the record is not associated to a conference.
661 662 663 664 665 666

        """
        if not isinstance(record, RecordConf):
            raise CheckException(MSG_NO_CONF)

    def is_thesis(self, record):
667
        """Check that the record described a thesis.
668

669 670
        Args:
            record (RecordPubli): record describing a publication.
671

672 673
        Raises:
            CheckException: when the record does not describe a thesis.
674 675 676 677 678

        """
        if not isinstance(record, RecordThesis):
            raise CheckException(MSG_NO_THESIS)

679 680 681 682 683 684 685
    def my_affiliation(
            self,
            record,
            id_project,
            id_team,
            fmt_rescue="F. Last",
            sort=False):
686 687 688 689
        """Check that authors of my institute are signatories.

        Launch a recovery procedure when affiliations are not defined.
        It is based on the author rescue list stored in the database.
690

691
        Args:
692 693 694 695 696 697 698 699 700
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database

            id_team (int):
                identifier of the team in the database

701 702
            fmt_rescue (str):
                the format for the authors used in the rescue list
703

704 705 706 707 708 709 710 711
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

        Return
            str:
                * the found affiliation
                * an empty string when the rescue list is used.
712

713
        Raises:
714 715 716 717
            CheckException:
                when the rescue list is required but empty
                or because the intersection between the rescue list
                and the author is null.
718 719

        """
720
        value = record.find_affiliation(self.reg_institute)
721
        if len(value) > 0:
722
            return value
723

724 725 726 727 728
        # affiliation is not defined
        # try to recover using the authors rescue list
        rescue_list = self._get_author_rescue_list(record, id_project, id_team)
        if not rescue_list:
            raise CheckException(MSG_NO_MY_AUTHOR)
729

730
        # format the author in the same way as the rescue list
731 732 733 734
        fmt_ref = record._last_fmt_author
        record.reformat_authors(fmt_rescue)

        if sort:
735
            authors = (record[u"authors"][["last_name", "fmt_name"]]
736 737 738 739
                       .sort_values(by="last_name")
                       .fmt_name)

        else:
740
            authors = (record[u"authors"].fmt_name
741 742 743 744
                       .sort_index())

        # go back to the origin formatting
        record.reformat_authors(fmt_ref)
745

746 747
        # compute the intersection between the authors and the rescue list
        intersection = set(authors) & set(rescue_list)
748

749
        if len(intersection) == 0:
750
            raise CheckException(MSG_NO_MY_AUTHOR)
751

752
        # cache the result for a latter use
753 754
        self._my_authors[record.id()] = list(intersection)

755
        return ""
756 757 758 759 760

    def paper_reference(self, record):
        """Check that editor, page, volume and paper year are defined
        for a published paper. Repair it as far as possible.

761 762 763
        Note:
            It is recommended to call this method when the
            erratum are removed.
764

765 766
        Args:
            record (RecordPubli): record describing a publication.
767

768 769
        Raises:
            CheckException: when the paper reference is not well formed.
770 771 772 773 774 775

        """
        if not record.is_published():
            return

        # list of reference (paper with erratum)
776 777
        refs = record["773"]
        if not isinstance(record["773"], list):
778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
            refs = [refs]

        # INSPIREHEP
        # fix the following case:
        #    {'p': u'Phys.Lett.B'},
        #    {'y': u'2013', 'p': u'Phys.Lett.B', 'c': u'6-12', 'v': u'B720'}

        if len(refs) == 2 and record.host().startswith('inspirehep'):
            nkeys = [len(di) for di in refs]
            if nkeys.count(1):
                index = nkeys.index(1)
                if 'p' in refs[index]:
                    refs.pop(index)

        # do we have all fields defining a reference
        # editor, volume, year and page
        for ref in refs:

            # proceeding can only contains "w" and "y" keys
            if "w" in ref:
                continue

            # check full reference
            for k in ("p", "v", "y", "c"):
                if k not in ref:
                    self._repair_paper_reference(record)
                    return

806 807 808 809 810 811 812 813
    def publisher(self, record):
        """Check publisher.
        Have a look to the synonyms when the publisher does not exist.

        Args:
            record (RecordPubli): record describing a publication.

        Raises:
814
            CheckException: when the publisher is not defined
815
                nor entered as a synonym.
816 817 818 819 820 821 822 823

        """
        db = self.db
        val = record.paper_editor()
        if not val:
            return

        # erratum -- check the first entry
824
        if isinstance(val, list):
825 826
            val = val[0]

827 828 829 830 831
        # convert ToolException to CheckExcpetion
        try:
            search_synonym(db.publishers, "abbreviation", val)
        except ToolException as e:
            raise CheckException(*e.args)
832

833 834 835 836
    def recover_oai(self, record, host):
        """Recover the OAI identifier when it is not defined
        or not well form.

837 838
        Args:
            record (RecordPubli): record describing a publication.
839
            host (str): possible values ares ``cds.cern.ch``
840
                or ``inspirehep.net``
841 842

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
843 844 845 846 847 848 849
        # Note:
        # For the record cds 1951625, possible values are:
        # oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
        # oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
        # in both store)
        # In all the case the first OAI corresponds to the record.id()
        #
850 851 852 853 854
        oai = record.oai()
        if oai is not None and REG_OAI.match(oai):
            return

        if host == "cds.cern.ch":
855
            field, subfield = "0248", "a"
856 857

        elif host == "inspirehep.net":
858
            field, subfield = "909CO", "o"
859 860 861 862 863 864 865 866 867

        else:
            raise ValueError(MSG_INVALID_HOST)

        if field not in record:
            record[field] = dict()

        record[field][subfield] = OAI_INVENIO % (host, record.id())

868
    def submitted(self, record):
869
        """Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
870 871
        Look for alternative when it is not defined.

872 873
        Note:
            After this check the year submitted contains one entry.
874

875 876
        Args:
            record (RecordPubli): record describing a publication.
877

878 879 880
        Raises:
            CheckException: when the date is not well formed or when more
                than one date are found.
881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897

        """
        dates = record.submitted()

        # recover missing date using conference, preprint, thesis information
        if not dates:
            val = self._recover_submitted(record)
            if not val:
                raise CheckException(MSG_NO_DATE)
            dates.append(val)

        # check that date are well formed
        for i in range(len(dates)):

            # 22 Mar 2011
            m = DECODE_DD_MMM_YYYY.match(dates[i])
            if m:
898 899
                data = (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
                dates[i] = '%s-%s-%02i' % data
900 901 902 903 904
                continue

            # 22 03 2011
            m = DECODE_DD_MM_YYYY.match(dates[i])
            if m:
LE GAC Renaud's avatar
LE GAC Renaud committed
905
                data = (m.group(3), int(m.group(2)), int(m.group(1)))
906
                dates[i] = '%s-%02i-%02i' % data
907