checkandfix.py 25.7 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
""" harvest_tools.checkandfix
3 4

"""
5
import numpy as np
6 7 8
import re
import regex

9 10
from .base import search_synonym, ToolException
from .exception import CheckException
11
from gluon import current
12 13 14 15 16 17 18 19
from invenio_tools import (DECODE_REF,
                           MSG_NO_CONF,
                           MSG_NO_THESIS,
                           OAI_URL,
                           RecordConf,
                           RecordThesis,
                           REG_OAI,
                           REG_YEAR)
20 21 22 23

from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS

from itertools import imap
24
from plugin_dbui import CLEAN_SPACES, get_id
25 26


27
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
28 29 30 31

# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
32
DECODE_YYYY = re.compile(r"^(\d{4})$")
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
MONTHS = {"Jan": "01",
          "Feb": "02",
          "Fev": "02",
          "Mar": "03",
          "Apr": "04",
          "Avr": "04",
          "May": "05",
          "Mai": "05",
          "Jun": "06",
          "Jul": "07",
          "Aug": "08",
          "Sep": "09",
          "Oct": "10",
          "Nov": "11",
          "Dec": "12"}
49

50 51
MSG_INVALID_HOST = "Invalid host"

52
MSG_NO_AUTHOR = "Reject no author(s)"
53
MSG_NO_CONF_DATE = "Reject no conference date"
54
MSG_NO_DATE = "Reject no submission date"
55
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
56
MSG_NO_REF = "Reject incomplete paper reference. Check "
57
MSG_NO_YEAR = "Reject no publication year"
58

59
MSG_TEMPORARY_RECORD = "Temporary record"
60

61 62 63
MSG_TO_MANY_DATE = "Reject to many submit date"
MSG_TO_MANY_FAUTHOR = "Reject to many first author"
MSG_TO_MANY_YEAR = "Reject to many year"
64

65 66
MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
67

68
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
69

70 71
OAI_INVENIO = "oai:%s:%s"

72 73
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
74 75

REG_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
76
    re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
77

78
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
79 80 81

REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")

82 83
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)

84 85
UNIVERSITY = "University"

86 87

class CheckAndFix(object):
88 89
    """A collection of tools to check and repair the content
    of the Marc12 record.
90

91 92
    """
    def __init__(self):
93

94
        self.db = current.db
95 96 97 98 99 100 101
        self.reg_institute = self._get_reg_institute()

        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

        # private cache for my authors list
102
        self._my_authors = {}
103 104

    def _get_reg_institute(self):
105 106 107 108 109 110
        """Get the regular expression defining the affiliation of my institute.

        It is obtained by concatenating the affiliation keys.
        Affiliation key can contains character like ``(``, ``)`` or ``&``.
        They are replaced by ``\(`` *etc*.

111
        Returns:
112
            str:
113 114 115

        """
        # alias
116
        db = self.db
117 118 119
        app = current.app
        reg_institute = app.reg_institute

120 121 122
        # regular expression for the affiliation keys
        # protect special character
        # add start and end of string for an exact match
123 124
        if not reg_institute:

125 126 127
            lst = []
            for row in db(db.affiliation_keys.id > 0).iterselect():
                val = row.key_u
128

129 130 131 132 133 134 135 136 137 138 139 140 141
                val = (val
                       .replace("(", "\(")
                       .replace(")", "\)")
                       .replace("&", "\&")
                       .replace("$", "\$")
                       .replace("+", "\+")
                       .replace("?", "\?"))

                val = r"(^|\|){}($|\|)" .format(val)

                lst.append(val)

            reg_institute = r"|".join(lst)
142

143 144 145 146 147
        return reg_institute

    def _get_author_rescue_list(self, record, id_project, id_team):
        """Get the rescue list for my authors.

148
        Args:
149 150 151 152 153 154 155 156
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database.

            id_team (int):
                identifier of the team in the database.
157

158
        Returns:
159 160
            list:
                empty when not defined
161 162

        """
163
        year = record.submitted()
164 165 166 167

        # try to recover year when not defined
        if not year:
            # published article, proceeding
168 169
            if record[u"publication_info"].year.iloc[0] != "":
                year = record[u"publication_info"].year.iloc[0]
170 171

            # start date of a conference
172 173
            elif record._get(u"meeting_name", u"opening_date") != u"":
                year = record._get(u"meeting_name", u"opening_date")
174 175

            # end date of a conference
176 177
            elif record._get(u"meeting_name", u"closing_date") != u"":
                year = record._get(u"meeting_name", u"closing_date")
178 179 180 181 182

            else:
                return []

        #
183 184
        # protection
        # submitted and paper year are protect against erratum, but ...
185 186 187 188 189 190 191 192 193 194 195 196 197 198
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)

        else:
            return []

        # caching
LE GAC Renaud's avatar
LE GAC Renaud committed
199
        t = (year, id_project, id_team)
200 201 202 203 204
        if t == self.__par:
            return self.__reference

        # extract the list from the database
        row = self.db.my_authors(year=year,
LE GAC Renaud's avatar
LE GAC Renaud committed
205 206
                                 id_projects=id_project,
                                 id_teams=id_team)
207 208

        if row:
209
            self.__reference = row['authors'].strip("\n"). split(', ')
210 211 212 213
        else:
            self.__reference = []

        return self.__reference
214

215 216 217 218
    def _is_synonym(self, tablename, value):
        """Check that the synonym field contains *value*.

        Args:
219 220
            tablename (str): name of the database table
            value (str): value to be searched
221 222 223 224 225 226 227 228 229 230 231 232 233 234

        Returns:
            bool: ``True`` if *one* row is found, ``False`` otherwise.

        """
        db = self.db
        table = db[tablename]

        query = table.synonyms.contains(value)
        if db(query).count() == 1:
            return True

        return False

235 236
    @staticmethod
    def _recover_submitted(record):
237 238 239
        """Recover submitted date using conference, preprint or thesis
        information.

240
        Args:
241 242
            record (RecordPubli):
                record describing a publication.
243

244
        Returns:
245 246
            unicode:
                empty when procedure failed
247 248

        """
249
        val = u""
250
        if isinstance(record, RecordConf):
251

252 253 254
            # CDS opening date is encoded as 20141231
            if u"opening_date" in record[u"meeting_name"]:
                val = record[u"meeting_name"][u"opening_date"]
255 256
                val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])

257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
            # CDS / INSPIREHEP date
            # date is encoded as 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
            # decode as DD-MMM-YYYY
            elif u"date" in record[u"meeting_name"]:
                val = record[u"meeting_name"][u"date"]

                m1 = REG_CONF_DATES_1.match(val)
                m2 = REG_CONF_DATES_2.match(val)

                if m1:
                    val = u"%s-%s-%s" % (m1.group(1), m1.group(3), m1.group(4))

                elif m2:
                    val = u"%s-%s-%s" % (m1.group(1), m1.group(2), m1.group(5))

272
        elif isinstance(record, RecordThesis):
273 274 275 276 277 278 279 280 281 282 283
            val = record.these_defense()

        else:
            report = record.preprint_number()
            if report:
                m_arxiv = DECODE_ARXIV.match(report)
                if m_arxiv:
                    val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))

        return val

284 285
    @staticmethod
    def authors(record):
286
        """Check that author fields are defined.
287

288
        Args:
289 290
            record (RecordPubli):
                record describing a publication.
291

292
        Raises:
293 294
            CheckException:
                when there is no authors.
295 296 297

        """

298
        if not record.is_authors():
299 300 301
            raise CheckException(MSG_NO_AUTHOR)

    def collaboration(self, record):
302 303
        """Check the collaboration.
        Have a look to the synonyms when the collaboration is not well formed.
304

305
        Args:
306 307
            record (RecordPubli):
                record describing a publication.
308

309
        Raises:
310 311
            CheckException:
                when the collaboration value is defined
312
                nor entered as a synonym.
313 314 315

        """
        val = record.collaboration()
316 317 318
        if not val:
            return

319
        db = self.db
320 321 322

        try:
            search_synonym(db.collaborations, "collaboration", val)
323

324 325 326
        except ToolException as e:
            raise CheckException(*e.args)

327
    def country(self, record):
328
        """Check conference country.
329
        Have a look to the synonyms when the country does not exist.
330

331 332
        Args:
            record (RecordConf): record describing a talk or a proceeding.
333

334
        Raises:
335
            CheckException: when the country is not defined
336
                nor entered as a synonym.
337 338

        """
339
        if not isinstance(record, RecordConf):
340 341
            return

342
        db = self.db
343
        val = record.conference_country()
344 345 346 347 348

        try:
            search_synonym(db.countries, "country", val)
        except ToolException as e:
            raise CheckException(*e.args)
349

350
    def conference_date(self, record, host):
351
        """Check conference date.
352

353 354
        Args:
            record (RecordConf): record describing a talk or a proceeding.
355 356
            host (str): possible values ares ``cds.cern.ch``
                or ``inspirehep.net``
357

358 359
        Raises:
            CheckException: when dates are not found or not well formed.
360 361 362 363 364 365

        """
        # conference information are available, i.e proceeding
        if not isinstance(record, RecordConf):
            return

366 367 368 369 370 371 372 373 374
        # inspirehep.net
        if host == "inspirehep.net":
            value = record.conference_dates()
            if len(value) == 0:
                raise CheckException(MSG_NO_CONF_DATE)

            return

        # cds.cern.ch
375
        if not ("111" in record and "d" in record["111"]):
376
                raise CheckException(MSG_NO_CONF_DATE)
377

378
        value = record["111"]["d"]
379

380 381 382 383 384 385 386 387
        m = REG_CONF_DATES.match(value)
        if not m:

            # 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
            m1 = REG_CONF_DATES_1.match(value)
            m2 = REG_CONF_DATES_2.match(value)

            if m1:
388
                record["111"]["d"] = "%s-%s %s %s" % m1.groups()
389 390

            elif m2:
391
                record["111"]["d"] = "%s %s - %s %s %s" % m2.groups()
392 393 394 395

            else:
                raise CheckException(MSG_WELL_FORMED_CONF_DATES)

396
    def is_bad_oai_used(self, record):
397 398 399
        """Bad OAI is when the ``id`` in the OAI field is different from
        the ``record id``. This happens when an old record is redirected
        to new one.
400

401
        Args:
402 403
            record (RecordPubli):
                record describing a publication.
404

405
        Returns:
406 407
            bool:
                ``True`` when a record is found in the database with
408
                the bad OAI.
409

410 411 412 413
        """
        value = record.oai()
        match = REG_OAI.match(value)

LE GAC Renaud's avatar
LE GAC Renaud committed
414
        if int(match.group(2)) != record.id():
415 416
            db = self.db

417
            # a record with the bad OAI exists in the database
418 419 420
            bad_oai_url = OAI_URL % (match.group(1), match.group(2))
            if get_id(db.publications, origin=bad_oai_url):
                return True
421

422
        return False
423

424 425
    @staticmethod
    def format_authors(record, fmt="Last, First"):
426
        """Format the author names.
427

428 429
        Args:
            record (RecordPubli): record describing a publication.
430 431 432 433
            fmt (str):
                define the format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."
434 435

        """
436
        record.reformat_authors(fmt)
437

438 439 440 441
    @staticmethod
    def format_editor(record):
        """Format the editor abbreviation.
        The encoding depends on the store::
442

443 444
            INVENIO:    Phys. Lett. B + volume 673
            INSPIREHEP: Phys.Lett + volume B673
445

446
        Standardise the answer as ``Phys. Lett. B``.
447

448
        Args:
449 450
            record (RecordPubli):
                record describing a publication.
451

452
        Raises:
453 454
            CheckException:
                when the editor is not well formed.
455 456 457 458 459

        """
        if not record.is_published():
            return

460
        df = record[u"publication_info"].iloc[0]
461

462 463
        editor = df.title
        volume = df.volume
464

465 466
        # add space after the dot  Phys.Rev -> Phys. Rev
        editor = re.sub(r'\.([A-Z])', r'. \1', editor)
467

468 469 470 471 472
        # get the volume letter
        m = re.match(r'([A-Z]+) *(\d+)', volume)
        if m and m.group(1) != editor[-1]:
            editor = "%s %s" % (editor, m.group(1))
            volume = m.group(2)
473

474 475
        # remove stupid mistake
        editor = CLEAN_SPACES(editor)
476

477
        df[["title", "volume"]] = [editor, volume]
478

479 480 481
    def format_universities(self, record):
        """Format the name of the university for PhD:

482 483
            * Fix the name of Aix-Marseille University
            * Replace U. by University
484

485 486
        Args:
            record (RecordThesis): record describing a thesis.
487 488 489

        """
        # protection
490
        if not isinstance(record, RecordThesis):
491 492
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
493
        is_cppm = self._get_reg_institute().find("CPPM") != -1
494

495
        # CPPM: fix the name of Aix-Marseille university
496
        if is_cppm:
497 498 499

            year = REG_YEAR.search(record.these_defense()).group(1)
            if int(year) < 2012:
500
                university = "Université de la Méditerrannée Aix-Marseille II"
501
            else:
502
                university = "Aix Marseille Université"
503

504 505 506 507
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    if "Marseille" in record["502"]["b"]:
                        record["502"]["b"] = university
508

509 510 511 512
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        if "Marseille" in record["502"]["b"][i]:
                            record["502"]["b"][i] = university
513 514 515

        # Other: replace U. by University
        else:
516
            university = current.T(UNIVERSITY, lazy=False)
517

518 519 520
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    value = record["502"]["b"]
521 522
                    if "U." in value:
                        value = value.replace('U.', university)
523
                        record["502"]["b"] = value
524

525 526 527
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        value = record["502"]["b"][i]
528 529
                        if "U." in value:
                            value = value.replace('U.', university)
530
                            record["502"]["b"][i] = value
531

532
    def get_my_authors(self, record, sep=", ", sort=False):
533
        """Get authors of my institutes signing the record.
534 535
        The information is append to the Record object via the attribute
        ``my_authors``.
536

537
        Args:
538 539 540 541
            record (RecordPubli):
                record describing a publication.

            sep (unicode):
542
                string separating author names. The default is the comma.
543

544 545 546
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
547

548
        Returns:
549 550
            unicode:
                the list of authors separated by the ``sep`` argument.
551

552
        Raises:
553 554
            CheckException:
                when the list is empty
555 556 557 558

        """
        # might have been computed when affiliation is checked
        rec_id = record.id()
559 560 561
        if rec_id in self._my_authors:
            li = self._my_authors[rec_id]
            value = sep.join(li)
562 563 564 565

        # find authors of my institute signing the record
        else:
            reg_institute = self.reg_institute
566 567
            value = \
                record.find_authors_by_affiliation(reg_institute, sep, sort)
568

569
        if len(value) == 0:
570 571
            raise CheckException(MSG_NO_MY_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
572
        record.my_authors = value
573

574 575
    @staticmethod
    def is_conference(record):
576 577
        """Check that the record described a conference talk / proceeding.

578 579
        Args:
            record (RecordPubli): record describing a publication.
580

581 582
        Raises:
            CheckException: when the record is not associated to a conference.
583 584 585 586 587

        """
        if not isinstance(record, RecordConf):
            raise CheckException(MSG_NO_CONF)

588 589
    @staticmethod
    def is_thesis(record):
590
        """Check that the record described a thesis.
591

592 593
        Args:
            record (RecordPubli): record describing a publication.
594

595 596
        Raises:
            CheckException: when the record does not describe a thesis.
597 598 599 600 601

        """
        if not isinstance(record, RecordThesis):
            raise CheckException(MSG_NO_THESIS)

602 603 604 605 606 607 608
    def my_affiliation(
            self,
            record,
            id_project,
            id_team,
            fmt_rescue="F. Last",
            sort=False):
609 610 611 612
        """Check that authors of my institute are signatories.

        Launch a recovery procedure when affiliations are not defined.
        It is based on the author rescue list stored in the database.
613

614
        Args:
615 616 617 618 619 620 621 622 623
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database

            id_team (int):
                identifier of the team in the database

624 625
            fmt_rescue (str):
                the format for the authors used in the rescue list
626

627 628 629 630 631 632 633 634
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

        Return
            str:
                * the found affiliation
                * an empty string when the rescue list is used.
635

636
        Raises:
637 638 639 640
            CheckException:
                when the rescue list is required but empty
                or because the intersection between the rescue list
                and the author is null.
641 642

        """
643
        value = record.find_affiliation(self.reg_institute)
644
        if len(value) > 0:
645
            return value
646

647 648 649 650 651
        # affiliation is not defined
        # try to recover using the authors rescue list
        rescue_list = self._get_author_rescue_list(record, id_project, id_team)
        if not rescue_list:
            raise CheckException(MSG_NO_MY_AUTHOR)
652

653
        # format the author in the same way as the rescue list
654 655 656 657
        fmt_ref = record._last_fmt_author
        record.reformat_authors(fmt_rescue)

        if sort:
658
            authors = (record[u"authors"][["last_name", "fmt_name"]]
659 660 661 662
                       .sort_values(by="last_name")
                       .fmt_name)

        else:
663
            authors = (record[u"authors"].fmt_name
664 665 666 667
                       .sort_index())

        # go back to the origin formatting
        record.reformat_authors(fmt_ref)
668

669 670
        # compute the intersection between the authors and the rescue list
        intersection = set(authors) & set(rescue_list)
671

672
        if len(intersection) == 0:
673
            raise CheckException(MSG_NO_MY_AUTHOR)
674

675
        # cache the result for a latter use
676 677
        self._my_authors[record.id()] = list(intersection)

678
        return ""
679

680 681
    @staticmethod
    def paper_reference(record):
682
        """Check that editor, page, volume and paper year are defined
683
        for a published paper. Repair it from doi when possible.
684

685
        Args:
686 687
            record (RecordPubli):
                record describing a publication.
688

689
        Raises:
690 691
            CheckException:
                when the paper reference is not well formed.
692 693

        """
694
        if record.is_published():
695 696
            return

697 698 699 700
        # paper reference can be incomplete or missing
        # is the paper published ? In that case the doi is defined
        if u"doi" not in record:
            return
701

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
        # what information is missing ?
        # * df.columns are title, volume, year and pagination
        # * df can contains one or more rows due to erratum.
        # * assume that the first row is the oldest one and corresponds tp
        #   the first publication
        # * the row contains empty string when the record is not published.
        # * iloc[0] returns a serie where the index are the column's name
        #
        columns = (record[u"publication_info"].iloc[0]
                   .replace("", np.nan)
                   .dropna()
                   .index)

        missing = PAPER_REFERENCE_KEYS.difference(columns)

        # try to recover from the doi when it has the form
        # xx.yyyy/Publisher.Volume.Page
        m = REG_DOI.match(record[u"doi"])
        if not m:
            raise ToolException(MSG_NO_REF + str(list(missing)))

        for subfield in missing:
            if subfield == "title":

                # transform PhysRevD in Phys. Rev. D
                li = re.split(r"([A-Z][a-z]+)", m.group(1))
                title = ". ".join([el for el in li if len(el) > 0])
                record[u"publication_info"].loc[0, u"title"] = title

            elif subfield == "volume":
                record[u"publication_info"].loc[0, u"volume"] = m.group(2)

            elif subfield == "pagination":
                record[u"publication_info"].loc[0, u"pagination"] = m.group(3)

            elif subfield == "year":
                raise ToolException(MSG_NO_REF + "[year]")
739

740 741 742 743 744
    def publisher(self, record):
        """Check publisher.
        Have a look to the synonyms when the publisher does not exist.

        Args:
745 746
            record (RecordPubli):
                record describing a publication.
747 748

        Raises:
749 750
            CheckException:
                when the publisher is not defined nor entered as a synonym.
751 752 753

        """
        val = record.paper_editor()
754
        if len(val) == 0:
755 756
            return

757 758
        # convert ToolException to CheckExcpetion
        try:
759
            db = self.db
760
            search_synonym(db.publishers, "abbreviation", val)
761

762 763
        except ToolException as e:
            raise CheckException(*e.args)
764

765 766
    @staticmethod
    def recover_oai(record, host):
767 768 769
        """Recover the OAI identifier when it is not defined
        or not well form.

770 771
        Args:
            record (RecordPubli): record describing a publication.
772
            host (str): possible values ares ``cds.cern.ch``
773
                or ``inspirehep.net``
774 775

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
776 777 778 779 780 781 782
        # Note:
        # For the record cds 1951625, possible values are:
        # oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
        # oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
        # in both store)
        # In all the case the first OAI corresponds to the record.id()
        #
783 784 785 786 787
        oai = record.oai()
        if oai is not None and REG_OAI.match(oai):
            return

        if host == "cds.cern.ch":
788
            field, subfield = "0248", "a"
789 790

        elif host == "inspirehep.net":
791
            field, subfield = "909CO", "o"
792 793 794 795 796 797 798 799 800

        else:
            raise ValueError(MSG_INVALID_HOST)

        if field not in record:
            record[field] = dict()

        record[field][subfield] = OAI_INVENIO % (host, record.id())

801
    def submitted(self, record):
802
        """Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
803 804
        Look for alternative when it is not defined.

805 806
        Note:
            After this check the year submitted contains one entry.
807

808
        Args:
809 810
            record (RecordPubli):
                record describing a publication.
811

812
        Raises:
813 814
            CheckException:
                when the date is not well formed or when more
815
                than one date are found.
816 817

        """
818
        date = record.submitted()
819 820

        # recover missing date using conference, preprint, thesis information
821 822 823
        if len(date) == 0:
            date = self._recover_submitted(record)
            if len(date) == 0:
824 825
                raise CheckException(MSG_NO_DATE)

826 827 828 829 830
        # 22 Mar 2011
        m = DECODE_DD_MMM_YYYY.match(date)
        if m:
            data = (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
            date = '%s-%s-%02i' % data
831

832 833 834 835 836
        # 22 03 2011
        m = DECODE_DD_MM_YYYY.match(date)
        if m:
            data = (m.group(3), int(m.group(2)), int(m.group(1)))
            date = '%s-%02i-%02i' % data
837

838 839 840 841
        # 2011
        m_year = DECODE_YYYY.match(date)
        if m_year:
            date = self._recover_submitted(record)
842

843 844 845
        # check the minimum requirement is 2001-05
        if not REG_SUBMITTED.match(date):
            raise CheckException(MSG_WELL_FORMED_DATE)
846

847
        record[u"prepublication"][u"date"] = date
848

849 850
    @staticmethod
    def temporary_record(record):
851 852
        """Some records are marked temporary.

853 854
        Args:
            record (RecordPubli): record describing a publication.
855

856 857
        Raises:
            CheckException: when the record is marked temporary
858 859

        """
860 861 862 863 864 865 866 867 868
        # INSPIREHEP
        # Can be find by using the XML syntax:
        #    http://inspirehep.net/search?500__a="*Temporary record*"
        #
        # or the corresponding JSON field:
        #    http://inspirehep.net/comment="*Temporary record*"
        #
        if u"comment" in record:
            if record[u"comment"] == u"*Temporary record*":
869
                raise CheckException(MSG_TEMPORARY_RECORD)