checkandfix.py 27.3 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
""" harvest_tools.checkandfix
3 4

"""
5
import numpy as np
6 7 8
import re
import regex

9
from .base import search_synonym, ToolException
10
from datetime import datetime
11
from .exception import CheckException
12
from gluon import current
13 14 15 16 17 18 19 20
from invenio_tools import (DECODE_REF,
                           MSG_NO_CONF,
                           MSG_NO_THESIS,
                           OAI_URL,
                           RecordConf,
                           RecordThesis,
                           REG_OAI,
                           REG_YEAR)
21 22 23 24

from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS

from itertools import imap
25
from plugin_dbui import CLEAN_SPACES, get_id
26 27


28
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
29 30 31 32

# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
33
DECODE_YYYY = re.compile(r"^(\d{4})$")
34

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
MONTHS = {"Jan": "01",
          "Feb": "02",
          "Fev": "02",
          "Mar": "03",
          "Apr": "04",
          "Avr": "04",
          "May": "05",
          "Mai": "05",
          "Jun": "06",
          "Jul": "07",
          "Aug": "08",
          "Sep": "09",
          "Oct": "10",
          "Nov": "11",
          "Dec": "12"}
50

51 52
MSG_INVALID_HOST = "Invalid host"

53
MSG_NO_AUTHOR = "Reject no author(s)"
54
MSG_NO_CONF_DATE = "Reject no conference date"
55
MSG_NO_DATE = "Reject no submission date"
56
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
57
MSG_NO_REF = "Reject incomplete paper reference. Check "
58
MSG_NO_YEAR = "Reject no publication year"
59

60
MSG_TEMPORARY_RECORD = "Temporary record"
61

62 63 64
MSG_TO_MANY_DATE = "Reject to many submit date"
MSG_TO_MANY_FAUTHOR = "Reject to many first author"
MSG_TO_MANY_YEAR = "Reject to many year"
65

66 67
MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
68

69
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
70

71 72
OAI_INVENIO = "oai:%s:%s"

73 74
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
75 76

REG_CONF_DATES_2 = \
LE GAC Renaud's avatar
LE GAC Renaud committed
77
    re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
78

79
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
80 81 82

REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")

83 84
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)

85 86 87 88 89
REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}")

REG_WELL_FORMED_CONF_DATES_2 = \
    re.compile("\d{2} [A-Z][a-z]{2} - \d{2} [A-Z][a-z]{2} \d{4}")

90 91
UNIVERSITY = "University"

92 93

class CheckAndFix(object):
94 95
    """A collection of tools to check and repair the content
    of the Marc12 record.
96

97 98
    """
    def __init__(self):
99

100
        self.db = current.db
101 102 103 104 105 106 107
        self.reg_institute = self._get_reg_institute()

        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

        # private cache for my authors list
108
        self._my_authors = {}
109

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    @staticmethod
    def _get_conference_dates(record):
        """Return the opening and closing dates of a conference.

        Args:
            record (RecordConf):
                record describing a conference proceeding or talk.

        Returns:
            tuple of datetime.date:
                opening and closing dates.

        Raise:
            ToolException:
                no conference date found.

        """
        if u"meeting_name" not in record:
            raise ToolException(MSG_NO_CONF_DATE)

        meeting = record[u"meeting_name"]
        meeting = (meeting[0] if isinstance(meeting, list) else meeting)

        # CDS has the opening and closing dates encoded as 20141231
        if u"opening_date" in meeting and u"closing_date" in meeting:

            fmt = "%Y%m%d"

            val = meeting[u"opening_date"]
            opening = datetime.strptime(val, fmt)

            val = meeting[u"closing_date"]
            closing = datetime.strptime(val, fmt)

            return (opening, closing)

        # both CDS and INSPIRE have the dates subfield
        val = meeting[u"date"]

        # date is encode as 12 - 15 Mar 2014
        m = REG_CONF_DATES_1.match(val)
        if m:

            fmt = "%d-%b-%Y"

            val = u"%s-%s-%s" % (m.group(1), m.group(3), m.group(4))
            opening = datetime.strptime(val, fmt)

            val = u"%s-%s-%s" % (m.group(2), m.group(3), m.group(4))
            closing = datetime.strptime(val, fmt)

            return (opening, closing)

        # dates are encoded 29 Feb - 1 Mar 2014
        m = REG_CONF_DATES_2.match(val)
        if not m:
            raise ToolException(MSG_NO_CONF_DATE)

        fmt = "%d-%b-%Y"

        val = u"%s-%s-%s" % (m.group(1), m.group(2), m.group(5))
        opening = datetime.strptime(val, fmt)

        val = u"%s-%s-%s" % (m.group(3), m.group(4), m.group(5))
        closing = datetime.strptime(val, fmt)

        return (opening, closing)

178
    def _get_reg_institute(self):
179 180 181 182 183 184
        """Get the regular expression defining the affiliation of my institute.

        It is obtained by concatenating the affiliation keys.
        Affiliation key can contains character like ``(``, ``)`` or ``&``.
        They are replaced by ``\(`` *etc*.

185
        Returns:
186
            str:
187 188 189

        """
        # alias
190
        db = self.db
191 192 193
        app = current.app
        reg_institute = app.reg_institute

194 195 196
        # regular expression for the affiliation keys
        # protect special character
        # add start and end of string for an exact match
197 198
        if not reg_institute:

199 200 201
            lst = []
            for row in db(db.affiliation_keys.id > 0).iterselect():
                val = row.key_u
202

203 204 205 206 207 208 209 210 211 212 213 214 215
                val = (val
                       .replace("(", "\(")
                       .replace(")", "\)")
                       .replace("&", "\&")
                       .replace("$", "\$")
                       .replace("+", "\+")
                       .replace("?", "\?"))

                val = r"(^|\|){}($|\|)" .format(val)

                lst.append(val)

            reg_institute = r"|".join(lst)
216

217 218 219 220 221
        return reg_institute

    def _get_author_rescue_list(self, record, id_project, id_team):
        """Get the rescue list for my authors.

222
        Args:
223 224 225 226 227 228 229 230
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database.

            id_team (int):
                identifier of the team in the database.
231

232
        Returns:
233 234
            list:
                empty when not defined
235 236

        """
237
        year = record.submitted()
238 239 240 241

        # try to recover year when not defined
        if not year:
            # published article, proceeding
242 243
            if record[u"publication_info"].year.iloc[0] != "":
                year = record[u"publication_info"].year.iloc[0]
244 245

            # start date of a conference
246 247
            elif record._get(u"meeting_name", u"opening_date") != u"":
                year = record._get(u"meeting_name", u"opening_date")
248 249

            # end date of a conference
250 251
            elif record._get(u"meeting_name", u"closing_date") != u"":
                year = record._get(u"meeting_name", u"closing_date")
252 253 254 255 256

            else:
                return []

        #
257 258
        # protection
        # submitted and paper year are protect against erratum, but ...
259 260 261 262 263 264 265 266 267 268 269 270 271 272
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)

        else:
            return []

        # caching
LE GAC Renaud's avatar
LE GAC Renaud committed
273
        t = (year, id_project, id_team)
274 275 276 277 278
        if t == self.__par:
            return self.__reference

        # extract the list from the database
        row = self.db.my_authors(year=year,
LE GAC Renaud's avatar
LE GAC Renaud committed
279 280
                                 id_projects=id_project,
                                 id_teams=id_team)
281 282

        if row:
283
            self.__reference = row['authors'].strip("\n"). split(', ')
284 285 286 287
        else:
            self.__reference = []

        return self.__reference
288

289 290 291 292
    def _is_synonym(self, tablename, value):
        """Check that the synonym field contains *value*.

        Args:
293 294
            tablename (str): name of the database table
            value (str): value to be searched
295 296 297 298 299 300 301 302 303 304 305 306 307 308

        Returns:
            bool: ``True`` if *one* row is found, ``False`` otherwise.

        """
        db = self.db
        table = db[tablename]

        query = table.synonyms.contains(value)
        if db(query).count() == 1:
            return True

        return False

309
    def _recover_submitted(self, record):
310 311 312
        """Recover submitted date using conference, preprint or thesis
        information.

313
        Args:
314 315
            record (RecordPubli):
                record describing a publication.
316

317
        Returns:
318 319
            unicode:
                empty when procedure failed
320 321

        """
322
        val = u""
323
        if isinstance(record, RecordConf):
324

325 326
            opening, closing = self._get_conference_dates(record)
            return opening.strftime("%Y-%m-%d")
327

328
        elif isinstance(record, RecordThesis):
329 330 331 332 333 334 335 336 337 338 339
            val = record.these_defense()

        else:
            report = record.preprint_number()
            if report:
                m_arxiv = DECODE_ARXIV.match(report)
                if m_arxiv:
                    val = "20%s-%s" % (m_arxiv.group(1), m_arxiv.group(2))

        return val

340 341
    @staticmethod
    def authors(record):
342
        """Check that author fields are defined.
343

344
        Args:
345 346
            record (RecordPubli):
                record describing a publication.
347

348
        Raises:
349 350
            CheckException:
                when there is no authors.
351 352 353

        """

354
        if not record.is_authors():
355 356 357
            raise CheckException(MSG_NO_AUTHOR)

    def collaboration(self, record):
358 359
        """Check the collaboration.
        Have a look to the synonyms when the collaboration is not well formed.
360

361
        Args:
362 363
            record (RecordPubli):
                record describing a publication.
364

365
        Raises:
366 367
            CheckException:
                when the collaboration value is defined
368
                nor entered as a synonym.
369 370 371

        """
        val = record.collaboration()
372 373 374
        if not val:
            return

375
        try:
376
            search_synonym(self.db.collaborations, "collaboration", val)
377

378 379 380
        except ToolException as e:
            raise CheckException(*e.args)

381
    def country(self, record):
382
        """Check conference country.
383
        Have a look to the synonyms when the country does not exist.
384

385
        Args:
386 387
            record (RecordConf):
                record describing a talk or a proceeding.
388

389
        Raises:
390 391
            CheckException:
                the country is not defined nor entered as a synonym.
392 393

        """
394
        if not isinstance(record, RecordConf):
395 396
            return

397
        val = record.conference_country()
398 399

        try:
400 401
            search_synonym(self.db.countries, "country", val)

402 403
        except ToolException as e:
            raise CheckException(*e.args)
404

405 406
    def conference_date(self, record):
        """Check conference date and format it properly.
407

408
        Args:
409 410
            record (RecordConf):
                record describing a talk or a proceeding.
411

412
        Raises:
413 414
            CheckException:
                dates are not found.
415 416 417 418 419 420

        """
        # conference information are available, i.e proceeding
        if not isinstance(record, RecordConf):
            return

421 422 423
        val = record.conference_dates()
        if len(val) == 0:
            raise CheckException(MSG_NO_CONF_DATE)
424

425 426
        # is it well formed
        if REG_WELL_FORMED_CONF_DATES_1.match(val):
427 428
            return

429 430
        if REG_WELL_FORMED_CONF_DATES_2.match(val):
            return
431

432 433
        # format the date properly
        opening, closing = self._get_conference_dates(record)
434

435 436 437 438 439 440 441 442 443 444 445
        if opening.month == closing.month:
            val = "%02i - %02i %s %i" % (opening.day,
                                         closing.day,
                                         opening.strftime("%b"),
                                         opening.year)
        else:
            val = "%02i %s - %02i %s %i" % (opening.day,
                                            opening.strftime("%b"),
                                            closing.day,
                                            closing.strftime("%b"),
                                            opening.year)
446

447 448 449
        meeting = record[u"meeting_name"]
        meeting = (meeting[0] if isinstance(meeting, list) else meeting)
        meeting[u"date"] = val
450

451
    def is_bad_oai_used(self, record):
452 453 454
        """Bad OAI is when the ``id`` in the OAI field is different from
        the ``record id``. This happens when an old record is redirected
        to new one.
455

456
        Args:
457 458
            record (RecordPubli):
                record describing a publication.
459

460
        Returns:
461 462
            bool:
                ``True`` when a record is found in the database with
463
                the bad OAI.
464

465 466 467 468
        """
        value = record.oai()
        match = REG_OAI.match(value)

LE GAC Renaud's avatar
LE GAC Renaud committed
469
        if int(match.group(2)) != record.id():
470 471
            db = self.db

472
            # a record with the bad OAI exists in the database
473 474 475
            bad_oai_url = OAI_URL % (match.group(1), match.group(2))
            if get_id(db.publications, origin=bad_oai_url):
                return True
476

477
        return False
478

479 480
    @staticmethod
    def format_authors(record, fmt="Last, First"):
481
        """Format the author names.
482

483 484
        Args:
            record (RecordPubli): record describing a publication.
485 486 487 488
            fmt (str):
                define the format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."
489 490

        """
491
        record.reformat_authors(fmt)
492

493 494 495 496
    @staticmethod
    def format_editor(record):
        """Format the editor abbreviation.
        The encoding depends on the store::
497

498 499
            INVENIO:    Phys. Lett. B + volume 673
            INSPIREHEP: Phys.Lett + volume B673
500

501
        Standardise the answer as ``Phys. Lett. B``.
502

503
        Args:
504 505
            record (RecordPubli):
                record describing a publication.
506

507
        Raises:
508 509
            CheckException:
                when the editor is not well formed.
510 511 512 513 514

        """
        if not record.is_published():
            return

515
        df = record[u"publication_info"].iloc[0]
516

517 518
        editor = df.title
        volume = df.volume
519

520 521
        # add space after the dot  Phys.Rev -> Phys. Rev
        editor = re.sub(r'\.([A-Z])', r'. \1', editor)
522

523 524 525 526 527
        # get the volume letter
        m = re.match(r'([A-Z]+) *(\d+)', volume)
        if m and m.group(1) != editor[-1]:
            editor = "%s %s" % (editor, m.group(1))
            volume = m.group(2)
528

529 530
        # remove stupid mistake
        editor = CLEAN_SPACES(editor)
531

532
        df[["title", "volume"]] = [editor, volume]
533

534 535 536
    def format_universities(self, record):
        """Format the name of the university for PhD:

537 538
            * Fix the name of Aix-Marseille University
            * Replace U. by University
539

540 541
        Args:
            record (RecordThesis): record describing a thesis.
542 543 544

        """
        # protection
545
        if not isinstance(record, RecordThesis):
546 547
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
548
        is_cppm = self._get_reg_institute().find("CPPM") != -1
549

550
        # CPPM: fix the name of Aix-Marseille university
551
        if is_cppm:
552 553 554

            year = REG_YEAR.search(record.these_defense()).group(1)
            if int(year) < 2012:
555
                university = "Université de la Méditerrannée Aix-Marseille II"
556
            else:
557
                university = "Aix Marseille Université"
558

559 560 561 562
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    if "Marseille" in record["502"]["b"]:
                        record["502"]["b"] = university
563

564 565 566 567
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        if "Marseille" in record["502"]["b"][i]:
                            record["502"]["b"][i] = university
568 569 570

        # Other: replace U. by University
        else:
571
            university = current.T(UNIVERSITY, lazy=False)
572

573 574 575
            if "502" in record and "b" in record["502"]:
                if isinstance(record["502"]["b"], str):
                    value = record["502"]["b"]
576 577
                    if "U." in value:
                        value = value.replace('U.', university)
578
                        record["502"]["b"] = value
579

580 581 582
                elif isinstance(record["502"]["b"], list):
                    for i in range(len(record["502"]["b"])):
                        value = record["502"]["b"][i]
583 584
                        if "U." in value:
                            value = value.replace('U.', university)
585
                            record["502"]["b"][i] = value
586

587
    def get_my_authors(self, record, sep=", ", sort=False):
588
        """Get authors of my institutes signing the record.
589 590
        The information is append to the Record object via the attribute
        ``my_authors``.
591

592
        Args:
593 594 595 596
            record (RecordPubli):
                record describing a publication.

            sep (unicode):
597
                string separating author names. The default is the comma.
598

599 600 601
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
602

603
        Returns:
604 605
            unicode:
                the list of authors separated by the ``sep`` argument.
606

607
        Raises:
608 609
            CheckException:
                when the list is empty
610 611 612 613

        """
        # might have been computed when affiliation is checked
        rec_id = record.id()
614 615 616
        if rec_id in self._my_authors:
            li = self._my_authors[rec_id]
            value = sep.join(li)
617 618 619 620

        # find authors of my institute signing the record
        else:
            reg_institute = self.reg_institute
621 622
            value = \
                record.find_authors_by_affiliation(reg_institute, sep, sort)
623

624
        if len(value) == 0:
625 626
            raise CheckException(MSG_NO_MY_AUTHOR)

LE GAC Renaud's avatar
LE GAC Renaud committed
627
        record.my_authors = value
628

629 630
    @staticmethod
    def is_conference(record):
631 632
        """Check that the record described a conference talk / proceeding.

633
        Args:
634 635
            record (RecordPubli):
                record describing a publication.
636

637
        Raises:
638 639
            CheckException:
                the record is not associated to a conference.
640 641 642 643 644

        """
        if not isinstance(record, RecordConf):
            raise CheckException(MSG_NO_CONF)

645 646
    @staticmethod
    def is_thesis(record):
647
        """Check that the record described a thesis.
648

649 650
        Args:
            record (RecordPubli): record describing a publication.
651

652 653
        Raises:
            CheckException: when the record does not describe a thesis.
654 655 656 657 658

        """
        if not isinstance(record, RecordThesis):
            raise CheckException(MSG_NO_THESIS)

659 660 661 662 663 664 665
    def my_affiliation(
            self,
            record,
            id_project,
            id_team,
            fmt_rescue="F. Last",
            sort=False):
666 667 668 669
        """Check that authors of my institute are signatories.

        Launch a recovery procedure when affiliations are not defined.
        It is based on the author rescue list stored in the database.
670

671
        Args:
672 673 674 675 676 677 678 679 680
            record (RecordPubli):
                record describing a publication.

            id_project (int):
                identifier of the project in the database

            id_team (int):
                identifier of the team in the database

681 682
            fmt_rescue (str):
                the format for the authors used in the rescue list
683

684 685 686 687 688 689 690 691
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

        Return
            str:
                * the found affiliation
                * an empty string when the rescue list is used.
692

693
        Raises:
694 695 696 697
            CheckException:
                when the rescue list is required but empty
                or because the intersection between the rescue list
                and the author is null.
698 699

        """
700
        value = record.find_affiliation(self.reg_institute)
701
        if len(value) > 0:
702
            return value
703

704 705 706 707 708
        # affiliation is not defined
        # try to recover using the authors rescue list
        rescue_list = self._get_author_rescue_list(record, id_project, id_team)
        if not rescue_list:
            raise CheckException(MSG_NO_MY_AUTHOR)
709

710
        # format the author in the same way as the rescue list
711 712 713 714
        fmt_ref = record._last_fmt_author
        record.reformat_authors(fmt_rescue)

        if sort:
715
            authors = (record[u"authors"][["last_name", "fmt_name"]]
716 717 718 719
                       .sort_values(by="last_name")
                       .fmt_name)

        else:
720
            authors = (record[u"authors"].fmt_name
721 722 723 724
                       .sort_index())

        # go back to the origin formatting
        record.reformat_authors(fmt_ref)
725

726 727
        # compute the intersection between the authors and the rescue list
        intersection = set(authors) & set(rescue_list)
728

729
        if len(intersection) == 0:
730
            raise CheckException(MSG_NO_MY_AUTHOR)
731

732
        # cache the result for a latter use
733 734
        self._my_authors[record.id()] = list(intersection)

735
        return ""
736

737 738
    @staticmethod
    def paper_reference(record):
739
        """Check that editor, page, volume and paper year are defined
740
        for a published paper. Repair it from doi when possible.
741

742
        Args:
743 744
            record (RecordPubli):
                record describing a publication.
745

746
        Raises:
747 748
            CheckException:
                when the paper reference is not well formed.
749 750

        """
751
        if record.is_published():
752 753
            return

754 755 756 757
        # paper reference can be incomplete or missing
        # is the paper published ? In that case the doi is defined
        if u"doi" not in record:
            return
758

759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795
        # what information is missing ?
        # * df.columns are title, volume, year and pagination
        # * df can contains one or more rows due to erratum.
        # * assume that the first row is the oldest one and corresponds tp
        #   the first publication
        # * the row contains empty string when the record is not published.
        # * iloc[0] returns a serie where the index are the column's name
        #
        columns = (record[u"publication_info"].iloc[0]
                   .replace("", np.nan)
                   .dropna()
                   .index)

        missing = PAPER_REFERENCE_KEYS.difference(columns)

        # try to recover from the doi when it has the form
        # xx.yyyy/Publisher.Volume.Page
        m = REG_DOI.match(record[u"doi"])
        if not m:
            raise ToolException(MSG_NO_REF + str(list(missing)))

        for subfield in missing:
            if subfield == "title":

                # transform PhysRevD in Phys. Rev. D
                li = re.split(r"([A-Z][a-z]+)", m.group(1))
                title = ". ".join([el for el in li if len(el) > 0])
                record[u"publication_info"].loc[0, u"title"] = title

            elif subfield == "volume":
                record[u"publication_info"].loc[0, u"volume"] = m.group(2)

            elif subfield == "pagination":
                record[u"publication_info"].loc[0, u"pagination"] = m.group(3)

            elif subfield == "year":
                raise ToolException(MSG_NO_REF + "[year]")
796

797 798 799 800 801
    def publisher(self, record):
        """Check publisher.
        Have a look to the synonyms when the publisher does not exist.

        Args:
802 803
            record (RecordPubli):
                record describing a publication.
804 805

        Raises:
806 807
            CheckException:
                when the publisher is not defined nor entered as a synonym.
808 809 810

        """
        val = record.paper_editor()
811
        if len(val) == 0:
812 813
            return

814 815
        # convert ToolException to CheckExcpetion
        try:
816
            db = self.db
817
            search_synonym(db.publishers, "abbreviation", val)
818

819 820
        except ToolException as e:
            raise CheckException(*e.args)
821

822 823
    @staticmethod
    def recover_oai(record, host):
824 825 826
        """Recover the OAI identifier when it is not defined
        or not well form.

827 828
        Args:
            record (RecordPubli): record describing a publication.
829
            host (str): possible values ares ``cds.cern.ch``
830
                or ``inspirehep.net``
831 832

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
833 834 835 836 837 838 839
        # Note:
        # For the record cds 1951625, possible values are:
        # oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
        # oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
        # in both store)
        # In all the case the first OAI corresponds to the record.id()
        #
840 841 842 843 844
        oai = record.oai()
        if oai is not None and REG_OAI.match(oai):
            return

        if host == "cds.cern.ch":
845
            field, subfield = "0248", "a"
846 847

        elif host == "inspirehep.net":
848
            field, subfield = "909CO", "o"
849 850 851 852 853 854 855 856 857

        else:
            raise ValueError(MSG_INVALID_HOST)

        if field not in record:
            record[field] = dict()

        record[field][subfield] = OAI_INVENIO % (host, record.id())

858
    def submitted(self, record):
859
        """Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
860 861
        Look for alternative when it is not defined.

862 863
        Note:
            After this check the year submitted contains one entry.
864

865
        Args:
866 867
            record (RecordPubli):
                record describing a publication.
868

869
        Raises:
870 871
            CheckException:
                when the date is not well formed or when more
872
                than one date are found.
873 874

        """
875
        date = record.submitted()
876 877

        # recover missing date using conference, preprint, thesis information
878 879 880
        if len(date) == 0:
            date = self._recover_submitted(record)
            if len(date) == 0:
881 882
                raise CheckException(MSG_NO_DATE)

883 884 885 886 887
        # 22 Mar 2011
        m = DECODE_DD_MMM_YYYY.match(date)
        if m:
            data = (m.group(3), MONTHS[m.group(2)], int(m.group(1)))
            date = '%s-%s-%02i' % data
888

889 890 891 892 893
        # 22 03 2011
        m = DECODE_DD_MM_YYYY.match(date)
        if m:
            data = (m.group(3), int(m.group(2)), int(m.group(1)))
            date = '%s-%02i-%02i' % data
894