recordpubli.py 25.4 KB
Newer Older
1 2 3 4 5 6
""" invenio_tools.recordpubli

"""
import re


7 8 9 10
from .base import (ARXIV,
                   ARXIV_PDF,
                   REG_ARXIV_NUMBER,
                   REG_YEAR)
11

12
from .exception import RecordException
13
from filters import CLEAN_COLLABORATION
14
from numpy import NaN
15
from pandas import concat, DataFrame
16
from plugin_dbui import as_list, CLEAN_SPACES
17
from .record import Record
18

19 20 21 22 23 24 25 26

AUTHOR_FORMATS = [
    "First, Last",
    "F. Last",
    "Last",
    "Last, First",
    "Last F."]

27 28 29 30 31 32 33
# Decode publication reference:
#  Phys. Rev. Lett. 113, 032001 (2014)
#  Eur. Phys. J. C (2014) 74:2883
_ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2 = r"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]

34 35
MSG_INVALID_FMT = "Invalid format for author"

36 37 38
# The MARC12 keys containing paper reference
PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])

39 40
# Limit the number of first name to two (others will be ignored)
REG_INITIAL = initial = r"^(\w+)\.?(\-)* *(\w+)*\.?"
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66


def to_initial(x, y, z):
    """Help function to extract initial from a first name split in x, y and z:

        Albert                (x="Albert", y="", z="")
        Antonio Augusto       (x="Antonio", y="", z="Augusto")
        Jean-Pierre           (x="Jean", y="-", z="Pierre")

    Args:
        x (str): first part
        y (str): separator
        z (str): second part

    Returns:
        str

    """
    if z == "":
        return "%s." % x[0:1]

    if y == "":
        return "%s. %s." % (x[0:1], z[0:1])
    else:
        return "%s.%s%s." % (x[0:1], y[0:1], z[0:1])

67

68 69 70 71
def to_str(x):
    return ("|".join(x) if isinstance(x, list) else x)


72
class RecordPubli(Record):
73 74
    """The MARC record describing a publication.
    Usual publications are article, preprint, proceeding, report and talk.
75 76
    The relation between methods and MARC fields are the following::

77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
        +-----------------------+---------+----------+
        |                       |  CDS    | INSPIREP |
        +-----------------------+---------+----------+
        | authors               | 700 a   |          |
        | collaboration         | 710 g   |          |
        | first author          | 100 a   |          |
        | institutes            | 700 u   |          |
        | paper editor          | 773 p   |          |
        | paper pages           | 773 c   |          |
        | paper reference       | 773 o   |          |
        | paper URL             | 8564 u  |          |
        | paper volume          | 773 v   |          |
        | paper year            | 773 y   |          |
        | preprint number       | 037 a   |          |
        | report number         | 088 a   | 037a     |
        | submitted             | 269 c   |          |
        | title                 | 245 a   |          |
        | year                  | 260 c   |          |
        +-----------------------+---------+----------+
96 97

    """
98 99
    def __init__(self, *args):

100 101
        self._last_fmt_author = "Last, First"

102 103 104 105 106 107 108 109 110
        Record.__init__(self, *args)
        self._process_authors()

    def _process_authors(self):
        """Convert authors information into DataFrame:

            * Keep the subfield "a", "u" and "e" (phd thesis)
            * Convert list of affiliation in string separated by "|"

111
        Authors and their affiliations are defined in the fields 100 and 700.
112 113 114 115 116 117
        The method deals with cases where:

            * the first author is defined in 100 but it is not in 700
            * first author is not defined in 100 but in 700
            * thesis in which 700 contains names of director

118 119 120 121 122 123 124 125 126 127 128 129 130
        Authors and their affiliations are stored in DataFrame with the
        following structure:

            +------------+---------------------------+
            | column     |                           |
            +------------+---------------------------+
            | a          | author name (Last, First) |
            | u          | affiliation(s)            |
            | first_name | first name                |
            | last_name  | family name               |
            | fmt_name   | formated name             |
            +------------+---------------------------+

131
        """
132 133
        columns4names = ["last_name", "first_name"]

134 135 136 137
        # ....................................................................
        #
        # Instantiate DataFrame for field 100 and 700
        #
138
        di = {"100": None, "700": None}
139 140 141 142 143 144 145 146 147 148 149
        for key in di.iterkeys():

            if key not in self:
                continue

            data = self[key]
            data = (data if isinstance(data, list) else [data])

            df = DataFrame(data)
            columns = df.columns

150
            # keep columns:
151 152 153 154 155
            #    - "a": author name
            #    - "e": phd director (equal to "dir.")
            #    - "u": affiliation(s)
            df = df.drop(columns.difference(["a", "e", "u"]), axis="columns")

156
            # add columns first_name, last_name and fmt_name
157 158 159

            # protection -- split create 1, 2 and more than 2 columns
            # former append when the author name is 'ATLAS collaboration'
160
            df1 = df.a.str.split(",", expand=True)
161 162 163 164
            if df1.shape[1] < 2:
                continue

            df[columns4names] = df1[[0, 1]]
165 166 167 168 169
            df["fmt_name"] = df.a

            df.first_name = df.first_name.str.strip()
            df.last_name = df.last_name.str.strip()

170 171 172 173 174 175 176 177
            # protection -- affiliation not defined
            if "a" in columns and "u" not in columns:
                dfu = DataFrame([""]*len(df), columns=["u"])
                df = concat([df, dfu], axis="columns")

            # protection -- mission affiliation
            df.u = df.u.fillna("")

178
            # convert list of affiliation to string separated by |
179 180 181 182 183
            df.u = df.u.apply(lambda x: to_str(x))

            di[key] = df

        # alias
184
        d100, d700 = di["100"], di["700"]
185 186 187

        # ....................................................................
        #
188 189 190 191
        # Protection -- more than one first author
        #
        #     treat the case with duplicate author name
        #     by building the affiliation string
192 193 194
        #
        if d100 is not None and len(d100) > 1:
            grouped = d100.groupby(["a"], sort=False)
195

196 197
            if len(grouped) == 1:
                for name, group in grouped:
198
                    last_name, first_name = name.split(",")
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214

                    affiliations = \
                        [el for el in group.u if el not in ("", NaN, None)]

                    di = {"a": [name],
                          "first_name": [first_name.strip()],
                          "fmt_name": [name],
                          "last_name": [last_name.strip()],
                          "u": ["|".join(affiliations)]}

                    d100 = DataFrame(di)

        # NOTE
        # The case with more than one first author is rare
        # It will be detect by the CheckAndFix procedure when it is
        # not fixed by the above protection
215 216 217 218 219 220 221

        # ....................................................................
        #
        # the author are spread over the 100 and 700 field.
        # deal with cases where the first author is defined in 100
        # but not in 700, first author is defined in 100 and in 700
        # or no author in 100
222
        #
223 224 225 226 227 228 229 230 231 232 233 234
        if d100 is not None and d700 is not None:
            if d100.a.iloc[0] != d700.a.iloc[0]:
                if len(d100) == 1:
                    d700 = concat([d100, d700], ignore_index=True)

        elif d100 is None and d700 is not None:
            d100 = DataFrame(d700.iloc[0]).transpose()

        elif d700 is None and d100 is not None:
            d700 = d100

        else:
235 236 237 238 239 240
            d100 = d700 = DataFrame({
                "a": [""],
                "first_name": [""],
                "fmt_name": [""],
                "last_name": [""],
                "u": [""]})
241

242 243 244 245
        # ....................................................................
        #
        # Update
        #
246 247
        self["100"] = d100
        self["700"] = d700
248

249
    def authors(self, sep=", ", sort=False):
250 251
        """The author(s) signing the publication.

252
        Args:
253 254 255 256 257
            sep (unicode):
                string separating author names. The default is the comma.
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
258

259 260
        Returns:
            unicode:
261
                * Author names are separated by the ``sep`` argument.
262
                * The string is empty when there is no authors.
263 264

        """
265 266
        li = self.authors_as_list(sort=sort)
        return sep.join(li)
267

268
    def authors_as_list(self, sort=False):
269 270
        """The list of author(s) signing the publication.

271 272 273 274 275
        Args:
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record

276 277
        Returns:
            list: the list is empty when authors are not defined.
278 279

        """
280
        if sort:
281
            li = (self["700"][["last_name", "fmt_name"]]
282 283 284 285 286
                  .sort_values(by="last_name")
                  .fmt_name
                  .tolist())

        else:
287
            li = (self["700"].fmt_name
288 289
                  .sort_index()
                  .tolist())
290 291 292 293 294

        if len(li) == 1 and li[0] == "":
            li = []

        return li
295 296 297 298

    def collaboration(self):
        """The collaboration(s) signing the publication.

299 300 301 302
        Returns:
            unicode:
                * names of collaboration are separated by a comma.
                * The filter CLEAN_COLLABORATION is applied.
303 304

        """
305
        li = self._get("710", "g", force_list=True)
LE GAC Renaud's avatar
LE GAC Renaud committed
306
        return CLEAN_COLLABORATION(", ".join(li))
307

308
    def find_affiliation(self, pattern):
309
        """Find affiliation matching the regular expression *pattern*.
310

311
        Args:
312 313
            pattern (unicode):
                regular expression defining the affiliation keys.
314 315 316
                It has to be build for an exact match namely containing
                start and end of string. This is reuqired to separate
                `Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
317

318 319
        Returns:
            unicode:
320
                - the affiliation or the first one when several are found.
321
                - empty string when nothing is found.
322 323

        """
324
        df = self["700"]
325 326
        query = df.u.str.contains(pattern)
        data = (df[query].u.unique())
327

328
        return (data[0] if len(data) > 0 else "")
329

330
    def find_authors(self, pattern, sep=", ", sort=False):
331
        """Find authors containing the regular expression *pattern*.
332
        The search is performed on the formatted name.
333

334
        Args:
335 336 337 338 339 340 341
            pattern (unicode):
                regular expression defining the author name(s).
            sep (unicode):
                string separating author names. The default is the comma.
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
342

343 344
        Returns:
            unicode:
345
                * Author names are separated by ``sep`` argument.
346
                * The string is empty when nothing is found.
347 348

        """
349
        df = self["700"]
350

351 352 353 354 355 356
        query = df.fmt_name.str.contains(pattern)

        if sort:
            data = (df.loc[query, ["last_name", "fmt_name"]]
                    .sort_values(by="last_name")
                    .fmt_name)
357

358 359 360 361 362
        else:
            data = (df.loc[query, ["fmt_name"]]
                    .sort_index()
                    .fmt_name)

363
        return ("" if len(data) == 0 else sep.join(data))
364

365
    def find_authors_by_affiliation(self, pattern, sep=", ", sort=False):
366 367
        """Find authors belonging to a given institute(s) defined by a regular
        expression.
368

369
        Args:
370 371
            pattern (unicode):
                regular expression defining the affiliation keys
372
                for the institute(s).
373 374 375 376 377
            sep (unicode):
                string separating author names. The default is the comma.
            sort (bool):
                sort authors by family name when true otherwise use the
                order of authors at the creation of the record
378

379 380
        Returns:
            unicode:
381
                * Author names are separated by the ``sep`` argument.
382
                * Author are sorted according to their family name.
383
                * Empty string when authors are not found.
384 385

        """
386
        df = self["700"]
387
        query = df.u.str.contains(pattern)
388

389 390 391 392 393 394 395 396 397 398
        if sort:
            data = (df.loc[query, ["last_name", "fmt_name"]]
                    .sort_values(by="last_name")
                    .fmt_name)

        else:
            data = (df.loc[query, ["fmt_name"]]
                    .sort_index()
                    .fmt_name)

399
        return (sep.join(data) if len(data) > 0 else "")
400 401 402 403

    def first_author(self):
        """The name of the first author.

404
        Returns:
405
            unicode:
406
                - Empty string when the first author is not defined.
407 408

        """
409
        return self["700"].fmt_name.iloc[0]
410 411 412 413

    def first_author_institutes(self):
        """The institute(s) associated to the first author.

414
        Note:
415 416
            Search is performed via the affiliation defined by the "u" key
            of the author field.
417

418 419
        Returns:
            unicode:
420
                - names are separated by ``|``.
421
                - The string is empty when institutes are not defined.
422 423

        """
424
        val = self["700"].u.iloc[0]
425
        return ("" if val == NaN else val)
426 427 428 429

    def institutes(self):
        """The list of institute signing the publication.

430
        Note:
431 432
            Name of institute are given by the affiliation defined by
            the "u" key of the author field.
433

434 435
        Returns:
            list: the list is sort in alphabetic order.
436 437

        """
438
        # expand multi-affiliation (one per column)
439
        df = self["700"].u.str.split("|", expand=True)
440

441 442 443 444 445 446
        # merge all columns into a single one,
        # sort and remove duplicate entries
        li = [df[el].dropna() for el in df.columns]
        df = (concat(li, ignore_index=True)
              .sort_values()
              .unique())
447

448
        return df.tolist()
449

450
    def is_affiliations(self):
451
        """``True`` when affiliations are defined for authors.
452

453
        Note:
454
            This is a fast algorithm checking that the ``u`` field exists.
455 456
            To check that the affiliation is defined for all authors,
            uses the method :func:`is_affiliation_for_all`.
457 458


459 460
        Returns:
            bool:
461 462

        """
463
        df = self["700"]
464

465 466 467 468 469
        if "u" not in df.columns:
            return False

        if len(df) == 1 and df.u.iloc[0] == "":
            return False
470 471 472

        return True

473
    def is_affiliation_for_all(self):
474
        """``True`` when affiliation are defined for all authors.
475

476 477
        Return:
            bool:
478 479

        """
480
        df = self["700"]
481 482 483

        query = df.u.isin(["", NaN])
        return df.u[query].size == 0
484

485
    def is_authors(self):
486
        """``True`` when authors are defined.
487

488 489
        Returns:
            bool:
490 491

        """
492
        df = self["700"]
493 494 495 496 497 498 499 500

        if "a" not in df.columns:
            return False

        if len(df) == 1 and df.a.iloc[0] == "":
            return False

        return True
501

502
    def is_published(self):
503 504 505 506
        """``True`` is the record is published.

        Returns:
            bool:
507 508

        """
509
        if "773" not in self:
510 511
            return False

512
        # record can contains erratum
513
        for di in as_list(self["773"]):
514

515 516 517
            # the reference field is complete and contains, at least,
            # the keys "p", "v", "y" and "c"
            if PAPER_REFERENCE_KEYS.issubset(set(di.keys())):
518 519
                return True

520 521 522 523 524 525 526 527 528 529 530
            # paper reference my be incomplete or even wrong
            # the recovery procedure will use the 773o
            # check that 773o contains the paper reference:
            #    Eur. Phys. J. C (2014) 74:2883
            #    Phys. Rev. Lett. 113, 032001 (2014)
            if "o" in di:
                value = di["o"]
                for reg in DECODE_REF:
                    if reg.match(value):
                        return True

531 532 533
        return False

    def is_with_erratum(self):
534
        """``True`` when the record contains erratum data.
535

536 537
        Returns:
            bool
538 539 540

        """
        # record with erratum contains a list of editor
541
        return "773" in self and isinstance(self["773"], list)
542 543

    def paper_editor(self):
544
        """The abbreviated version of the review, *e.g* Phys Lett B.
545

546 547 548 549
        Returns:
            unicode or list:
                * A list when there are erratum.
                * Empty string when not defined.
550 551

        """
552
        return self._get("773", "p")
553 554 555 556

    def paper_pages(self):
        """The page number / range when the record is published in a review.

557 558 559 560 561
        Returns:
            unicode or list:
                * The format is "45-67" or "234".
                * A list when there are erratum.
                * Empty string when not defined.
562 563

        """
564
        return self._get("773", "c")
565 566 567 568

    def paper_reference(self):
        """The full reference for a publication published in a review.

569 570 571 572 573
        Returns:
            unicode or list:
                * The format is "Phys Lett B 456 2010 5-6".
                * The string is empty when the publication is not
                  published in a review.
574 575

        """
576 577
        if "773" not in self:
            return ""
578 579 580

        li = []
        for k in ("p", "v", "y", "c"):
581 582
            if k in self["773"]:
                li.append(self["773"][k])
583

584
        return " ".join(li)
585 586 587 588

    def paper_url(self):
        """The URL of the preprint.

589 590
        Note:
            Many others URL exists mainly those related to open access.
591

592 593
        Returns:
            unicode: the string is empty when no URLs are found.
594 595 596 597

        """
        pdf = "%s.pdf" % self.preprint_number()

598 599
        if "8564" in self and isinstance(self["8564"], list):
            for el in self["8564"]:
600 601 602 603 604 605 606 607 608

                # protection see http://cds.cern.ch/record/2014733
                if "u" in el and isinstance(el["u"], list) and pdf:

                    m = REG_ARXIV_NUMBER.search(pdf)
                    if m:
                        return "%s%s" % (ARXIV_PDF, m.group())

                # cds.cern.ch
609
                if "y" in el and el["y"] == "Preprint":
610 611 612 613 614 615 616
                    return el["u"]

                # inspirehep.net
                elif "y" not in el and el["u"].endswith(pdf):
                    return el["u"]

        else:
617
            return ""
618 619 620 621

    def paper_volume(self):
        """The volume number when the record is published in a review.

622 623 624 625
        Returns:
            unicode or list:
                - A list when there are erratum.
                - Empty string when nothing is found.
626 627

        """
628
        return self._get("773", "v")
629 630 631 632

    def paper_year(self):
        """The year of the publication.

633 634 635 636
        Returns:
            unicode or list:
                - A list when there are erratum.
                - Empty string if the year is not defined.
637 638

        """
639
        rep = self._get("773", "y")
640 641 642 643 644 645 646 647 648 649 650 651 652 653

        # protection
        # in record http://cds.cern.ch:record/1951625 the entrie 773y
        # is duplicate but there is no erratum
        if isinstance(rep, list) and not isinstance(self["773"], list):
            rep = list(set(rep))
            if len(rep) == 1:
                rep = rep[0]

        return rep

    def preprint_number(self):
        """The ArXiv preprint number.

654 655
        Returns:
            unicode: empty string when it is not defined.
656 657 658 659

        """
        # for both CDS and INSPRIREHEP preprint data in 37 a
        # for CDS preprint information are also store in 88 a
660
        for k in ("037", "088"):
661 662 663
            for val in self._get(k, "a", force_list=True):
                if ARXIV in val:
                    return val
664
        return ""
665

666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
    def reformat_authors(self, fmt="Last, First"):
        """Reformat names of authors.

        The default formatting for cds/invenio record is ``Last, First``.

        Args:
            fmt (str):
                define the new format for author names.
                Possible values are "First, Last", "F. Last", "Last",
                "Last, First" and "Last F."

        Raises:
            RecordException: if fmt is not valid.

        """
        if fmt not in AUTHOR_FORMATS:
            raise RecordException(MSG_INVALID_FMT)

        if fmt == self._last_fmt_author:
            return

        self._last_fmt_author = fmt

        # alias
690
        d100, d700 = self["100"], self["700"]
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735

        # ....................................................................
        #
        # Compute initial for the first name
        #
        if fmt in ("F. Last", "Last F."):
            for df in (d100, d700):
                dfm = (df.first_name.str.extract(REG_INITIAL, expand=True)
                       .fillna(""))

                df["initial"] = dfm.apply(
                    lambda x: to_initial(x[0], x[1], x[2]), axis="columns")

        # ....................................................................
        #
        # Format
        #
        if fmt == "Last, First":
            d100["fmt_name"] = d100.a
            d700["fmt_name"] = d700.a

        elif fmt == "First, Last":
            d100["fmt_name"] = d100.first_name + ", " + d100.last_name
            d700["fmt_name"] = d700.first_name + " " + d700.last_name

        elif fmt == "F. Last":
            d100["fmt_name"] = d100.initial + " " + d100.last_name
            d700["fmt_name"] = d700.initial + " " + d700.last_name

        elif fmt == "Last":
            d100["fmt_name"] = d100.last_name
            d700["fmt_name"] = d700.last_name

        elif fmt == "Last F.":
            d100["fmt_name"] = d100.last_name + " " + d100.initial
            d700["fmt_name"] = d700.last_name + " " + d700.initial

        # ....................................................................
        #
        # Clean initial column
        #
        if fmt in ("F. Last", "Last F."):
            d100 = d100.drop("initial", axis="columns")
            d700 = d700.drop("initial", axis="columns")

736 737 738
    def report_number(self):
        """The report number(s) associated to the publication.

739 740 741 742 743
        Returns:
            unicode:
                - Numbers are separated by a comma
                - Number are sorted in alphabetic order.
                - Empty string when not defined.
744 745 746 747 748 749 750 751

        """
        li = []

        # cds.cern.ch
        # report number can be in 37a, 88a and 88 9
        # entry can be the preprint number arXiv:xxx
        if self.host().startswith("cds"):
752
            for elt in self._get("088", "a", force_list=True):
753 754 755 756 757 758 759
                if not elt.startswith(ARXIV):
                    li.append(elt)

            # if empty have a look to "088" "9"
            # logic to avoid version number in 88/9
            # 88/a = LHCB-PAPER-2015-016 while 88/9 = LHCB-PAPER-2015-016-003
            if not li:
760
                for elt in self._get("088", "9", force_list=True):
761 762 763 764 765 766 767
                    if not elt.startswith(ARXIV):
                        li.append(elt)

        # inspirehep.net / cds.cern.ch -- example of MARC structure:
        # 037__ $$aLHCB-PAPER-2014-047
        # 037__ $$aCERN-PH-EP-2014-221
        # 037__ $$9arXiv$$aarXiv:1410.0149$$chep-ex
768
        if "037" in self:
769

770 771
            if isinstance(self["037"], dict):
                if "9" in self["037"] and self["037"]["9"] == ARXIV:
772
                    pass
773 774 775
                elif "a" in self["037"]:
                    if not self["037"]["a"].startswith(ARXIV):
                        li.append(self["037"]["a"])
776

777 778
            elif isinstance(self["037"], list):
                for di in self["037"]:
779 780 781 782 783 784 785 786
                    if "9" in di and di["9"] == ARXIV:
                        continue

                    if "a" in di:
                        if not di["a"].startswith(ARXIV):
                            li.append(di["a"])

        li.sort()
LE GAC Renaud's avatar
LE GAC Renaud committed
787
        return ", ".join(li)
788 789 790 791

    def submitted(self):
        """The date of submission.

792 793 794 795 796
        Returns:
            unicode or list:
                * The format is "YYYY-MM" or "YYYY-MM-DD"
                * A list when there are erratum.
                * Empty list when not defined.
797 798

        """
799
        return self._get("269", "c", force_list=True)
800 801 802 803

    def title(self):
        """The title of the publication.

804 805 806 807 808
        Returns:
            unicode or list:
                * A list when there are erratum.
                * Empty string when not defined.
                * The filter CLEAN_SPACES is applied.
809 810

        """
811
        val = self._get("245", "a")
812 813 814 815 816 817 818 819 820 821 822 823 824

        if isinstance(val, (unicode, str)):
            return CLEAN_SPACES(val)

        elif isinstance(val, list):
            for i in range(len(val)):
                val[i] = CLEAN_SPACES(val[i])
        else:
            return val

    def year(self):
        """The year of the publication.

825 826 827 828
        Returns:
            unicode or list:
                * A list when there are erratum.
                * Empty string when it is not defined.
829 830

        """
831
        val = self._get("260", "c")
832 833 834 835 836 837

        if isinstance(val, list):
            if len(val):
                val.sort()
                val = val[0]
            else:
838
                val = ""
839 840 841 842 843 844 845 846

        # several form are possible 2014, 2014-12 or 2014-12-31
        if val:
            match = REG_YEAR.search(val)
            if match:
                val = match.group(1)

        return val