articles.py 12 KB
Newer Older
1 2 3 4 5 6 7
# -*- coding: utf-8 -*-
""" harvest_tools.articles

"""
import traceback


8
from automaton import Automaton
9 10
from base import (family_name_fr,
                  format_author_fr,
11
                  learn_my_authors,
12 13 14 15
                  MSG_CRASH,
                  MSG_FIX_ORIGIN,
                  MSG_IN_DB,
                  MSG_LOAD)
16 17 18 19
from invenio_tools import CheckException
from plugin_dbui import get_id, UNDEF_ID


20 21
MSG_NO_EDITOR = "Reject article is not published"
MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article"
22 23


24
class Articles(Automaton):
25
    """Automaton for articles.
26 27 28 29

    """
    def __init__(self, *args, **kwargs):

30
        Automaton.__init__(self, *args, **kwargs)
31 32 33 34

        # the preprint categories
        self.id_preprint = get_id(self.db.categories, code="PRE")

35
    def check_record(self, record):
36
        """Check the content of the article in order to fix non-conformities.
37

38 39
        Args:
            record (RecordPubli): the MARC12 record describing the article.
40

41 42 43
        Returns:
            bool: ``False`` when a non conformity is found and
                can not be corrected.
44 45 46 47 48 49

        """
        if not Automaton.check_record(self, record):
            return False

        if self.dbg:
LE GAC Renaud's avatar
LE GAC Renaud committed
50
            print "check article record"
51 52 53 54 55

        try:
            self.check.clean_erratum(record)

            if not record.is_published():
56
                self.logs[-1].reject(MSG_NO_EDITOR, record=record)
57 58
                return False

59
            self.check.format_editor(record)
60
            self.check.publisher(record)
61

62
            self.check.paper_reference(record)
63 64 65
            self.check.submitted(record)
            self.check.year(record)

66 67
            self.check.format_authors(record, format_author_fr)
            self.check.get_my_authors(record, family_name_fr)
68 69

        except CheckException as e:
70
            self.logs[-1].reject(e, record=record)
71 72 73
            return False

        except Exception as e:
74
            self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
75 76 77 78 79
            print traceback.format_exc()
            return False

        return True

80 81 82 83 84 85 86 87 88
    def get_record_by_origin(self,
                             id_publisher=None,
                             my_authors=None,
                             oai_url=None,
                             pages=None,
                             publication_url=None,
                             title=None,
                             volume=None,
                             year=None):
89 90 91 92 93
        """Get an existing record using the origin field and its value
        defined in the ``oai_url`` keyword argument.
        The other arguments are used to transform the corresponding preprint
        into an article.

94
        Keyword Args:
95 96 97 98 99 100 101 102 103 104 105 106 107 108
            oai_url (unicode): the OAI identifier of the article.
            id_publisher (int): identifier of the publisher in the database.
            my_authors (unicode): authors of my institute separated by a comma.
            pages (unicode):  the page reference.
            publication_url (unicode): the URL of the publications
            title (unicode): the title of the publication.
            volume (unicode): the volume reference.
            year (unicode): the year of publication.

        Returns:
            tuple: ``(id, status)`` which contains the ``id`` of the record.
            It is equal to ``None`` when nothing is found.
            The ``status`` is equal to one when the existing preprint was
            modified into article, zero otherwise
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139

        """
        if self.dbg:
            print "check existing article by origin"

        db = self.db

        rec_id = get_id(db.publications, origin=oai_url)
        if not rec_id:
            return (None, 0)

        # not a preprint ?
        if db.publications[rec_id].id_categories != self.id_preprint:
            self.logs[-1].idle(MSG_IN_DB, year)
            return (rec_id, 0)

        # transform a preprint into an article
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

140
    def get_record_by_fields(self,
141 142
                             oai_url,
                             year,
143 144 145 146 147 148
                             id_publisher=None,
                             my_authors=None,
                             pages=None,
                             publication_url=None,
                             preprint_number=None,
                             title=None,
149 150 151
                             volume=None):
        """Get article matching fields values defined
        in the keyword arguments.
152 153

        Note:
154 155
            This method is required deal with an article entered by hand and
            found later by the harvester.
156 157

        Args:
158 159 160 161 162 163 164 165 166
            oai_url (unicode): the oai_url, *e.g*
                ``http://cds.cern.ch/record/123456``. The origin field
                of the existing database record is update to **oai_url**
                when a match is found.

            year (unicode): the year of the publication. It is used
                by the search algorithm and by the logger.

        Keyword Args:
167 168 169 170 171 172 173 174 175 176 177 178 179
            id_publisher (int): identifier of the publisher in the database.
            my_authors (unicode): authors of my institute separated by a comma.
            pages (unicode):  the page reference.
            publication_url (unicode): the URL of the publications
            preprint_number (unicode): the preprint number
            title (unicode): the title of the publication.
            volume (unicode): the volume reference.

        Returns:
            tuple: ``(id, status)`` which contains the ``id`` of the record.
            It is equal to ``None`` when nothing is found.
            The ``status`` is equal to one when the existing preprint was
            modified into article, zero otherwise
180 181 182

        """
        if self.dbg:
183
            print "get existing article by fields"
184

185
        # alias
186
        db = self.db
187 188 189
        id_project = self.id_project
        id_team = self.id_team
        logs = self.logs
190 191 192

        # check against published articles
        rec_id = get_id(db.publications,
193
                        id_projects=id_project,
194
                        id_publishers=id_publisher,
195
                        id_teams=id_team,
196 197 198 199
                        pages=pages,
                        volume=volume,
                        year=year)

200
        # fix origin field
201 202
        publication = db.publications[rec_id]
        if rec_id and not publication.origin:
203
            if not self.dry_run:
204
                publication = dict(origin=oai_url)
205

206
            logs[-1].modify(MSG_FIX_ORIGIN, year)
207 208 209
            return (rec_id, 1)

        if rec_id:
210
            logs[-1].idle(MSG_IN_DB, year)
211 212 213 214 215 216
            return (rec_id, 0)

        # check against published preprint
        # a preprint can be identified by its category which is PRE (15)
        rec_id = get_id(db.publications,
                        id_categories=self.id_preprint,
217 218
                        id_projects=id_project,
                        id_teams=id_team,
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
                        preprint=preprint_number)

        if not rec_id:
            return (None, 0)

        # transform an existing preprint into article
        # institute authors can be missing in the preprint
        # change also the status
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)

        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

242 243
    def insert_record(self, record):
        """Insert an article in the database.
244

245 246 247 248 249
        Note:
            The method assumes that erratum are removed.

        Args:
            record (RecordPubli): the MARC12 record describing the article.
250

251 252 253
        Returns:
            int: one when the record is inserted / updated in the database,
                zero otherwise.
254 255 256 257 258 259 260

        """
        db = self.db

        # alias
        editor = record.paper_editor()
        first_author = record.first_author()
261
        my_authors = record.my_authors
262 263 264 265 266 267 268 269 270
        oai_url = record.oai_url()
        pages = record.paper_pages()
        preprint_number = record.preprint_number()
        publication_url = record.paper_url()
        submitted = record.submitted()[0]
        title = record.title()
        volume = record.paper_volume()
        year = record.paper_year()

271 272 273
        # get the collaboration / publisher identifiers
        id_collaboration = self.search_collaboration(record.collaboration())
        id_publisher = self.search_publisher(editor)
274

275 276
        # get already published articles or preprint
        # A preprint is transform into an article.
277 278 279
        #
        # NOTE: The check is performed by origin then by fields.
        # The latter is useful to cover the case where the record
280
        # is entered by hand or by another harvester.
281
        #
LE GAC Renaud's avatar
LE GAC Renaud committed
282 283 284 285 286 287 288 289 290 291 292

        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      oai_url=oai_url,
                      pages=pages,
                      publication_url=publication_url,
                      title=title,
                      volume=volume,
                      year=year)

        rec_id, status = self.get_record_by_origin(**fields)
293 294 295
        if rec_id:
            return status

LE GAC Renaud's avatar
LE GAC Renaud committed
296 297 298 299 300 301
        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      pages=pages,
                      publication_url=publication_url,
                      preprint_number=preprint_number,
                      title=title,
302
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
303

304
        rec_id, status = self.get_record_by_fields(oai_url, year, **fields)
305 306 307 308 309
        if rec_id:
            return status

        # eventually insert a new articles in the database
        # try to improve the rescue list for CPPM authors
310
        ret = 1
311 312
        if not self.dry_run:

LE GAC Renaud's avatar
LE GAC Renaud committed
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
            fields = dict(authors=record.authors(),
                          authors_institute=my_authors,
                          first_author=first_author,
                          id_categories=self.id_category,
                          id_collaborations=id_collaboration,
                          id_projects=self.id_project,
                          id_publishers=id_publisher,
                          id_status=UNDEF_ID,
                          id_teams=self.id_team,
                          origin=oai_url,
                          pages=pages,
                          preprint=preprint_number,
                          publication_url=publication_url,
                          submitted=submitted,
                          title=title,
                          volume=volume,
                          year=year)

            ret = self._insert_in_db(log_year=year, **fields)
332 333 334 335 336 337 338
            if ret == 1:
                learn_my_authors(db,
                                 authors=record.my_authors,
                                 id_project=self.id_project,
                                 id_team=self.id_team,
                                 year=year)

339
        if ret == 1:
340
            self.logs[-1].load(MSG_LOAD, year)
341
            return 1
342

343
        return 0