articles.py 12.7 KB
Newer Older
1 2 3 4 5 6 7
# -*- coding: utf-8 -*-
""" harvest_tools.articles

"""
import traceback


8
from automaton import Automaton
9
from base import (learn_my_authors,
10 11 12 13
                  MSG_CRASH,
                  MSG_FIX_ORIGIN,
                  MSG_IN_DB,
                  MSG_LOAD)
14
from checkandfix import CheckException
15 16 17
from plugin_dbui import get_id, UNDEF_ID


18 19
MSG_NO_EDITOR = "Reject article is not published"
MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article"
20 21


22
class Articles(Automaton):
23
    """Automaton for articles.
24 25 26 27

    """
    def __init__(self, *args, **kwargs):

28
        Automaton.__init__(self, *args, **kwargs)
29 30 31 32

        # the preprint categories
        self.id_preprint = get_id(self.db.categories, code="PRE")

33
    def check_record(self, record):
34
        """Check the content of the article in order to fix non-conformities.
35

36 37
        Args:
            record (RecordPubli): the MARC12 record describing the article.
38

39 40 41
        Returns:
            bool: ``False`` when a non conformity is found and
                can not be corrected.
42 43 44 45 46 47

        """
        if not Automaton.check_record(self, record):
            return False

        if self.dbg:
LE GAC Renaud's avatar
LE GAC Renaud committed
48
            print "check article record"
49 50 51 52 53

        try:
            self.check.clean_erratum(record)

            if not record.is_published():
54
                self.logs[-1].reject(MSG_NO_EDITOR, record=record)
55 56
                return False

57
            self.check.format_editor(record)
58
            self.check.publisher(record)
59

60
            self.check.paper_reference(record)
61 62 63
            self.check.submitted(record)
            self.check.year(record)

64 65
            self.check.format_authors(record, fmt="F. Last")
            self.check.get_my_authors(record, sort=True)
66 67

        except CheckException as e:
68
            self.logs[-1].reject(e, record=record)
69 70 71
            return False

        except Exception as e:
72
            self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
73 74 75 76 77
            print traceback.format_exc()
            return False

        return True

78
    def get_record_by_fields(self,
79 80
                             oai_url,
                             year,
81 82 83 84 85 86
                             id_publisher=None,
                             my_authors=None,
                             pages=None,
                             publication_url=None,
                             preprint_number=None,
                             title=None,
87 88 89
                             volume=None):
        """Get article matching fields values defined
        in the keyword arguments.
90 91

        Note:
92 93
            This method is required deal with an article entered by hand and
            found later by the harvester.
94 95

        Args:
96 97 98 99 100 101 102 103 104
            oai_url (unicode): the oai_url, *e.g*
                ``http://cds.cern.ch/record/123456``. The origin field
                of the existing database record is update to **oai_url**
                when a match is found.

            year (unicode): the year of the publication. It is used
                by the search algorithm and by the logger.

        Keyword Args:
105 106 107 108 109 110 111 112 113 114 115 116 117
            id_publisher (int): identifier of the publisher in the database.
            my_authors (unicode): authors of my institute separated by a comma.
            pages (unicode):  the page reference.
            publication_url (unicode): the URL of the publications
            preprint_number (unicode): the preprint number
            title (unicode): the title of the publication.
            volume (unicode): the volume reference.

        Returns:
            tuple: ``(id, status)`` which contains the ``id`` of the record.
            It is equal to ``None`` when nothing is found.
            The ``status`` is equal to one when the existing preprint was
            modified into article, zero otherwise
118 119 120

        """
        if self.dbg:
121
            print "get existing article by fields"
122

123
        # alias
124
        db = self.db
125 126 127
        id_project = self.id_project
        id_team = self.id_team
        logs = self.logs
128 129 130

        # check against published articles
        rec_id = get_id(db.publications,
131
                        id_projects=id_project,
132
                        id_publishers=id_publisher,
133
                        id_teams=id_team,
134 135 136 137
                        pages=pages,
                        volume=volume,
                        year=year)

138
        # fix origin field
139 140
        publication = db.publications[rec_id]
        if rec_id and not publication.origin:
141
            if not self.dry_run:
142
                publication = dict(origin=oai_url)
143

144
            logs[-1].modify(MSG_FIX_ORIGIN, year)
145 146 147
            return (rec_id, 1)

        if rec_id:
148
            logs[-1].idle(MSG_IN_DB, year)
149 150 151 152 153 154
            return (rec_id, 0)

        # check against published preprint
        # a preprint can be identified by its category which is PRE (15)
        rec_id = get_id(db.publications,
                        id_categories=self.id_preprint,
155 156
                        id_projects=id_project,
                        id_teams=id_team,
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
                        preprint=preprint_number)

        if not rec_id:
            return (None, 0)

        # transform an existing preprint into article
        # institute authors can be missing in the preprint
        # change also the status
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)

        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
    def get_record_by_origin(self,
                             primary_oai_url,
                             year,
                             id_publisher=None,
                             my_authors=None,
                             oai_url=None,
                             pages=None,
                             publication_url=None,
                             title=None,
                             volume=None):
        """Get an existing record using the origin field and its value
        defined in the *primary_oai_url* argument.

        Note:
            This method is required to transform a preprint into and article.
            All the keyword arguments are needed by the transformation.

        Args:
            primary_oai_url (unicode): the *primary* OAI identifier of the
                record. It is used by the search algorithm.

            year (unicode): the year of publication which is used
                by the logger.

        Keyword Args:
            id_publisher (int): identifier of the publisher in the database.
            my_authors (unicode): authors of my institute separated by a comma.
            oai_url (unicode): the full oai_url(s) of the article.
            pages (unicode):  the page reference.
            publication_url (unicode): the URL of the publications
            title (unicode): the title of the publication.
            volume (unicode): the volume reference.

        Returns:
            tuple: ``(id, status)`` which contains the ``id`` of the record.
            It is equal to ``None`` when nothing is found.
            The ``status`` is equal to one when the existing preprint was
            modified into article, zero otherwise

        """
        if self.dbg:
            print "check existing article by origin"

        # alias
        db = self.db
        logs = self.logs
        publications = db.publications

        # search by origin
        query = db.publications.origin.contains(primary_oai_url)
        setrows = db(query)
        if setrows.count() == 0:
            return (None, 0)

        # a record is found
        rec_id = setrows.select(publications.id).first().id
        publication = publications[rec_id]

        # not a preprint ?
        if publication.id_categories != self.id_preprint:
            logs[-1].idle(MSG_IN_DB, year)
            return (rec_id, 0)

        # transform a preprint into an article
        logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           oai_url=oai_url,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

259 260
    def insert_record(self, record):
        """Insert an article in the database.
261

262 263 264 265 266
        Note:
            The method assumes that erratum are removed.

        Args:
            record (RecordPubli): the MARC12 record describing the article.
267

268 269 270
        Returns:
            int: one when the record is inserted / updated in the database,
                zero otherwise.
271 272 273 274 275 276 277

        """
        db = self.db

        # alias
        editor = record.paper_editor()
        first_author = record.first_author()
278
        my_authors = record.my_authors
279 280 281 282 283 284 285 286 287
        oai_url = record.oai_url()
        pages = record.paper_pages()
        preprint_number = record.preprint_number()
        publication_url = record.paper_url()
        submitted = record.submitted()[0]
        title = record.title()
        volume = record.paper_volume()
        year = record.paper_year()

288 289 290
        # get the collaboration / publisher identifiers
        id_collaboration = self.search_collaboration(record.collaboration())
        id_publisher = self.search_publisher(editor)
291

292 293
        # get already published articles or preprint
        # A preprint is transform into an article.
294 295 296
        #
        # NOTE: The check is performed by origin then by fields.
        # The latter is useful to cover the case where the record
297
        # is entered by hand or by another harvester.
298
        #
LE GAC Renaud's avatar
LE GAC Renaud committed
299 300 301 302 303 304 305

        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      oai_url=oai_url,
                      pages=pages,
                      publication_url=publication_url,
                      title=title,
306
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
307

308 309 310
        rec_id, status = self.get_record_by_origin(record.primary_oai_url(),
                                                   year,
                                                   **fields)
311 312 313
        if rec_id:
            return status

LE GAC Renaud's avatar
LE GAC Renaud committed
314 315 316 317 318 319
        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      pages=pages,
                      publication_url=publication_url,
                      preprint_number=preprint_number,
                      title=title,
320
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
321

322
        rec_id, status = self.get_record_by_fields(oai_url, year, **fields)
323 324 325 326 327
        if rec_id:
            return status

        # eventually insert a new articles in the database
        # try to improve the rescue list for CPPM authors
328
        ret = 1
329 330
        if not self.dry_run:

LE GAC Renaud's avatar
LE GAC Renaud committed
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
            fields = dict(authors=record.authors(),
                          authors_institute=my_authors,
                          first_author=first_author,
                          id_categories=self.id_category,
                          id_collaborations=id_collaboration,
                          id_projects=self.id_project,
                          id_publishers=id_publisher,
                          id_status=UNDEF_ID,
                          id_teams=self.id_team,
                          origin=oai_url,
                          pages=pages,
                          preprint=preprint_number,
                          publication_url=publication_url,
                          submitted=submitted,
                          title=title,
                          volume=volume,
                          year=year)

            ret = self._insert_in_db(log_year=year, **fields)
350 351 352 353 354 355 356
            if ret == 1:
                learn_my_authors(db,
                                 authors=record.my_authors,
                                 id_project=self.id_project,
                                 id_team=self.id_team,
                                 year=year)

357
        if ret == 1:
358
            self.logs[-1].load(MSG_LOAD, year)
359
            return 1
360

361
        return 0