articles.py 13 KB
Newer Older
1 2 3 4 5 6
""" harvest_tools.articles

"""
import traceback


7 8 9 10 11 12 13
from .automaton import Automaton
from .base import (learn_my_authors,
                   MSG_CRASH,
                   MSG_FIX_ORIGIN,
                   MSG_IN_DB,
                   MSG_LOAD)
from .checkandfix import CheckException
14 15 16
from plugin_dbui import get_id, UNDEF_ID


17 18
MSG_NO_EDITOR = "Reject article is not published"
MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article"
19 20


21
class Articles(Automaton):
22
    """Automaton for articles.
23 24 25 26

    """
    def __init__(self, *args, **kwargs):

27
        Automaton.__init__(self, *args, **kwargs)
28 29 30 31

        # the preprint categories
        self.id_preprint = get_id(self.db.categories, code="PRE")

32
    def check_record(self, record):
33
        """Check the content of the article in order to fix non-conformities.
34

35
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
36 37
            record (RecordPubli):
                the record describing the article.
38

39
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
40 41
            bool:
                ``False`` when a non conformity is found and
42
                can not be corrected.
43 44 45 46 47 48

        """
        if not Automaton.check_record(self, record):
            return False

        if self.dbg:
49
            print("check article record")
50 51 52 53

        try:

            if not record.is_published():
54
                self.logs[-1].reject(MSG_NO_EDITOR, record=record)
55 56
                return False

57
            self.check.format_editor(record)
58
            self.check.publisher(record)
59

60
            self.check.paper_reference(record)
61 62
            self.check.submitted(record)

63 64
            self.check.format_authors(record, fmt="F. Last")
            self.check.get_my_authors(record, sort=True)
65 66

        except CheckException as e:
67
            self.logs[-1].reject(e, record=record)
68 69 70
            return False

        except Exception as e:
71
            self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
72
            print((traceback.format_exc()))
73 74 75 76
            return False

        return True

77
    def get_record_by_fields(self,
78 79
                             oai_url,
                             year,
80 81 82 83 84 85
                             id_publisher=None,
                             my_authors=None,
                             pages=None,
                             publication_url=None,
                             preprint_number=None,
                             title=None,
86 87 88
                             volume=None):
        """Get article matching fields values defined
        in the keyword arguments.
89 90

        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
91
            This method is required to deal with an article entered by hand and
92
            found later by the harvester.
93 94

        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
95 96 97 98
            oai_url (unicode):
                the oai_url, *e.g* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update
                to **oai_url** when a match is found.
99

LE GAC Renaud's avatar
LE GAC Renaud committed
100 101
            year (unicode):
                the year of the publication. It is used
102 103 104
                by the search algorithm and by the logger.

        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
            id_publisher (int):
                identifier of the publisher in the database.

            my_authors (unicode):
                authors of my institute separated by a comma.

            pages (unicode):
                the page reference.

            publication_url (unicode):
                the URL of the publications

            preprint_number (unicode):
                the preprint number

            title (unicode):
                the title of the publication.

            volume (unicode):
                the volume reference.
125 126

        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
127 128 129 130 131
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                It is equal to ``None`` when nothing is found.
                The ``status`` is equal to one when the existing preprint was
                modified into article, zero otherwise
132 133 134

        """
        if self.dbg:
135
            print("get existing article by fields")
136

137
        # alias
138
        db = self.db
139 140 141
        id_project = self.id_project
        id_team = self.id_team
        logs = self.logs
142 143 144

        # check against published articles
        rec_id = get_id(db.publications,
145
                        id_projects=id_project,
146
                        id_publishers=id_publisher,
147
                        id_teams=id_team,
148 149 150 151
                        pages=pages,
                        volume=volume,
                        year=year)

152
        # fix origin field
153 154
        publication = db.publications[rec_id]
        if rec_id and not publication.origin:
155
            if not self.dry_run:
156
                publication = dict(origin=oai_url)
157

158
            logs[-1].modify(MSG_FIX_ORIGIN, year)
159 160 161
            return (rec_id, 1)

        if rec_id:
162
            logs[-1].idle(MSG_IN_DB, year)
163 164 165 166 167 168
            return (rec_id, 0)

        # check against published preprint
        # a preprint can be identified by its category which is PRE (15)
        rec_id = get_id(db.publications,
                        id_categories=self.id_preprint,
169 170
                        id_projects=id_project,
                        id_teams=id_team,
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
                        preprint=preprint_number)

        if not rec_id:
            return (None, 0)

        # transform an existing preprint into article
        # institute authors can be missing in the preprint
        # change also the status
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)

        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
    def get_record_by_origin(self,
                             primary_oai_url,
                             year,
                             id_publisher=None,
                             my_authors=None,
                             oai_url=None,
                             pages=None,
                             publication_url=None,
                             title=None,
                             volume=None):
        """Get an existing record using the origin field and its value
        defined in the *primary_oai_url* argument.

        Note:
            This method is required to transform a preprint into and article.
            All the keyword arguments are needed by the transformation.

        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
212 213
            primary_oai_url (unicode):
                the *primary* OAI identifier of the
214 215
                record. It is used by the search algorithm.

LE GAC Renaud's avatar
LE GAC Renaud committed
216 217
            year (unicode):
                the year of publication which is used
218 219 220
                by the logger.

        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
            id_publisher (int):
                identifier of the publisher in the database.

            my_authors (unicode):
                authors of my institute separated by a comma.

            oai_url (unicode):
                the full oai_url(s) of the article.

            pages (unicode):
                the page reference.

            publication_url (unicode):
                the URL of the publications

            title (unicode):
                the title of the publication.

            volume (unicode):
                the volume reference.
241 242

        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
243 244 245 246 247
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                It is equal to ``None`` when nothing is found.
                The ``status`` is equal to one when the existing preprint was
                modified into article, zero otherwise
248 249 250

        """
        if self.dbg:
251
            print("check existing article by origin")
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288

        # alias
        db = self.db
        logs = self.logs
        publications = db.publications

        # search by origin
        query = db.publications.origin.contains(primary_oai_url)
        setrows = db(query)
        if setrows.count() == 0:
            return (None, 0)

        # a record is found
        rec_id = setrows.select(publications.id).first().id
        publication = publications[rec_id]

        # not a preprint ?
        if publication.id_categories != self.id_preprint:
            logs[-1].idle(MSG_IN_DB, year)
            return (rec_id, 0)

        # transform a preprint into an article
        logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           oai_url=oai_url,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

289 290
    def insert_record(self, record):
        """Insert an article in the database.
291

292 293 294 295
        Note:
            The method assumes that erratum are removed.

        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
296 297
            record (RecordPubli):
                the record describing the article.
298

299
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
300 301
            int:
                one when the record is inserted / updated in the database,
302
                zero otherwise.
303 304 305 306 307 308 309

        """
        db = self.db

        # alias
        editor = record.paper_editor()
        first_author = record.first_author()
310
        my_authors = record.my_authors
311 312 313 314 315 316 317 318 319
        oai_url = record.oai_url()
        pages = record.paper_pages()
        preprint_number = record.preprint_number()
        publication_url = record.paper_url()
        submitted = record.submitted()[0]
        title = record.title()
        volume = record.paper_volume()
        year = record.paper_year()

320
        # get the collaboration / publisher identifiers
321 322 323 324
        id_collaboration = \
            get_id(db.collaborations, collaboration=record.collaboration())

        id_publisher = get_id(db.publishers, abbreviation=editor)
325

326 327
        # get already published articles or preprint
        # A preprint is transform into an article.
328 329 330
        #
        # NOTE: The check is performed by origin then by fields.
        # The latter is useful to cover the case where the record
331
        # is entered by hand or by another harvester.
332
        #
LE GAC Renaud's avatar
LE GAC Renaud committed
333 334 335 336 337 338 339

        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      oai_url=oai_url,
                      pages=pages,
                      publication_url=publication_url,
                      title=title,
340
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
341

342 343 344
        rec_id, status = self.get_record_by_origin(record.primary_oai_url(),
                                                   year,
                                                   **fields)
345 346 347
        if rec_id:
            return status

LE GAC Renaud's avatar
LE GAC Renaud committed
348 349 350 351 352 353
        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      pages=pages,
                      publication_url=publication_url,
                      preprint_number=preprint_number,
                      title=title,
354
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
355

356
        rec_id, status = self.get_record_by_fields(oai_url, year, **fields)
357 358 359 360 361
        if rec_id:
            return status

        # eventually insert a new articles in the database
        # try to improve the rescue list for CPPM authors
362
        ret = 1
363 364
        if not self.dry_run:

LE GAC Renaud's avatar
LE GAC Renaud committed
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
            fields = dict(authors=record.authors(),
                          authors_institute=my_authors,
                          first_author=first_author,
                          id_categories=self.id_category,
                          id_collaborations=id_collaboration,
                          id_projects=self.id_project,
                          id_publishers=id_publisher,
                          id_status=UNDEF_ID,
                          id_teams=self.id_team,
                          origin=oai_url,
                          pages=pages,
                          preprint=preprint_number,
                          publication_url=publication_url,
                          submitted=submitted,
                          title=title,
                          volume=volume,
                          year=year)

            ret = self._insert_in_db(log_year=year, **fields)
384 385 386 387 388 389 390
            if ret == 1:
                learn_my_authors(db,
                                 authors=record.my_authors,
                                 id_project=self.id_project,
                                 id_team=self.id_team,
                                 year=year)

391
        if ret == 1:
392
            self.logs[-1].load(MSG_LOAD, year)
393
            return 1
394

395
        return 0