articles.py 13.8 KB
Newer Older
1 2 3
""" harvest_tools.articles

"""
4 5
from .automaton import Automaton
from .base import (learn_my_authors,
6
                   MSG_CRASH,
7 8
                   MSG_FIX_ORIGIN,
                   MSG_IN_DB,
9 10 11
                   MSG_LOAD,
                   T4,
                   T6)
12
from plugin_dbui import get_id, UNDEF_ID
13
from store_tools import CheckException
14

15
MSG_IS_PREPRINT = "Reject publication is a preprint"
16
MSG_NO_EDITOR = "Reject article is not published"
17
MSG_NOT_ARTICLE = "Reject publication is not and article"
18
MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article"
19

20

21
class Articles(Automaton):
22
    """Automaton for articles.
23 24

    """
25

26 27
    def __init__(self, *args, **kwargs):

28
        super().__init__(*args, **kwargs)
29 30 31 32

        # the preprint categories
        self.id_preprint = get_id(self.db.categories, code="PRE")

33
    def check_record(self, record):
34
        """Check the content of the article in order to fix non-conformities.
35

36 37 38 39 40 41 42 43 44 45 46
            * publication is a published article
            * is with authors form my institute
            * standardise name of collaboration
            * format authors according to my format
            * extract authors form my institute signing the publication
            * is submitted date well formed

            * format editor according to my criteria
            * resolve published synonym
            * check reference paper

47
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
48 49
            record (RecordPubli):
                the record describing the article.
50

51
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
52 53
            bool:
                ``False`` when a non conformity is found and
54
                can not be corrected.
55 56

        """
57
        self.logger.debug(f"{T4}check and fix record (article)")
58

59 60 61 62
        stype = record.subtype()
        if stype != "article":
            msg = (MSG_IS_PREPRINT if stype == "preprint" else MSG_NOT_ARTICLE)
            self.logs[-1].reject(msg, record)
63
            return False
64

65 66 67 68 69 70
        try:
            # is with authors form my institute
            # standardise name of collaboration
            # format authors according to my format
            # extract authors form my institute signing the publication
            # is submitted date well formed
71
            record.check_and_fix(db=self.db,
72
                                 fmt_author="F. Last",
73
                                 rex_institute=self.rex_institute,
74 75
                                 sep_author=", ",
                                 sort_author=True)
76

77 78
            record.format_editor()
            record.check_publisher(self.db)
79

80
        except CheckException as e:
81
            self.logs[-1].reject(e, record=record)
82 83
            return False

84 85 86 87
        except Exception as e:
            self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
            return False

88 89
        return True

90
    def get_record_by_fields(self,
91 92
                             oai_url,
                             year,
93 94 95 96 97 98
                             id_publisher=None,
                             my_authors=None,
                             pages=None,
                             publication_url=None,
                             preprint_number=None,
                             title=None,
99 100 101
                             volume=None):
        """Get article matching fields values defined
        in the keyword arguments.
102 103

        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
104
            This method is required to deal with an article entered by hand and
105
            found later by the harvester.
106 107

        Args:
108
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
109 110 111
                the oai_url, *e.g* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update
                to **oai_url** when a match is found.
112

113
            year (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
114
                the year of the publication. It is used
115 116 117
                by the search algorithm and by the logger.

        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
118 119 120
            id_publisher (int):
                identifier of the publisher in the database.

121
            my_authors (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
122 123
                authors of my institute separated by a comma.

124
            pages (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
125 126
                the page reference.

127
            publication_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
128 129
                the URL of the publications

130
            preprint_number (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
131 132
                the preprint number

133
            title (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
134 135
                the title of the publication.

136
            volume (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
137
                the volume reference.
138 139

        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
140 141 142 143 144
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                It is equal to ``None`` when nothing is found.
                The ``status`` is equal to one when the existing preprint was
                modified into article, zero otherwise
145 146

        """
147
        self.logger.debug(f"{T6}check existing article by fields")
148

149
        # alias
150
        db = self.db
151 152 153
        id_project = self.id_project
        id_team = self.id_team
        logs = self.logs
154 155 156

        # check against published articles
        rec_id = get_id(db.publications,
157
                        id_projects=id_project,
158
                        id_publishers=id_publisher,
159
                        id_teams=id_team,
160 161 162 163
                        pages=pages,
                        volume=volume,
                        year=year)

164
        # fix origin field
165 166
        publication = db.publications[rec_id]
        if rec_id and not publication.origin:
167
            if not self.dry_run:
168
                publication = dict(origin=oai_url)
169

170
            logs[-1].modify(MSG_FIX_ORIGIN, year)
171 172 173
            return (rec_id, 1)

        if rec_id:
174
            logs[-1].idle(MSG_IN_DB, year)
175 176 177 178 179 180
            return (rec_id, 0)

        # check against published preprint
        # a preprint can be identified by its category which is PRE (15)
        rec_id = get_id(db.publications,
                        id_categories=self.id_preprint,
181 182
                        id_projects=id_project,
                        id_teams=id_team,
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
                        preprint=preprint_number)

        if not rec_id:
            return (None, 0)

        # transform an existing preprint into article
        # institute authors can be missing in the preprint
        # change also the status
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)

        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
    def get_record_by_origin(self,
                             primary_oai_url,
                             year,
                             id_publisher=None,
                             my_authors=None,
                             oai_url=None,
                             pages=None,
                             publication_url=None,
                             title=None,
                             volume=None):
        """Get an existing record using the origin field and its value
        defined in the *primary_oai_url* argument.

        Note:
            This method is required to transform a preprint into and article.
            All the keyword arguments are needed by the transformation.

        Args:
224
            primary_oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
225
                the *primary* OAI identifier of the
226 227
                record. It is used by the search algorithm.

228
            year (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
229
                the year of publication which is used
230 231 232
                by the logger.

        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
233 234 235
            id_publisher (int):
                identifier of the publisher in the database.

236
            my_authors (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
237 238
                authors of my institute separated by a comma.

239
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
240 241
                the full oai_url(s) of the article.

242
            pages (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
243 244
                the page reference.

245
            publication_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
246 247
                the URL of the publications

248
            title (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
249 250
                the title of the publication.

251
            volume (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
252
                the volume reference.
253 254

        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
255 256 257 258 259
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                It is equal to ``None`` when nothing is found.
                The ``status`` is equal to one when the existing preprint was
                modified into article, zero otherwise
260 261

        """
262
        self.logger.debug(f"{T6}check existing article by origin")
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299

        # alias
        db = self.db
        logs = self.logs
        publications = db.publications

        # search by origin
        query = db.publications.origin.contains(primary_oai_url)
        setrows = db(query)
        if setrows.count() == 0:
            return (None, 0)

        # a record is found
        rec_id = setrows.select(publications.id).first().id
        publication = publications[rec_id]

        # not a preprint ?
        if publication.id_categories != self.id_preprint:
            logs[-1].idle(MSG_IN_DB, year)
            return (rec_id, 0)

        # transform a preprint into an article
        logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           oai_url=oai_url,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

300 301
    def insert_record(self, record):
        """Insert an article in the database.
302

303 304 305 306
        Note:
            The method assumes that erratum are removed.

        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
307 308
            record (RecordPubli):
                the record describing the article.
309

310
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
311 312
            int:
                one when the record is inserted / updated in the database,
313
                zero otherwise.
314 315 316 317 318 319 320

        """
        db = self.db

        # alias
        editor = record.paper_editor()
        first_author = record.first_author()
321
        my_authors = record.my_authors
322 323 324 325
        oai_url = record.oai_url()
        pages = record.paper_pages()
        preprint_number = record.preprint_number()
        publication_url = record.paper_url()
326
        submitted = record.submitted()
327 328 329 330
        title = record.title()
        volume = record.paper_volume()
        year = record.paper_year()

331
        # get the collaboration / publisher identifiers
332 333 334 335
        id_collaboration = \
            get_id(db.collaborations, collaboration=record.collaboration())

        id_publisher = get_id(db.publishers, abbreviation=editor)
336

337 338
        # get already published articles or preprint
        # A preprint is transform into an article.
339 340 341
        #
        # NOTE: The check is performed by origin then by fields.
        # The latter is useful to cover the case where the record
342
        # is entered by hand or by another harvester.
343
        #
LE GAC Renaud's avatar
LE GAC Renaud committed
344 345 346 347 348 349 350

        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      oai_url=oai_url,
                      pages=pages,
                      publication_url=publication_url,
                      title=title,
351
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
352

353 354 355
        rec_id, status = self.get_record_by_origin(record.primary_oai_url(),
                                                   year,
                                                   **fields)
356 357 358
        if rec_id:
            return status

LE GAC Renaud's avatar
LE GAC Renaud committed
359 360 361 362 363 364
        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      pages=pages,
                      publication_url=publication_url,
                      preprint_number=preprint_number,
                      title=title,
365
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
366

367
        rec_id, status = self.get_record_by_fields(oai_url, year, **fields)
368 369 370 371 372
        if rec_id:
            return status

        # eventually insert a new articles in the database
        # try to improve the rescue list for CPPM authors
373
        ret = 1
374 375
        if not self.dry_run:

LE GAC Renaud's avatar
LE GAC Renaud committed
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
            fields = dict(authors=record.authors(),
                          authors_institute=my_authors,
                          first_author=first_author,
                          id_categories=self.id_category,
                          id_collaborations=id_collaboration,
                          id_projects=self.id_project,
                          id_publishers=id_publisher,
                          id_status=UNDEF_ID,
                          id_teams=self.id_team,
                          origin=oai_url,
                          pages=pages,
                          preprint=preprint_number,
                          publication_url=publication_url,
                          submitted=submitted,
                          title=title,
                          volume=volume,
                          year=year)

            ret = self._insert_in_db(log_year=year, **fields)
395 396 397 398 399 400 401
            if ret == 1:
                learn_my_authors(db,
                                 authors=record.my_authors,
                                 id_project=self.id_project,
                                 id_team=self.id_team,
                                 year=year)

402
        if ret == 1:
403
            self.logs[-1].load(MSG_LOAD, year)
404
            return 1
405

406
        return 0