articles.py 13.6 KB
Newer Older
1 2 3
""" harvest_tools.articles

"""
4 5
from .automaton import Automaton
from .base import (learn_my_authors,
6
                   MSG_CRASH,
7 8
                   MSG_FIX_ORIGIN,
                   MSG_IN_DB,
9 10 11
                   MSG_LOAD,
                   T4,
                   T6)
12
from plugin_dbui import get_id, UNDEF_ID
13
from store_tools import CheckException
14

15
MSG_NO_EDITOR = "Reject article is not published"
16
MSG_NOT_ARTICLE = "Reject publication is not and article"
17
MSG_TRANSFORM_PREPRINT = "Transform the preprint into an article"
18

19

20
class Articles(Automaton):
21
    """Automaton for articles.
22 23

    """
24

25 26
    def __init__(self, *args, **kwargs):

27
        super().__init__(*args, **kwargs)
28 29 30 31

        # the preprint categories
        self.id_preprint = get_id(self.db.categories, code="PRE")

32
    def check_record(self, record):
33
        """Check the content of the article in order to fix non-conformities.
34

35 36 37 38 39 40 41 42 43 44 45
            * publication is a published article
            * is with authors form my institute
            * standardise name of collaboration
            * format authors according to my format
            * extract authors form my institute signing the publication
            * is submitted date well formed

            * format editor according to my criteria
            * resolve published synonym
            * check reference paper

46
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
47 48
            record (RecordPubli):
                the record describing the article.
49

50
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
51 52
            bool:
                ``False`` when a non conformity is found and
53
                can not be corrected.
54 55

        """
56
        self.logger.debug(f"{T4}check and fix record (article)")
57

58
        if record.subtype() != "article":
59 60
            self.logs[-1].reject(MSG_NOT_ARTICLE, record)
            return False
61

62 63 64 65 66 67
        try:
            # is with authors form my institute
            # standardise name of collaboration
            # format authors according to my format
            # extract authors form my institute signing the publication
            # is submitted date well formed
68
            record.check_and_fix(db=self.db,
69
                                 fmt_author="F. Last",
70
                                 rex_institute=self.rex_institute,
71 72
                                 sep_author=", ",
                                 sort_author=True)
73

74 75
            record.format_editor()
            record.check_publisher(self.db)
76

77
        except CheckException as e:
78
            self.logs[-1].reject(e, record=record)
79 80
            return False

81 82 83 84
        except Exception as e:
            self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
            return False

85 86
        return True

87
    def get_record_by_fields(self,
88 89
                             oai_url,
                             year,
90 91 92 93 94 95
                             id_publisher=None,
                             my_authors=None,
                             pages=None,
                             publication_url=None,
                             preprint_number=None,
                             title=None,
96 97 98
                             volume=None):
        """Get article matching fields values defined
        in the keyword arguments.
99 100

        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
101
            This method is required to deal with an article entered by hand and
102
            found later by the harvester.
103 104

        Args:
105
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
106 107 108
                the oai_url, *e.g* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update
                to **oai_url** when a match is found.
109

110
            year (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
111
                the year of the publication. It is used
112 113 114
                by the search algorithm and by the logger.

        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
115 116 117
            id_publisher (int):
                identifier of the publisher in the database.

118
            my_authors (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
119 120
                authors of my institute separated by a comma.

121
            pages (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
122 123
                the page reference.

124
            publication_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
125 126
                the URL of the publications

127
            preprint_number (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
128 129
                the preprint number

130
            title (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
131 132
                the title of the publication.

133
            volume (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
134
                the volume reference.
135 136

        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
137 138 139 140 141
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                It is equal to ``None`` when nothing is found.
                The ``status`` is equal to one when the existing preprint was
                modified into article, zero otherwise
142 143

        """
144
        self.logger.debug(f"{T6}check existing article by fields")
145

146
        # alias
147
        db = self.db
148 149 150
        id_project = self.id_project
        id_team = self.id_team
        logs = self.logs
151 152 153

        # check against published articles
        rec_id = get_id(db.publications,
154
                        id_projects=id_project,
155
                        id_publishers=id_publisher,
156
                        id_teams=id_team,
157 158 159 160
                        pages=pages,
                        volume=volume,
                        year=year)

161
        # fix origin field
162 163
        publication = db.publications[rec_id]
        if rec_id and not publication.origin:
164
            if not self.dry_run:
165
                publication = dict(origin=oai_url)
166

167
            logs[-1].modify(MSG_FIX_ORIGIN, year)
168 169 170
            return (rec_id, 1)

        if rec_id:
171
            logs[-1].idle(MSG_IN_DB, year)
172 173 174 175 176 177
            return (rec_id, 0)

        # check against published preprint
        # a preprint can be identified by its category which is PRE (15)
        rec_id = get_id(db.publications,
                        id_categories=self.id_preprint,
178 179
                        id_projects=id_project,
                        id_teams=id_team,
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
                        preprint=preprint_number)

        if not rec_id:
            return (None, 0)

        # transform an existing preprint into article
        # institute authors can be missing in the preprint
        # change also the status
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)

        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    def get_record_by_origin(self,
                             primary_oai_url,
                             year,
                             id_publisher=None,
                             my_authors=None,
                             oai_url=None,
                             pages=None,
                             publication_url=None,
                             title=None,
                             volume=None):
        """Get an existing record using the origin field and its value
        defined in the *primary_oai_url* argument.

        Note:
            This method is required to transform a preprint into and article.
            All the keyword arguments are needed by the transformation.

        Args:
221
            primary_oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
222
                the *primary* OAI identifier of the
223 224
                record. It is used by the search algorithm.

225
            year (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
226
                the year of publication which is used
227 228 229
                by the logger.

        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
230 231 232
            id_publisher (int):
                identifier of the publisher in the database.

233
            my_authors (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
234 235
                authors of my institute separated by a comma.

236
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
237 238
                the full oai_url(s) of the article.

239
            pages (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
240 241
                the page reference.

242
            publication_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
243 244
                the URL of the publications

245
            title (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
246 247
                the title of the publication.

248
            volume (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
249
                the volume reference.
250 251

        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
252 253 254 255 256
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                It is equal to ``None`` when nothing is found.
                The ``status`` is equal to one when the existing preprint was
                modified into article, zero otherwise
257 258

        """
259
        self.logger.debug(f"{T6}check existing article by origin")
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296

        # alias
        db = self.db
        logs = self.logs
        publications = db.publications

        # search by origin
        query = db.publications.origin.contains(primary_oai_url)
        setrows = db(query)
        if setrows.count() == 0:
            return (None, 0)

        # a record is found
        rec_id = setrows.select(publications.id).first().id
        publication = publications[rec_id]

        # not a preprint ?
        if publication.id_categories != self.id_preprint:
            logs[-1].idle(MSG_IN_DB, year)
            return (rec_id, 0)

        # transform a preprint into an article
        logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
        if not self.dry_run:
            db.publications[rec_id] = dict(authors_institute=my_authors,
                                           id_categories=self.id_category,
                                           id_publishers=id_publisher,
                                           id_status=UNDEF_ID,
                                           oai_url=oai_url,
                                           pages=pages,
                                           publication_url=publication_url,
                                           title=title,
                                           volume=volume,
                                           year=year)

        return (rec_id, 1)

297 298
    def insert_record(self, record):
        """Insert an article in the database.
299

300 301 302 303
        Note:
            The method assumes that erratum are removed.

        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
304 305
            record (RecordPubli):
                the record describing the article.
306

307
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
308 309
            int:
                one when the record is inserted / updated in the database,
310
                zero otherwise.
311 312 313 314 315 316 317

        """
        db = self.db

        # alias
        editor = record.paper_editor()
        first_author = record.first_author()
318
        my_authors = record.my_authors
319 320 321 322
        oai_url = record.oai_url()
        pages = record.paper_pages()
        preprint_number = record.preprint_number()
        publication_url = record.paper_url()
323
        submitted = record.submitted()
324 325 326 327
        title = record.title()
        volume = record.paper_volume()
        year = record.paper_year()

328
        # get the collaboration / publisher identifiers
329 330 331 332
        id_collaboration = \
            get_id(db.collaborations, collaboration=record.collaboration())

        id_publisher = get_id(db.publishers, abbreviation=editor)
333

334 335
        # get already published articles or preprint
        # A preprint is transform into an article.
336 337 338
        #
        # NOTE: The check is performed by origin then by fields.
        # The latter is useful to cover the case where the record
339
        # is entered by hand or by another harvester.
340
        #
LE GAC Renaud's avatar
LE GAC Renaud committed
341 342 343 344 345 346 347

        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      oai_url=oai_url,
                      pages=pages,
                      publication_url=publication_url,
                      title=title,
348
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
349

350 351 352
        rec_id, status = self.get_record_by_origin(record.primary_oai_url(),
                                                   year,
                                                   **fields)
353 354 355
        if rec_id:
            return status

LE GAC Renaud's avatar
LE GAC Renaud committed
356 357 358 359 360 361
        fields = dict(id_publisher=id_publisher,
                      my_authors=my_authors,
                      pages=pages,
                      publication_url=publication_url,
                      preprint_number=preprint_number,
                      title=title,
362
                      volume=volume)
LE GAC Renaud's avatar
LE GAC Renaud committed
363

364
        rec_id, status = self.get_record_by_fields(oai_url, year, **fields)
365 366 367 368 369
        if rec_id:
            return status

        # eventually insert a new articles in the database
        # try to improve the rescue list for CPPM authors
370
        ret = 1
371 372
        if not self.dry_run:

LE GAC Renaud's avatar
LE GAC Renaud committed
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
            fields = dict(authors=record.authors(),
                          authors_institute=my_authors,
                          first_author=first_author,
                          id_categories=self.id_category,
                          id_collaborations=id_collaboration,
                          id_projects=self.id_project,
                          id_publishers=id_publisher,
                          id_status=UNDEF_ID,
                          id_teams=self.id_team,
                          origin=oai_url,
                          pages=pages,
                          preprint=preprint_number,
                          publication_url=publication_url,
                          submitted=submitted,
                          title=title,
                          volume=volume,
                          year=year)

            ret = self._insert_in_db(log_year=year, **fields)
392 393 394 395 396 397 398
            if ret == 1:
                learn_my_authors(db,
                                 authors=record.my_authors,
                                 id_project=self.id_project,
                                 id_team=self.id_team,
                                 year=year)

399
        if ret == 1:
400
            self.logs[-1].load(MSG_LOAD, year)
401
            return 1
402

403
        return 0