automaton.py 22.5 KB
Newer Older
1
""" harvest_tools.automaton
2 3

"""
4
import logging
5 6 7
import re
import traceback

8

9 10 11 12
from .base import (MSG_FIX_ORIGIN,
                   MSG_IN_DB,
                   ToolException)
from .checkandfix import CheckAndFix
13
from gluon.storage import Storage
LE GAC Renaud's avatar
LE GAC Renaud committed
14 15
from invenio_tools import (CdsException,
                           InvenioStore,
16
                           OAI_URL)
LE GAC Renaud's avatar
LE GAC Renaud committed
17
from invenio_tools.factory import build_record
18 19
from .msg import Msg
from .msgcollection import MsgCollection
20
from plugin_dbui import CALLBACK_ERRORS, get_id
21

22

23 24 25
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
26

LE GAC Renaud's avatar
LE GAC Renaud committed
27
MSG_INSERT_FAIL = "Fail to insert the new record in the database."
28

29
OAI = "oai:%s:%i"
30

31 32 33 34
# search collection when using inspirehep
# require for "Hal Hidden"
REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)")

35 36
T2 = " "*2
T4 = " "*4
37
T6 = " "*6
38

39

40
class Automaton(object):
41
    """Base class to search and process publications:
42

43
        * Decode the selector defining user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
44
        * Search in the store publications matching user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
45
        * Instantiate the record and check it.
46
        * Insert new records in the database.
47

48 49
    Note:
        The parameters of the search are defined by the current ``request``.
50

51 52 53
    The logic implements in the ``Automaton`` class is the following:

        #. Ask to the store, all the `record_id` satisfying the user request.
LE GAC Renaud's avatar
LE GAC Renaud committed
54 55
        #. Reject `record_id` contains in the *origin* field of a
           database entry.
LE GAC Renaud's avatar
LE GAC Renaud committed
56
        #. Request to the store, the JSON description of the publications
LE GAC Renaud's avatar
LE GAC Renaud committed
57 58 59 60
           and decode them.
        #. Reject the record for which the *secondary_oai_url* is contained in
           the *origin* field of a database entry. Update the *origin* field
           of the database record.
61
        #. Check that the *oai* of the publication is defined and well formed.
LE GAC Renaud's avatar
LE GAC Renaud committed
62 63
           Recover it, if it is not the case. At this stage the OAI is always
           defined.
64 65
        #. Reject temporarily publication.
        #. Check that *authors* are defined.
66
           Reject the publication if it is not the case.
67
        #. Check that *my institute* is in the list of the institutes
68 69 70 71 72 73
           signing the publication. Reject the publication if it is
           not the case. When the affiliation are not defined,
           try to recover this case, by finding the author of my institute
           signing the publication. This recovery procedure uses
           the *author rescue list*. Reject the record when the recovery
           procedure failed.
74
        #. Check that the *collaboration*, if defined, is well formed.
75
           Reject the publication if it is not the case
76 77 78 79 80
        #. Several check are applied depending on the publication type.
        #. At the end of this process, the publisher, the authors are
           formatted and the list of signatories of my institute extracted.

    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
81 82 83 84 85 86 87 88 89
        db (gluon.DAL):
            the database connection.

        id_team (int):
            the identifier of the team in the database.

        id_project (int):
            the identifier of the project in the database.

90
        automaton (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
            the name of the automaton which will be used to process the data.
            Possible values are: ``articles``, ``notes``, ``preprints``,
            ``proceedings``, ``reports``, ``talks`` and ``theses``.

        id_category (int):
            the identifier of the category of publication

        year_start (int):
            starting year for the scan

        year_end (int):
            ending year of the scan

        dry_run (bool):
            new records are not inserted in the database when ``True``.

107
    Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
108 109
        ToolException:
            * team or project or the publication category not defined
110

111 112 113 114 115
    """
    def __init__(self,
                 db,
                 id_team,
                 id_project,
116
                 automaton,
117 118 119
                 id_category,
                 year_start=None,
                 year_end=None,
120
                 dry_run=True):
121 122

        # protection team, project and/or category have to be defined
LE GAC Renaud's avatar
LE GAC Renaud committed
123
        if not id_team:
124 125
            raise ToolException(MSG_NO_TEAM)

LE GAC Renaud's avatar
LE GAC Renaud committed
126
        if not id_project:
127 128
            raise ToolException(MSG_NO_PROJECT)

LE GAC Renaud's avatar
LE GAC Renaud committed
129
        if not id_category:
130 131
            raise ToolException(MSG_NO_CAT)

132
        self.check = CheckAndFix()
LE GAC Renaud's avatar
LE GAC Renaud committed
133 134 135 136 137 138 139 140
        self.collection_logs = []
        self.controller = automaton
        self.db = db
        self.dry_run = dry_run
        self.id_category = id_category
        self.id_team = id_team
        self.id_project = id_project
        self.logs = []
141
        self.logger = logging.getLogger("web2py.app.limbra")
LE GAC Renaud's avatar
LE GAC Renaud committed
142 143 144 145
        self.store = None
        self.year_start = year_start
        self.year_end = year_end

146
        # Construct harvester Storage needed for the log
LE GAC Renaud's avatar
LE GAC Renaud committed
147 148 149 150
        self.harvester = Storage(id_teams=id_team,
                                 id_projects=id_project,
                                 controller=automaton,
                                 id_categories=id_category)
151

152 153 154 155 156
        # Identifier of the categories preprint and articles
        # Used by the method _is_record_in_db
        self._id_preprint = get_id(db.categories, code="PRE")
        self._id_article = get_id(db.categories, code="ACL")

157 158 159
    def _insert_in_db(self, log_year="", **fields):
        """Insert the record in the database, handling database exception.

160
        Args:
161
            log_year (str): year of the record for the log
162

163
        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
164 165
            **fields:
                keyword arguments defining the record values to be
166
                inserted in the database.
167

168
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
169 170
            int:
                one when the record is inserted / updated in the database,
171
                zero otherwise.
172 173 174 175 176

        """
        db = self.db

        try:
LE GAC Renaud's avatar
LE GAC Renaud committed
177
            rec_id = db.publications.insert(**fields)
LE GAC Renaud's avatar
LE GAC Renaud committed
178 179
            if rec_id:
                return 1
180

LE GAC Renaud's avatar
LE GAC Renaud committed
181
            # operation can be reject by callback table._before_insert
LE GAC Renaud's avatar
LE GAC Renaud committed
182
            else:
LE GAC Renaud's avatar
LE GAC Renaud committed
183
                msg = MSG_INSERT_FAIL
LE GAC Renaud's avatar
LE GAC Renaud committed
184 185
                if CALLBACK_ERRORS in db.publications:
                    msg = db.publications._callback_errors
186

LE GAC Renaud's avatar
LE GAC Renaud committed
187 188 189
                # reduce the error message
                if isinstance(msg, list):
                    msg = "%s %s" % (msg[0], msg[-1])
190

LE GAC Renaud's avatar
LE GAC Renaud committed
191 192
                self.logs[-1].reject(msg, log_year)
                return 0
193

LE GAC Renaud's avatar
LE GAC Renaud committed
194 195
        # operation can be rejected by the database
        except Exception as dbe:
196
            self.logs[-1].reject(str(dbe), log_year)
LE GAC Renaud's avatar
LE GAC Renaud committed
197
            return 0
198

LE GAC Renaud's avatar
LE GAC Renaud committed
199 200 201 202 203 204 205
    def _is_record_in_db(self,
                         collection_title,
                         host=None,
                         rec_id=None,
                         oai_url=None):
        """Return the database identifier when the publication is registered.
        The search is based on the ``origin`` field and on the primary OAI.
206

207 208
        Note:
            A new log entry is created when a record is found.
209

210
        Args:
211
            title (str): the title of the publication.
212 213

        Keyword Args:
214
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
215
                the store. possible values are ``cds.cern.ch`` or
216 217
                ``inspirehep.net``. To be used with *rec_id*.

LE GAC Renaud's avatar
LE GAC Renaud committed
218 219 220
            rec_id (int):
                the record identifier in the store

221
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
222 223
                the URL of the record in the store.
                Either use *host* and *rec_id* or *oai_url*
224

225
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
226 227
            int:
                the id of the record in the database when a record is found,
228
                0 otherwise.
229

230
        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
231 232
            ValueError:
                * keyword arguments are not defined properly.
233

234 235
        """
        db = self.db
236
        harvester = self.harvester
237

238 239 240 241 242 243 244 245
        # build the OAI URL
        if host is not None and rec_id is not None and oai_url is None:
            url = OAI_URL % (host, rec_id)
        elif host is None and rec_id is None and oai_url is not None:
            url = oai_url
        else:
            raise ValueError

LE GAC Renaud's avatar
LE GAC Renaud committed
246
        # protection empty URL
247 248 249
        if len(url) == 0:
            return 0

250 251 252
        # check the OAI
        query = db.publications.origin.contains(url)
        setrows = db(query)
253

254
        if setrows.count() == 0:
255
            return 0
256

257
        # one record found
258 259
        columns = [db.publications.id,
                   db.publications.id_categories,
260 261 262
                   db.publications.title,
                   db.publications.year]
        publication = setrows.select(*columns).first()
263

264 265
        # Note:
        # The category for the publication and the harvester have to be equal.
266 267 268 269 270 271 272
        # However, keep the record if it is a preprint when the harvester
        # looks for articles. This is required to transform a preprint
        # into article
        #
        # Category can disagree when the publication is an article and
        # the harvester look for preprint. In that case, keep the article
        #
273
        if publication.id_categories != harvester.id_categories:
274 275 276 277 278 279 280

            is_preprint_to_article = \
                publication.id_categories == self._id_preprint \
                and harvester.id_categories == self._id_article

            if is_preprint_to_article:
                return 0
281 282

        # log
283
        self.logs.append(Msg(harvester=harvester,
LE GAC Renaud's avatar
LE GAC Renaud committed
284
                             collection=collection_title,
285 286 287 288 289
                             record_id=rec_id,
                             title=publication.title))

        self.logs[-1].idle(MSG_IN_DB, publication.year)

290 291 292
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}record {rec_id} in db with id {publication.id}")
LE GAC Renaud's avatar
LE GAC Renaud committed
293

294
        return publication.id
295

296 297 298 299 300
    def _search_parameters(self, collection):
        """Build the keywords to steer the URL search in invenio store.
        The main parameter is the collection and the date range defined
        in the selector.

301
        Args:
302
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
303 304
                string defining the collection in the store.
                The syntax depends on the invenio store:
305 306 307

                    * ``"find cn d0 and tc p and not tc c"``
                    * ``"LHCb Papers"``.
308

309
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
310 311 312
            dict:
                the key are a sub-set of those defined in
                :meth:`invenio_tools.InvenioStore.get_ids`.
313 314

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
315 316
        year_start = self.year_start
        year_end = self.year_end
317 318

        # INSPIREHEP store
LE GAC Renaud's avatar
LE GAC Renaud committed
319
        if collection.startswith("find"):
320 321 322

            query = collection

LE GAC Renaud's avatar
LE GAC Renaud committed
323 324
            if year_start and not year_end:
                query += " and date %s" % year_start
325

LE GAC Renaud's avatar
LE GAC Renaud committed
326 327
            elif not year_start and year_end:
                query += " and date %s" % year_end
328

LE GAC Renaud's avatar
LE GAC Renaud committed
329
            elif year_start and year_end:
330
                query += " and date > %s and date < %s " \
LE GAC Renaud's avatar
LE GAC Renaud committed
331
                         % (year_start - 1, year_end + 1)
332 333 334

            dic = dict(p=query,  # query à la spires
                       rg=1000,  # maximum number of records returned
LE GAC Renaud's avatar
LE GAC Renaud committed
335 336
                       sf="year",  # sort by date
                       so="d")  # descending order
337

338 339 340 341 342 343 344 345 346
            # handle the cc keyword (true inspirehep collection)
            match = REG_COLLECTION.search(query)
            if match:
                dic["cc"] = match.group(1).strip()
                dic["p"] = REG_COLLECTION.sub("", query).strip()
                dic["p"] = dic["p"].replace("  ", " ")
                if dic["p"] == "find":
                    del dic["p"]

347 348 349
        # CERN INVENIO store
        else:

LE GAC Renaud's avatar
LE GAC Renaud committed
350 351
            if year_start and not year_end:
                rex = year_start
352

LE GAC Renaud's avatar
LE GAC Renaud committed
353 354
            elif not year_start and year_end:
                rex = year_end
355

LE GAC Renaud's avatar
LE GAC Renaud committed
356
            elif year_start and year_end:
357
                li = [str(el) for el in range(year_start, year_end + 1)]
LE GAC Renaud's avatar
LE GAC Renaud committed
358
                rex = "|".join(li)
359 360

            dic = dict(cc=collection,  # collection
LE GAC Renaud's avatar
LE GAC Renaud committed
361 362
                       f1="year",  # search on year
                       m1="r",  # use regular expression
363
                       p1=rex,  # regular expression defining year
LE GAC Renaud's avatar
LE GAC Renaud committed
364 365
                       sf="year",  # sort by date
                       so="d")  # descending order
366 367
        return dic

LE GAC Renaud's avatar
LE GAC Renaud committed
368
    def check_record(self, record):
369 370
        """Check the content of the record in order to fix non-conformities.
        Return ``False`` when non-conformities are found and can not be
371 372
        corrected.

373 374 375
        Note:
            Some checks depend on the type of publications and have to be
            implemented in inherited class.
376

377
        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
378
            The order of the checks matter. It should be OAI,
379 380
            temporary record, authors, my authors and then a series of checks
            specific to the publication type.
381

382
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
383 384
            record (Record):
                JSON record describing the publication.
385

386
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
387 388
            bool:
                ``False`` when a non-conformity is found and can not be
389
                corrected.
390 391

        """
392
        self.logger.debug(f"{T4}check record (automaton)")
393 394

        try:
395 396
            # fix record with a missing OAI
            if not self.check.is_oai(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
397
                oai = OAI % (self.harvester.host, record.id())
398
                record["oai"] = {"value": oai}
399

400
            if self.check.is_bad_oai_used(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
401
                self.logs[-1].idle(MSG_IN_DB, record.submitted())
402 403
                return False

404 405
            self.check.temporary_record(record)
            self.check.authors(record)
406
            self.check.my_affiliation(record, self.id_project, self.id_team)
407 408 409
            self.check.collaboration(record)

        except Exception as e:
410
            self.logs[-1].reject(e, record=record)
411 412 413 414
            return False

        return True

415
    def get_record_by_fields(self, oai_url, year, **kwargs):
416 417
        """Get database record matching fields values defined
        in the keyword arguments.
418

419
        Note:
420 421
            This method is required to deal with publication entered by hand
            and found later by an harvester.
422

423
        Args:
424
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
425 426 427
                the oai_url, *e.g.* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update to
                **oai_url** when a match is found.
428

LE GAC Renaud's avatar
LE GAC Renaud committed
429 430
            year (int):
                the year of the publication. It is used
431 432 433
                by the search algorithm and by the logger.

        Keyword Args:
434
            kwargs (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
435 436
                 a series of key, value pair where the key is the name of a
                 publications database field.
437

438
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
439 440 441 442 443
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                The ``id`` is equal to ``None`` when there is no matching.
                The ``status`` is equal to one when the existing record was
                modified zero otherwise.
444 445

        """
446
        self.logger.debug(f"{T6}get existing record by fields")
447

448
        # alias
449
        db = self.db
450
        logs = self.logs
451

452 453 454
        # add the publication year to search criteria
        if year:
            kwargs["year"] = year
455 456 457 458 459 460 461

        # look for an existing record
        rec_id = get_id(db.publications, **kwargs)
        if not rec_id:
            return (None, 0)

        # fix origin field
462 463
        publication = db.publications[rec_id]
        ok = publication.origin and publication.origin == oai_url
464 465
        if not ok:
            if not self.dry_run:
466
                publication = dict(origin=oai_url)
467

468
            logs[-1].modify(MSG_FIX_ORIGIN, year)
469 470
            return (rec_id, 1)

471
        logs[-1].idle(MSG_IN_DB, year)
472 473
        return (rec_id, 0)

474 475
    def insert_record(self, record):
        """Insert the record in the database.
476

477 478 479
        Note:
            This method depend on the type of publications.
            It has to be implemented for each inherited class.
480

481
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
482 483
            record (Record):
                record describing the publication.
484

485
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
486 487
            int:
                one when the record is inserted / updated in the database,
488
                zero otherwise.
489 490 491 492

        """
        return 0

LE GAC Renaud's avatar
LE GAC Renaud committed
493
    def process_collection(self, collection):
LE GAC Renaud's avatar
LE GAC Renaud committed
494
        """Retrieve JSON objects from the invenio store and for the given
LE GAC Renaud's avatar
LE GAC Renaud committed
495
        collection. Corresponding records are inserted in the database.
496

497
        Args:
498
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
499
                name of the collection to be interrogated.
500

501 502
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
503 504
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
505 506

        """
507 508
        logger = self.logger
        logger.debug(f"process collection {collection}")
509 510

        # alias
511
        collection_logs = self.collection_logs
512
        controller = self.controller
LE GAC Renaud's avatar
LE GAC Renaud committed
513
        host = self.harvester.host
514
        project = self.db.projects[self.id_project].project
LE GAC Renaud's avatar
LE GAC Renaud committed
515
        store = self.store
516

517 518
        # log collection information
        # A collection is identified as "Project Controller collection"
LE GAC Renaud's avatar
LE GAC Renaud committed
519 520
        ctitle = "%s / %s / %s" % (project, controller, collection)
        collection_logs.append(MsgCollection(title=ctitle))
521

LE GAC Renaud's avatar
LE GAC Renaud committed
522
        # get search parameters for the collection including user criteria
523
        kwargs = self._search_parameters(collection)
524

LE GAC Renaud's avatar
LE GAC Renaud committed
525
        # get the list of record identifier matching the search criteria
526 527
        try:
            rec_ids = store.get_ids(**kwargs)
528

LE GAC Renaud's avatar
LE GAC Renaud committed
529
        except CdsException as error:
530
            logger.debug(f"exit process_collection: {error}")
531 532 533
            collection_logs[-1].url = store.last_search_url()
            collection_logs[-1].error = error
            return
534

LE GAC Renaud's avatar
LE GAC Renaud committed
535
        # log the number of record found for the collection
536 537
        collection_logs[-1].url = store.last_search_url()
        collection_logs[-1].found = len(rec_ids)
538

LE GAC Renaud's avatar
LE GAC Renaud committed
539
        if len(rec_ids) == 0:
540
            logger.debug(f"no records found in {collection}")
541
            return
542

543
        logger.debug(f"{len(rec_ids)} records found in {collection}")
544

LE GAC Renaud's avatar
LE GAC Renaud committed
545 546 547 548
        # remove form the list identifier already registered in the data base
        # and log them
        func = self._is_record_in_db
        rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0]
549

LE GAC Renaud's avatar
LE GAC Renaud committed
550
        # process the remaining identifiers
551
        (*map(self.process_recid, rec_ids), )
LE GAC Renaud's avatar
LE GAC Renaud committed
552

553 554
    def process_recjson(self, recjson):
        """Process the publication provided as a JSON record:
LE GAC Renaud's avatar
LE GAC Renaud committed
555 556 557 558

            * instantiate the record (RecordPubli, REcordConf, RecordThesis)
            * check the record
            * insert new record in the database
559

560
        Args:
561 562
            recjson (dict):
                record provided by the store.
563

LE GAC Renaud's avatar
LE GAC Renaud committed
564
        """
565 566
        logger = self.logger
        logger.debug(f"{T4}process record {recjson['recid']} (process_recjson)")
567

LE GAC Renaud's avatar
LE GAC Renaud committed
568 569
        collection_logs = self.collection_logs
        harvester = self.harvester
570 571
        logs = self.logs

LE GAC Renaud's avatar
LE GAC Renaud committed
572
        # instantiate the record
573
        record = build_record(recjson)
574

575
        logger.debug(f"{T4}{record.title()[:72]}")
LE GAC Renaud's avatar
LE GAC Renaud committed
576 577 578 579 580 581 582 583 584 585

        # start the log for the record
        logs.append(Msg(harvester=harvester,
                        collection=collection_logs[-1].title,
                        record_id=record.id(),
                        title=record.title()))

        # check that the record is well formed
        # repair non-conformity as far as possible
        if not self.check_record(record):
586
            logger.debug(f"{T4}{logs[-1].txt}")
587
            return
LE GAC Renaud's avatar
LE GAC Renaud committed
588

589 590
        txt = ("(dry run)" if self.dry_run else "")
        logger.debug(f"{T4}insert record in the database {txt}")
591

LE GAC Renaud's avatar
LE GAC Renaud committed
592 593
        # insert the record in the database
        self.insert_record(record)
594

595
        if logger.getEffectiveLevel() == logging.DEBUG:
LE GAC Renaud's avatar
LE GAC Renaud committed
596 597 598
            log = logs[-1]
            action = log.action
            action = (action.upper() if isinstance(action, str) else action)
599
            logger.debug(f"{T4}log: {action} {log.txt}")
LE GAC Renaud's avatar
LE GAC Renaud committed
600

601 602 603 604
    def process_recid(self, rec_id):
        """Process the publication identified by its record identifier:

            * get the publication data from the store using its identifier
LE GAC Renaud's avatar
LE GAC Renaud committed
605 606
            * instantiate the record: ``RecordPubli``, ``RecordConf``
              or ``RecordThesis``
607 608 609 610 611 612
            * process OAI data
            * check the record
            * insert new record in the database

        Note:
            * Design to never stop although exception are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
613 614
            * Have a look to the attribute ``collection_logs`` and ``logs`` in
              order to understand what happen.
615 616 617 618 619 620

        Args:
            rec_id (int):
                identifier of the publication in the store.

        """
621 622 623
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}get record {rec_id} (process_recid)")
624 625 626 627 628 629 630 631 632 633

        collection_logs = self.collection_logs
        harvester = self.harvester
        logs = self.logs

        try:
            recjson = self.store.get_record(rec_id)
            self.process_recjson(recjson)

        except Exception as e:
634
            logger.debug(f"{T2}{str(e)}")
635 636 637 638 639 640 641 642
            url = OAI_URL % (harvester.host, rec_id)
            logs.append(Msg(harvester=harvester,
                            collection=collection_logs[-1].title,
                            record_id=rec_id,
                            title=url))
            logs[-1].reject(e)
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
643 644 645 646
    def process_url(self, host, collections):
        """Retrieve JSON objects from the invenio store and
        insert corresponding records in the database.

647 648
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
649 650
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
651

LE GAC Renaud's avatar
LE GAC Renaud committed
652
        Args:
653
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
654 655 656
                host name to query for publications, either
                ``cds.cern.ch`` or ``inspirehep.net``.

657
            collections (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
658
                list of collection to be interrogated.
659
                Collections are separated by a comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
660 661

        """
662 663
        self.logger.debug("")
        self.logger.debug(f"process URL search -- {host} -- {collections}")
LE GAC Renaud's avatar
LE GAC Renaud committed
664 665 666 667 668 669 670 671 672 673 674 675

        # extend harvester for logs
        self.harvester.host = host
        self.harvester.collections = collections

        # instantiate the store
        self.store = InvenioStore(host)

        # list of collections
        collections = re.sub(" *, *", ",", collections).split(",")

        # process
676
        (*map(self.process_collection, collections), )
677 678 679 680

    def report(self):
        """Build the processing report.

681 682
        Returns:
            dict:
LE GAC Renaud's avatar
LE GAC Renaud committed
683
                * ``collection_logs`` list of :class:`MsgCollection`
684
                * ``controller`` str
LE GAC Renaud's avatar
LE GAC Renaud committed
685
                * ``logs`` list of :class:`Msg`
LE GAC Renaud's avatar
LE GAC Renaud committed
686
                * ``selector`` :class:`plugin_dbui.Selector`
687 688 689 690 691 692

        """

        return dict(collection_logs=self.collection_logs,
                    controller=self.controller,
                    logs=self.logs)