automaton.py 22.6 KB
Newer Older
1
""" harvest_tools.automaton
2 3

"""
4
import logging
5 6 7
import re
import traceback

8

9 10 11 12
from .base import (MSG_FIX_ORIGIN,
                   MSG_IN_DB,
                   ToolException)
from .checkandfix import CheckAndFix
13
from gluon.storage import Storage
LE GAC Renaud's avatar
LE GAC Renaud committed
14 15
from invenio_tools import (CdsException,
                           InvenioStore,
16
                           OAI_URL)
LE GAC Renaud's avatar
LE GAC Renaud committed
17
from invenio_tools.factory import build_record
18 19
from .msg import Msg
from .msgcollection import MsgCollection
20
from plugin_dbui import CALLBACK_ERRORS, get_id
21

22

23 24 25
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
26

LE GAC Renaud's avatar
LE GAC Renaud committed
27
MSG_INSERT_FAIL = "Fail to insert the new record in the database."
28

29
OAI = "oai:%s:%i"
30

31 32 33 34
# search collection when using inspirehep
# require for "Hal Hidden"
REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)")

35 36
T2 = " "*2
T4 = " "*4
37
T6 = " "*6
38

39

40
class Automaton(object):
41
    """Base class to search and process publications:
42

43
        * Decode the selector defining user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
44
        * Search in the store publications matching user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
45
        * Instantiate the record and check it.
46
        * Insert new records in the database.
47

48 49
    Note:
        The parameters of the search are defined by the current ``request``.
50

51 52 53
    The logic implements in the ``Automaton`` class is the following:

        #. Ask to the store, all the `record_id` satisfying the user request.
LE GAC Renaud's avatar
LE GAC Renaud committed
54 55
        #. Reject `record_id` contains in the *origin* field of a
           database entry.
LE GAC Renaud's avatar
LE GAC Renaud committed
56
        #. Request to the store, the JSON description of the publications
LE GAC Renaud's avatar
LE GAC Renaud committed
57 58 59 60
           and decode them.
        #. Reject the record for which the *secondary_oai_url* is contained in
           the *origin* field of a database entry. Update the *origin* field
           of the database record.
61
        #. Check that the *oai* of the publication is defined and well formed.
LE GAC Renaud's avatar
LE GAC Renaud committed
62 63
           Recover it, if it is not the case. At this stage the OAI is always
           defined.
64 65
        #. Reject temporarily publication.
        #. Check that *authors* are defined.
66
           Reject the publication if it is not the case.
67
        #. Check that *my institute* is in the list of the institutes
68 69 70 71 72 73
           signing the publication. Reject the publication if it is
           not the case. When the affiliation are not defined,
           try to recover this case, by finding the author of my institute
           signing the publication. This recovery procedure uses
           the *author rescue list*. Reject the record when the recovery
           procedure failed.
74
        #. Check that the *collaboration*, if defined, is well formed.
75
           Reject the publication if it is not the case
76 77 78 79 80
        #. Several check are applied depending on the publication type.
        #. At the end of this process, the publisher, the authors are
           formatted and the list of signatories of my institute extracted.

    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
81 82 83 84 85 86 87 88 89
        db (gluon.DAL):
            the database connection.

        id_team (int):
            the identifier of the team in the database.

        id_project (int):
            the identifier of the project in the database.

90
        automaton (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
            the name of the automaton which will be used to process the data.
            Possible values are: ``articles``, ``notes``, ``preprints``,
            ``proceedings``, ``reports``, ``talks`` and ``theses``.

        id_category (int):
            the identifier of the category of publication

        year_start (int):
            starting year for the scan

        year_end (int):
            ending year of the scan

        dry_run (bool):
            new records are not inserted in the database when ``True``.

107
    Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
108 109
        ToolException:
            * team or project or the publication category not defined
110

111 112 113 114 115
    """
    def __init__(self,
                 db,
                 id_team,
                 id_project,
116
                 automaton,
117 118 119
                 id_category,
                 year_start=None,
                 year_end=None,
120
                 dry_run=True):
121 122

        # protection team, project and/or category have to be defined
LE GAC Renaud's avatar
LE GAC Renaud committed
123
        if not id_team:
124 125
            raise ToolException(MSG_NO_TEAM)

LE GAC Renaud's avatar
LE GAC Renaud committed
126
        if not id_project:
127 128
            raise ToolException(MSG_NO_PROJECT)

LE GAC Renaud's avatar
LE GAC Renaud committed
129
        if not id_category:
130 131
            raise ToolException(MSG_NO_CAT)

132
        self.check = CheckAndFix()
LE GAC Renaud's avatar
LE GAC Renaud committed
133 134 135 136 137 138 139 140
        self.collection_logs = []
        self.controller = automaton
        self.db = db
        self.dry_run = dry_run
        self.id_category = id_category
        self.id_team = id_team
        self.id_project = id_project
        self.logs = []
141
        self.logger = logging.getLogger("web2py.app.limbra")
LE GAC Renaud's avatar
LE GAC Renaud committed
142 143 144 145
        self.store = None
        self.year_start = year_start
        self.year_end = year_end

146
        # Construct harvester Storage needed for the log
LE GAC Renaud's avatar
LE GAC Renaud committed
147 148 149 150
        self.harvester = Storage(id_teams=id_team,
                                 id_projects=id_project,
                                 controller=automaton,
                                 id_categories=id_category)
151

152 153 154 155 156
        # Identifier of the categories preprint and articles
        # Used by the method _is_record_in_db
        self._id_preprint = get_id(db.categories, code="PRE")
        self._id_article = get_id(db.categories, code="ACL")

157 158 159
    def _insert_in_db(self, log_year="", **fields):
        """Insert the record in the database, handling database exception.

160
        Args:
161
            log_year (str): year of the record for the log
162

163
        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
164 165
            **fields:
                keyword arguments defining the record values to be
166
                inserted in the database.
167

168
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
169 170
            int:
                one when the record is inserted / updated in the database,
171
                zero otherwise.
172 173 174 175 176

        """
        db = self.db

        try:
LE GAC Renaud's avatar
LE GAC Renaud committed
177
            rec_id = db.publications.insert(**fields)
LE GAC Renaud's avatar
LE GAC Renaud committed
178 179
            if rec_id:
                return 1
180

LE GAC Renaud's avatar
LE GAC Renaud committed
181
            # operation can be reject by callback table._before_insert
LE GAC Renaud's avatar
LE GAC Renaud committed
182
            else:
LE GAC Renaud's avatar
LE GAC Renaud committed
183
                msg = MSG_INSERT_FAIL
LE GAC Renaud's avatar
LE GAC Renaud committed
184 185
                if CALLBACK_ERRORS in db.publications:
                    msg = db.publications._callback_errors
186

LE GAC Renaud's avatar
LE GAC Renaud committed
187 188 189
                # reduce the error message
                if isinstance(msg, list):
                    msg = "%s %s" % (msg[0], msg[-1])
190

LE GAC Renaud's avatar
LE GAC Renaud committed
191 192
                self.logs[-1].reject(msg, log_year)
                return 0
193

LE GAC Renaud's avatar
LE GAC Renaud committed
194 195
        # operation can be rejected by the database
        except Exception as dbe:
196
            self.logs[-1].reject(str(dbe), log_year)
LE GAC Renaud's avatar
LE GAC Renaud committed
197
            return 0
198

LE GAC Renaud's avatar
LE GAC Renaud committed
199 200 201 202 203 204 205
    def _is_record_in_db(self,
                         collection_title,
                         host=None,
                         rec_id=None,
                         oai_url=None):
        """Return the database identifier when the publication is registered.
        The search is based on the ``origin`` field and on the primary OAI.
206

207 208
        Note:
            A new log entry is created when a record is found.
209

210
        Args:
211
            title (str): the title of the publication.
212 213

        Keyword Args:
214
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
215
                the store. possible values are ``cds.cern.ch`` or
216 217
                ``inspirehep.net``. To be used with *rec_id*.

LE GAC Renaud's avatar
LE GAC Renaud committed
218 219 220
            rec_id (int):
                the record identifier in the store

221
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
222 223
                the URL of the record in the store.
                Either use *host* and *rec_id* or *oai_url*
224

225
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
226 227
            int:
                the id of the record in the database when a record is found,
228
                0 otherwise.
229

230
        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
231 232
            ValueError:
                * keyword arguments are not defined properly.
233

234 235
        """
        db = self.db
236
        harvester = self.harvester
237

238 239 240 241 242 243 244 245
        # build the OAI URL
        if host is not None and rec_id is not None and oai_url is None:
            url = OAI_URL % (host, rec_id)
        elif host is None and rec_id is None and oai_url is not None:
            url = oai_url
        else:
            raise ValueError

LE GAC Renaud's avatar
LE GAC Renaud committed
246
        # protection empty URL
247 248 249
        if len(url) == 0:
            return 0

250 251 252
        # check the OAI
        query = db.publications.origin.contains(url)
        setrows = db(query)
253

254
        if setrows.count() == 0:
255
            return 0
256

257
        # one record found
258 259
        columns = [db.publications.id,
                   db.publications.id_categories,
260 261 262
                   db.publications.title,
                   db.publications.year]
        publication = setrows.select(*columns).first()
263

264 265
        # Note:
        # The category for the publication and the harvester have to be equal.
266 267 268 269 270 271 272
        # However, keep the record if it is a preprint when the harvester
        # looks for articles. This is required to transform a preprint
        # into article
        #
        # Category can disagree when the publication is an article and
        # the harvester look for preprint. In that case, keep the article
        #
273
        if publication.id_categories != harvester.id_categories:
274 275 276 277 278 279 280

            is_preprint_to_article = \
                publication.id_categories == self._id_preprint \
                and harvester.id_categories == self._id_article

            if is_preprint_to_article:
                return 0
281 282

        # log
283
        self.logs.append(Msg(harvester=harvester,
LE GAC Renaud's avatar
LE GAC Renaud committed
284
                             collection=collection_title,
285 286 287 288 289
                             record_id=rec_id,
                             title=publication.title))

        self.logs[-1].idle(MSG_IN_DB, publication.year)

290 291 292
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}record {rec_id} in db with id {publication.id}")
LE GAC Renaud's avatar
LE GAC Renaud committed
293

294
        return publication.id
295

296 297 298 299 300
    def _search_parameters(self, collection):
        """Build the keywords to steer the URL search in invenio store.
        The main parameter is the collection and the date range defined
        in the selector.

301
        Args:
302
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
303 304
                string defining the collection in the store.
                The syntax depends on the invenio store:
305 306 307

                    * ``"find cn d0 and tc p and not tc c"``
                    * ``"LHCb Papers"``.
308

309
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
310 311 312
            dict:
                the key are a sub-set of those defined in
                :meth:`invenio_tools.InvenioStore.get_ids`.
313 314

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
315 316
        year_start = self.year_start
        year_end = self.year_end
317 318

        # INSPIREHEP store
LE GAC Renaud's avatar
LE GAC Renaud committed
319
        if collection.startswith("find"):
320 321 322

            query = collection

LE GAC Renaud's avatar
LE GAC Renaud committed
323
            if year_start and not year_end:
324
                query += f" and date {year_start}"
325

LE GAC Renaud's avatar
LE GAC Renaud committed
326
            elif not year_start and year_end:
327
                query += f" and date {year_end}"
328

LE GAC Renaud's avatar
LE GAC Renaud committed
329
            elif year_start and year_end:
330 331 332
                tpl = (f"date {el}" for el in range(year_start, year_end + 1))
                sdates = " or ".join(tpl)
                query += f" and ({sdates})"
333 334 335

            dic = dict(p=query,  # query à la spires
                       rg=1000,  # maximum number of records returned
LE GAC Renaud's avatar
LE GAC Renaud committed
336 337
                       sf="year",  # sort by date
                       so="d")  # descending order
338

339 340 341 342 343 344 345 346 347
            # handle the cc keyword (true inspirehep collection)
            match = REG_COLLECTION.search(query)
            if match:
                dic["cc"] = match.group(1).strip()
                dic["p"] = REG_COLLECTION.sub("", query).strip()
                dic["p"] = dic["p"].replace("  ", " ")
                if dic["p"] == "find":
                    del dic["p"]

348 349 350
        # CERN INVENIO store
        else:

LE GAC Renaud's avatar
LE GAC Renaud committed
351 352
            if year_start and not year_end:
                rex = year_start
353

LE GAC Renaud's avatar
LE GAC Renaud committed
354 355
            elif not year_start and year_end:
                rex = year_end
356

LE GAC Renaud's avatar
LE GAC Renaud committed
357
            elif year_start and year_end:
358 359
                tpl = (str(el) for el in range(year_start, year_end + 1))
                rex = "|".join(tpl)
360 361

            dic = dict(cc=collection,  # collection
LE GAC Renaud's avatar
LE GAC Renaud committed
362 363
                       f1="year",  # search on year
                       m1="r",  # use regular expression
364
                       p1=rex,  # regular expression defining year
LE GAC Renaud's avatar
LE GAC Renaud committed
365 366
                       sf="year",  # sort by date
                       so="d")  # descending order
367 368
        return dic

LE GAC Renaud's avatar
LE GAC Renaud committed
369
    def check_record(self, record):
370 371
        """Check the content of the record in order to fix non-conformities.
        Return ``False`` when non-conformities are found and can not be
372 373
        corrected.

374 375 376
        Note:
            Some checks depend on the type of publications and have to be
            implemented in inherited class.
377

378
        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
379
            The order of the checks matter. It should be OAI,
380 381
            temporary record, authors, my authors and then a series of checks
            specific to the publication type.
382

383
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
384 385
            record (Record):
                JSON record describing the publication.
386

387
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
388 389
            bool:
                ``False`` when a non-conformity is found and can not be
390
                corrected.
391 392

        """
393
        self.logger.debug(f"{T4}check record (automaton)")
394 395

        try:
396 397
            # fix record with a missing OAI
            if not self.check.is_oai(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
398
                oai = OAI % (self.harvester.host, record.id())
399
                record["oai"] = {"value": oai}
400

401
            if self.check.is_bad_oai_used(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
402
                self.logs[-1].idle(MSG_IN_DB, record.submitted())
403 404
                return False

405 406
            self.check.temporary_record(record)
            self.check.authors(record)
407
            self.check.my_affiliation(record, self.id_project, self.id_team)
408 409 410
            self.check.collaboration(record)

        except Exception as e:
411
            self.logs[-1].reject(e, record=record)
412 413 414 415
            return False

        return True

416
    def get_record_by_fields(self, oai_url, year, **kwargs):
417 418
        """Get database record matching fields values defined
        in the keyword arguments.
419

420
        Note:
421 422
            This method is required to deal with publication entered by hand
            and found later by an harvester.
423

424
        Args:
425
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
426 427 428
                the oai_url, *e.g.* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update to
                **oai_url** when a match is found.
429

LE GAC Renaud's avatar
LE GAC Renaud committed
430 431
            year (int):
                the year of the publication. It is used
432 433 434
                by the search algorithm and by the logger.

        Keyword Args:
435
            kwargs (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
436 437
                 a series of key, value pair where the key is the name of a
                 publications database field.
438

439
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
440 441 442 443 444
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                The ``id`` is equal to ``None`` when there is no matching.
                The ``status`` is equal to one when the existing record was
                modified zero otherwise.
445 446

        """
447
        self.logger.debug(f"{T6}get existing record by fields")
448

449
        # alias
450
        db = self.db
451
        logs = self.logs
452

453 454 455
        # add the publication year to search criteria
        if year:
            kwargs["year"] = year
456 457 458 459 460 461 462

        # look for an existing record
        rec_id = get_id(db.publications, **kwargs)
        if not rec_id:
            return (None, 0)

        # fix origin field
463 464
        publication = db.publications[rec_id]
        ok = publication.origin and publication.origin == oai_url
465 466
        if not ok:
            if not self.dry_run:
467
                publication = dict(origin=oai_url)
468

469
            logs[-1].modify(MSG_FIX_ORIGIN, year)
470 471
            return (rec_id, 1)

472
        logs[-1].idle(MSG_IN_DB, year)
473 474
        return (rec_id, 0)

475 476
    def insert_record(self, record):
        """Insert the record in the database.
477

478 479 480
        Note:
            This method depend on the type of publications.
            It has to be implemented for each inherited class.
481

482
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
483 484
            record (Record):
                record describing the publication.
485

486
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
487 488
            int:
                one when the record is inserted / updated in the database,
489
                zero otherwise.
490 491 492 493

        """
        return 0

LE GAC Renaud's avatar
LE GAC Renaud committed
494
    def process_collection(self, collection):
LE GAC Renaud's avatar
LE GAC Renaud committed
495
        """Retrieve JSON objects from the invenio store and for the given
LE GAC Renaud's avatar
LE GAC Renaud committed
496
        collection. Corresponding records are inserted in the database.
497

498
        Args:
499
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
500
                name of the collection to be interrogated.
501

502 503
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
504 505
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
506 507

        """
508 509
        logger = self.logger
        logger.debug(f"process collection {collection}")
510 511

        # alias
512
        collection_logs = self.collection_logs
513
        controller = self.controller
LE GAC Renaud's avatar
LE GAC Renaud committed
514
        host = self.harvester.host
515
        project = self.db.projects[self.id_project].project
LE GAC Renaud's avatar
LE GAC Renaud committed
516
        store = self.store
517

518 519
        # log collection information
        # A collection is identified as "Project Controller collection"
LE GAC Renaud's avatar
LE GAC Renaud committed
520 521
        ctitle = "%s / %s / %s" % (project, controller, collection)
        collection_logs.append(MsgCollection(title=ctitle))
522

LE GAC Renaud's avatar
LE GAC Renaud committed
523
        # get search parameters for the collection including user criteria
524
        kwargs = self._search_parameters(collection)
525

LE GAC Renaud's avatar
LE GAC Renaud committed
526
        # get the list of record identifier matching the search criteria
527 528
        try:
            rec_ids = store.get_ids(**kwargs)
529

LE GAC Renaud's avatar
LE GAC Renaud committed
530
        except CdsException as error:
531
            logger.debug(f"exit process_collection: {error}")
532 533 534
            collection_logs[-1].url = store.last_search_url()
            collection_logs[-1].error = error
            return
535

LE GAC Renaud's avatar
LE GAC Renaud committed
536
        # log the number of record found for the collection
537 538
        collection_logs[-1].url = store.last_search_url()
        collection_logs[-1].found = len(rec_ids)
539

LE GAC Renaud's avatar
LE GAC Renaud committed
540
        if len(rec_ids) == 0:
541
            logger.debug(f"no records found in {collection}")
542
            return
543

544
        logger.debug(f"{len(rec_ids)} records found in {collection}")
545

LE GAC Renaud's avatar
LE GAC Renaud committed
546 547 548 549
        # remove form the list identifier already registered in the data base
        # and log them
        func = self._is_record_in_db
        rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0]
550

LE GAC Renaud's avatar
LE GAC Renaud committed
551
        # process the remaining identifiers
552
        (*map(self.process_recid, rec_ids), )
LE GAC Renaud's avatar
LE GAC Renaud committed
553

554 555
    def process_recjson(self, recjson):
        """Process the publication provided as a JSON record:
LE GAC Renaud's avatar
LE GAC Renaud committed
556 557 558 559

            * instantiate the record (RecordPubli, REcordConf, RecordThesis)
            * check the record
            * insert new record in the database
560

561
        Args:
562 563
            recjson (dict):
                record provided by the store.
564

LE GAC Renaud's avatar
LE GAC Renaud committed
565
        """
566 567
        logger = self.logger
        logger.debug(f"{T4}process record {recjson['recid']} (process_recjson)")
568

LE GAC Renaud's avatar
LE GAC Renaud committed
569 570
        collection_logs = self.collection_logs
        harvester = self.harvester
571 572
        logs = self.logs

LE GAC Renaud's avatar
LE GAC Renaud committed
573
        # instantiate the record
574
        record = build_record(recjson)
575

576
        logger.debug(f"{T4}{record.title()[:72]}")
LE GAC Renaud's avatar
LE GAC Renaud committed
577 578 579 580 581 582 583 584 585 586

        # start the log for the record
        logs.append(Msg(harvester=harvester,
                        collection=collection_logs[-1].title,
                        record_id=record.id(),
                        title=record.title()))

        # check that the record is well formed
        # repair non-conformity as far as possible
        if not self.check_record(record):
587
            logger.debug(f"{T4}{logs[-1].txt}")
588
            return
LE GAC Renaud's avatar
LE GAC Renaud committed
589

590 591
        txt = ("(dry run)" if self.dry_run else "")
        logger.debug(f"{T4}insert record in the database {txt}")
592

LE GAC Renaud's avatar
LE GAC Renaud committed
593 594
        # insert the record in the database
        self.insert_record(record)
595

596
        if logger.getEffectiveLevel() == logging.DEBUG:
LE GAC Renaud's avatar
LE GAC Renaud committed
597 598 599
            log = logs[-1]
            action = log.action
            action = (action.upper() if isinstance(action, str) else action)
600
            logger.debug(f"{T4}log: {action} {log.txt}")
LE GAC Renaud's avatar
LE GAC Renaud committed
601

602 603 604 605
    def process_recid(self, rec_id):
        """Process the publication identified by its record identifier:

            * get the publication data from the store using its identifier
LE GAC Renaud's avatar
LE GAC Renaud committed
606 607
            * instantiate the record: ``RecordPubli``, ``RecordConf``
              or ``RecordThesis``
608 609 610 611 612 613
            * process OAI data
            * check the record
            * insert new record in the database

        Note:
            * Design to never stop although exception are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
614 615
            * Have a look to the attribute ``collection_logs`` and ``logs`` in
              order to understand what happen.
616 617 618 619 620 621

        Args:
            rec_id (int):
                identifier of the publication in the store.

        """
622 623 624
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}get record {rec_id} (process_recid)")
625 626 627 628 629 630 631 632 633 634

        collection_logs = self.collection_logs
        harvester = self.harvester
        logs = self.logs

        try:
            recjson = self.store.get_record(rec_id)
            self.process_recjson(recjson)

        except Exception as e:
635
            logger.debug(f"{T2}{str(e)}")
636 637 638 639 640 641 642 643
            url = OAI_URL % (harvester.host, rec_id)
            logs.append(Msg(harvester=harvester,
                            collection=collection_logs[-1].title,
                            record_id=rec_id,
                            title=url))
            logs[-1].reject(e)
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
644 645 646 647
    def process_url(self, host, collections):
        """Retrieve JSON objects from the invenio store and
        insert corresponding records in the database.

648 649
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
650 651
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
652

LE GAC Renaud's avatar
LE GAC Renaud committed
653
        Args:
654
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
655 656 657
                host name to query for publications, either
                ``cds.cern.ch`` or ``inspirehep.net``.

658
            collections (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
659
                list of collection to be interrogated.
660
                Collections are separated by a comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
661 662

        """
663 664
        self.logger.debug("")
        self.logger.debug(f"process URL search -- {host} -- {collections}")
LE GAC Renaud's avatar
LE GAC Renaud committed
665 666 667 668 669 670 671 672 673 674 675 676

        # extend harvester for logs
        self.harvester.host = host
        self.harvester.collections = collections

        # instantiate the store
        self.store = InvenioStore(host)

        # list of collections
        collections = re.sub(" *, *", ",", collections).split(",")

        # process
677
        (*map(self.process_collection, collections), )
678 679 680 681

    def report(self):
        """Build the processing report.

682 683
        Returns:
            dict:
LE GAC Renaud's avatar
LE GAC Renaud committed
684
                * ``collection_logs`` list of :class:`MsgCollection`
685
                * ``controller`` str
LE GAC Renaud's avatar
LE GAC Renaud committed
686
                * ``logs`` list of :class:`Msg`
LE GAC Renaud's avatar
LE GAC Renaud committed
687
                * ``selector`` :class:`plugin_dbui.Selector`
688 689 690 691 692 693

        """

        return dict(collection_logs=self.collection_logs,
                    controller=self.controller,
                    logs=self.logs)