automaton.py 22.6 KB
Newer Older
1
""" harvest_tools.automaton
2 3

"""
4
import logging
5 6 7
import re
import traceback

8

9 10 11 12
from .base import (MSG_FIX_ORIGIN,
                   MSG_IN_DB,
                   ToolException)
from .checkandfix import CheckAndFix
13
from gluon.storage import Storage
LE GAC Renaud's avatar
LE GAC Renaud committed
14 15
from invenio_tools import (CdsException,
                           InvenioStore,
16
                           OAI_URL)
LE GAC Renaud's avatar
LE GAC Renaud committed
17
from invenio_tools.factory import build_record
18 19
from .msg import Msg
from .msgcollection import MsgCollection
20
from plugin_dbui import CALLBACK_ERRORS, get_id
21

22

23 24 25
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
26

LE GAC Renaud's avatar
LE GAC Renaud committed
27
MSG_INSERT_FAIL = "Fail to insert the new record in the database."
28

29
OAI = "oai:%s:%i"
30

31 32 33 34
# search collection when using inspirehep
# require for "Hal Hidden"
REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)")

35 36
T2 = " "*2
T4 = " "*4
37
T6 = " "*6
38

39

40
class Automaton(object):
41
    """Base class to search and process publications:
42

43
        * Decode the selector defining user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
44
        * Search in the store publications matching user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
45
        * Instantiate the record and check it.
46
        * Insert new records in the database.
47

48 49
    Note:
        The parameters of the search are defined by the current ``request``.
50

51 52 53
    The logic implements in the ``Automaton`` class is the following:

        #. Ask to the store, all the `record_id` satisfying the user request.
LE GAC Renaud's avatar
LE GAC Renaud committed
54 55
        #. Reject `record_id` contains in the *origin* field of a
           database entry.
LE GAC Renaud's avatar
LE GAC Renaud committed
56
        #. Request to the store, the JSON description of the publications
LE GAC Renaud's avatar
LE GAC Renaud committed
57 58 59 60
           and decode them.
        #. Reject the record for which the *secondary_oai_url* is contained in
           the *origin* field of a database entry. Update the *origin* field
           of the database record.
61
        #. Check that the *oai* of the publication is defined and well formed.
LE GAC Renaud's avatar
LE GAC Renaud committed
62 63
           Recover it, if it is not the case. At this stage the OAI is always
           defined.
64 65
        #. Reject temporarily publication.
        #. Check that *authors* are defined.
66
           Reject the publication if it is not the case.
67
        #. Check that *my institute* is in the list of the institutes
68 69 70 71 72 73
           signing the publication. Reject the publication if it is
           not the case. When the affiliation are not defined,
           try to recover this case, by finding the author of my institute
           signing the publication. This recovery procedure uses
           the *author rescue list*. Reject the record when the recovery
           procedure failed.
74
        #. Check that the *collaboration*, if defined, is well formed.
75
           Reject the publication if it is not the case
76 77 78 79 80
        #. Several check are applied depending on the publication type.
        #. At the end of this process, the publisher, the authors are
           formatted and the list of signatories of my institute extracted.

    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
81 82 83 84 85 86 87 88 89
        db (gluon.DAL):
            the database connection.

        id_team (int):
            the identifier of the team in the database.

        id_project (int):
            the identifier of the project in the database.

90
        automaton (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
            the name of the automaton which will be used to process the data.
            Possible values are: ``articles``, ``notes``, ``preprints``,
            ``proceedings``, ``reports``, ``talks`` and ``theses``.

        id_category (int):
            the identifier of the category of publication

        year_start (int):
            starting year for the scan

        year_end (int):
            ending year of the scan

        dry_run (bool):
            new records are not inserted in the database when ``True``.

        debug (bool):
            activate the verbose mode when ``True``.
109 110

    Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
111 112
        ToolException:
            * team or project or the publication category not defined
113

114 115 116 117 118
    """
    def __init__(self,
                 db,
                 id_team,
                 id_project,
119
                 automaton,
120 121 122 123 124 125 126
                 id_category,
                 year_start=None,
                 year_end=None,
                 dry_run=True,
                 debug=False):

        # protection team, project and/or category have to be defined
LE GAC Renaud's avatar
LE GAC Renaud committed
127
        if not id_team:
128 129
            raise ToolException(MSG_NO_TEAM)

LE GAC Renaud's avatar
LE GAC Renaud committed
130
        if not id_project:
131 132
            raise ToolException(MSG_NO_PROJECT)

LE GAC Renaud's avatar
LE GAC Renaud committed
133
        if not id_category:
134 135
            raise ToolException(MSG_NO_CAT)

136
        self.check = CheckAndFix(debug)
LE GAC Renaud's avatar
LE GAC Renaud committed
137 138 139 140 141 142 143 144 145
        self.collection_logs = []
        self.controller = automaton
        self.db = db
        self.dbg = debug
        self.dry_run = dry_run
        self.id_category = id_category
        self.id_team = id_team
        self.id_project = id_project
        self.logs = []
146
        self.logger = logging.getLogger("web2py.app.limbra")
LE GAC Renaud's avatar
LE GAC Renaud committed
147 148 149 150
        self.store = None
        self.year_start = year_start
        self.year_end = year_end

151
        # Construct harvester Storage needed for the log
LE GAC Renaud's avatar
LE GAC Renaud committed
152 153 154 155
        self.harvester = Storage(id_teams=id_team,
                                 id_projects=id_project,
                                 controller=automaton,
                                 id_categories=id_category)
156

157 158 159 160 161
        # Identifier of the categories preprint and articles
        # Used by the method _is_record_in_db
        self._id_preprint = get_id(db.categories, code="PRE")
        self._id_article = get_id(db.categories, code="ACL")

162 163 164
    def _insert_in_db(self, log_year="", **fields):
        """Insert the record in the database, handling database exception.

165
        Args:
166
            log_year (str): year of the record for the log
167

168
        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
169 170
            **fields:
                keyword arguments defining the record values to be
171
                inserted in the database.
172

173
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
174 175
            int:
                one when the record is inserted / updated in the database,
176
                zero otherwise.
177 178 179 180 181

        """
        db = self.db

        try:
LE GAC Renaud's avatar
LE GAC Renaud committed
182
            rec_id = db.publications.insert(**fields)
LE GAC Renaud's avatar
LE GAC Renaud committed
183 184
            if rec_id:
                return 1
185

LE GAC Renaud's avatar
LE GAC Renaud committed
186
            # operation can be reject by callback table._before_insert
LE GAC Renaud's avatar
LE GAC Renaud committed
187
            else:
LE GAC Renaud's avatar
LE GAC Renaud committed
188
                msg = MSG_INSERT_FAIL
LE GAC Renaud's avatar
LE GAC Renaud committed
189 190
                if CALLBACK_ERRORS in db.publications:
                    msg = db.publications._callback_errors
191

LE GAC Renaud's avatar
LE GAC Renaud committed
192 193 194
                # reduce the error message
                if isinstance(msg, list):
                    msg = "%s %s" % (msg[0], msg[-1])
195

LE GAC Renaud's avatar
LE GAC Renaud committed
196 197
                self.logs[-1].reject(msg, log_year)
                return 0
198

LE GAC Renaud's avatar
LE GAC Renaud committed
199 200
        # operation can be rejected by the database
        except Exception as dbe:
201
            self.logs[-1].reject(str(dbe), log_year)
LE GAC Renaud's avatar
LE GAC Renaud committed
202
            return 0
203

LE GAC Renaud's avatar
LE GAC Renaud committed
204 205 206 207 208 209 210
    def _is_record_in_db(self,
                         collection_title,
                         host=None,
                         rec_id=None,
                         oai_url=None):
        """Return the database identifier when the publication is registered.
        The search is based on the ``origin`` field and on the primary OAI.
211

212 213
        Note:
            A new log entry is created when a record is found.
214

215
        Args:
216
            title (str): the title of the publication.
217 218

        Keyword Args:
219
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
220
                the store. possible values are ``cds.cern.ch`` or
221 222
                ``inspirehep.net``. To be used with *rec_id*.

LE GAC Renaud's avatar
LE GAC Renaud committed
223 224 225
            rec_id (int):
                the record identifier in the store

226
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
227 228
                the URL of the record in the store.
                Either use *host* and *rec_id* or *oai_url*
229

230
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
231 232
            int:
                the id of the record in the database when a record is found,
233
                0 otherwise.
234

235
        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
236 237
            ValueError:
                * keyword arguments are not defined properly.
238

239 240
        """
        db = self.db
241
        harvester = self.harvester
242

243 244 245 246 247 248 249 250
        # build the OAI URL
        if host is not None and rec_id is not None and oai_url is None:
            url = OAI_URL % (host, rec_id)
        elif host is None and rec_id is None and oai_url is not None:
            url = oai_url
        else:
            raise ValueError

LE GAC Renaud's avatar
LE GAC Renaud committed
251
        # protection empty URL
252 253 254
        if len(url) == 0:
            return 0

255 256 257
        # check the OAI
        query = db.publications.origin.contains(url)
        setrows = db(query)
258

259
        if setrows.count() == 0:
260
            return 0
261

262
        # one record found
263 264
        columns = [db.publications.id,
                   db.publications.id_categories,
265 266 267
                   db.publications.title,
                   db.publications.year]
        publication = setrows.select(*columns).first()
268

269 270
        # Note:
        # The category for the publication and the harvester have to be equal.
271 272 273 274 275 276 277
        # However, keep the record if it is a preprint when the harvester
        # looks for articles. This is required to transform a preprint
        # into article
        #
        # Category can disagree when the publication is an article and
        # the harvester look for preprint. In that case, keep the article
        #
278
        if publication.id_categories != harvester.id_categories:
279 280 281 282 283 284 285

            is_preprint_to_article = \
                publication.id_categories == self._id_preprint \
                and harvester.id_categories == self._id_article

            if is_preprint_to_article:
                return 0
286 287

        # log
288
        self.logs.append(Msg(harvester=harvester,
LE GAC Renaud's avatar
LE GAC Renaud committed
289
                             collection=collection_title,
290 291 292 293 294
                             record_id=rec_id,
                             title=publication.title))

        self.logs[-1].idle(MSG_IN_DB, publication.year)

295 296 297
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}record {rec_id} in db with id {publication.id}")
LE GAC Renaud's avatar
LE GAC Renaud committed
298

299
        return publication.id
300

301 302 303 304 305
    def _search_parameters(self, collection):
        """Build the keywords to steer the URL search in invenio store.
        The main parameter is the collection and the date range defined
        in the selector.

306
        Args:
307
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
308 309
                string defining the collection in the store.
                The syntax depends on the invenio store:
310 311 312

                    * ``"find cn d0 and tc p and not tc c"``
                    * ``"LHCb Papers"``.
313

314
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
315 316 317
            dict:
                the key are a sub-set of those defined in
                :meth:`invenio_tools.InvenioStore.get_ids`.
318 319

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
320 321
        year_start = self.year_start
        year_end = self.year_end
322 323

        # INSPIREHEP store
LE GAC Renaud's avatar
LE GAC Renaud committed
324
        if collection.startswith("find"):
325 326 327

            query = collection

LE GAC Renaud's avatar
LE GAC Renaud committed
328 329
            if year_start and not year_end:
                query += " and date %s" % year_start
330

LE GAC Renaud's avatar
LE GAC Renaud committed
331 332
            elif not year_start and year_end:
                query += " and date %s" % year_end
333

LE GAC Renaud's avatar
LE GAC Renaud committed
334
            elif year_start and year_end:
335
                query += " and date > %s and date < %s " \
LE GAC Renaud's avatar
LE GAC Renaud committed
336
                         % (year_start - 1, year_end + 1)
337 338 339

            dic = dict(p=query,  # query à la spires
                       rg=1000,  # maximum number of records returned
LE GAC Renaud's avatar
LE GAC Renaud committed
340 341
                       sf="year",  # sort by date
                       so="d")  # descending order
342

343 344 345 346 347 348 349 350 351
            # handle the cc keyword (true inspirehep collection)
            match = REG_COLLECTION.search(query)
            if match:
                dic["cc"] = match.group(1).strip()
                dic["p"] = REG_COLLECTION.sub("", query).strip()
                dic["p"] = dic["p"].replace("  ", " ")
                if dic["p"] == "find":
                    del dic["p"]

352 353 354
        # CERN INVENIO store
        else:

LE GAC Renaud's avatar
LE GAC Renaud committed
355 356
            if year_start and not year_end:
                rex = year_start
357

LE GAC Renaud's avatar
LE GAC Renaud committed
358 359
            elif not year_start and year_end:
                rex = year_end
360

LE GAC Renaud's avatar
LE GAC Renaud committed
361
            elif year_start and year_end:
362
                li = [str(el) for el in range(year_start, year_end + 1)]
LE GAC Renaud's avatar
LE GAC Renaud committed
363
                rex = "|".join(li)
364 365

            dic = dict(cc=collection,  # collection
LE GAC Renaud's avatar
LE GAC Renaud committed
366 367
                       f1="year",  # search on year
                       m1="r",  # use regular expression
368
                       p1=rex,  # regular expression defining year
LE GAC Renaud's avatar
LE GAC Renaud committed
369 370
                       sf="year",  # sort by date
                       so="d")  # descending order
371 372
        return dic

LE GAC Renaud's avatar
LE GAC Renaud committed
373
    def check_record(self, record):
374 375
        """Check the content of the record in order to fix non-conformities.
        Return ``False`` when non-conformities are found and can not be
376 377
        corrected.

378 379 380
        Note:
            Some checks depend on the type of publications and have to be
            implemented in inherited class.
381

382
        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
383
            The order of the checks matter. It should be OAI,
384 385
            temporary record, authors, my authors and then a series of checks
            specific to the publication type.
386

387
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
388 389
            record (Record):
                JSON record describing the publication.
390

391
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
392 393
            bool:
                ``False`` when a non-conformity is found and can not be
394
                corrected.
395 396

        """
397
        self.logger.debug(f"{T4}check record (automaton)")
398 399

        try:
400 401
            # fix record with a missing OAI
            if not self.check.is_oai(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
402
                oai = OAI % (self.harvester.host, record.id())
403
                record["oai"] = {"value": oai}
404

405
            if self.check.is_bad_oai_used(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
406
                self.logs[-1].idle(MSG_IN_DB, record.submitted())
407 408
                return False

409 410
            self.check.temporary_record(record)
            self.check.authors(record)
411
            self.check.my_affiliation(record, self.id_project, self.id_team)
412 413 414
            self.check.collaboration(record)

        except Exception as e:
415
            self.logs[-1].reject(e, record=record)
416 417 418 419
            return False

        return True

420
    def get_record_by_fields(self, oai_url, year, **kwargs):
421 422
        """Get database record matching fields values defined
        in the keyword arguments.
423

424
        Note:
425 426
            This method is required to deal with publication entered by hand
            and found later by an harvester.
427

428
        Args:
429
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
430 431 432
                the oai_url, *e.g.* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update to
                **oai_url** when a match is found.
433

LE GAC Renaud's avatar
LE GAC Renaud committed
434 435
            year (int):
                the year of the publication. It is used
436 437 438
                by the search algorithm and by the logger.

        Keyword Args:
439
            kwargs (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
440 441
                 a series of key, value pair where the key is the name of a
                 publications database field.
442

443
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
444 445 446 447 448
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                The ``id`` is equal to ``None`` when there is no matching.
                The ``status`` is equal to one when the existing record was
                modified zero otherwise.
449 450

        """
451
        self.logger.debug(f"{T6}get existing record by fields")
452

453
        # alias
454
        db = self.db
455
        logs = self.logs
456

457 458 459
        # add the publication year to search criteria
        if year:
            kwargs["year"] = year
460 461 462 463 464 465 466

        # look for an existing record
        rec_id = get_id(db.publications, **kwargs)
        if not rec_id:
            return (None, 0)

        # fix origin field
467 468
        publication = db.publications[rec_id]
        ok = publication.origin and publication.origin == oai_url
469 470
        if not ok:
            if not self.dry_run:
471
                publication = dict(origin=oai_url)
472

473
            logs[-1].modify(MSG_FIX_ORIGIN, year)
474 475
            return (rec_id, 1)

476
        logs[-1].idle(MSG_IN_DB, year)
477 478
        return (rec_id, 0)

479 480
    def insert_record(self, record):
        """Insert the record in the database.
481

482 483 484
        Note:
            This method depend on the type of publications.
            It has to be implemented for each inherited class.
485

486
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
487 488
            record (Record):
                record describing the publication.
489

490
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
491 492
            int:
                one when the record is inserted / updated in the database,
493
                zero otherwise.
494 495 496 497

        """
        return 0

LE GAC Renaud's avatar
LE GAC Renaud committed
498
    def process_collection(self, collection):
LE GAC Renaud's avatar
LE GAC Renaud committed
499
        """Retrieve JSON objects from the invenio store and for the given
LE GAC Renaud's avatar
LE GAC Renaud committed
500
        collection. Corresponding records are inserted in the database.
501

502
        Args:
503
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
504
                name of the collection to be interrogated.
505

506 507
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
508 509
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
510 511

        """
512 513
        logger = self.logger
        logger.debug(f"process collection {collection}")
514 515

        # alias
516
        collection_logs = self.collection_logs
517
        controller = self.controller
LE GAC Renaud's avatar
LE GAC Renaud committed
518
        host = self.harvester.host
519
        project = self.db.projects[self.id_project].project
LE GAC Renaud's avatar
LE GAC Renaud committed
520
        store = self.store
521

522 523
        # log collection information
        # A collection is identified as "Project Controller collection"
LE GAC Renaud's avatar
LE GAC Renaud committed
524 525
        ctitle = "%s / %s / %s" % (project, controller, collection)
        collection_logs.append(MsgCollection(title=ctitle))
526

LE GAC Renaud's avatar
LE GAC Renaud committed
527
        # get search parameters for the collection including user criteria
528
        kwargs = self._search_parameters(collection)
529

LE GAC Renaud's avatar
LE GAC Renaud committed
530
        # get the list of record identifier matching the search criteria
531 532
        try:
            rec_ids = store.get_ids(**kwargs)
533

LE GAC Renaud's avatar
LE GAC Renaud committed
534
        except CdsException as error:
535 536 537
            collection_logs[-1].url = store.last_search_url()
            collection_logs[-1].error = error
            return
538

LE GAC Renaud's avatar
LE GAC Renaud committed
539
        # log the number of record found for the collection
540 541
        collection_logs[-1].url = store.last_search_url()
        collection_logs[-1].found = len(rec_ids)
542

LE GAC Renaud's avatar
LE GAC Renaud committed
543
        if len(rec_ids) == 0:
544
            logger.debug(f"no records found in {collection}")
545
            return
546

547
        logger.debug(f"{len(rec_ids)} records found in {collection}")
548

LE GAC Renaud's avatar
LE GAC Renaud committed
549 550 551 552
        # remove form the list identifier already registered in the data base
        # and log them
        func = self._is_record_in_db
        rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0]
553

LE GAC Renaud's avatar
LE GAC Renaud committed
554 555 556
        # process the remaining identifiers
        [self.process_recid(rec_id) for rec_id in rec_ids]

557 558
    def process_recjson(self, recjson):
        """Process the publication provided as a JSON record:
LE GAC Renaud's avatar
LE GAC Renaud committed
559 560 561 562

            * instantiate the record (RecordPubli, REcordConf, RecordThesis)
            * check the record
            * insert new record in the database
563

564
        Args:
565 566
            recjson (dict):
                record provided by the store.
567

LE GAC Renaud's avatar
LE GAC Renaud committed
568
        """
569 570
        logger = self.logger
        logger.debug(f"{T4}process record {recjson['recid']} (process_recjson)")
571

LE GAC Renaud's avatar
LE GAC Renaud committed
572 573
        collection_logs = self.collection_logs
        harvester = self.harvester
574 575
        logs = self.logs

LE GAC Renaud's avatar
LE GAC Renaud committed
576
        # instantiate the record
577
        record = build_record(recjson)
578

579
        logger.debug(f"{T4}{record.title()[:72]}")
LE GAC Renaud's avatar
LE GAC Renaud committed
580 581 582 583 584 585 586 587 588 589

        # start the log for the record
        logs.append(Msg(harvester=harvester,
                        collection=collection_logs[-1].title,
                        record_id=record.id(),
                        title=record.title()))

        # check that the record is well formed
        # repair non-conformity as far as possible
        if not self.check_record(record):
590
            logger.debug(f"{T4}{logs[-1].txt}")
591
            return
LE GAC Renaud's avatar
LE GAC Renaud committed
592

593 594
        txt = ("(dry run)" if self.dry_run else "")
        logger.debug(f"{T4}insert record in the database {txt}")
595

LE GAC Renaud's avatar
LE GAC Renaud committed
596 597
        # insert the record in the database
        self.insert_record(record)
598

599
        if logger.getEffectiveLevel() == logging.DEBUG:
LE GAC Renaud's avatar
LE GAC Renaud committed
600 601 602
            log = logs[-1]
            action = log.action
            action = (action.upper() if isinstance(action, str) else action)
603
            logger.debug(f"{T4}log: {action} {log.txt}")
LE GAC Renaud's avatar
LE GAC Renaud committed
604

605 606 607 608
    def process_recid(self, rec_id):
        """Process the publication identified by its record identifier:

            * get the publication data from the store using its identifier
LE GAC Renaud's avatar
LE GAC Renaud committed
609 610
            * instantiate the record: ``RecordPubli``, ``RecordConf``
              or ``RecordThesis``
611 612 613 614 615 616
            * process OAI data
            * check the record
            * insert new record in the database

        Note:
            * Design to never stop although exception are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
617 618
            * Have a look to the attribute ``collection_logs`` and ``logs`` in
              order to understand what happen.
619 620 621 622 623 624

        Args:
            rec_id (int):
                identifier of the publication in the store.

        """
625 626 627
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}get record {rec_id} (process_recid)")
628 629 630 631 632 633 634 635 636 637

        collection_logs = self.collection_logs
        harvester = self.harvester
        logs = self.logs

        try:
            recjson = self.store.get_record(rec_id)
            self.process_recjson(recjson)

        except Exception as e:
638
            logger.debug(f"{T2}{str(e)}")
639 640 641 642 643 644 645 646
            url = OAI_URL % (harvester.host, rec_id)
            logs.append(Msg(harvester=harvester,
                            collection=collection_logs[-1].title,
                            record_id=rec_id,
                            title=url))
            logs[-1].reject(e)
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
647 648 649 650
    def process_url(self, host, collections):
        """Retrieve JSON objects from the invenio store and
        insert corresponding records in the database.

651 652
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
653 654
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
655

LE GAC Renaud's avatar
LE GAC Renaud committed
656
        Args:
657
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
658 659 660
                host name to query for publications, either
                ``cds.cern.ch`` or ``inspirehep.net``.

661
            collections (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
662
                list of collection to be interrogated.
663
                Collections are separated by a comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
664 665

        """
666 667
        self.logger.debug("")
        self.logger.debug(f"process URL search -- {host} -- {collections}")
LE GAC Renaud's avatar
LE GAC Renaud committed
668 669 670 671 672 673 674 675 676 677 678 679 680

        # extend harvester for logs
        self.harvester.host = host
        self.harvester.collections = collections

        # instantiate the store
        self.store = InvenioStore(host)

        # list of collections
        collections = re.sub(" *, *", ",", collections).split(",")

        # process
        [self.process_collection(collection) for collection in collections]
681 682 683 684

    def report(self):
        """Build the processing report.

685 686
        Returns:
            dict:
LE GAC Renaud's avatar
LE GAC Renaud committed
687
                * ``collection_logs`` list of :class:`MsgCollection`
688
                * ``controller`` str
LE GAC Renaud's avatar
LE GAC Renaud committed
689
                * ``logs`` list of :class:`Msg`
LE GAC Renaud's avatar
LE GAC Renaud committed
690
                * ``selector`` :class:`plugin_dbui.Selector`
691 692 693 694 695 696

        """

        return dict(collection_logs=self.collection_logs,
                    controller=self.controller,
                    logs=self.logs)