automaton.py 22.6 KB
Newer Older
1
""" harvest_tools.automaton
2 3

"""
4
import logging
5 6 7
import re
import traceback

8

9 10 11 12
from .base import (MSG_FIX_ORIGIN,
                   MSG_IN_DB,
                   ToolException)
from .checkandfix import CheckAndFix
13
from gluon.storage import Storage
LE GAC Renaud's avatar
LE GAC Renaud committed
14 15
from invenio_tools import (CdsException,
                           InvenioStore,
16
                           OAI_URL)
LE GAC Renaud's avatar
LE GAC Renaud committed
17
from invenio_tools.factory import build_record
18 19
from .msg import Msg
from .msgcollection import MsgCollection
20
from plugin_dbui import CALLBACK_ERRORS, get_id
21

22

23 24 25
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
26

LE GAC Renaud's avatar
LE GAC Renaud committed
27
MSG_INSERT_FAIL = "Fail to insert the new record in the database."
28

29
OAI = "oai:%s:%i"
30

31 32 33 34
# search collection when using inspirehep
# require for "Hal Hidden"
REG_COLLECTION = re.compile(r"cc([A-Za-z ]+)(and|$)")

35 36 37
T2 = " "*2
T4 = " "*4

38

39
class Automaton(object):
40
    """Base class to search and process publications:
41

42
        * Decode the selector defining user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
43
        * Search in the store publications matching user criteria.
LE GAC Renaud's avatar
LE GAC Renaud committed
44
        * Instantiate the record and check it.
45
        * Insert new records in the database.
46

47 48
    Note:
        The parameters of the search are defined by the current ``request``.
49

50 51 52
    The logic implements in the ``Automaton`` class is the following:

        #. Ask to the store, all the `record_id` satisfying the user request.
LE GAC Renaud's avatar
LE GAC Renaud committed
53 54
        #. Reject `record_id` contains in the *origin* field of a
           database entry.
LE GAC Renaud's avatar
LE GAC Renaud committed
55
        #. Request to the store, the JSON description of the publications
LE GAC Renaud's avatar
LE GAC Renaud committed
56 57 58 59
           and decode them.
        #. Reject the record for which the *secondary_oai_url* is contained in
           the *origin* field of a database entry. Update the *origin* field
           of the database record.
60
        #. Check that the *oai* of the publication is defined and well formed.
LE GAC Renaud's avatar
LE GAC Renaud committed
61 62
           Recover it, if it is not the case. At this stage the OAI is always
           defined.
63 64
        #. Reject temporarily publication.
        #. Check that *authors* are defined.
65
           Reject the publication if it is not the case.
66
        #. Check that *my institute* is in the list of the institutes
67 68 69 70 71 72
           signing the publication. Reject the publication if it is
           not the case. When the affiliation are not defined,
           try to recover this case, by finding the author of my institute
           signing the publication. This recovery procedure uses
           the *author rescue list*. Reject the record when the recovery
           procedure failed.
73
        #. Check that the *collaboration*, if defined, is well formed.
74
           Reject the publication if it is not the case
75 76 77 78 79
        #. Several check are applied depending on the publication type.
        #. At the end of this process, the publisher, the authors are
           formatted and the list of signatories of my institute extracted.

    Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
80 81 82 83 84 85 86 87 88
        db (gluon.DAL):
            the database connection.

        id_team (int):
            the identifier of the team in the database.

        id_project (int):
            the identifier of the project in the database.

89
        automaton (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
            the name of the automaton which will be used to process the data.
            Possible values are: ``articles``, ``notes``, ``preprints``,
            ``proceedings``, ``reports``, ``talks`` and ``theses``.

        id_category (int):
            the identifier of the category of publication

        year_start (int):
            starting year for the scan

        year_end (int):
            ending year of the scan

        dry_run (bool):
            new records are not inserted in the database when ``True``.

        debug (bool):
            activate the verbose mode when ``True``.
108 109

    Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
110 111
        ToolException:
            * team or project or the publication category not defined
112

113 114 115 116 117
    """
    def __init__(self,
                 db,
                 id_team,
                 id_project,
118
                 automaton,
119 120 121 122 123 124 125
                 id_category,
                 year_start=None,
                 year_end=None,
                 dry_run=True,
                 debug=False):

        # protection team, project and/or category have to be defined
LE GAC Renaud's avatar
LE GAC Renaud committed
126
        if not id_team:
127 128
            raise ToolException(MSG_NO_TEAM)

LE GAC Renaud's avatar
LE GAC Renaud committed
129
        if not id_project:
130 131
            raise ToolException(MSG_NO_PROJECT)

LE GAC Renaud's avatar
LE GAC Renaud committed
132
        if not id_category:
133 134
            raise ToolException(MSG_NO_CAT)

135
        self.check = CheckAndFix(debug)
LE GAC Renaud's avatar
LE GAC Renaud committed
136 137 138 139 140 141 142 143 144
        self.collection_logs = []
        self.controller = automaton
        self.db = db
        self.dbg = debug
        self.dry_run = dry_run
        self.id_category = id_category
        self.id_team = id_team
        self.id_project = id_project
        self.logs = []
145
        self.logger = logging.getLogger("web2py.app.limbra")
LE GAC Renaud's avatar
LE GAC Renaud committed
146 147 148 149
        self.store = None
        self.year_start = year_start
        self.year_end = year_end

150
        # Construct harvester Storage needed for the log
LE GAC Renaud's avatar
LE GAC Renaud committed
151 152 153 154
        self.harvester = Storage(id_teams=id_team,
                                 id_projects=id_project,
                                 controller=automaton,
                                 id_categories=id_category)
155

156 157 158 159 160
        # Identifier of the categories preprint and articles
        # Used by the method _is_record_in_db
        self._id_preprint = get_id(db.categories, code="PRE")
        self._id_article = get_id(db.categories, code="ACL")

161 162 163
    def _insert_in_db(self, log_year="", **fields):
        """Insert the record in the database, handling database exception.

164
        Args:
165
            log_year (str): year of the record for the log
166

167
        Keyword Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
168 169
            **fields:
                keyword arguments defining the record values to be
170
                inserted in the database.
171

172
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
173 174
            int:
                one when the record is inserted / updated in the database,
175
                zero otherwise.
176 177 178 179 180

        """
        db = self.db

        try:
LE GAC Renaud's avatar
LE GAC Renaud committed
181
            rec_id = db.publications.insert(**fields)
LE GAC Renaud's avatar
LE GAC Renaud committed
182 183
            if rec_id:
                return 1
184

LE GAC Renaud's avatar
LE GAC Renaud committed
185
            # operation can be reject by callback table._before_insert
LE GAC Renaud's avatar
LE GAC Renaud committed
186
            else:
LE GAC Renaud's avatar
LE GAC Renaud committed
187
                msg = MSG_INSERT_FAIL
LE GAC Renaud's avatar
LE GAC Renaud committed
188 189
                if CALLBACK_ERRORS in db.publications:
                    msg = db.publications._callback_errors
190

LE GAC Renaud's avatar
LE GAC Renaud committed
191 192 193
                # reduce the error message
                if isinstance(msg, list):
                    msg = "%s %s" % (msg[0], msg[-1])
194

LE GAC Renaud's avatar
LE GAC Renaud committed
195 196
                self.logs[-1].reject(msg, log_year)
                return 0
197

LE GAC Renaud's avatar
LE GAC Renaud committed
198 199
        # operation can be rejected by the database
        except Exception as dbe:
200
            self.logs[-1].reject(str(dbe), log_year)
LE GAC Renaud's avatar
LE GAC Renaud committed
201
            return 0
202

LE GAC Renaud's avatar
LE GAC Renaud committed
203 204 205 206 207 208 209
    def _is_record_in_db(self,
                         collection_title,
                         host=None,
                         rec_id=None,
                         oai_url=None):
        """Return the database identifier when the publication is registered.
        The search is based on the ``origin`` field and on the primary OAI.
210

211 212
        Note:
            A new log entry is created when a record is found.
213

214
        Args:
215
            title (str): the title of the publication.
216 217

        Keyword Args:
218
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
219
                the store. possible values are ``cds.cern.ch`` or
220 221
                ``inspirehep.net``. To be used with *rec_id*.

LE GAC Renaud's avatar
LE GAC Renaud committed
222 223 224
            rec_id (int):
                the record identifier in the store

225
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
226 227
                the URL of the record in the store.
                Either use *host* and *rec_id* or *oai_url*
228

229
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
230 231
            int:
                the id of the record in the database when a record is found,
232
                0 otherwise.
233

234
        Raises:
LE GAC Renaud's avatar
LE GAC Renaud committed
235 236
            ValueError:
                * keyword arguments are not defined properly.
237

238 239
        """
        db = self.db
240
        harvester = self.harvester
241

242 243 244 245 246 247 248 249
        # build the OAI URL
        if host is not None and rec_id is not None and oai_url is None:
            url = OAI_URL % (host, rec_id)
        elif host is None and rec_id is None and oai_url is not None:
            url = oai_url
        else:
            raise ValueError

LE GAC Renaud's avatar
LE GAC Renaud committed
250
        # protection empty URL
251 252 253
        if len(url) == 0:
            return 0

254 255 256
        # check the OAI
        query = db.publications.origin.contains(url)
        setrows = db(query)
257

258
        if setrows.count() == 0:
259
            return 0
260

261
        # one record found
262 263
        columns = [db.publications.id,
                   db.publications.id_categories,
264 265 266
                   db.publications.title,
                   db.publications.year]
        publication = setrows.select(*columns).first()
267

268 269
        # Note:
        # The category for the publication and the harvester have to be equal.
270 271 272 273 274 275 276
        # However, keep the record if it is a preprint when the harvester
        # looks for articles. This is required to transform a preprint
        # into article
        #
        # Category can disagree when the publication is an article and
        # the harvester look for preprint. In that case, keep the article
        #
277
        if publication.id_categories != harvester.id_categories:
278 279 280 281 282 283 284

            is_preprint_to_article = \
                publication.id_categories == self._id_preprint \
                and harvester.id_categories == self._id_article

            if is_preprint_to_article:
                return 0
285 286

        # log
287
        self.logs.append(Msg(harvester=harvester,
LE GAC Renaud's avatar
LE GAC Renaud committed
288
                             collection=collection_title,
289 290 291 292 293
                             record_id=rec_id,
                             title=publication.title))

        self.logs[-1].idle(MSG_IN_DB, publication.year)

294 295 296
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}record {rec_id} in db with id {publication.id}")
LE GAC Renaud's avatar
LE GAC Renaud committed
297

298
        return publication.id
299

300 301 302 303 304
    def _search_parameters(self, collection):
        """Build the keywords to steer the URL search in invenio store.
        The main parameter is the collection and the date range defined
        in the selector.

305
        Args:
306
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
307 308
                string defining the collection in the store.
                The syntax depends on the invenio store:
309 310 311

                    * ``"find cn d0 and tc p and not tc c"``
                    * ``"LHCb Papers"``.
312

313
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
314 315 316
            dict:
                the key are a sub-set of those defined in
                :meth:`invenio_tools.InvenioStore.get_ids`.
317 318

        """
LE GAC Renaud's avatar
LE GAC Renaud committed
319 320
        year_start = self.year_start
        year_end = self.year_end
321 322

        # INSPIREHEP store
LE GAC Renaud's avatar
LE GAC Renaud committed
323
        if collection.startswith("find"):
324 325 326

            query = collection

LE GAC Renaud's avatar
LE GAC Renaud committed
327 328
            if year_start and not year_end:
                query += " and date %s" % year_start
329

LE GAC Renaud's avatar
LE GAC Renaud committed
330 331
            elif not year_start and year_end:
                query += " and date %s" % year_end
332

LE GAC Renaud's avatar
LE GAC Renaud committed
333
            elif year_start and year_end:
334
                query += " and date > %s and date < %s " \
LE GAC Renaud's avatar
LE GAC Renaud committed
335
                         % (year_start - 1, year_end + 1)
336 337 338

            dic = dict(p=query,  # query à la spires
                       rg=1000,  # maximum number of records returned
LE GAC Renaud's avatar
LE GAC Renaud committed
339 340
                       sf="year",  # sort by date
                       so="d")  # descending order
341

342 343 344 345 346 347 348 349 350
            # handle the cc keyword (true inspirehep collection)
            match = REG_COLLECTION.search(query)
            if match:
                dic["cc"] = match.group(1).strip()
                dic["p"] = REG_COLLECTION.sub("", query).strip()
                dic["p"] = dic["p"].replace("  ", " ")
                if dic["p"] == "find":
                    del dic["p"]

351 352 353
        # CERN INVENIO store
        else:

LE GAC Renaud's avatar
LE GAC Renaud committed
354 355
            if year_start and not year_end:
                rex = year_start
356

LE GAC Renaud's avatar
LE GAC Renaud committed
357 358
            elif not year_start and year_end:
                rex = year_end
359

LE GAC Renaud's avatar
LE GAC Renaud committed
360
            elif year_start and year_end:
361
                li = [str(el) for el in range(year_start, year_end + 1)]
LE GAC Renaud's avatar
LE GAC Renaud committed
362
                rex = "|".join(li)
363 364

            dic = dict(cc=collection,  # collection
LE GAC Renaud's avatar
LE GAC Renaud committed
365 366
                       f1="year",  # search on year
                       m1="r",  # use regular expression
367
                       p1=rex,  # regular expression defining year
LE GAC Renaud's avatar
LE GAC Renaud committed
368 369
                       sf="year",  # sort by date
                       so="d")  # descending order
370 371
        return dic

LE GAC Renaud's avatar
LE GAC Renaud committed
372
    def check_record(self, record):
373 374
        """Check the content of the record in order to fix non-conformities.
        Return ``False`` when non-conformities are found and can not be
375 376
        corrected.

377 378 379
        Note:
            Some checks depend on the type of publications and have to be
            implemented in inherited class.
380

381
        Note:
LE GAC Renaud's avatar
LE GAC Renaud committed
382
            The order of the checks matter. It should be OAI,
383 384
            temporary record, authors, my authors and then a series of checks
            specific to the publication type.
385

386
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
387 388
            record (Record):
                JSON record describing the publication.
389

390
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
391 392
            bool:
                ``False`` when a non-conformity is found and can not be
393
                corrected.
394 395

        """
396
        self.logger.debug(f"{T4}check record (automaton)")
397 398

        try:
399 400
            # fix record with a missing OAI
            if not self.check.is_oai(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
401
                oai = OAI % (self.harvester.host, record.id())
402
                record["oai"] = {"value": oai}
403

404
            if self.check.is_bad_oai_used(record):
LE GAC Renaud's avatar
LE GAC Renaud committed
405
                self.logs[-1].idle(MSG_IN_DB, record.submitted())
406 407
                return False

408 409
            self.check.temporary_record(record)
            self.check.authors(record)
410
            self.check.my_affiliation(record, self.id_project, self.id_team)
411 412 413
            self.check.collaboration(record)

        except Exception as e:
414
            self.logs[-1].reject(e, record=record)
415 416 417 418
            return False

        return True

419
    def get_record_by_fields(self, oai_url, year, **kwargs):
420 421
        """Get database record matching fields values defined
        in the keyword arguments.
422

423
        Note:
424 425
            This method is required to deal with publication entered by hand
            and found later by an harvester.
426

427
        Args:
428
            oai_url (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
429 430 431
                the oai_url, *e.g.* ``http://cds.cern.ch/record/123456``.
                The origin field of the existing database record is update to
                **oai_url** when a match is found.
432

LE GAC Renaud's avatar
LE GAC Renaud committed
433 434
            year (int):
                the year of the publication. It is used
435 436 437
                by the search algorithm and by the logger.

        Keyword Args:
438
            kwargs (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
439 440
                 a series of key, value pair where the key is the name of a
                 publications database field.
441

442
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
443 444 445 446 447
            tuple:
                ``(id, status)`` which contains the ``id`` of the record.
                The ``id`` is equal to ``None`` when there is no matching.
                The ``status`` is equal to one when the existing record was
                modified zero otherwise.
448 449

        """
450
        self.logger("get existing record by fields...")
451

452
        # alias
453
        db = self.db
454
        logs = self.logs
455

456 457 458
        # add the publication year to search criteria
        if year:
            kwargs["year"] = year
459 460 461 462 463 464 465

        # look for an existing record
        rec_id = get_id(db.publications, **kwargs)
        if not rec_id:
            return (None, 0)

        # fix origin field
466 467
        publication = db.publications[rec_id]
        ok = publication.origin and publication.origin == oai_url
468 469
        if not ok:
            if not self.dry_run:
470
                publication = dict(origin=oai_url)
471

472
            logs[-1].modify(MSG_FIX_ORIGIN, year)
473 474
            return (rec_id, 1)

475
        logs[-1].idle(MSG_IN_DB, year)
476 477
        return (rec_id, 0)

478 479
    def insert_record(self, record):
        """Insert the record in the database.
480

481 482 483
        Note:
            This method depend on the type of publications.
            It has to be implemented for each inherited class.
484

485
        Args:
LE GAC Renaud's avatar
LE GAC Renaud committed
486 487
            record (Record):
                record describing the publication.
488

489
        Returns:
LE GAC Renaud's avatar
LE GAC Renaud committed
490 491
            int:
                one when the record is inserted / updated in the database,
492
                zero otherwise.
493 494 495 496

        """
        return 0

LE GAC Renaud's avatar
LE GAC Renaud committed
497
    def process_collection(self, collection):
LE GAC Renaud's avatar
LE GAC Renaud committed
498
        """Retrieve JSON objects from the invenio store and for the given
LE GAC Renaud's avatar
LE GAC Renaud committed
499
        collection. Corresponding records are inserted in the database.
500

501
        Args:
502
            collection (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
503
                name of the collection to be interrogated.
504

505 506
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
507 508
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
509 510

        """
511 512
        logger = self.logger
        logger.debug(f"process collection {collection}")
513 514

        # alias
515
        collection_logs = self.collection_logs
516
        controller = self.controller
LE GAC Renaud's avatar
LE GAC Renaud committed
517
        host = self.harvester.host
518
        project = self.db.projects[self.id_project].project
LE GAC Renaud's avatar
LE GAC Renaud committed
519
        store = self.store
520

521 522
        # log collection information
        # A collection is identified as "Project Controller collection"
LE GAC Renaud's avatar
LE GAC Renaud committed
523 524
        ctitle = "%s / %s / %s" % (project, controller, collection)
        collection_logs.append(MsgCollection(title=ctitle))
525

LE GAC Renaud's avatar
LE GAC Renaud committed
526
        # get search parameters for the collection including user criteria
527
        kwargs = self._search_parameters(collection)
528

LE GAC Renaud's avatar
LE GAC Renaud committed
529
        # get the list of record identifier matching the search criteria
530 531
        try:
            rec_ids = store.get_ids(**kwargs)
532

LE GAC Renaud's avatar
LE GAC Renaud committed
533
        except CdsException as error:
534 535 536
            collection_logs[-1].url = store.last_search_url()
            collection_logs[-1].error = error
            return
537

LE GAC Renaud's avatar
LE GAC Renaud committed
538
        # log the number of record found for the collection
539 540
        collection_logs[-1].url = store.last_search_url()
        collection_logs[-1].found = len(rec_ids)
541

LE GAC Renaud's avatar
LE GAC Renaud committed
542
        if len(rec_ids) == 0:
543
            logger.debug(f"no records found in {collection}")
544
            return
545

546
        logger.debug(f"{len(rec_ids)} records found in {collection}")
547

LE GAC Renaud's avatar
LE GAC Renaud committed
548 549 550 551
        # remove form the list identifier already registered in the data base
        # and log them
        func = self._is_record_in_db
        rec_ids = [el for el in rec_ids if func(ctitle, host, el) == 0]
552

LE GAC Renaud's avatar
LE GAC Renaud committed
553 554 555
        # process the remaining identifiers
        [self.process_recid(rec_id) for rec_id in rec_ids]

556 557
    def process_recjson(self, recjson):
        """Process the publication provided as a JSON record:
LE GAC Renaud's avatar
LE GAC Renaud committed
558 559 560 561

            * instantiate the record (RecordPubli, REcordConf, RecordThesis)
            * check the record
            * insert new record in the database
562

563
        Args:
564 565
            recjson (dict):
                record provided by the store.
566

LE GAC Renaud's avatar
LE GAC Renaud committed
567
        """
568 569
        logger = self.logger
        logger.debug(f"{T4}process record {recjson['recid']} (process_recjson)")
570

LE GAC Renaud's avatar
LE GAC Renaud committed
571 572
        collection_logs = self.collection_logs
        harvester = self.harvester
573 574
        logs = self.logs

LE GAC Renaud's avatar
LE GAC Renaud committed
575
        # instantiate the record
576
        record = build_record(recjson)
577

578
        logger.debug(f"{T4}{record.title()[:72]}")
LE GAC Renaud's avatar
LE GAC Renaud committed
579 580 581 582 583 584 585 586 587 588

        # start the log for the record
        logs.append(Msg(harvester=harvester,
                        collection=collection_logs[-1].title,
                        record_id=record.id(),
                        title=record.title()))

        # check that the record is well formed
        # repair non-conformity as far as possible
        if not self.check_record(record):
589
            logger.debug(f"{T4}rejected {logs[-1].txt}")
590
            return
LE GAC Renaud's avatar
LE GAC Renaud committed
591

592 593
        txt = ("(dry run)" if self.dry_run else "")
        logger.debug(f"{T4}insert record in the database {txt}")
594

LE GAC Renaud's avatar
LE GAC Renaud committed
595 596
        # insert the record in the database
        self.insert_record(record)
597

598
        if logger.getEffectiveLevel() == logging.DEBUG:
LE GAC Renaud's avatar
LE GAC Renaud committed
599 600 601
            log = logs[-1]
            action = log.action
            action = (action.upper() if isinstance(action, str) else action)
602
            logger.debug(f"{T4}log: {action} {log.txt}")
LE GAC Renaud's avatar
LE GAC Renaud committed
603

604 605 606 607
    def process_recid(self, rec_id):
        """Process the publication identified by its record identifier:

            * get the publication data from the store using its identifier
LE GAC Renaud's avatar
LE GAC Renaud committed
608 609
            * instantiate the record: ``RecordPubli``, ``RecordConf``
              or ``RecordThesis``
610 611 612 613 614 615
            * process OAI data
            * check the record
            * insert new record in the database

        Note:
            * Design to never stop although exception are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
616 617
            * Have a look to the attribute ``collection_logs`` and ``logs`` in
              order to understand what happen.
618 619 620 621 622 623

        Args:
            rec_id (int):
                identifier of the publication in the store.

        """
624 625 626
        logger = self.logger
        logger.debug("")
        logger.debug(f"{T2}get record {rec_id} (process_recid)")
627 628 629 630 631 632 633 634 635 636

        collection_logs = self.collection_logs
        harvester = self.harvester
        logs = self.logs

        try:
            recjson = self.store.get_record(rec_id)
            self.process_recjson(recjson)

        except Exception as e:
637
            logger.debug(f"{T2}{str(e)}")
638 639 640 641 642 643 644 645
            url = OAI_URL % (harvester.host, rec_id)
            logs.append(Msg(harvester=harvester,
                            collection=collection_logs[-1].title,
                            record_id=rec_id,
                            title=url))
            logs[-1].reject(e)
            return

LE GAC Renaud's avatar
LE GAC Renaud committed
646 647 648 649
    def process_url(self, host, collections):
        """Retrieve JSON objects from the invenio store and
        insert corresponding records in the database.

650 651
        Note:
            * Design to never stop although exceptions are raised
LE GAC Renaud's avatar
LE GAC Renaud committed
652 653
            * Have a look to the attributes ``collection_logs`` and ``logs``
              in order to understand what happen.
654

LE GAC Renaud's avatar
LE GAC Renaud committed
655
        Args:
656
            host (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
657 658 659
                host name to query for publications, either
                ``cds.cern.ch`` or ``inspirehep.net``.

660
            collections (str):
LE GAC Renaud's avatar
LE GAC Renaud committed
661
                list of collection to be interrogated.
662
                Collections are separated by a comma.
LE GAC Renaud's avatar
LE GAC Renaud committed
663 664

        """
665 666
        self.logger.debug("")
        self.logger.debug(f"process URL search -- {host} -- {collections}")
LE GAC Renaud's avatar
LE GAC Renaud committed
667 668 669 670 671 672 673 674 675 676 677 678 679

        # extend harvester for logs
        self.harvester.host = host
        self.harvester.collections = collections

        # instantiate the store
        self.store = InvenioStore(host)

        # list of collections
        collections = re.sub(" *, *", ",", collections).split(",")

        # process
        [self.process_collection(collection) for collection in collections]
680 681 682 683

    def report(self):
        """Build the processing report.

684 685
        Returns:
            dict:
LE GAC Renaud's avatar
LE GAC Renaud committed
686
                * ``collection_logs`` list of :class:`MsgCollection`
687
                * ``controller`` str
LE GAC Renaud's avatar
LE GAC Renaud committed
688
                * ``logs`` list of :class:`Msg`
LE GAC Renaud's avatar
LE GAC Renaud committed
689
                * ``selector`` :class:`plugin_dbui.Selector`
690 691 692 693 694 695

        """

        return dict(collection_logs=self.collection_logs,
                    controller=self.controller,
                    logs=self.logs)