harvest_tools.py 74.8 KB
Newer Older
1
# -*- coding: utf-8 -*-
2 3
"""a collection of tools to search of publications in invenio store 
and to push them in the database.
4
        
5
@author: R. Le Gac
6 7

"""
8
import difflib
9
import datetime
10
import json
11
import re
12
import traceback
13

14
from gluon import current
15
from gluon.storage import Storage
16
from invenio_tools import (OAI_URL,
17
                           CheckAndFix,
18
                           CheckException,
19 20
                           InvenioStore,
                           Marc12)
21 22 23 24
from plugin_dbui import (UNDEF_ID, 
                         UNKNOWN, 
                         get_create_id, 
                         get_id)
25

LE GAC Renaud's avatar
LE GAC Renaud committed
26 27
DRY_RUN = current.T("dry run")

28
# explain message
29
MSG_CRASH = "Crash: %s"
30 31 32 33 34
MSG_DELETE_TALK = current.T("Delete the associated talk", lazy=False)
MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False)
MSG_IN_DB = current.T("Already in the database", lazy=False)
MSG_LOAD = current.T("Load in the database", lazy=False)
MSG_MATCH = current.T("Reject the talk match a proceeding", lazy=False)
35
MSG_NO_CAT = current.T('Select a "category" !!!', lazy=False)
36
MSG_NO_CONF = current.T("Reject no conference information", lazy=False)
37
MSG_NO_EDITOR = current.T("Reject article is not published", lazy=False)
38 39 40
MSG_NO_HARVESTER = current.T('Harvester parameters not defined in the database.', lazy=False)
MSG_NO_PROJECT = current.T('Select a "project" !!!', lazy=False)
MSG_NO_TEAM = current.T('Select a "team" !!!', lazy=False)
41 42 43 44
MSG_NO_THESIS = current.T("Reject not a thesis record", lazy=False)
MSG_PREPRINT_IS_PAPER =  current.T("Reject preprint is a published paper", lazy=False)
MSG_PREPRINT_IS_CONFERENCE = current.T("Reject preprint is a conference", lazy=False)
MSG_PREPRINT_IS_THESIS = current.T("Reject preprint is a thesis", lazy=False)
45 46
MSG_PREPRINT_NO_NUMBER = current.T("Reject no preprint number", lazy=False)
MSG_REPORT_NO_NUMBER = current.T("Reject no report number", lazy=False)
47 48
MSG_TRANSFORM_PREPRINT = current.T("Transform the preprint into an article", lazy=False)
MSG_TRANSFORM_TALK = current.T("Transform the talk into a proceeding", lazy=False)
49
                
50
REG_YEAR = re.compile("(\d{4})")
51

52
class ToolException(Exception): pass
53 54


55
def family_name_fr(x):
56 57 58 59 60 61 62
    """Extract the family name when the full name is encoded as C{J. Doe}.
    
    @type x: unicode
    
    @rtype: unicode
    
    """
63
    return x[x.find(' ')+1:]
64 65


66
def fix_amu(record):
67 68 69 70 71 72
    """Fix the name of the C{Aix Marseille University}
    
    @type record: L{Record}
    
    @rtype: unicode
    @return: the university names separated by comma.
73
    
74
    """
75 76
    li = record.these_universities()
    for i in range(len(li)):
77
        if re.search(current.app.reg_institute, li[i]):
78
            
LE GAC Renaud's avatar
LE GAC Renaud committed
79 80
            year = re.search(r"(\d\d\d\d)", record.these_defense()).group(1)
            if int(year) < 2012:
81 82 83 84 85
                li[i] = u"Université de la Méditerrannée Aix-Marseille II"
            else:
                li[i] = u"Aix Marseille Université"

    return ', '.join(li)
86 87


88
def format_author_fr(name):
89 90 91 92 93 94 95 96
    """Format the author name according to French typographic rules,
    I{i.e.} C{J.-P. Doe}.
    The name stays unchanged when the formatting failed.
    
    @type name: unicode
    @param name:
    
    @rtype: unicode
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123

    """
    # protection
    if name == '' or name == None:
        return name

    # name are encoded Family, L 
    #                  Family, P L
    #                  Family, M -H
    #                  Family Name, J
    #                  Family-Name, J
    #                  Family, F Name
    #                  Family, First
    # To avoid to deal with unicode character 
    # look for non empty string \S
    m = re.match('(.+), (\S+)( |\-)*(\S+)*', name)

    # reformat the name as L. Family
    # or keep it as it is
    if m:
        if m.group(3) and m.group(4):
            t = (m.group(2)[0], m.group(3)[0], m.group(4)[0], m.group(1))
            r = '%s.%s%s. %s' % t
        else:
            r = '%s. %s' % (m.group(2)[0], m.group(1))
    else:
        r = name
LE GAC Renaud's avatar
LE GAC Renaud committed
124 125 126
        
    # avoid author name in upper case (R. LE FOO --> R. Le Foo)
    r = r.title()
127 128 129 130
    
    return r


131
def get_harvester_tool(controller):
132 133 134 135 136 137 138 139 140 141 142 143 144 145
    """Get the harvester tool associated to the controller
    or None if .
    
    @note: valid names for the controller are:
        - articles
        - notes
        - preprints
        - proceedings
        - reports
        - talks
        - theses
        
    @type controller: unicode
    @param controller: name of the controller
146
    
147 148 149
    @rtype: class reference or None
    @return: None when the controller corresponds to nothing.
     
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    """
    if controller == 'articles':
        Tool = Articles

    elif controller == 'notes':
        Tool = Notes
    
    elif controller == 'preprints':
        Tool = Preprints
    
    elif controller == 'proceedings':
        Tool = Proceedings
    
    elif controller == 'reports':
        Tool = Reports
    
    elif controller == 'talks':
        Tool = Talks
    
    elif controller == 'theses':
        Tool = Thesis
    
    else:
        Tool = None

    return Tool


LE GAC Renaud's avatar
LE GAC Renaud committed
178 179 180 181 182 183
def learn_my_authors(db, authors=None, 
                         id_project=None, 
                         id_team=None, 
                         year=None):
    """Train the rescue list of the authors of my institute, 
    stored in the database, using the list C{authors} provided in argument.
184
    
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
    @note: all keyword arguments have to be defined.
    
    @type db: gluon.dal.DAL
    @param db:
    
    @type authors: list
    @param authors: authors names
    
    @type id_project: int
    @param id_project: project identifier
    
    @type id_team: int
    @param id_team: team idnetifier
    
    @type year: int
    @param year:
201 202 203
    
    """ 
    # get the list of authors store in the database
LE GAC Renaud's avatar
LE GAC Renaud committed
204
    row = db.my_authors(id_projects=id_project, 
205 206 207 208 209
                          id_teams=id_team,
                          year=year)
    
    # no entry in the database
    if not row:
LE GAC Renaud's avatar
LE GAC Renaud committed
210 211 212 213
        db.my_authors[0] = dict(authors=authors,
                               id_projects=id_project, 
                               id_teams=id_team,
                               year=year)
214 215
        return
    
216 217
    database_authors = row.authors.split(', ')

218 219
    # compare with the input list
    # and extract authors which are not in the db
220 221
    new = set(authors.split(', '))
    ref = set(database_authors)
222 223 224
    diff = new.difference(ref) 
    
    # update the database
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
    if diff:

        # NOTE1: be careful with the string encoding
        # NOTE2: handle the case J. Foo and J. M. Foo are the same person
        li = []
        for el in diff:
            if isinstance(el, unicode):
                el = el.encode('utf8')
                fn = el[el.rfind('. ')+2:]      # extract family name
                
                if fn not in row.authors:
                    li.append(el)

        database_authors.extend(li)
        database_authors.sort(key=family_name_fr)
LE GAC Renaud's avatar
LE GAC Renaud committed
240
        db.my_authors[row.id] = dict(authors=', '.join(database_authors))
241 242


243
class Msg(Storage):
244 245 246
    """Message and action taken for a publication.
        - The publication is found by an harvester tool, in a store.
        - The action refers to the database.
247
    
248 249 250 251 252 253
    Fours action are defined:
        - C{idle}
        - C{load}
        - C{modify}
        - C{reject}
    
254
    The class contains the attributes:
255
    
256
        - C{action}: action taken
257
        - C{collection}: the harvester collection
258 259
        - C{harvester}: the harvester encoded as a JSON string
        - C{record_id}; the store identifier of the record
260 261
        - C{title}: title of the publication
        - C{txt}: text of the message
262
        - C{url}: url of the record
263
        - C{year}: year of the publication
264 265

    """
266 267 268 269 270 271 272 273 274 275 276 277 278 279
    def __init__(self, 
                 collection=None,
                 harvester=None, 
                 record_id=None, 
                 title=None):
        """
        @type collection: str
        @param collection: the collection containing the record
        
        @type harvester: gluon.dal.Row
        @param harvester: the current harvester used to retrieve the record.
        
        @type record_id: int
        @param record_id: the store identifier of the record
280

281 282 283 284 285 286 287 288 289 290 291 292 293 294
        @type title: str
        @param title: the title associated to the record
        
        """
        self.action = None
        self.collection = collection
        self.harvester = json.dumps(harvester.as_dict())
        self.record_id = record_id
        self.title = title
        self.txt = None
        self.url = OAI_URL % (harvester.host, record_id)
        self.year = None
        
        
295
    def idle(self, txt, year=None):
296 297 298 299
        """Set the action as idle and the message as C{txt}.
        
        @type txt: unicode
        @param txt: message
300 301 302

        @type year: unicode
        @param year: year of the publication
303 304
        
        """
305
        self.action = 'idle'
306
        self._set(txt, year)
307

308

309
    def load(self, txt, year=None):
310 311 312 313 314
        """Set the action as C{load} and the message as C{txt}.
        
        @type txt: unicode
        @param txt: message
        
315 316 317
        @type year: unicode
        @param year: year of the publication
        
318
        """
319
        self.action = 'load'
320
        self._set(txt, year)
321

322
    
323
    def modify(self, txt, year=None):
324 325 326 327 328
        """Set the action as C{modify} and the message as C{txt}.
        
        @type txt: unicode
        @param txt: message
        
329 330 331
        @type year: unicode
        @param year: year of the publication
        
332
        """
333
        self.action = 'modify'
334
        self._set(txt, year)
335

336

337
    def reject(self, txt, year=None):
338 339 340 341 342
        """Set the action as C{reject} set the message as C{txt}.
        
        @type txt: unicode
        @param txt: message
        
343 344 345
        @type year: unicode
        @param year: year of the publication
        
346
        """
347
        self.action = 'reject'
348
        self._set(txt, year)
349 350


351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
    def _set(self, txt, year):
        
        if isinstance(txt, unicode):
            txt = txt.encode("utf-8")
        
        elif not isinstance(txt, str):
            txt = str(txt)
            
        self.txt = txt

        if year:
            if isinstance(year, list):
                self.year = ', '.join(year)
            else:
                self.year = year
366 367


368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
class MsgCollection(Storage):
    """Message for a collection. The class contains five public attributes:
        - C{error}: error when scanning the collection
        - C{found}: number of publication found in the harvester repository
        - C{url}: URL used to scan the harvester repository, returning a list ids.
        - C{title}: title of the collection

    """
    def __init__(self, error="", found=0, title="", url=""):
        Storage.__init__(self)
        self.error = error
        self.found = found
        self.title = title
        self.url = url

    
    def url_hb(self):
        """
        @rtype: str
        @return: the URL return a list of record in readable format.
        
        """
        return self.url.replace("of=id", "of=hb")
    
         
393
class PublicationsTool(object):
394 395 396 397 398
    """Base class to search and process publications.
        - Decode the parameter of a selector defining user criteria.
        - Search for publications in the store, according to 
          user criteria or process and XML string.
        - Load records in the database.
399
    
400 401 402 403 404
    The parameters of the search as well as the parameters of the harvester
    are defined by the current request.
    
    """
    
405
    def __init__(self, db, selector, debug=False):
406
        """
407 408 409 410
        
        @type db: gluon.dal.DAL
        @param db:
        
411 412 413 414 415 416 417 418
        @type selector: plugin_dbui.Selector
        @param selector: the selector defining the parameters to search
        and to process the publications.
        
        @type debug: bool
        @param debug: activate the debug mode
        
        """
419
        self.collection_logs = []
420
        self.db = db
421
        self.dbg = debug
422
        self.harvester = None
423
        self.logs = []
424

425 426
        self.check = CheckAndFix()
        self.marc12 = Marc12()
427

428
        self.selector = selector
429
        
430 431 432 433
        # private cache for my_author rescue list
        self.__par = None
        self.__reference = None

434

435
    def _search_parameters(self, collection):
436 437 438
        """Build the keywords to steer the URL search in invenio store. 
        The main parameter is the collection and the date range defined 
        in the selector.
439
        
440 441 442 443 444 445 446
        @type collection: unicode
        @param collection: statement defining the collection in the
        store, I{i.e.} C{"find cn d0 and tc p and not tc c"} or 
        C{"LHCb Papers"}. The syntax depends on the invenio store.
        
        @rtype: dict
        @return: the key are a sub-set of those defined in 
447
        L{invenio_tools.InvenioStore.get_ids}.
448 449
        
        """
450 451
        selector = self.selector
        
452 453 454 455 456
        # INSPIREHEP store
        if collection.startswith('find'):
            
            query = collection
            
457 458 459 460 461 462 463 464
            if selector.year_start and not selector.year_end:
                 query += " and date %s" % self.selector.year_start

            elif not selector.year_start and selector.year_end:
                query += " and date %s" % selector.year_end
            
            elif selector.year_start and selector.year_end:
                query += " and date > %s and date < %s " \
465
                         % (selector.year_start-1, selector.year_end+1)
466 467
            
            di = dict(p=query,              # query à la spires
468
                      rg=1000,              # maximum number of records returned
469 470 471 472 473 474
                      sf='year',            # sort by date
                      so='d')               # descending order

        # CERN INVENIO store
        else:

475 476 477 478
            if selector.year_start and not selector.year_end:
                rex = selector.year_start
                
            elif not selector.year_start and selector.year_end:
479
                rex = self.y2
480 481
                
            elif selector.year_start and selector.year_end:
482
                li = []
483
                for year in range(selector.year_start, selector.year_end+1):
484 485 486 487 488 489 490 491 492 493 494 495
                    li.append(str(year))
                rex = '|'.join(li)

            di = dict(cc=collection,         # collection
                      f1='year',             # search on year
                      m1='r',                # use regular expression
                      p1=rex,                # regular expression defining year
                      sf='year',             # sort by date
                      so='d')                # descending order               
        return di


496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
    def _my_author_list(self, record):
        """Extract the rescue list for my authors in the database.
         
        @type record: L{Record}
        @param record:
         
        @rtype: list
        @return: empty when not defined
         
        """
        year = record.year()

        # try to recover year when not defined
        if not year:
            # published article, proceeding
            if "773" in record and "y" in record["773"]:
                year = record["773"]["y"]
            
            # start date of a conference
            elif "111" in record and "x" in record["111"]:
                year = record["111"]["x"]

            # end date of a conference
            elif "111" in record and "z" in record["111"]:
                year = record["111"]["z"]

            # submitted date
            elif "269" in record and "c" in record["269"]:
                year = record["269"]["c"]
            
            else:
                return []
            
        #
        # NOTE:  
        # keep in mind that the CheckAndfix mechanism is not yet run
        # therefore year can be a list due to erratum, ...
        #
        if isinstance(year, list):
            year.sort()
            year = year[0]

        # the value can have several format 1992, 1992-12-31, ....
        m = REG_YEAR.search(year)
        if m:
            year = m.group(1)
        
        else:
            return []

        # caching
        t = (year, self.selector.id_projects, self.selector.id_teams)
        if t == self.__par:
            return self.__reference
        
        # extract the list from the database
LE GAC Renaud's avatar
LE GAC Renaud committed
552
        row = self.db.my_authors(year=year, 
553 554 555 556 557 558 559 560 561 562 563 564
                                   id_projects=self.selector.id_projects, 
                                   id_teams=self.selector.id_teams)
            
        if row:
            self.__reference = row['authors'].split(', ')
        else:
            self.__reference = []
        
        return self.__reference

    
    def check_by_origin(self, oai_url=None, year=None):
565
        """Check that a record already exist using the origin field.
566 567 568

            - Actions are logged.

569
        @type oai_url: unicode
570 571
        @param oai_url: typical value is "http://cds.cern.ch/record/123456"
        
572 573 574
        @type year: unicode
        @param year:
        
575 576 577 578 579 580 581 582 583
        @note: this method can be customised in inherited class 
        to perform dedicated action.
        
        @rtype: tuple
        @return: the tuple (id, status). The id of the record or None. 
        The status is equal to one when the existing record was modified
        zero otherwise
        
        """
584 585 586
        if self.dbg:
            print "check existing record by origin"

587
        id = get_id(self.db.publications, origin=oai_url)
588 589 590
        if not id:
            return (None, 0)

591
        self.logs[-1].idle(MSG_IN_DB, year)
592 593 594 595
        return (id, 0)
    

    def check_by_fields(self, **kwargs):
596
        """Check that a record already exist using the fields defined
597 598 599 600 601
        in the keyword arguments.
        
            - Fix the field origin when a match is found.
            - Actions are logged.

LE GAC Renaud's avatar
LE GAC Renaud committed
602
        @keyword oai_url: typical value is "http://cds.cern.ch/record/123456"
603
        @keyword year:
LE GAC Renaud's avatar
LE GAC Renaud committed
604
        
605 606 607 608 609 610 611 612 613
        @note: this method can be customised in inherited class 
        to perform dedicated action.
        
        @rtype: tuple
        @return: the tuple (id, status). The id of the record or None. 
        The status is equal to one when the existing record was modified
        zero otherwise
        
        """
614 615 616
        if self.dbg:
            print "check existing record by fields"

617 618
        db = self.db
        
619
        # origin can't be used for the search
LE GAC Renaud's avatar
LE GAC Renaud committed
620 621 622 623
        oai_url = kwargs["oai_url"]
        del kwargs["oai_url"]
        
        # look for an existing record
624 625 626 627
        id = get_id(db.publications, **kwargs)
        if not id:
            return (None, 0)

628 629 630
        # fix origin field
        ok = db.publications[id].origin and db.publications[id].origin == oai_url 
        if not ok:
631 632 633
            if self.selector.mode != DRY_RUN:
                db.publications[id] = dict(origin=oai_url)
            
634
            self.logs[-1].modify(MSG_FIX_ORIGIN, kwargs["year"])
635 636
            return (id, 1)
            
637
        self.logs[-1].idle(MSG_IN_DB, kwargs["year"])
638 639 640 641
        return (id, 0)

    
    def check_collaboration(self, value):
642
        """Check that the collaboration exit in the database, create it if not.
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
        
        @type value: str or None
        @param value: the name of the collaboration.
        
        @rtype: int
        @return: the id of the collaboration, UNDEF_ID when not defined
        
        """
        if not value:
            return UNDEF_ID
        
        return get_create_id(self.db.collaborations, collaboration=value)

        
    def check_publisher(self, value):
658
        """Check that publisher exit in the database, create it if not.
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
        
        @type value: str or None
        @param value: the abbreviation of the publisher name.
        
        @rtype: int
        @return: the id of the publisher, UNDEF_ID when not defined
        
        """
        if not value:
            return UNDEF_ID
        
        return get_create_id(self.db.publishers, abbreviation=value)


    def select_record(self, record):
674 675
        """C{True} when the C{record} is selected.
        This method check and format the author field.
676
        
677 678
        @note: The checks depend on the type of publications and have to be 
        implemented in inherited class.
679 680 681 682 683

        @type record: L{Record}
        @param record:
        
        @rtype: bool
684 685
        
        """
686
        if self.dbg:
687
            print "select record and check / format authors"
688

689 690 691 692
        try:
            self.check.temporary_record(record)
            self.check.authors(record)
            self.check.format_authors(record, format_author_fr)
693
            self.check.collaboration(record)
694 695 696 697 698 699

        except BaseException as e:
            self.logs[-1].reject(e, record.year())
            return False

        return True
700 701


702
    def load_db(self, record):
703
        """Load the record in the database.
704 705
        
        @note: This method depend on the type of publications.
706 707
        It has to be implemented for each inherited class.
        
708 709 710 711
        @type record: L{Record}
        @param record:
        
        @rtype: int        
712 713
        @return: one when the record is inserted / updated in the database 
        zero otherwise.
714 715 716 717

        """
        return 0

718
        
719
    def process_url(self):
720
        """Retrieve the MARC XML string and launch its decoding.
721

722 723 724
        @raise Exception: depending on what happen, can be StoreException,
        Marc12ZException, ...
        
725
        """
726 727 728
        if self.dbg:
            print "process URL search"

729
        store = InvenioStore(self.harvester.host)
730 731 732 733
        
        # list of collections
        collections = self.harvester.collections
        collections = re.sub(' *, *', ',', collections).split(',')
734
        
735
        # alias
LE GAC Renaud's avatar
LE GAC Renaud committed
736
        controller = self.harvester.controller
737 738
        project = self.db.projects[self.harvester.id_projects].project
        
739
        # extract the list of publications from the store for each collection
740 741 742
        # the search is perform on a range of creation date
        # if not defined all element are return
        #
743
        # The method use here minimise the memory usage
744
        # on the server as well as on the client side
745 746
        
        for collection in collections:
747

748 749
            # log collection information
            # A collection is identified as "Project Controller collection"
LE GAC Renaud's avatar
LE GAC Renaud committed
750
            title = "%s / %s / %s" % (project, controller, collection)
751 752 753
            self.collection_logs.append(MsgCollection(title=title))
            
            # search record in the harvester repository
754
            kwargs = self._search_parameters(collection)
755
            
756
            try:
757
                ids = store.get_ids(**kwargs)
758
                
759
            except Exception as error:
760
                self.collection_logs[-1].url = store.last_search_url()
761
                self.collection_logs[-1].error = error
762
                continue
763

764
            self.collection_logs[-1].url = store.last_search_url()
765 766
            self.collection_logs[-1].found = len(ids)
                        
767 768
            if not ids:
                continue
LE GAC Renaud's avatar
LE GAC Renaud committed
769
            
770
            if self.dbg:
771
                print '%i records found in %s' % (len(ids), collection)
772 773
                            
            for id in ids:
774
                
775
                if self.dbg:
776
                    print "\nprocessing record", id
777
                
778
                try:
779
                    xml = store.get_record(id)
780
                    self.process_xml(xml)
781
                    
782
                except BaseException as e:
783
                    url = OAI_URL % (self.harvester.host, id)
784 785 786 787
                    self.logs.append(Msg(harvester=self.harvester, 
                                         collection=title,
                                         record_id=id,
                                         title = url))
788
                    self.logs[-1].reject(e)
789
                
790
        
791
    def process_xml(self, xml):
792 793 794 795
        """Decode the MARC XML string and load records in the database.

        @type xml: unicode
        @param xml: MARC XML string
796 797
        
        """
798 799 800
        if self.dbg:
            print "process xml record"

801 802
        # NOTE: BaseException and inherited class 
        # are catched by the previous stage
803
        li = self.marc12(xml)
804
            
805
        # process individual record
806 807
        for record in li:

808 809
            if self.dbg:
                print "record decoded"
810

811
            # start the log for the record
812 813 814 815
            self.logs.append(Msg(harvester=self.harvester,
                                 collection=self.collection_logs[-1].title,
                                 record_id=record.id(),
                                 title=record.title()))
816 817 818 819

            # additional selection stage
            # at this step the validity of the record is checked
            # and non-conformities are repaired
820
            if not self.select_record(record):
821
                continue
822

823
            if self.dbg:
824 825 826
                print "start loading in the database"
            
            # laod record in the database
827
            i = self.load_db(record)
828

829
            if self.dbg:
830
                print self.logs[-1].action.upper(), self.logs[-1].txt
831

832

833
    def report(self):
834 835 836 837
        """Build the processing report. 
        
        @rtype: dict
        @return:
838
            - C{collection_logs} (list) one L{MsgCollection}) for each collection
839
            - C{controller} (str)
840 841
            - C{logs} (list) one L{Msg} for each publication
            - C{selector} (Selector)
842 843 844

        """
        
845
        return dict(collection_logs=self.collection_logs,
846
                    controller=self.harvester.controller,
847
                    logs=self.logs,
848
                    selector=self.selector)    
849 850


851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908
    def __call__(self):
        """Search publication in the invenio store according to criteria
        and load them in the database.

        @raise Exception: the type of exception depends on what happen:
            - L{ToolException} when projet, team or category identifier
              are not defined.
            - C{StoreException} when somethings goes wrong interrogating the store.
            - C{Marc12Exception} when somethings goes wrong decoding the XML
              string return by the store
            - C{CheckException} if the L{Record} is not valid
            - C{Exception} if the python code crash
        
        """
        selector = self.selector

        if self.dbg:
            print "start processing", self.__class__.__name__
            print "decode request"
            
        # protection team, project and/or category have to be defined    
        if not selector.id_projects:
            raise ToolException(MSG_NO_PROJECT)

        if not selector.id_teams:
            raise ToolException(MSG_NO_TEAM)

        if selector.xml and not selector.id_categories:
             raise ToolException(MSG_NO_CAT)

        if self.dbg:
            print "get harvest parameters"
            
        # process an XML request
        if selector.xml:
            self.harvester = Storage(controller=selector.controller,
                                     id_categories=selector.id_categories,
                                     id_projects=selector.id_projects,
                                     id_teams=selector.id_teams)
            
            self.collection_logs.append(MsgCollection(found=1))

            self.process_xml(selector.xml)
            return
        
        # retrieve the harvester parameter in the database
        # if not yet defined (free run)
        if not self.harvester:
            row = selector.select(self.db.harvesters).first()
            if not row:
                raise ToolException(MSG_NO_HARVESTER)
    
            self.harvester = row.harvesters

        # retrieve records in the store and load them in the database
        self.process_url()


909 910
class Articles(PublicationsTool):
    """Publications tool for articles.
911
    
912
    """
913 914 915 916 917 918
    def __init__(self, *args, **kwargs):
        
        PublicationsTool.__init__(self, *args, **kwargs)
        
        # the preprint categories
        self.id_preprint = get_id(self.db.categories, code="PRE")
919

920

921
    def check_by_origin(self, id_publisher=None,
922
                              my_authors=None,
923
                              oai_url=None,
924
                              pages=None,
925
                              publication_url=None,
926 927 928
                              title=None,
                              volume=None,
                              year=None):
929
        """Check that a record already exist using the origin field.
930 931 932 933
        
            - Transform a preprint into article.
            - Actions are logged.

934 935 936 937 938 939 940 941
        @keyword id_publisher:
        @keyword oai_url:
        @keyword pages:
        @keyword publication_url:
        @keyword title:
        @keyword volume:
        @keyword year:
        
942 943 944 945 946 947
        @rtype: tuple
        @return: the tuple (id, status). The id of the record or None. 
        The status is equal to one when the existing record was modified
        zero otherwise
        
        """
948
        if self.dbg:
949
            print "check existing article by origin"
950
            
951 952
        db = self.db
        
953
        id = get_id(db.publications, origin=oai_url)
954 955 956 957 958
        if not id:
            return (None, 0)

        # not a preprint ?
        if db.publications[id].id_categories != self.id_preprint:
959
            self.logs[-1].idle(MSG_IN_DB, year)
960 961 962
            return (id, 0)
        
        # transform a preprint into an article
963
        self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
964
        if self.selector.mode != DRY_RUN:
LE GAC Renaud's avatar
LE GAC Renaud committed
965
            db.publications[id] = dict(authors_institute=my_authors,
966 967 968 969 970 971 972 973 974 975 976 977
                                       id_categories=self.harvester.id_categories,
                                       id_publishers=id_publisher,
                                       id_status=UNDEF_ID,
                                       pages=pages,
                                       publication_url=publication_url,
                                       title=title,
                                       volume=volume,
                                       year=year)
            
        return (id, 1)
        
    
978
    def check_by_fields(self, id_publisher=None,
979
                              my_authors=None,
LE GAC Renaud's avatar
LE GAC Renaud committed
980
                              oai_url=None,
981
                              pages=None,
982
                              publication_url=None,
983 984 985 986
                              preprint_number=None,
                              title=None,
                              volume=None,
                              year=None):
987
        """Check that a record already exist using the fields: id_projects,
988 989 990 991 992
        id_publishers, id_teams, pages, volume and year.
        
            - Fix the field origin when a match is found.
            - Transform a preprint into article.
            - Actions are logged.
993 994

        @keyword id_publisher:
LE GAC Renaud's avatar
LE GAC Renaud committed
995
        @keyword oai_url:
996 997
        @keyword pages:
        @keyword publication_url:
998
        @keyword preprint_number:
999 1000 1001
        @keyword title:
        @keyword volume:
        @keyword year:
1002 1003 1004 1005 1006 1007 1008
        
        @rtype: tuple
        @return: the tuple (id, status). The id of the record or None. 
        The status is equal to one when the existing record was modified
        zero otherwise
        
        """
1009
        if self.dbg:
1010
            print "check existing article by fields"
1011

1012 1013 1014 1015 1016 1017 1018 1019 1020