harvest.py 21.7 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import difflib
5
import harvest
6
import traceback
7

8

9
def articles():
10
    """Scan the cds/invenio stores to find articles published during year range
tux091's avatar
tux091 committed
11
    and for a given team. Insert them in the database if they don't exist.
tux091's avatar
tux091 committed
12
    
13 14 15 16 17
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
18 19
    
    """
tux091's avatar
tux091 committed
20
    try:
21 22
        filter = harvest.filter_published_paper_with_cppm_authors
        results = harvest.search(db, request, filter=filter)
23

24
    except BaseException, e:
25 26 27 28
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
tux091's avatar
tux091 committed
29
    
30 31
    # process each record    
    n = 0
32
    for record in results.records:
33
        
34
        # alias
35
        first_author = record.first_author()
36 37
        id_category = results.id_category
        id_team = results.id_team
38
        pages = record.paper_pages()
39
        preprint = record.preprint_number()
40
        title = record.title()
41
        volume = record.paper_volume()
42
        year = record.year()
43 44 45 46 47 48 49 50 51 52 53 54 55

        # check the publisher and the collaboration
        val = record.paper_editor()
        id_publisher = harvest.get_create_id(db, 
                                             'publishers', 
                                             abbreviation=val)

        val = record.collaboration()
        id_collaboration = harvest.get_create_id(db, 
                                                 'collaborations', 
                                                 collaboration=val)

        # check against already published articles
56
        id = harvest.get_id(db, 'publications', id_publishers=id_publisher,
57
                                                id_teams=id_team,
58 59
                                                pages=pages,
                                                volume=volume,
60
                                                year=year)
tux091's avatar
tux091 committed
61

62 63
        if id: continue
       
64
        # check against already published preprint 
65 66
        # a preprint can be identified by its category which is PRE (15)
        id = harvest.get_id(db, 'publications', id_categories=15,
67
                                                id_teams=id_team,
68
                                                preprint=preprint)
69 70 71 72 73 74
        
        # transform an existing preprint into article
        if id:
            n += 1
            db.publications[id] = dict(id_categories=id_category,
                                       id_publishers=id_publisher,
75
                                       pages=pages,
76
                                       publication_url=record.paper_url(),
77 78
                                       title=title,
                                       volume=volume,
79 80
                                       year=year)
            continue
81
        
82
        # eventually insert a new articles in the database
83
        n += 1
84 85
        db.publications.insert(authors=record.authors(),
                               authors_cppm=harvest.cppm_authors(record),
86
                               first_author=first_author,
87
                               id_categories=id_category,
88 89
                               id_collaborations=id_collaboration,
                               id_publishers=id_publisher,
90 91
                               id_status=1,
                               id_teams=id_team,
92 93
                               pages=pages,
                               preprint=preprint,
94
                               publication_url=record.paper_url(),
95 96
                               submitted=record.submitted(),
                               title=title,
97
                               volume=volume,
98
                               year=year)
99
    
100
    # use a common view to display the results of the search
101 102
    response.view = 'harvest/layout.html'
    
103 104 105 106 107 108
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
tux091's avatar
tux091 committed
109 110


legac's avatar
legac committed
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
def notes():
    """Scan the cds/invenio stores to find notes published during a year 
    range and for a given team. Insert them in the database if they don't
    exist.
    
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
    
    """
    try:
        results = harvest.search(db, 
                                 request,
                                 filter=harvest.filter_cppm_authors)
        
    except BaseException, e:
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg

    # process each record
    n = 0
    for record in results.records:
        
        # alias
        first_author = record.first_author()
        id_category = results.id_category
        id_team = results.id_team
        title = record.title()
        year = record.year()

        # check against already published reports
        id = harvest.get_id(db, 'publications', first_author,
                                                id_categories=id_category,
                                                id_teams=id_team,
                                                title=title,
                                                year=year)

        if id: continue
       
        # eventually insert a new report
        n += 1
        db.publications.insert(authors=record.authors(),
                               authors_cppm=harvest.cppm_authors(record),
                               first_author=first_author,
                               id_categories=id_category,
                               id_status=1,
                               id_teams=id_team,
                               publication_url=record.paper_url(),
                               report_numbers=record.report_number(),
                               title=title,
                               year=year)

    # use a common view to display the results of the search    
    response.view = 'harvest/layout.html'

    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
    
    
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
def preprints():
    """Scan the cds/invenio stores to find preprints not yet published.
    the scan is performed during a year range for a given team. 
    New record are inserted in the database if they don't exist.
    
    The scanning is steered using URL arguments.
    Arguments are set and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
    
    """
    try:
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_preprint)

    except BaseException, e:
196 197 198 199
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
200 201 202 203 204 205 206 207 208
    
    # process each record    
    n = 0
    for record in results.records:

        # alias
        first_author = record.first_author()
        id_category = results.id_category
        id_team = results.id_team
209
        preprint = record.preprint_number()
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
        title = record.title()
        submitted = record.submitted()

        # check the collaboration
        val = record.collaboration()
        id_collaboration = harvest.get_create_id(db, 
                                                 'collaborations', 
                                                 collaboration=val)

        # Protection to only keep preprints with authors
        if not first_author:
            print 'No authors for preprint: %s [%s]' % (title, preprint)
            continue
        
        # check against preprint or article already published
        id = harvest.get_id(db, 'publications', first_author=first_author,
                                                id_teams=id_team,
                                                preprint=preprint,
                                                submitted=submitted,
                                                title=title)
        if id: continue

        # eventually insert a new preprint
        n += 1
        db.publications.insert(authors=record.authors(),
                               authors_cppm=harvest.cppm_authors(record),
                               first_author=first_author,
                               id_categories=id_category,
                               id_collaborations=id_collaboration,
                               id_status=1,
                               id_teams=id_team,
241
                               preprint=preprint,
242 243 244
                               publication_url=record.paper_url(),
                               submitted=record.submitted(),
                               title=title,
245
                               year=record.year())
246 247 248 249 250 251 252 253 254 255 256 257
    
    # use a common view to display the results of the search    
    response.view = 'harvest/layout.html'
        
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)


258
def proceedings():
259 260
    """Scan the cds/invenio stores to find conference proceedings published 
    during a year range for a given team. Insert them in the database 
261
    if they don't exist.
262
    
263 264
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 
265

266 267
    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
268
    
tux091's avatar
tux091 committed
269
    """
270
    # get the proceedings from the invenio store signed by cppm authors
tux091's avatar
tux091 committed
271
    try:
272 273 274
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
275 276
        
    except BaseException, e:
277 278 279 280
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
281 282 283

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12
tux091's avatar
tux091 committed
284
    
285
    # process each record
286
    n = 0
287
    for proceeding in results.records:
288
        
tux091's avatar
tux091 committed
289
        # get the conference information
290 291
        key = proceeding.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
292 293 294 295
        conferences = marc12.process(xml)
        
        # protection against missing conference information
        if not conferences:
296
            continue
297

298
        # alias
tux091's avatar
tux091 committed
299
        authors = proceeding.authors()
300 301 302 303 304 305
        id_category = results.id_category
        id_team = results.id_team
        conference = conferences[0]
        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
        country = conference.conference_country()
tux091's avatar
tux091 committed
306
        first_author = proceeding.first_author()
307
        preprint = proceeding.preprint_number()
legac's avatar
legac committed
308
        report_number = proceeding.report_number()
309 310
        title = proceeding.title()
        url= proceeding.paper_url()
tux091's avatar
tux091 committed
311 312
        year = proceeding.year()

313 314
        # check conference country
        id_countries = harvest.get_create_id(db, 'countries', country=country)
315
        
316
        # check against an already published proceeding
317 318 319 320
        id = harvest.get_id(db, 'publications', authors=authors,
                                                conference_dates=conference_dates,
                                                conference_title=conference_title,
                                                first_author=first_author,
321 322
                                                id_categories=id_category,
                                                id_teams=id_team,
323
                                                publication_url=url,
324
                                                title=title,
325
                                                year=year)
tux091's avatar
tux091 committed
326

327 328
        if id: continue

329
        # update an already published talk
Renaud Le Gac's avatar
Renaud Le Gac committed
330
        # A talk defines authors, conference parameters, first author, title,   
331
        # a category and a year. Latter on this talk might be transform
Renaud Le Gac's avatar
Renaud Le Gac committed
332 333
        # into a proceeding. it update the authors, the category,
        # the publication URL and the year.
334 335 336 337 338 339
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
340 341
                (db.publications.first_author==first_author)&\
                (db.publications.id_teams==id_team)
342 343 344

        for row in db(query).select(db.publications.id, db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
345
                if s.ratio() > results.ratio:
346 347
                    id = row.id
                    db.publications[id] = dict(authors=authors,
348
                                               id_categories=id_category,
349
                                               preprint=preprint,
350 351
                                               publication_url=url,
                                               title=title,
legac's avatar
legac committed
352
                                               report_numbers=report_number,
353 354 355 356 357 358 359
                                               year=year)

                    match = True
                    break

        if match: continue
        
360
        # eventually insert a new proceeding
361
        n += 1
362 363
        db.publications.insert(authors=authors,
                               authors_cppm=harvest.cppm_authors(proceeding),
364
                               conference_dates=conference_dates,
365
                               conference_speaker=first_author,
366 367 368
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_url=conference.conference_url(),
369 370
                               first_author=first_author,
                               id_categories=id_category,
371
                               id_countries=id_countries,
372 373
                               id_status=1,
                               id_teams=id_team,
374
                               preprint=preprint,
375
                               publication_url=url,
legac's avatar
legac committed
376
                               report_numbers=report_number,
377 378
                               title=title,
                               year=year)
379

380
    # use a common view to display the results of the search    
381 382
    response.view = 'harvest/layout.html'

383 384 385 386 387 388
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
389 390


Renaud Le Gac's avatar
Renaud Le Gac committed
391
def reports():
392 393 394
    """Scan the cds/invenio stores to find reports published during a year 
    range and for a given team. Insert them in the database if they don't
    exist.
Renaud Le Gac's avatar
Renaud Le Gac committed
395
    
396 397 398 399 400
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
Renaud Le Gac's avatar
Renaud Le Gac committed
401 402 403
    
    """
    try:
404
        results = harvest.search(db, request)
Renaud Le Gac's avatar
Renaud Le Gac committed
405 406
        
    except BaseException, e:
407 408 409 410
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
Renaud Le Gac's avatar
Renaud Le Gac committed
411 412

    # process each record
413
    n = 0
Renaud Le Gac's avatar
Renaud Le Gac committed
414 415
    for record in results.records:
        
416
        # alias
Renaud Le Gac's avatar
Renaud Le Gac committed
417
        first_author = record.first_author()
418 419 420
        id_category = results.id_category
        id_team = results.id_team
        title = record.title()
Renaud Le Gac's avatar
Renaud Le Gac committed
421
        year = record.year()
422

423 424 425 426 427 428 429 430 431 432
        # protection -- authors not defined
        if not first_author:
            authors = '???'
            cppm_authors = '???'
            first_author = '???'
        else:
            authors = record.authors()
            cppm_authors = harvest.cppm_authors(record)
            
            
433
        # check against already published reports
434
        id = harvest.get_id(db, 'publications', id_categories=id_category,
435
                                                id_teams=id_team,
436 437
                                                title=title,
                                                year=year)
Renaud Le Gac's avatar
Renaud Le Gac committed
438 439 440

        if id: continue
       
441
        # eventually insert a new report
442
        n += 1
443 444
        db.publications.insert(authors=authors,
                               authors_cppm=cppm_authors,
Renaud Le Gac's avatar
Renaud Le Gac committed
445
                               first_author=first_author,
446 447 448
                               id_categories=id_category,
                               id_status=1,
                               id_teams=id_team,
Renaud Le Gac's avatar
Renaud Le Gac committed
449
                               publication_url=record.paper_url(),
450
                               report_numbers=record.report_number(),
Renaud Le Gac's avatar
Renaud Le Gac committed
451 452
                               title=title,
                               year=year)
453

454
    # use a common view to display the results of the search    
455 456
    response.view = 'harvest/layout.html'

457 458 459 460 461 462
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
Renaud Le Gac's avatar
Renaud Le Gac committed
463 464
    
    
465
def talks():
466 467
    """Scan the cds/invenio stores to find conference talks for a given 
    year range and team. Insert them in the database if they don't exist.
468
    
469 470
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 
471

472 473
    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
474 475 476
    
    """
    try:
477 478 479
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
480 481
        
    except BaseException, e:
482 483 484 485
        msg  = '<br><br><hr/>'
        msg += CODE(traceback.format_exc()).xml()
        msg += '<hr/>'
        return msg
486 487 488 489

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12

Renaud Le Gac's avatar
Renaud Le Gac committed
490
    # process talks
491
    n = 0
492 493 494 495 496
    for talk in results.records:

        # get the conference information
        key = talk.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
497 498 499 500 501 502
        conferences = marc12.process(xml)

        # protection against missing conference
        if not conferences:
            print 'no information for conference key', key
            continue
503

504
        # define alias
505 506 507 508 509 510
        conference = conferences[0]
        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
        country = conference.conference_country() 
        id_category = results.id_category
        id_team = results.id_team
511
        first_author = talk.first_author()
512
        title = talk.title()
513 514
        year = talk.year()

515 516 517 518
        # check country
        id_countries = harvest.get_create_id(db, 'countries', country=country)

        # check against already published talks
Renaud Le Gac's avatar
Renaud Le Gac committed
519
        # A talk defines title, first author, conference parameter 
520
        # a category and a year. Latter on this talk might be transform
Renaud Le Gac's avatar
Renaud Le Gac committed
521 522
        # into a proceeding. it update the authors, the category,
        # the publication URL and the year.
523 524 525 526 527 528 529
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex.
        
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
530 531
                (db.publications.first_author==first_author)&\
                (db.publications.id_teams==id_team)
532 533 534

        for row in db(query).select(db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
535
                if s.ratio() > results.ratio:
536 537 538 539
                    match = True
                    break

        if match: continue
540

541
        # eventually insert a new talk
542
        n += 1
543 544 545
        db.publications.insert(authors=talk.authors(),
                               authors_cppm=first_author,
                               conference_dates=conference_dates,
546
                               conference_speaker=first_author,
547 548 549 550
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_url=conference.conference_url(),
                               first_author=first_author,
551 552 553 554
                               id_categories=id_category,
                               id_countries=id_countries,
                               id_status=1,
                               id_teams=id_team,
555 556
                               title=title,
                               year=year)
557
    
558
    # use a common view to display the results of the search    
559
    response.view = 'harvest/layout.html'
560
        
561 562 563 564 565 566
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581


def theses():
    """Scan the cds/invenio stores to find theses.
    the scan is performed during a year range for a given team. 
    New record are inserted in the database if they don't exist.
    
    The scanning is steered using URL arguments.
    Arguments are set and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
    
    """
    return 'not yet implemented'