harvest.py 18.8 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import difflib
5 6
import harvest

7

8
def articles():
9
    """Scan the cds/invenio stores to find articles published during year range
tux091's avatar
tux091 committed
10
    and for a given team. Insert them in the database if they don't exist.
tux091's avatar
tux091 committed
11
    
12 13 14 15 16
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
17 18
    
    """
tux091's avatar
tux091 committed
19
    try:
20 21
        filter = harvest.filter_published_paper_with_cppm_authors
        results = harvest.search(db, request, filter=filter)
22

23
    except BaseException, e:
24
        return 'Error: %s' % e
tux091's avatar
tux091 committed
25
    
26 27
    # process each record    
    n = 0
28
    for record in results.records:
29
        
30
        # alias
31
        first_author = record.first_author()
32 33 34
        id_category = results.id_category
        id_team = results.id_team
        title = record.title()
35
        year = record.year()
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51

        # check the publisher and the collaboration
        val = record.paper_editor()
        id_publisher = harvest.get_create_id(db, 
                                             'publishers', 
                                             abbreviation=val)

        val = record.collaboration()
        id_collaboration = harvest.get_create_id(db, 
                                                 'collaborations', 
                                                 collaboration=val)

        # check against already published articles
        id = harvest.get_id(db, 'publications', first_author=first_author,
                                                id_publishers=id_publisher,
                                                id_teams=id_team,
52 53
                                                title=title,
                                                year=year)
tux091's avatar
tux091 committed
54

55 56
        if id: continue
       
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
        # check against already published preprint 
        # a preprint can be identified by its category which is undefined
        id = harvest.get_id(db, 'publications', first_author=first_author,
                                                id_categories=1,
                                                id_teams=id_team,
                                                submitted=record.submitted(),
                                                title=title)
        
        # transform an existing preprint into article
        if id:
            n += 1
            db.publications[id] = dict(id_categories=id_category,
                                       id_publishers=id_publisher,
                                       pages=record.paper_pages(),
                                       publication_url=record.paper_url(),
                                       volume=record.paper_volume(),
                                       year=year)
            continue
75
        
76
        # eventually insert a new articles in the database
77
        n += 1
78 79
        db.publications.insert(authors=record.authors(),
                               authors_cppm=harvest.cppm_authors(record),
80
                               first_author=first_author,
81
                               id_categories=id_category,
82 83
                               id_collaborations=id_collaboration,
                               id_publishers=id_publisher,
84 85
                               id_status=1,
                               id_teams=id_team,
86
                               pages=record.paper_pages(),
87
                               preprint=record.report_number(),
88
                               publication_url=record.paper_url(),
89 90 91 92
                               submitted=record.submitted(),
                               title=title,
                               volume=record.paper_volume(),
                               year=year)
93
    
94
    # use a common view to display the results of the search
95 96
    response.view = 'harvest/layout.html'
    
97 98 99 100 101 102
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
tux091's avatar
tux091 committed
103 104


105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
def preprints():
    """Scan the cds/invenio stores to find preprints not yet published.
    the scan is performed during a year range for a given team. 
    New record are inserted in the database if they don't exist.
    
    The scanning is steered using URL arguments.
    Arguments are set and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
    
    """
    try:
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_preprint)

    except BaseException, e:
        return 'Error: %s' % e
    
    # process each record    
    n = 0
    for record in results.records:

        # alias
        first_author = record.first_author()
        id_category = results.id_category
        id_team = results.id_team
133
        preprint = record.report_number()
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
        title = record.title()
        submitted = record.submitted()

        # check the collaboration
        val = record.collaboration()
        id_collaboration = harvest.get_create_id(db, 
                                                 'collaborations', 
                                                 collaboration=val)

        # Protection to only keep preprints with authors
        if not first_author:
            print 'No authors for preprint: %s [%s]' % (title, preprint)
            continue
        
        # check against preprint or article already published
        id = harvest.get_id(db, 'publications', first_author=first_author,
                                                id_teams=id_team,
                                                preprint=preprint,
                                                submitted=submitted,
                                                title=title)
        if id: continue

        # eventually insert a new preprint
        n += 1
        db.publications.insert(authors=record.authors(),
                               authors_cppm=harvest.cppm_authors(record),
                               first_author=first_author,
                               id_categories=id_category,
                               id_collaborations=id_collaboration,
                               id_status=1,
                               id_teams=id_team,
165
                               preprint=preprint,
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
                               publication_url=record.paper_url(),
                               submitted=record.submitted(),
                               title=title,
                               year=year)
    
    # use a common view to display the results of the search    
    response.view = 'harvest/layout.html'
        
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)


182
def proceedings():
183 184
    """Scan the cds/invenio stores to find conference proceedings published 
    during a year range for a given team. Insert them in the database 
185
    if they don't exist.
186
    
187 188
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 
189

190 191
    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
192
    
tux091's avatar
tux091 committed
193
    """
194
    # get the proceedings from the invenio store signed by cppm authors
tux091's avatar
tux091 committed
195
    try:
196 197 198
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
199 200
        
    except BaseException, e:
201
        return 'Error: %s' % e
202 203 204

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12
tux091's avatar
tux091 committed
205
    
206
    # process each record
207
    n = 0
208
    for proceeding in results.records:
209
        
tux091's avatar
tux091 committed
210
        # get the conference information
211 212
        key = proceeding.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
213 214 215 216
        conferences = marc12.process(xml)
        
        # protection against missing conference information
        if not conferences:
217 218
            print 'no information for conference key', key
            continue
219

220
        # alias
tux091's avatar
tux091 committed
221
        authors = proceeding.authors()
222 223 224 225 226 227
        id_category = results.id_category
        id_team = results.id_team
        conference = conferences[0]
        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
        country = conference.conference_country()
tux091's avatar
tux091 committed
228
        first_author = proceeding.first_author()
legac's avatar
legac committed
229
        report_number = proceeding.report_number()
230 231
        title = proceeding.title()
        url= proceeding.paper_url()
tux091's avatar
tux091 committed
232 233
        year = proceeding.year()

234 235
        # check conference country
        id_countries = harvest.get_create_id(db, 'countries', country=country)
236
        
237
        # check against an already published proceeding
238 239 240 241
        id = harvest.get_id(db, 'publications', authors=authors,
                                                conference_dates=conference_dates,
                                                conference_title=conference_title,
                                                first_author=first_author,
242 243
                                                id_categories=id_category,
                                                id_teams=id_team,
244
                                                publication_url=url,
245
                                                title=title,
246
                                                year=year)
tux091's avatar
tux091 committed
247

248 249
        if id: continue

250
        # update an already published talk
Renaud Le Gac's avatar
Renaud Le Gac committed
251
        # A talk defines authors, conference parameters, first author, title,   
252
        # a category and a year. Latter on this talk might be transform
Renaud Le Gac's avatar
Renaud Le Gac committed
253 254
        # into a proceeding. it update the authors, the category,
        # the publication URL and the year.
255 256 257 258 259 260
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
261 262
                (db.publications.first_author==first_author)&\
                (db.publications.id_teams==id_team)
263 264 265

        for row in db(query).select(db.publications.id, db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
266
                if s.ratio() > results.ratio:
267 268
                    id = row.id
                    db.publications[id] = dict(authors=authors,
269
                                               id_categories=id_category,
270 271
                                               publication_url=url,
                                               title=title,
legac's avatar
legac committed
272
                                               report_numbers=report_number,
273 274 275 276 277 278 279
                                               year=year)

                    match = True
                    break

        if match: continue
        
280
        # eventually insert a new proceeding
281
        n += 1
282 283
        db.publications.insert(authors=authors,
                               authors_cppm=harvest.cppm_authors(proceeding),
284
                               conference_dates=conference_dates,
285
                               conference_speaker=first_author,
286 287 288
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_url=conference.conference_url(),
289 290
                               first_author=first_author,
                               id_categories=id_category,
291
                               id_countries=id_countries,
292 293 294
                               id_status=1,
                               id_teams=id_team,
                               publication_url=url,
legac's avatar
legac committed
295
                               report_numbers=report_number,
296 297
                               title=title,
                               year=year)
298

299
    # use a common view to display the results of the search    
300 301
    response.view = 'harvest/layout.html'

302 303 304 305 306 307
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
308 309


Renaud Le Gac's avatar
Renaud Le Gac committed
310
def reports():
311 312 313
    """Scan the cds/invenio stores to find reports published during a year 
    range and for a given team. Insert them in the database if they don't
    exist.
Renaud Le Gac's avatar
Renaud Le Gac committed
314
    
315 316 317 318 319
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
Renaud Le Gac's avatar
Renaud Le Gac committed
320 321 322
    
    """
    try:
323 324 325
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
Renaud Le Gac's avatar
Renaud Le Gac committed
326 327 328 329 330
        
    except BaseException, e:
        return 'Error: %s' % e

    # process each record
331
    n = 0
Renaud Le Gac's avatar
Renaud Le Gac committed
332 333
    for record in results.records:
        
334
        # alias
Renaud Le Gac's avatar
Renaud Le Gac committed
335
        first_author = record.first_author()
336 337 338
        id_category = results.id_category
        id_team = results.id_team
        title = record.title()
Renaud Le Gac's avatar
Renaud Le Gac committed
339
        year = record.year()
340 341 342 343 344

        # check against already published reports
        id = harvest.get_id(db, 'publications', first_author=first_author,
                                                id_categories=id_category,
                                                id_teams=id_team,
345 346
                                                title=title,
                                                year=year)
Renaud Le Gac's avatar
Renaud Le Gac committed
347 348 349

        if id: continue
       
350
        # eventually insert a new report
351
        n += 1
Renaud Le Gac's avatar
Renaud Le Gac committed
352
        db.publications.insert(authors=record.authors(),
353
                               authors_cppm=harvest.cppm_authors(record),
Renaud Le Gac's avatar
Renaud Le Gac committed
354
                               first_author=first_author,
355 356 357
                               id_categories=id_category,
                               id_status=1,
                               id_teams=id_team,
Renaud Le Gac's avatar
Renaud Le Gac committed
358
                               publication_url=record.paper_url(),
359
                               report_numbers=record.report_number(),
Renaud Le Gac's avatar
Renaud Le Gac committed
360 361
                               title=title,
                               year=year)
362

363
    # use a common view to display the results of the search    
364 365
    response.view = 'harvest/layout.html'

366 367 368 369 370 371
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
Renaud Le Gac's avatar
Renaud Le Gac committed
372 373
    
    
374
def talks():
375 376
    """Scan the cds/invenio stores to find conference talks for a given 
    year range and team. Insert them in the database if they don't exist.
377
    
378 379
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 
380

381 382
    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
383 384 385
    
    """
    try:
386 387 388
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
389 390
        
    except BaseException, e:
391
        return 'Error: %s' % e
392 393 394 395

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12

Renaud Le Gac's avatar
Renaud Le Gac committed
396
    # process talks
397
    n = 0
398 399 400 401 402
    for talk in results.records:

        # get the conference information
        key = talk.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
403 404 405 406 407 408
        conferences = marc12.process(xml)

        # protection against missing conference
        if not conferences:
            print 'no information for conference key', key
            continue
409

410
        # define alias
411 412 413 414 415 416
        conference = conferences[0]
        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
        country = conference.conference_country() 
        id_category = results.id_category
        id_team = results.id_team
417
        first_author = talk.first_author()
418
        title = talk.title()
419 420
        year = talk.year()

421 422 423 424
        # check country
        id_countries = harvest.get_create_id(db, 'countries', country=country)

        # check against already published talks
Renaud Le Gac's avatar
Renaud Le Gac committed
425
        # A talk defines title, first author, conference parameter 
426
        # a category and a year. Latter on this talk might be transform
Renaud Le Gac's avatar
Renaud Le Gac committed
427 428
        # into a proceeding. it update the authors, the category,
        # the publication URL and the year.
429 430 431 432 433 434 435
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex.
        
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
436 437
                (db.publications.first_author==first_author)&\
                (db.publications.id_teams==id_team)
438 439 440

        for row in db(query).select(db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
441
                if s.ratio() > results.ratio:
442 443 444 445
                    match = True
                    break

        if match: continue
446

447
        # eventually insert a new talk
448
        n += 1
449 450 451
        db.publications.insert(authors=talk.authors(),
                               authors_cppm=first_author,
                               conference_dates=conference_dates,
452
                               conference_speaker=first_author,
453 454 455 456
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_url=conference.conference_url(),
                               first_author=first_author,
457 458 459 460
                               id_categories=id_category,
                               id_countries=id_countries,
                               id_status=1,
                               id_teams=id_team,
461 462
                               title=title,
                               year=year)
463
    
464
    # use a common view to display the results of the search    
465
    response.view = 'harvest/layout.html'
466
        
467 468 469 470 471 472
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487


def theses():
    """Scan the cds/invenio stores to find theses.
    the scan is performed during a year range for a given team. 
    New record are inserted in the database if they don't exist.
    
    The scanning is steered using URL arguments.
    Arguments are set and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
    
    """
    return 'not yet implemented'