harvest.py 13.1 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import difflib
5 6
import harvest

7

8
def articles():
9
    """Scan the cds/invenio stores to find articles published during year range
tux091's avatar
tux091 committed
10
    and for a given team. Insert them in the database if they don't exist.
tux091's avatar
tux091 committed
11
    
12 13 14 15 16
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
17 18
    
    """
tux091's avatar
tux091 committed
19
    try:
20 21 22 23
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_published_paper_with_cppm_authors)

24
    except BaseException, e:
25
        return 'Error: %s' % e
tux091's avatar
tux091 committed
26
    
27 28
    # process each record    
    n = 0
29
    for record in results.records:
30
        
31
        # check the publisher
32
        val = record.paper_editor()
33
        id_publisher = harvest.get_create_id(db, 'publishers', abbreviation=val)
34

35
        # check the publication
36 37 38 39
        title = record.title()
        first_author = record.first_author()
        year = record.year()
        
40 41 42 43
        id = harvest.get_id(db, 'publications', id_publishers=id_publisher,
                                                first_author=first_author,
                                                title=title,
                                                year=year)
tux091's avatar
tux091 committed
44

45 46
        if id: continue
       
47
        # insert a new publications
48 49
        # check the collaboration
        val = record.collaboration()
50
        id_collaboration = harvest.get_create_id(db, 'collaborations', collaboration=val)
51 52
        
        # insert
53
        n += 1
54 55 56 57 58 59 60 61
        db.publications.insert(title=title,
                               first_author=first_author,
                               authors=record.authors(),
                               id_collaborations=id_collaboration,
                               id_publishers=id_publisher,
                               year=year,
                               volume=record.paper_volume(),
                               pages=record.paper_pages(),
62
                               submitted=record.submitted(),
63 64
                               preprint=record.rapport_number(),
                               publication_url=record.paper_url(),
65
                               authors_cppm=harvest.cppm_authors(record),
66
                               id_teams=results.id_team,
67
                               id_categories=results.id_category,
68
                               id_status=1)
69 70 71
    
    response.view = 'harvest/layout.html'
    
72 73 74 75 76 77
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
tux091's avatar
tux091 committed
78 79


80
def proceedings():
81 82
    """Scan the cds/invenio stores to find conference proceedings published 
    during a year range for a given team. Insert them in the database 
83
    if they don't exist.
84
    
85 86
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 
87

88 89
    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
90
    
tux091's avatar
tux091 committed
91
    """
tux091's avatar
tux091 committed
92
    # get the proceedings from the invenio store
93
    # signed by cppm authors
tux091's avatar
tux091 committed
94
    try:
95 96 97
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
98 99
        
    except BaseException, e:
100
        return 'Error: %s' % e
101 102 103

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12
tux091's avatar
tux091 committed
104
    
105
    # process each record
106
    n = 0
107
    for proceeding in results.records:
108
        
tux091's avatar
tux091 committed
109
        # get the conference information
110 111
        key = proceeding.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
112 113 114 115 116
        records = marc12.process(xml)
        if not records:
            print 'no information for conference key', key
            continue
        conference = records[0]
117

118
        # define alias
tux091's avatar
tux091 committed
119 120
        authors = proceeding.authors()
        first_author = proceeding.first_author()
121 122
        title = proceeding.title()
        url= proceeding.paper_url()
tux091's avatar
tux091 committed
123 124 125 126
        year = proceeding.year()

        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
127
        
128
        # check the proceeding
129 130 131 132 133 134 135 136
        id = harvest.get_id(db, 'publications', authors=authors,
                                                conference_dates=conference_dates,
                                                conference_title=conference_title,
                                                first_author=first_author,
                                                id_categories=results.id_category,
                                                title=title,
                                                publication_url=url,
                                                year=year)
tux091's avatar
tux091 committed
137

138 139 140
        if id: continue

        # update a talk
Renaud Le Gac's avatar
Renaud Le Gac committed
141
        # A talk defines authors, conference parameters, first author, title,   
142
        # a category and a year. Latter on this talk might be transform
Renaud Le Gac's avatar
Renaud Le Gac committed
143 144
        # into a proceeding. it update the authors, the category,
        # the publication URL and the year.
145 146 147 148 149 150 151 152 153 154
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
                (db.publications.first_author==first_author)

        for row in db(query).select(db.publications.id, db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
155
                if s.ratio() > results.ratio:
156 157
                    id = row.id
                    db.publications[id] = dict(authors=authors,
158
                                               id_categories=results.id_category,
159 160 161 162 163 164 165 166 167
                                               publication_url=url,
                                               title=title,
                                               year=year)

                    match = True
                    break

        if match: continue
        
tux091's avatar
tux091 committed
168
        # insert a new proceeding
169 170 171
        id_countries = harvest.get_create_id(db, 
                                             'countries', 
                                             country=conference.conference_country())
172

173
        n += 1
174 175 176 177 178 179 180 181 182 183 184 185
        db.publications.insert(title=title,
                               first_author=first_author,
                               authors=authors,
                               year=year,
                               preprint=proceeding.rapport_number(),
                               publication_url=url,
                               conference_dates=conference_dates,
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_url=conference.conference_url(),
                               conference_speaker=first_author,
                               id_countries=id_countries,
186
                               authors_cppm=harvest.cppm_authors(proceeding),
187
                               id_teams=results.id_team,
188
                               id_categories=results.id_category,
189
                               id_status=1)
190

191 192
    response.view = 'harvest/layout.html'

193 194 195 196 197 198
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
199 200


Renaud Le Gac's avatar
Renaud Le Gac committed
201
def reports():
202 203 204
    """Scan the cds/invenio stores to find reports published during a year 
    range and for a given team. Insert them in the database if they don't
    exist.
Renaud Le Gac's avatar
Renaud Le Gac committed
205
    
206 207 208 209 210
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 

    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
Renaud Le Gac's avatar
Renaud Le Gac committed
211 212 213
    
    """
    try:
214 215 216
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
Renaud Le Gac's avatar
Renaud Le Gac committed
217 218 219 220 221
        
    except BaseException, e:
        return 'Error: %s' % e

    # process each record
222
    n = 0
Renaud Le Gac's avatar
Renaud Le Gac committed
223 224 225 226 227 228 229
    for record in results.records:
        
        # check the report
        title = record.title()
        first_author = record.first_author()
        year = record.year()
        
230 231 232 233
        id = harvest.get_id(db, 'publications', id_categories=results.id_category,
                                                first_author=first_author,
                                                title=title,
                                                year=year)
Renaud Le Gac's avatar
Renaud Le Gac committed
234 235 236 237

        if id: continue
       
        # insert a new report
238
        n += 1
Renaud Le Gac's avatar
Renaud Le Gac committed
239
        db.publications.insert(authors=record.authors(),
240
                               authors_cppm=harvest.cppm_authors(record),
Renaud Le Gac's avatar
Renaud Le Gac committed
241 242 243 244 245 246 247 248
                               id_categories=results.id_category,
                               id_status=1,
                               id_teams=results.id_team,
                               first_author=first_author,
                               publication_url=record.paper_url(),
                               report_numbers=record.rapport_number(),
                               title=title,
                               year=year)
249

250 251
    response.view = 'harvest/layout.html'

252 253 254 255 256 257
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
Renaud Le Gac's avatar
Renaud Le Gac committed
258 259
    
    
260
def talks():
261 262
    """Scan the cds/invenio stores to find conference talks for a given 
    year range and team. Insert them in the database if they don't exist.
263
    
264 265
    The scanning is steered using URL arguments.
    Arguments are defined and send via the harvester selector. 
266

267 268
    Parameters of the scanning associated to this controller are also 
    defined in the database table harvesters
269 270 271
    
    """
    try:
272 273 274
        results = harvest.search(db, 
                                 request, 
                                 filter=harvest.filter_cppm_authors)
275 276
        
    except BaseException, e:
277
        return 'Error: %s' % e
278 279 280 281

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12

Renaud Le Gac's avatar
Renaud Le Gac committed
282
    # process talks
283
    n = 0
284 285 286 287 288 289 290
    for talk in results.records:

        # get the conference information
        key = talk.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
        conference = marc12.process(xml)[0]

291
        # define alias
292 293 294 295 296 297 298
        title = talk.title()
        first_author = talk.first_author()
        year = talk.year()

        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
        
299
        # check the talk
Renaud Le Gac's avatar
Renaud Le Gac committed
300
        # A talk defines title, first author, conference parameter 
301
        # a category and a year. Latter on this talk might be transform
Renaud Le Gac's avatar
Renaud Le Gac committed
302 303
        # into a proceeding. it update the authors, the category,
        # the publication URL and the year.
304 305 306 307 308 309 310 311 312 313 314
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex.
        
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
                (db.publications.first_author==first_author)

        for row in db(query).select(db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
315
                if s.ratio() > results.ratio:
316 317 318 319
                    match = True
                    break

        if match: continue
320 321

        # insert a new talk
322 323 324
        id_countries = harvest.get_create_id(db, 
                                             'countries', 
                                             country=conference.conference_country())
325

326
        n += 1
327 328
        db.publications.insert(authors=talk.authors(),
                               authors_cppm=first_author,
329
                               id_categories=results.id_category,
330 331 332 333 334 335 336 337 338 339 340
                               id_countries=id_countries,
                               id_status=1,
                               id_teams=results.id_team,
                               conference_dates=conference_dates,
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_speaker=first_author,
                               conference_url=conference.conference_url(),
                               first_author=first_author,
                               title=title,
                               year=year)
341 342
    
    response.view = 'harvest/layout.html'
343
        
344 345 346 347 348 349
    return dict(controller=request.function,
                collections=results.collections,
                y1=results.y1,
                y2=results.y2,
                nfound=len(results.records),
                ninsert=n)
350