harvest.py 10.5 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
4
import difflib
5
import harvest
6
import tools
7

8 9 10 11

STRING_MATCH = 0.6


12
def index():
tux091's avatar
tux091 committed
13 14 15 16 17
    """
    """
    return


18
def articles():
tux091's avatar
tux091 committed
19 20
    """Scan the cds/invenio store to find articles published during a given year
    and for a given team. Insert them in the database if they don't exist.
tux091's avatar
tux091 committed
21
    
22
    The scanning can be steered using URL keyword arguments to select 
tux091's avatar
tux091 committed
23
    the team and the year. The syntax of a well form URL is:
24
    localhost:8000/track_publications/harvest/articles?year=2011&team=LHCb
tux091's avatar
tux091 committed
25
    It selects publication publised in 2011 by the team LHCb.
tux091's avatar
tux091 committed
26
    
tux091's avatar
tux091 committed
27
    By default the year is the current one and the team LHCb.
28 29
    
    """
tux091's avatar
tux091 committed
30
    try:
31 32 33 34 35
        results = tools.search(db, 
                               request, 
                               filter=harvest.filter_published_paper)
        
    except BaseException, e:
36
        return 'Error: %s' % e
tux091's avatar
tux091 committed
37
    
38
    # process each record
39
    for record in results.records:
40
        
41
        # check the publisher
42
        val = record.paper_editor()
43
        id_publisher = tools.get_create_id(db, 'publishers', abbreviation=val)
44

45
        # check the publication
46 47 48 49
        title = record.title()
        first_author = record.first_author()
        year = record.year()
        
50 51 52 53
        id = tools.get_id(db, 'publications', id_publishers=id_publisher,
                                              first_author=first_author,
                                              title=title,
                                              year=year)
tux091's avatar
tux091 committed
54

55 56
        if id: continue
       
57
        # insert a new publications
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
        # check the collaboration
        val = record.collaboration()
        id_collaboration = tools.get_create_id(db, 'collaborations', collaboration=val)
        
        # insert
        db.publications.insert(title=title,
                               first_author=first_author,
                               authors=record.authors(),
                               id_collaborations=id_collaboration,
                               id_publishers=id_publisher,
                               year=year,
                               volume=record.paper_volume(),
                               pages=record.paper_pages(),
                               preprint=record.rapport_number(),
                               publication_url=record.paper_url(),
                               authors_cppm=tools.cppm_authors(record),
                               id_teams=results.id_team,
75
                               id_categories=results.id_category,
76
                               id_status=1)
77
        
78
    return "%i articles found" % len(results.records)
tux091's avatar
tux091 committed
79 80


81
def proceedings():
82
    """Scan the cds/invenio store to find conference proceedings published 
83 84
    during a given year for a given team. Insert them in the database 
    if they don't exist.
85 86
    
    The scanning can be steered using URL keyword arguments to select 
87
    the team and the year.
88 89
    
    The syntax of a well form URL is:
90 91 92
    localhost:8000/track_publications/harvest/proceedings?year=2011&team=LHCb
    It selects conference proceedingd publised in 2011 by a member of the 
    team LHCb. 
93

94
    By default the year is the current one and the team LHCb.
95
    
tux091's avatar
tux091 committed
96
    """
tux091's avatar
tux091 committed
97
    # get the proceedings from the invenio store
98
    # signed by cppm authors
tux091's avatar
tux091 committed
99
    try:
100 101 102 103 104
        results = tools.search(db, 
                               request, 
                               filter=tools.filter_cppm_authors)
        
    except BaseException, e:
105
        return 'Error: %s' % e
106 107 108

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12
tux091's avatar
tux091 committed
109
    
110
    # process each record
111
    for proceeding in results.records:
112
        
tux091's avatar
tux091 committed
113
        # get the conference information
114 115
        key = proceeding.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
tux091's avatar
tux091 committed
116
        conference = marc12.process(xml)[0]
117

118
        # define alias
tux091's avatar
tux091 committed
119 120
        authors = proceeding.authors()
        first_author = proceeding.first_author()
121 122
        title = proceeding.title()
        url= proceeding.paper_url()
tux091's avatar
tux091 committed
123 124 125 126
        year = proceeding.year()

        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
127
        
128
        # check the proceeding
129
        id = tools.get_id(db, 'publications', authors=authors,
tux091's avatar
tux091 committed
130
                                              conference_dates=conference_dates,
131 132
                                              conference_title=conference_title,
                                              first_author=first_author,
133
                                              id_categories=results.id_category,
134
                                              title=title,
135
                                              publication_url=url,
136
                                              year=year)
tux091's avatar
tux091 committed
137

138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
        if id: continue

        # update a talk
        # A talk defines authors, conference paraemeters, first author, title,   
        # a category and a year. Latter on this talk might be transform
        # into a proceesding. it update the authors, the category,
        # the publication url and the year.
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
                (db.publications.first_author==first_author)

        for row in db(query).select(db.publications.id, db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
                if s.ratio() > STRING_MATCH:
                    id = row.id
                    db.publications[id] = dict(authors=authors,
158
                                               id_categories=results.id_category,
159 160 161 162 163 164 165 166 167
                                               publication_url=url,
                                               title=title,
                                               year=year)

                    match = True
                    break

        if match: continue
        
tux091's avatar
tux091 committed
168
        # insert a new proceeding
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
        id_countries = tools.get_create_id(db, 
                                           'countries', 
                                           country=conference.conference_country())

        db.publications.insert(title=title,
                               first_author=first_author,
                               authors=authors,
                               year=year,
                               preprint=proceeding.rapport_number(),
                               publication_url=url,
                               conference_dates=conference_dates,
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_url=conference.conference_url(),
                               conference_speaker=first_author,
                               id_countries=id_countries,
                               authors_cppm=tools.cppm_authors(proceeding),
                               id_teams=results.id_team,
187
                               id_categories=results.id_category,
188
                               id_status=1)
tux091's avatar
tux091 committed
189
    
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
    return "%i conference proceedings found" % len(results.records)


def talks():
    """Scan the cds/invenio store to find conference talks for a given 
    year and team. Insert them in the database if they don't exist.
    
    The scanning can be steered using URL keyword arguments to select 
    the team and the year.
    
    The syntax of a well form URL is:
    localhost:8000/track_publications/harvest/talks?year=2011&team=LHCb
    It selects conference talks given in 2011 by a member of the team LHCb. 

    By default the year is the current one and the team LHCb.
    
    """
    try:
        results = tools.search(db, 
                               request, 
                               filter=tools.filter_cppm_authors)
        
    except BaseException, e:
213
        return 'Error: %s' % e
214 215 216 217 218 219 220 221 222 223 224 225

    # alias to the cds services
    cds, marc12 = results.cds, results.marc12

    # prcess talks
    for talk in results.records:

        # get the conference information
        key = talk.conference_key()
        xml = cds.search(c='Conferences', p=key, f='', of='xm')
        conference = marc12.process(xml)[0]

226
        # define alias
227 228 229 230 231 232 233
        title = talk.title()
        first_author = talk.first_author()
        year = talk.year()

        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
        
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
        # check the talk
        # A talk defines title, first author, conference paraemeter 
        # a category and a year. Latter on this talk might be transform
        # into a proceesding. it update the authors, the category,
        # the publication url and the year.
        # Often, the title of the talk and the title of the proceeding
        # are similar but not equal. It is why the algorithm is rather complex.
        
        match = False
        
        query = (db.publications.conference_dates==conference_dates)&\
                (db.publications.conference_title==conference_title)&\
                (db.publications.first_author==first_author)

        for row in db(query).select(db.publications.title):
                s = difflib.SequenceMatcher(None, title, row.title)
                if s.ratio() > STRING_MATCH:
                    match = True
                    break

        if match: continue
255 256

        # insert a new talk
257 258 259
        id_countries = tools.get_create_id(db, 
                                           'countries', 
                                           country=conference.conference_country())
260

261 262
        db.publications.insert(authors=talk.authors(),
                               authors_cppm=first_author,
263
                               id_categories=results.id_category,
264 265 266 267 268 269 270 271 272 273 274
                               id_countries=id_countries,
                               id_status=1,
                               id_teams=results.id_team,
                               conference_dates=conference_dates,
                               conference_title=conference_title,
                               conference_town=conference.conference_town(),
                               conference_speaker=first_author,
                               conference_url=conference.conference_url(),
                               first_author=first_author,
                               title=title,
                               year=year)
275 276 277
        
    return "%i conference talks found" % len(results.records)