harvest.py 7.17 KB
Newer Older
1 2 3
""" Harvest Controllers

"""
tux091's avatar
tux091 committed
4
import datetime
5
import harvest
6
import tools
7 8

def index():
tux091's avatar
tux091 committed
9 10 11 12 13
    """
    """
    return


14 15 16
def articles():
    """Scan the cds/invenio store to find articles published during a given year.
    Insert them in the database if they don't exist.
tux091's avatar
tux091 committed
17
    
18 19 20 21 22
    The scanning can be steered using URL keyword arguments to select 
    the team and the year.
    
    The syntax of a well form URL is:
    localhost:8000/track_publications/harvest/articles?year=2011&team=LHCb
23

24 25
    It selects publication publised in 2011 belonging to 
    the collection 'LHCb Papers'.
tux091's avatar
tux091 committed
26
    
27 28 29 30 31 32 33 34
    By default the year is the current one and the collection 'LHCb Papers'.
    
    """
    # setup category
    category = 'ACL'
    id_category = tools.get_create_id(db, 'categories', code=category)

    # setup team
35
    team = 'LHCb'
tux091's avatar
tux091 committed
36 37
    if 'team' in request.vars:
        team = request.vars['team']
38
    id_team = tools.get_create_id(db, 'teams', team=team)
tux091's avatar
tux091 committed
39

40
    # setup year
tux091's avatar
tux091 committed
41 42 43 44
    year = datetime.datetime.now().strftime('%Y')
    if 'year' in request.vars:
        year = request.vars['year']

tux091's avatar
tux091 committed
45 46 47
    # setup the collection
    collection = '%s Papers' % team

48
    # get the records from the collection
tux091's avatar
tux091 committed
49
    records = tools.get_records(collection, 
50 51
                                year, 
                                filter=harvest.filter_published_paper)
52 53

    # process each record
54 55
    for record in records:
        
56
        # check the publisher
57
        val = record.paper_editor()
58
        id_publisher = tools.get_create_id(db, 'publishers', abbreviation=val)
59

60
        # check the publication
61 62 63 64
        title = record.title()
        first_author = record.first_author()
        year = record.year()
        
65 66 67 68
        id = tools.get_id(db, 'publications', id_publishers=id_publisher,
                                              first_author=first_author,
                                              title=title,
                                              year=year)
tux091's avatar
tux091 committed
69

70 71 72
        # insert a new publications
        if not id:
            
73 74
            # check the collaboration
            val = record.collaboration()
75
            id_collaboration = tools.get_create_id(db, 'collaborations', collaboration=val)
76
            
77
            # insert
78 79 80 81 82 83 84 85 86
            db.publications.insert(title=title,
                                   first_author=first_author,
                                   authors=record.authors(),
                                   id_collaborations=id_collaboration,
                                   id_publishers=id_publisher,
                                   year=year,
                                   volume=record.paper_volume(),
                                   pages=record.paper_pages(),
                                   preprint=record.rapport_number(),
tux091's avatar
tux091 committed
87
                                   publication_url=record.paper_url(),
88
                                   authors_cppm=tools.cppm_authors(record),
89 90 91 92
                                   id_teams=id_team,
                                   id_categories=id_category,
                                   id_status=1)
        
93
    return "%i articles found" % len(records)
tux091's avatar
tux091 committed
94 95


96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
def conferences():
    """Scan the cds/invenio store to find conference proceedings published 
    during a given year. Insert them in the database if they don't exist.
    
    The scanning can be steered using URL keyword arguments to select 
    the collections and the year.
    
    The syntax of a well form URL is:
    localhost:8000/track_publications/harvest/conferences?year=2011&team=LHCb

    It selects conference proceedingd publised in 2011 belonging to 
    the collection 'LHCb Conference Proceedings'.

    By default the year is the current one and the 
    collection 'LHCb Conference Proceedings'.
    
tux091's avatar
tux091 committed
112
    """
113 114
    # setup category
    category = 'ACTI'
tux091's avatar
tux091 committed
115
    id_category = tools.get_id(db, 'categories', code=category)
116 117 118 119 120 121 122 123 124 125 126 127

    # setup team
    team = 'LHCb'
    if 'team' in request.vars:
        team = request.vars['team']
    id_team = tools.get_create_id(db, 'teams', team=team)

    # setup year
    year = datetime.datetime.now().strftime('%Y')
    if 'year' in request.vars:
        year = request.vars['year']

tux091's avatar
tux091 committed
128 129 130
    # setup the collection
    collection = '%s Conference Proceedings' % team
    
131
    # get the records from the collection signed by CPPM authors
tux091's avatar
tux091 committed
132 133 134 135 136
    cds = harvest.CdsSvc()
    marc12 = harvest.Marc12Svc()

    xml = cds.search_year(collection, year, rg=500)
    records = marc12.process(xml, filter=tools.filter_cppm)
137 138

    # process each record
tux091's avatar
tux091 committed
139
    for proceeding in records:
140
        
tux091's avatar
tux091 committed
141 142 143
        # get the conference information
        xml = cds.get_record(proceeding.conference_id())
        conference = marc12.process(xml)[0]
144
        
tux091's avatar
tux091 committed
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
        # NOTE: it happens that the conference identifer is not 
        # related to a conference. This happen when the proceeding
        # is published a year latter. In that case the solution is
        # to search in the conference collection using the "conference key"
        if not conference.conference_title():
            key = proceeding.conference_key()
            xml = cds.search(c='Conferences', p=key, f='', of='xm')
            conference = marc12.process(xml)[0]

        # check the roceeding
        authors = proceeding.authors()
        title = proceeding.title()
        first_author = proceeding.first_author()
        year = proceeding.year()

        conference_dates = conference.conference_dates()
        conference_title = conference.conference_title()
162 163
        
        id = tools.get_id(db, 'publications', authors=authors,
tux091's avatar
tux091 committed
164
                                              conference_dates=conference_dates,
165 166 167 168
                                              conference_title=conference_title,
                                              first_author=first_author,
                                              title=title,
                                              year=year)
tux091's avatar
tux091 committed
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194

        # insert a new proceeding
        if not id:

            # check country
            id_countries = tools.get_create_id(db, 
                                               'countries', 
                                               country=conference.conference_country())

            # insert
            db.publications.insert(title=title,
                                   first_author=first_author,
                                   authors=authors,
                                   year=year,
                                   preprint=proceeding.rapport_number(),
                                   publication_url=proceeding.paper_url(),
                                   conference_dates=conference_dates,
                                   conference_title=conference_title,
                                   conference_town=conference.conference_town(),
                                   conference_url=conference.conference_url(),
                                   conference_speaker=first_author,
                                   id_countries=id_countries,
                                   authors_cppm=tools.cppm_authors(proceeding),
                                   id_teams=id_team,
                                   id_categories=id_category,
                                   id_status=1)
tux091's avatar
tux091 committed
195
    
196
    return "%i conference proceedings found" % len(records)