Commit fc7a7aa9 authored by tux091's avatar tux091
Browse files

Redesign the tools harvest tools.

parent 643c8f02
......@@ -12,46 +12,33 @@ def index():
def articles():
"""Scan the cds/invenio store to find articles published during a given year.
Insert them in the database if they don't exist.
"""Scan the cds/invenio store to find articles published during a given year
and for a given team. Insert them in the database if they don't exist.
The scanning can be steered using URL keyword arguments to select
the team and the year.
The syntax of a well form URL is:
the team and the year. The syntax of a well form URL is:
localhost:8000/track_publications/harvest/articles?year=2011&team=LHCb
It selects publication publised in 2011 belonging to
the collection 'LHCb Papers'.
It selects publication publised in 2011 by the team LHCb.
By default the year is the current one and the collection 'LHCb Papers'.
By default the year is the current one and the team LHCb.
A team is not registered in the database raise an exception.
"""
# setup category
category = 'ACL'
id_category = tools.get_create_id(db, 'categories', code=category)
# setup team
team = 'LHCb'
if 'team' in request.vars:
team = request.vars['team']
id_team = tools.get_create_id(db, 'teams', team=team)
# setup year
year = datetime.datetime.now().strftime('%Y')
if 'year' in request.vars:
year = request.vars['year']
# setup the collection
collection = '%s Papers' % team
# get the records from the collection
records = tools.get_records(collection,
year,
filter=harvest.filter_published_paper)
# get the articles from the invenio store
try:
search = tools.get_records(db,
request,
filter=harvest.filter_published_paper)
except tools.ToolException, e:
return 'Error: %s' % e
# process each record
for record in records:
for record in search.records:
# check the publisher
val = record.paper_editor()
......@@ -80,17 +67,17 @@ def articles():
authors=record.authors(),
id_collaborations=id_collaboration,
id_publishers=id_publisher,
year=year,
year=search.year,
volume=record.paper_volume(),
pages=record.paper_pages(),
preprint=record.rapport_number(),
publication_url=record.paper_url(),
authors_cppm=tools.cppm_authors(record),
id_teams=id_team,
id_teams=search.id_team,
id_categories=id_category,
id_status=1)
return "%i articles found" % len(records)
return "%i articles found" % len(search.records)
def conferences():
......@@ -102,7 +89,6 @@ def conferences():
The syntax of a well form URL is:
localhost:8000/track_publications/harvest/conferences?year=2011&team=LHCb
It selects conference proceedingd publised in 2011 belonging to
the collection 'LHCb Conference Proceedings'.
......@@ -114,29 +100,19 @@ def conferences():
category = 'ACTI'
id_category = tools.get_id(db, 'categories', code=category)
# setup team
team = 'LHCb'
if 'team' in request.vars:
team = request.vars['team']
id_team = tools.get_create_id(db, 'teams', team=team)
# get the proceedings from the invenio store
try:
search = tools.get_records(db,
request,
filter=tools.filter_cppm)
except tools.ToolException, e:
return 'Error: %s' % e
# setup year
year = datetime.datetime.now().strftime('%Y')
if 'year' in request.vars:
year = request.vars['year']
# setup the collection
collection = '%s Conference Proceedings' % team
cds = search.cds
marc12 = search.marc12
# get the records from the collection signed by CPPM authors
cds = harvest.CdsSvc()
marc12 = harvest.Marc12Svc()
xml = cds.search_year(collection, year, rg=500)
records = marc12.process(xml, filter=tools.filter_cppm)
# process each record
for proceeding in records:
for proceeding in search.records:
# get the conference information
xml = cds.get_record(proceeding.conference_id())
......@@ -189,7 +165,7 @@ def conferences():
conference_speaker=first_author,
id_countries=id_countries,
authors_cppm=tools.cppm_authors(proceeding),
id_teams=id_team,
id_teams=search.id_team,
id_categories=id_category,
id_status=1)
......
......@@ -7,8 +7,11 @@
"""
import datetime
import harvest
from gluon.storage import Storage
# list of CPPM authors -- to be extract from somewhere else
CPPM_AUTHORS = [u'C. Adrover',
u'E. Aslanides',
......@@ -31,6 +34,17 @@ CPPM_AUTHORS = [u'C. Adrover',
u'M. Sapunov',
u'A. Tsaregorodtsev']
# converters
CTRL_TO_COLLECTION = {'articles': '%s Papers',
'conference': '%s Conference Proceeding',
'talks': '%s Talks'}
# Default team for the harvest search
TEAM = 'LHCb'
class ToolException(BaseException): pass
def cppm_authors(record):
"""Helper function to find the CPPM authors signing the record.
......@@ -44,6 +58,50 @@ def cppm_authors(record):
return rep
def decode_url(db, request):
"""Decode the url to find the parameters of the harvest search.
The syntax of a well form URL is:
localhost:8000/track_publications/harvest/xxx?year=2011&team=LHCb
where the controller name xxx is equal to articles, conferences,
talks or reports.
The parameters of the harvest search are the collection and the year.
The collection is derived from the team argument and the controller name:
articles → TEAM Papers
conferences → TEAM Conference Proceedings
talks → TEAM Talks
reports →
If the team is not in the database it is created.
By default the year is the current one and the team LHCb.
Return a Storage with the following keys:
collection, id_team, team, and year.
"""
# setup the team
team = TEAM
if 'team' in request.vars:
team = request.vars['team']
id_team = get_id(db, 'teams', team=team)
if not id_team:
raise ToolException('Team not defined in the database.')
# setup the collection
ctrl = request.function
collection = CTRL_TO_COLLECTION[ctrl] % team
# setup the year
year = datetime.datetime.now().strftime('%Y')
if 'year' in request.vars:
year = request.vars['year']
return Storage(collection=collection, id_team=id_team, team=team, year=year)
def filter_cppm(record):
"""Filter selecting record signed by CPPM authors
......@@ -59,7 +117,7 @@ def fix_cppm_authors(record, reference):
Scan the list of authors against a reference list.
Return a string where author names are separated by comma.
Empty string when there is no authors
Return an empty string when there is no authors
"""
def family_name(x):
......@@ -114,27 +172,37 @@ def get_create_id(db, table, **kwargs):
return id
def get_records(collection, year, filter=None, func=None, max_records=200):
"""Helper function finding the record in cds/invenio store for a
given collection and for a given year.
def get_records(db, request, filter=None, func=None, max_records=200):
"""Helper function to run the harvest seach for a given collection
and for a given year.
The function filter allows to skip records which don't
satisfy users criteria.
satisfy users criteria. See harvest.Marc12SVc.search for details.
The function func is applied to surviving harvest.Record.
See harvest.Marc12SVc.search for details.
The keyword argument filter and func are reference to a function.
These functions have a single argument a harvest.Record object.
Return a list of harvest.Record.
Return a Storage with the following keys: cds, marc12 and records.
"""
# start the CDS service and the marc12 converter
cds = harvest.CdsSvc()
marc12 = harvest.Marc12Svc()
# decode the url to get the harvest parameters
par = decode_url(db, request)
# extract the list of publications from invenio
xml = cds.search_year(collection, year, rg=max_records)
xml = cds.search_year(par.collection, par.year, rg=max_records)
records = marc12.process(xml, filter=filter, func=func)
return records
\ No newline at end of file
return Storage(cds=cds,
collection=par.collection,
id_team=par.id_team,
marc12=marc12,
team=par.team,
records=records,
year=par.year)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment