Commit 00ac2184 authored by legac's avatar legac
Browse files

Add the preprint in harvester controller.

Review all harvesters and polish publication grid display.
parent 781a3068
......@@ -17,9 +17,8 @@ def articles():
"""
try:
results = harvest.search(db,
request,
filter=harvest.filter_published_paper_with_cppm_authors)
filter = harvest.filter_published_paper_with_cppm_authors
results = harvest.search(db, request, filter=filter)
except BaseException, e:
return 'Error: %s' % e
......@@ -28,45 +27,71 @@ def articles():
n = 0
for record in results.records:
# check the publisher
val = record.paper_editor()
id_publisher = harvest.get_create_id(db, 'publishers', abbreviation=val)
# check the publication
title = record.title()
# alias
first_author = record.first_author()
id_category = results.id_category
id_team = results.id_team
title = record.title()
year = record.year()
id = harvest.get_id(db, 'publications', id_publishers=id_publisher,
first_author=first_author,
# check the publisher and the collaboration
val = record.paper_editor()
id_publisher = harvest.get_create_id(db,
'publishers',
abbreviation=val)
val = record.collaboration()
id_collaboration = harvest.get_create_id(db,
'collaborations',
collaboration=val)
# check against already published articles
id = harvest.get_id(db, 'publications', first_author=first_author,
id_publishers=id_publisher,
id_teams=id_team,
title=title,
year=year)
if id: continue
# insert a new publications
# check the collaboration
val = record.collaboration()
id_collaboration = harvest.get_create_id(db, 'collaborations', collaboration=val)
# check against already published preprint
# a preprint can be identified by its category which is undefined
id = harvest.get_id(db, 'publications', first_author=first_author,
id_categories=1,
id_teams=id_team,
submitted=record.submitted(),
title=title)
# transform an existing preprint into article
if id:
n += 1
db.publications[id] = dict(id_categories=id_category,
id_publishers=id_publisher,
pages=record.paper_pages(),
publication_url=record.paper_url(),
volume=record.paper_volume(),
year=year)
continue
# insert
# eventually insert a new articles in the database
n += 1
db.publications.insert(title=title,
db.publications.insert(authors=record.authors(),
authors_cppm=harvest.cppm_authors(record),
first_author=first_author,
authors=record.authors(),
id_categories=id_category,
id_collaborations=id_collaboration,
id_publishers=id_publisher,
year=year,
volume=record.paper_volume(),
id_status=1,
id_teams=id_team,
pages=record.paper_pages(),
submitted=record.submitted(),
preprint=record.rapport_number(),
publication_url=record.paper_url(),
authors_cppm=harvest.cppm_authors(record),
id_teams=results.id_team,
id_categories=results.id_category,
id_status=1)
submitted=record.submitted(),
title=title,
volume=record.paper_volume(),
year=year)
# use a common view to display the results of the search
response.view = 'harvest/layout.html'
return dict(controller=request.function,
......@@ -77,6 +102,83 @@ def articles():
ninsert=n)
def preprints():
"""Scan the cds/invenio stores to find preprints not yet published.
the scan is performed during a year range for a given team.
New record are inserted in the database if they don't exist.
The scanning is steered using URL arguments.
Arguments are set and send via the harvester selector.
Parameters of the scanning associated to this controller are also
defined in the database table harvesters
"""
try:
results = harvest.search(db,
request,
filter=harvest.filter_preprint)
except BaseException, e:
return 'Error: %s' % e
# process each record
n = 0
for record in results.records:
# alias
first_author = record.first_author()
id_category = results.id_category
id_team = results.id_team
preprint = record.rapport_number()
title = record.title()
submitted = record.submitted()
# check the collaboration
val = record.collaboration()
id_collaboration = harvest.get_create_id(db,
'collaborations',
collaboration=val)
# Protection to only keep preprints with authors
if not first_author:
print 'No authors for preprint: %s [%s]' % (title, preprint)
continue
# check against preprint or article already published
id = harvest.get_id(db, 'publications', first_author=first_author,
id_teams=id_team,
preprint=preprint,
submitted=submitted,
title=title)
if id: continue
# eventually insert a new preprint
n += 1
db.publications.insert(authors=record.authors(),
authors_cppm=harvest.cppm_authors(record),
first_author=first_author,
id_categories=id_category,
id_collaborations=id_collaboration,
id_status=1,
id_teams=id_team,
preprint=record.rapport_number(),
publication_url=record.paper_url(),
submitted=record.submitted(),
title=title,
year=year)
# use a common view to display the results of the search
response.view = 'harvest/layout.html'
return dict(controller=request.function,
collections=results.collections,
y1=results.y1,
y2=results.y2,
nfound=len(results.records),
ninsert=n)
def proceedings():
"""Scan the cds/invenio stores to find conference proceedings published
during a year range for a given team. Insert them in the database
......@@ -89,8 +191,7 @@ def proceedings():
defined in the database table harvesters
"""
# get the proceedings from the invenio store
# signed by cppm authors
# get the proceedings from the invenio store signed by cppm authors
try:
results = harvest.search(db,
request,
......@@ -109,35 +210,43 @@ def proceedings():
# get the conference information
key = proceeding.conference_key()
xml = cds.search(c='Conferences', p=key, f='', of='xm')
records = marc12.process(xml)
if not records:
conferences = marc12.process(xml)
# protection against missing conference information
if not conferences:
print 'no information for conference key', key
continue
conference = records[0]
# define alias
# alias
authors = proceeding.authors()
id_category = results.id_category
id_team = results.id_team
conference = conferences[0]
conference_dates = conference.conference_dates()
conference_title = conference.conference_title()
country = conference.conference_country()
first_author = proceeding.first_author()
title = proceeding.title()
url= proceeding.paper_url()
year = proceeding.year()
conference_dates = conference.conference_dates()
conference_title = conference.conference_title()
# check conference country
id_countries = harvest.get_create_id(db, 'countries', country=country)
# check the proceeding
# check against an already published proceeding
id = harvest.get_id(db, 'publications', authors=authors,
conference_dates=conference_dates,
conference_title=conference_title,
first_author=first_author,
id_categories=results.id_category,
title=title,
id_categories=id_category,
id_teams=id_team,
publication_url=url,
title=title,
year=year)
if id: continue
# update a talk
# update an already published talk
# A talk defines authors, conference parameters, first author, title,
# a category and a year. Latter on this talk might be transform
# into a proceeding. it update the authors, the category,
......@@ -148,14 +257,15 @@ def proceedings():
query = (db.publications.conference_dates==conference_dates)&\
(db.publications.conference_title==conference_title)&\
(db.publications.first_author==first_author)
(db.publications.first_author==first_author)&\
(db.publications.id_teams==id_team)
for row in db(query).select(db.publications.id, db.publications.title):
s = difflib.SequenceMatcher(None, title, row.title)
if s.ratio() > results.ratio:
id = row.id
db.publications[id] = dict(authors=authors,
id_categories=results.id_category,
id_categories=id_category,
publication_url=url,
title=title,
year=year)
......@@ -165,29 +275,26 @@ def proceedings():
if match: continue
# insert a new proceeding
id_countries = harvest.get_create_id(db,
'countries',
country=conference.conference_country())
# eventually insert a new proceeding
n += 1
db.publications.insert(title=title,
first_author=first_author,
authors=authors,
year=year,
preprint=proceeding.rapport_number(),
publication_url=url,
db.publications.insert(authors=authors,
authors_cppm=harvest.cppm_authors(proceeding),
conference_dates=conference_dates,
conference_speaker=first_author,
conference_title=conference_title,
conference_town=conference.conference_town(),
conference_url=conference.conference_url(),
conference_speaker=first_author,
first_author=first_author,
id_categories=id_category,
id_countries=id_countries,
authors_cppm=harvest.cppm_authors(proceeding),
id_teams=results.id_team,
id_categories=results.id_category,
id_status=1)
id_status=1,
id_teams=id_team,
preprint=proceeding.rapport_number(),
publication_url=url,
title=title,
year=year)
# use a common view to display the results of the search
response.view = 'harvest/layout.html'
return dict(controller=request.function,
......@@ -222,31 +329,36 @@ def reports():
n = 0
for record in results.records:
# check the report
title = record.title()
# alias
first_author = record.first_author()
id_category = results.id_category
id_team = results.id_team
title = record.title()
year = record.year()
id = harvest.get_id(db, 'publications', id_categories=results.id_category,
first_author=first_author,
# check against already published reports
id = harvest.get_id(db, 'publications', first_author=first_author,
id_categories=id_category,
id_teams=id_team,
title=title,
year=year)
if id: continue
# insert a new report
# eventually insert a new report
n += 1
db.publications.insert(authors=record.authors(),
authors_cppm=harvest.cppm_authors(record),
id_categories=results.id_category,
id_status=1,
id_teams=results.id_team,
first_author=first_author,
id_categories=id_category,
id_status=1,
id_teams=id_team,
publication_url=record.paper_url(),
report_numbers=record.rapport_number(),
title=title,
year=year)
# use a common view to display the results of the search
response.view = 'harvest/layout.html'
return dict(controller=request.function,
......@@ -286,17 +398,28 @@ def talks():
# get the conference information
key = talk.conference_key()
xml = cds.search(c='Conferences', p=key, f='', of='xm')
conference = marc12.process(xml)[0]
conferences = marc12.process(xml)
# protection against missing conference
if not conferences:
print 'no information for conference key', key
continue
# define alias
title = talk.title()
conference = conferences[0]
conference_dates = conference.conference_dates()
conference_title = conference.conference_title()
country = conference.conference_country()
id_category = results.id_category
id_team = results.id_team
first_author = talk.first_author()
title = talk.title()
year = talk.year()
conference_dates = conference.conference_dates()
conference_title = conference.conference_title()
# check the talk
# check country
id_countries = harvest.get_create_id(db, 'countries', country=country)
# check against already published talks
# A talk defines title, first author, conference parameter
# a category and a year. Latter on this talk might be transform
# into a proceeding. it update the authors, the category,
......@@ -308,7 +431,8 @@ def talks():
query = (db.publications.conference_dates==conference_dates)&\
(db.publications.conference_title==conference_title)&\
(db.publications.first_author==first_author)
(db.publications.first_author==first_author)&\
(db.publications.id_teams==id_team)
for row in db(query).select(db.publications.title):
s = difflib.SequenceMatcher(None, title, row.title)
......@@ -318,27 +442,24 @@ def talks():
if match: continue
# insert a new talk
id_countries = harvest.get_create_id(db,
'countries',
country=conference.conference_country())
# eventually insert a new talk
n += 1
db.publications.insert(authors=talk.authors(),
authors_cppm=first_author,
id_categories=results.id_category,
id_countries=id_countries,
id_status=1,
id_teams=results.id_team,
conference_dates=conference_dates,
conference_speaker=first_author,
conference_title=conference_title,
conference_town=conference.conference_town(),
conference_speaker=first_author,
conference_url=conference.conference_url(),
first_author=first_author,
id_categories=id_category,
id_countries=id_countries,
id_status=1,
id_teams=id_team,
title=title,
year=year)
# use a common view to display the results of the search
response.view = 'harvest/layout.html'
return dict(controller=request.function,
......@@ -347,4 +468,18 @@ def talks():
y2=results.y2,
nfound=len(results.records),
ninsert=n)
\ No newline at end of file
def theses():
"""Scan the cds/invenio stores to find theses.
the scan is performed during a year range for a given team.
New record are inserted in the database if they don't exist.
The scanning is steered using URL arguments.
Arguments are set and send via the harvester selector.
Parameters of the scanning associated to this controller are also
defined in the database table harvesters
"""
return 'not yet implemented'
\ No newline at end of file
......@@ -11,7 +11,9 @@
dummy = DAL(None)
#
# harvester selector
#
dummy.define_table('harvester_selector',
Field('year_start', 'integer', default=year),
Field('year_end', 'integer'),
......@@ -22,25 +24,27 @@ dummy.harvester_selector.id_teams.requires = IS_IN_DB(db,
'teams.id',
'teams.team')
dummy.harvester_selector.store.requires = IS_IN_SET(['articles',
dummy.harvester_selector.store.requires = IS_IN_SET(['articles',
'preprints',
'proceedings',
'reports',
'talks'])
'talks',
'theses'])
#
# list selector
#
dummy.define_table('list_selector',
Field('year_start', 'integer', default=year),
Field('year_end', 'integer'),
Field('id_teams', db.teams, default=2, label='Team'),
Field('author', 'string'),
Field('list', 'string', default=T('basic')),
Field('list', 'string', default='basic'),
Field('format', 'string', default='HTML'))
dummy.list_selector.id_teams.requires = IS_IN_DB(db,
'teams.id',
'teams.team')
dummy.list_selector.list.requires = IS_IN_SET([T('basic')])
dummy.list_selector.list.requires = IS_IN_SET(['basic'])
dummy.list_selector.format.requires = IS_IN_SET(['HTML', 'latex', 'odt'])
......@@ -85,12 +85,18 @@ if db(db.teams.id).count() == 1:
# Harvester paramters
if db(db.harvesters.id).count() == 0:
db.harvesters.insert(id=2,
db.harvesters.insert(id=1,
id_teams=2,
controller='articles',
collections='LHCb Papers, LHCb Detector Performance Papers',
id_categories=2)
db.harvesters.insert(id=2,
id_teams=2,
controller='preprints',
collections='LHCb Papers, LHCb Detector Performance Papers',
id_categories=1)
db.harvesters.insert(id=3,
id_teams=2,
controller='proceedings',
......@@ -101,7 +107,7 @@ if db(db.harvesters.id).count() == 0:
db.harvesters.insert(id=4,
id_teams=2,
controller='reports',
collections='LHCb Conference Contributions, LHCb Public Notes',
collections='LHCb Public Notes, LHCb Reports',
id_categories=14)
db.harvesters.insert(id=5,
......@@ -110,3 +116,9 @@ if db(db.harvesters.id).count() == 0:
collections='LHCb Talks',
id_categories=9,
ratio=0.9)
db.harvesters.insert(id=6,
id_teams=2,
controller='theses',
collections='LHCb Theses',
id_categories=1)
......@@ -145,6 +145,7 @@ tpl = ['<b>{PublicationsTitle}</b><br>',
'<tpl if="!PublicationsConference_title">{PublicationsFirst_author} et al</tpl>'
'<tpl if="PublicationsConference_title">{PublicationsFirst_author}</tpl>'
'<tpl if="PublicationsId_collaborations &gt; 1">, {CollaborationsCollaboration}</tpl>',
'<tpl if="PublicationsId_categories==1">, {PublicationsPreprint}</tpl>',
'<tpl if="PublicationsId_publishers &gt; 1">, {PublishersAbbreviation}',
'<tpl if="PublicationsVolume"> {PublicationsVolume}</tpl>',
'<tpl if="PublicationsYear"> ({PublicationsYear}) </tpl>',
......
......@@ -78,15 +78,25 @@ def filter_cppm_authors(record):
return False
def filter_preprint(record):
"""Filter selecting a preprint not yet published.
The record should have a preprint number and a date of submission.
The publisher and conference fields should be empty.
"""
l = record.rapport_number() and \
record.submitted() and \
not filter_published_paper(record)
return l
def filter_published_paper(record):
"""Filter selecting published paper.
The paper should have a publisher and a volume numbers.
"""
if record.paper_editor() and record.paper_volume():
return True
return False
return record.paper_editor() and record.paper_volume()
def filter_published_paper_with_cppm_authors(record):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment