Commit 6cff9aaa authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge branch '8-10-multi-store' into 'master'

8 10 search publications in several stores

* The origin field is now a string containing the list of oai_url separated by a comma, one value for each store.
* Additional method to the class `Record`: `primary_oai`, `primary_oai_url`, `secondary_oai`, `secondary_oai_url`.
* Change the signature of the methods: `Automaton._is_record_in_db`, `Automaton.get_record_by_fields` and `Articles.get_record_by_origin`.
* Harvester logic modified to search articles in several stores.
* Close #8, #10

See merge request !31
parents fa26d23a 061b00d6
......@@ -156,11 +156,20 @@ def edit_insert():
values['PublicationsId_collaborations'] = int(recId)
# teams, project, categories, origin
# teams, project, categories
values['PublicationsId_categories'] = int(selector.id_categories)
values['PublicationsId_projects'] = int(selector.id_projects)
values['PublicationsId_teams'] = int(selector.id_teams)
values['PublicationsOrigin'] = OAI_URL % (selector.host, selector.record_id)
# origin
# Note:
# - It is always defined
# - Use a trivial algorithm to recover it
oai_url = record.oai_url()
if not oai_url:
oai_url = OAI_URL % (selector.host, selector.record_id)
values['PublicationsOrigin'] = oai_url
# publishers
if selector.controller in ('articles', 'proceedings'):
......@@ -299,24 +308,31 @@ def run():
selector = Selector(virtdb.harvester_selector,
exclude_fields=('mode', 'year_start', 'year_end'))
# Get the host and collections
row = selector.select(db.harvesters).first()
if not row:
# Get hosts and collections
rows = selector.select(db.harvesters)
if not rows:
raise ToolException(MSG_NO_HARVESTER)
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
row.harvesters.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
collection_logs = []
logs = []
for row in rows:
tool = build_harvester_tool(db,
selector.id_teams,
selector.id_projects,
selector.controller,
row.harvesters.id_categories,
year_start=selector.year_start,
year_end=selector.year_end,
dry_run=(selector.mode == MODE_DRY_RUN),
debug=False)
if not tool:
return INLINE_ALERT % (T('Error'), T('Select an harvester.'))
tool.process_url(row.harvesters.host, row.harvesters.collections)
tool.process_url(row.harvesters.host, row.harvesters.collections)
collection_logs.extend(tool.collection_logs)
logs.extend(tool.logs)
except ToolException as e:
return T(str(e))
......@@ -327,10 +343,12 @@ def run():
msg += '<hr/>'
return msg
# delegate rendering to the report view
response.view = 'harvest/layout.%s' % request.extension
report = tool.report()
report['selector'] = selector
return report
return dict(collection_logs=collection_logs,
controller=selector.controller,
logs=logs,
selector=selector)
def run_all():
......
......@@ -14,6 +14,10 @@ invenio_tools.record.Record
~Record.id
~Record.oai
~Record.oai_url
~Record.primary_oai
~Record.primary_oai_url
~Record.secondary_oai
~Record.secondary_oai_url
~Record.sysno
......
invenio_tools.record.Record.primary_oai
=======================================
.. currentmodule:: invenio_tools.record
.. automethod:: Record.primary_oai
\ No newline at end of file
invenio_tools.record.Record.primary_oai_url
===========================================
.. currentmodule:: invenio_tools.record
.. automethod:: Record.primary_oai_url
\ No newline at end of file
invenio_tools.record.Record.secondary_oai
=========================================
.. currentmodule:: invenio_tools.record
.. automethod:: Record.secondary_oai
\ No newline at end of file
invenio_tools.record.Record.secondary_oai_url
=============================================
.. currentmodule:: invenio_tools.record
.. automethod:: Record.secondary_oai_url
\ No newline at end of file
......@@ -19,7 +19,8 @@ from gluon.tools import PluginManager
from regex import (REG_COLLABORATION,
REG_CONF_DATES,
REG_DEFENSE,
REG_SUBMITTED)
REG_SUBMITTED,
REG_VALID_ORIGIN)
#-------------------------------------------------------------------------------
#
......
......@@ -37,7 +37,7 @@ T("Preprint identifier separated by comma: arXiv:0906.1516")
tp_report_numbers = \
T("Report identifier separated by comma: LHCb-PROC-2008-04")
tp_submitted = \
T("Date of submission to a publisher: 2011-12-13 or 2011-12")
......@@ -116,7 +116,7 @@ db.publications.id_publishers.requires = IS_IN_DB(db, 'publishers.abbreviation')
db.publications.conference_dates.requires = IS_EMPTY_OR(IS_MATCH(REG_CONF_DATES))
db.publications.conference_url.requires = IS_EMPTY_OR(IS_URL())
db.publications.origin.requires = IS_EMPTY_OR(IS_URL())
db.publications.origin.requires = IS_EMPTY_OR(IS_MATCH(REG_VALID_ORIGIN))
db.publications.publication_url.requires = IS_EMPTY_OR(IS_URL())
# rule: 2012-12 or 2012-12-31
......
......@@ -11,6 +11,7 @@ fieldsModifier = dbui.FieldsModifier('publications')
fieldsModifier.configure_field('conference_start', format='Y-m-d')
fieldsModifier.configure_field('id_authors_roles', hidden=True)
fieldsModifier.configure_field('origin', xtype='textarea')
fieldsModifier.configure_field('pages', emptyText=T('pages'))
fieldsModifier.configure_field('publication_date', format='Y-m-d')
fieldsModifier.configure_field('volume', emptyText=T('volume'))
......
......@@ -130,7 +130,8 @@ def INHIBIT_HARVESTER_ON_CATEGORY(harvester):
"""Inhibit the insert of similar harvesters.
For a project, one automaton can only proceed publication
of the same code, e.g ACL or ACLN but not both
of the same code, e.g ACL or ACLN but not both, but automatons
can scan different stores.
Args:
harvester (dict): harvester fields passed to insert.
......@@ -144,7 +145,6 @@ def INHIBIT_HARVESTER_ON_CATEGORY(harvester):
# a new harvester
id_harvester = get_id(db.harvesters,
controller=harvester['controller'],
host=harvester['host'],
id_projects=harvester['id_projects'],
id_teams=harvester['id_teams'])
......
......@@ -222,7 +222,7 @@ def duplicate_article(publication):
db = current.globalenv['db']
categories = db.categories
publications = db.pulications
publications = db.publications
qcat = (categories.code == 'ACL') | (categories.code == 'ACLN')
qpub = publications.id_publishers == publication['id_publishers']
......
......@@ -77,29 +77,40 @@ class Articles(Automaton):
return True
def get_record_by_origin(self,
def get_record_by_fields(self,
oai_url,
year,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
preprint_number=None,
title=None,
volume=None,
year=None):
"""Get an existing record using the origin field and its value
defined in the ``oai_url`` keyword argument.
The other arguments are used to transform the corresponding preprint
into an article.
volume=None):
"""Get article matching fields values defined
in the keyword arguments.
Note:
This method is required deal with an article entered by hand and
found later by the harvester.
Args:
oai_url (unicode): the OAI identifier of the article.
oai_url (unicode): the oai_url, *e.g*
``http://cds.cern.ch/record/123456``. The origin field
of the existing database record is update to **oai_url**
when a match is found.
year (unicode): the year of the publication. It is used
by the search algorithm and by the logger.
Keyword Args:
id_publisher (int): identifier of the publisher in the database.
my_authors (unicode): authors of my institute separated by a comma.
pages (unicode): the page reference.
publication_url (unicode): the URL of the publications
preprint_number (unicode): the preprint number
title (unicode): the title of the publication.
volume (unicode): the volume reference.
year (unicode): the year of publication.
Returns:
tuple: ``(id, status)`` which contains the ``id`` of the record.
......@@ -109,21 +120,52 @@ class Articles(Automaton):
"""
if self.dbg:
print "check existing article by origin"
print "get existing article by fields"
# alias
db = self.db
id_project = self.id_project
id_team = self.id_team
logs = self.logs
rec_id = get_id(db.publications, origin=oai_url)
if not rec_id:
return (None, 0)
# check against published articles
rec_id = get_id(db.publications,
id_projects=id_project,
id_publishers=id_publisher,
id_teams=id_team,
pages=pages,
volume=volume,
year=year)
# not a preprint ?
if db.publications[rec_id].id_categories != self.id_preprint:
self.logs[-1].idle(MSG_IN_DB, year)
# fix origin field
publication = db.publications[rec_id]
if rec_id and not publication.origin:
if not self.dry_run:
publication = dict(origin=oai_url)
logs[-1].modify(MSG_FIX_ORIGIN, year)
return (rec_id, 1)
if rec_id:
logs[-1].idle(MSG_IN_DB, year)
return (rec_id, 0)
# transform a preprint into an article
# check against published preprint
# a preprint can be identified by its category which is PRE (15)
rec_id = get_id(db.publications,
id_categories=self.id_preprint,
id_projects=id_project,
id_teams=id_team,
preprint=preprint_number)
if not rec_id:
return (None, 0)
# transform an existing preprint into article
# institute authors can be missing in the preprint
# change also the status
self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
if not self.dry_run:
db.publications[rec_id] = dict(authors_institute=my_authors,
id_categories=self.id_category,
......@@ -137,34 +179,38 @@ class Articles(Automaton):
return (rec_id, 1)
def get_record_by_fields(self,
def get_record_by_origin(self,
primary_oai_url,
year,
id_publisher=None,
my_authors=None,
oai_url=None,
pages=None,
publication_url=None,
preprint_number=None,
title=None,
volume=None,
year=None):
"""Get the record matching the input fields
Note:
Fix the field origin when a match is found.
volume=None):
"""Get an existing record using the origin field and its value
defined in the *primary_oai_url* argument.
Note:
Transform a preprint into article.
This method is required to transform a preprint into and article.
All the keyword arguments are needed by the transformation.
Args:
primary_oai_url (unicode): the *primary* OAI identifier of the
record. It is used by the search algorithm.
year (unicode): the year of publication which is used
by the logger.
Keyword Args:
id_publisher (int): identifier of the publisher in the database.
my_authors (unicode): authors of my institute separated by a comma.
oai_url (unicode): the URL defining the OAI.
oai_url (unicode): the full oai_url(s) of the article.
pages (unicode): the page reference.
publication_url (unicode): the URL of the publications
preprint_number (unicode): the preprint number
title (unicode): the title of the publication.
volume (unicode): the volume reference.
year (unicode): the year of publication.
Returns:
tuple: ``(id, status)`` which contains the ``id`` of the record.
......@@ -174,52 +220,36 @@ class Articles(Automaton):
"""
if self.dbg:
print "get existing article by fields"
print "check existing article by origin"
# alias
db = self.db
logs = self.logs
publications = db.publications
# check against published articles
rec_id = get_id(db.publications,
id_projects=self.id_project,
id_publishers=id_publisher,
id_teams=self.id_team,
pages=pages,
volume=volume,
year=year)
# fix origin field
if rec_id and not db.publications[rec_id].origin:
if not self.dry_run:
db.publications[rec_id] = dict(origin=oai_url)
# search by origin
query = db.publications.origin.contains(primary_oai_url)
setrows = db(query)
if setrows.count() == 0:
return (None, 0)
self.logs[-1].modify(MSG_FIX_ORIGIN, year)
return (rec_id, 1)
# a record is found
rec_id = setrows.select(publications.id).first().id
publication = publications[rec_id]
if rec_id:
self.logs[-1].idle(MSG_IN_DB, year)
# not a preprint ?
if publication.id_categories != self.id_preprint:
logs[-1].idle(MSG_IN_DB, year)
return (rec_id, 0)
# check against published preprint
# a preprint can be identified by its category which is PRE (15)
rec_id = get_id(db.publications,
id_categories=self.id_preprint,
id_projects=self.id_project,
id_teams=self.id_team,
preprint=preprint_number)
if not rec_id:
return (None, 0)
# transform an existing preprint into article
# institute authors can be missing in the preprint
# change also the status
self.logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
# transform a preprint into an article
logs[-1].modify(MSG_TRANSFORM_PREPRINT, year)
if not self.dry_run:
db.publications[rec_id] = dict(authors_institute=my_authors,
id_categories=self.id_category,
id_publishers=id_publisher,
id_status=UNDEF_ID,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
title=title,
......@@ -275,24 +305,23 @@ class Articles(Automaton):
pages=pages,
publication_url=publication_url,
title=title,
volume=volume,
year=year)
volume=volume)
rec_id, status = self.get_record_by_origin(**fields)
rec_id, status = self.get_record_by_origin(record.primary_oai_url(),
year,
**fields)
if rec_id:
return status
fields = dict(id_publisher=id_publisher,
my_authors=my_authors,
oai_url=oai_url,
pages=pages,
publication_url=publication_url,
preprint_number=preprint_number,
title=title,
volume=volume,
year=year)
volume=volume)
rec_id, status = self.get_record_by_fields(**fields)
rec_id, status = self.get_record_by_fields(oai_url, year, **fields)
if rec_id:
return status
......
......@@ -33,8 +33,7 @@ class Automaton(object):
"""Base class to search and process publications:
* Decode the selector defining user criteria.
* Search for publications in the store, according to
user criteria
* Search in the store publications matching user criteria.
* Decode the XML string return by the store.
* Insert new records in the database.
......@@ -44,15 +43,16 @@ class Automaton(object):
The logic implements in the ``Automaton`` class is the following:
#. Ask to the store, all the `record_id` satisfying the user request.
#. Reject `record_id` matching the `origin` field of database entry.
#. Request to the store, the XML description of the publication
and decode it.
#. Reject `record_id` contains in the *origin* field of a
database entry.
#. Request to the store, the XML description of the publications
and decode them.
#. Reject the record for which the *secondary_oai_url* is contained in
the *origin* field of a database entry. Update the *origin* field
of the database record.
#. Check that the *oai* of the publication is defined and well formed.
Recover it, if it is not the case. From time to time, the `id`
encoded in the `oai` field is different from the `record_id`.
This happens when an old record is redirected to new one
for obscure reasons. The record is ignore if a database entry
is found with the bad OAI.
Recover it, if it is not the case. At this stage the OAI is always
defined.
#. Reject temporarily publication.
#. Check that *authors* are defined.
Reject the publication if it is not the case.
......@@ -172,48 +172,74 @@ class Automaton(object):
self.logs[-1].reject(dbe.message, log_year)
return 0
def _is_record_in_db(self, rec_id, title):
def _is_record_in_db(self, title, host=None, rec_id=None, oai_url=None):
"""Return ``True`` when the record is already in the database.
The search is based on the origin field.
The search is based on the origin field and on the primary OAI.
Note:
A new log entry is created when a record is found.
Args:
rec_id (int): record identifier
title (unicode): title of the collection
title (unicode): the title of the publication.
Keyword Args:
host (unicode): the store. possible values are ``cds.cern.ch`` or
``inspirehep.net``. To be used with *rec_id*.
rec_id (int): the record identifier in the store
oai_url (unicode): the URL of the record in the store.
Note:
Either use *host* and *rec_id* or *oai_url*
Returns:
bool: ``True`` when a record if found, ``False`` otherwise.
int: the id of the record in the database when a record is found,
0 otherwise.
Raises:
ValueError: when keyword arguments are not defined properly.
"""
db = self.db
harvester = self.harvester
# check
url = OAI_URL % (harvester.host, rec_id)
db_id = get_id(db.publications, origin=url)
# build the OAI URL
if host is not None and rec_id is not None and oai_url is None:
url = OAI_URL % (host, rec_id)
elif host is None and rec_id is None and oai_url is not None:
url = oai_url
else:
raise ValueError
if db_id is None:
return False
# check the OAI
query = db.publications.origin.contains(url)
setrows = db(query)
publication = db.publications[db_id]
if setrows.count() == 0:
return 0
# same category for the publication and the harvester
# keep the record if it is not the case
# this is required to transform a preprint into article
# one record found
columns = [db.publications.id,
db.publications.id_categories,
db.publications.title,
db.publications.year]
publication = setrows.select(*columns).first()
# Note:
# The category for the publication and the harvester have to be equal.
# However, keep the record if it is not the case.
# This is required to transform a preprint into article
if publication.id_categories != harvester.id_categories:
return False
return 0
# log
self.logs.append(Msg(harvester=self.harvester,
self.logs.append(Msg(harvester=harvester,
collection=title,
record_id=rec_id,
title=publication.title))
self.logs[-1].idle(MSG_IN_DB, publication.year)
return True
return publication.id
def _search_parameters(self, collection):
"""Build the keywords to steer the URL search in invenio store.
......@@ -319,19 +345,26 @@ class Automaton(object):
return True
def get_record_by_fields(self, **kwargs):
def get_record_by_fields(self, oai_url, year, **kwargs):
"""Get database record matching fields values defined
in the keyword arguments.
Note:
This method is required to deal with publication entered by hand
and found later by an harvester.
Args:
oai_url (unicode): *e.g* ``"http://cds.cern.ch/record/123456"``
year (int): the year of the publication.
oai_url (unicode): the oai_url, *e.g*
``http://cds.cern.ch/record/123456``. The origin field
of the existing database record is update to **oai_url**
when a match is found.
Note:
Fix the field origin when a match is found.
year (int): the year of the publication. It is used
by the search algorithm and by the logger.