Commit 1885637c authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Move the function search_synoym in harvester.base and test it.

parent 9f00fca0
harvest_tools.base.search_synonym
=================================
.. currentmodule:: harvest_tools.base
.. autofunction:: search_synonym
\ No newline at end of file
......@@ -48,7 +48,7 @@ Helper functions
~base.family_name_fr
~base.format_author_fr
~base.learn_my_authors
~automaton.search_synonym
~base.search_synonym
Logger
^^^^^^
......
......@@ -6,9 +6,10 @@ and to push them in the database.
from base import (DRY_RUN,
family_name_fr,
format_author_fr,
search_synonym,
ToolException)
from automaton import Automaton, search_synonym
from automaton import Automaton
from articles import Articles
from factory import build_harvester_tool, get_harvester_tool
from msg import Msg
......
......@@ -6,7 +6,10 @@ import re
import traceback
from base import MSG_FIX_ORIGIN, MSG_IN_DB, ToolException
from base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
search_synonym,
ToolException)
from gluon.storage import Storage
from invenio_tools import (CheckAndFix,
InvenioStore,
......@@ -14,74 +17,18 @@ from invenio_tools import (CheckAndFix,
OAI_URL)
from msg import Msg
from msgcollection import MsgCollection
from plugin_dbui import CALLBACK_ERRORS, get_id, UNDEF_ID
from plugin_dbui import CALLBACK_ERRORS, get_id
MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms."
MSG_NSERT_FAIL = "Fail to insert the new record in the database."
MSG_NO_OAI = "Reject no OAI identifier"
MSG_WELL_FORM_OAI = "Reject OAI is not well formed"
def search_synonym(table, fieldname, value, create=False):
"""Get the database identifier for the record having the database field
or the synonyms field matching the value.
Note:
The database table must have a field name *synonyms*.
It is a string containing values separated by a comma.
Args:
table (gluon.DAL.Table): database table.
fieldname (unicode): field of the database table
identified by its name.
value (unicode): value to be matched.
create(bool): create a new entry in the database table when
it is ``True``
Returns:
int:
* the id of the database record.
* UNDEF_ID if value is not defined.
Raises:
ToolException: when more than one synonym is found.
"""
if not value:
return UNDEF_ID
db = table._db
kwargs = {}
kwargs[fieldname] = value
id_rec = get_id(table, **kwargs)
if id_rec is not None:
return id_rec
# nothing found, have a look to the synonyms field
query = table.synonyms.contains(value)
setrows = db(query)
# no synonym found, create the entry
ncount = setrows.count()
if ncount == 0 and create:
return table.insert(**kwargs)
# one synonym found
elif ncount == 1:
return setrows.select(table.id).first().id
# more than one synonyms - don't know how to choose
else:
msg = MSG_TOOMANY_SYNONYM % table._tablename
raise ToolException(msg)
class Automaton(object):
"""Base class to search and process publications:
......@@ -649,7 +596,8 @@ class Automaton(object):
* UNDEF_ID if value is not defined.
Raises:
ToolException: when more than one synonym is found.
ToolException: when more than one synonym is found ot when
the country is not defined.
"""
return search_synonym(self.db.countries, "country", value)
......@@ -666,7 +614,8 @@ class Automaton(object):
* UNDEF_ID if value is not defined.
Raises:
ToolException: when more than one synonym is found.
ToolException: when more than one synonym is found or when
the publisher is not defined.
"""
return search_synonym(self.db.publishers, "abbreviation", value)
......@@ -3,6 +3,8 @@
"""
from invenio_tools import REG_AUTHOR
from plugin_dbui import get_id, UNDEF_ID
DRY_RUN = "dry run"
......@@ -10,6 +12,8 @@ MSG_CRASH = "Crash: %s"
MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB = "Already in the database"
MSG_LOAD = "Load in the database"
MSG_NO_ENTRY = "Reject %s is not defined."
MSG_TOOMANY_SYNONYM = "Reject too many %s synonyms."
def family_name_fr(full_name):
......@@ -146,5 +150,64 @@ def learn_my_authors(db,
db.my_authors[row.id] = dict(authors=', '.join(database_authors))
def search_synonym(table, fieldname, value, create=False):
"""Get the database identifier for the record having the database field
or the synonyms field matching the value.
Note:
The database table must have a field name *synonyms*.
It is a string containing values separated by a comma.
Args:
table (gluon.DAL.Table): database table.
fieldname (unicode): field of the database table
identified by its name.
value (unicode): value to be matched.
create(bool): create a new entry in the database table when
it is ``True``
Returns:
int:
* the id of the database record.
* UNDEF_ID if value is not defined.
Raises:
ToolException: when more than one synonym is found.
"""
if not value:
return UNDEF_ID
db = table._db
kwargs = {}
kwargs[fieldname] = value
id_rec = get_id(table, **kwargs)
if id_rec is not None:
return id_rec
# nothing found, have a look to the synonyms field
query = table.synonyms.contains(value)
setrows = db(query)
# no synonym found, create the entry
ncount = setrows.count()
if ncount == 0:
if create:
return table.insert(**kwargs)
else:
msg = MSG_NO_ENTRY % table._tablename
raise ToolException(msg)
# one synonym found
elif ncount == 1:
return setrows.select(table.id).first().id
# more than one synonyms - don't know how to choose
else:
msg = MSG_TOOMANY_SYNONYM % table._tablename
raise ToolException(msg)
class ToolException(Exception):
pass
# -*- coding: utf-8 -*-
"""TALK
http://cds.cern.ch/record/1559714
Rare Decays of Heavy Mesons
26th International Symposium on Lepton Photon Interactions
at High Energies, San Francisco, CA, USA, 24 - 29 Jun 2013
No corrections are applied to the record.
Allow to test the brute force decoding with its mistakes.
Note:
* Country is not well defined (USA)
"""
import pytest
from invenio_tools import CheckAndFix, CheckException, load_record
def test_country_exception():
record = load_record('cds.cern.ch', 1559714)
svc = CheckAndFix()
# no exception since the value is define in the synonyms.
assert svc.country(record) is None
......@@ -2,10 +2,32 @@
"""test basic harvester functions
"""
from harvest_tools import format_author_fr
import pytest
from gluon import current
from harvest_tools import format_author_fr, search_synonym, ToolException
from invenio_tools import load_record
def test_format_author():
assert format_author_fr("Aaij, Roel") == "R. Aaij"
assert format_author_fr("Le Gac, Renaud") == "R. Le Gac"
assert format_author_fr("Bettler, Marc-Olivier") == "M.-O. Bettler"
def test_search_synonym():
db = current.globalenv['db']
# collaboration ANTARES, TANAMI (should not be defined as a synonym)
record = load_record("inspirehep.net", 1342250)
with pytest.raises(ToolException):
search_synonym(db.collaborations, "collaboration", record.collaboration())
# collaboration = ANTARES (defined as synonym in the db))
record = load_record("inspirehep.net", 718872)
colid = search_synonym(db.collaborations, "collaboration", record.collaboration())
assert colid == 2
# country = USA (defined as a synonym)
record = load_record('cds.cern.ch', 1559714)
country_id = search_synonym(db.countries, "country", record.conference_country())
assert country_id == 311
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment