Commit 945dcead authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Modify the logic to deal with synonyms.

parent 28b2ec83
...@@ -318,8 +318,10 @@ class Articles(Automaton): ...@@ -318,8 +318,10 @@ class Articles(Automaton):
year = record.paper_year() year = record.paper_year()
# get the collaboration / publisher identifiers # get the collaboration / publisher identifiers
id_collaboration = self.search_collaboration(record.collaboration()) id_collaboration = \
id_publisher = self.search_publisher(editor) get_id(db.collaborations, collaboration=record.collaboration())
id_publisher = get_id(db.publishers, abbreviation=editor)
# get already published articles or preprint # get already published articles or preprint
# A preprint is transform into an article. # A preprint is transform into an article.
......
...@@ -672,64 +672,3 @@ class Automaton(object): ...@@ -672,64 +672,3 @@ class Automaton(object):
return dict(collection_logs=self.collection_logs, return dict(collection_logs=self.collection_logs,
controller=self.controller, controller=self.controller,
logs=self.logs) logs=self.logs)
def search_collaboration(self, value):
"""Get the database collaboration identifier using synonyms.
Args:
value (unicode):
the name of the collaboration.
Returns:
int:
* the id of the collaboration record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found or when the
collaboration is not defined.
"""
return search_synonym(self.db.collaborations, "collaboration", value)
def search_country(self, value):
"""Get the database country identifier using synonyms.
Args:
value (unicode):
the name of the country.
Returns:
int:
* the id of the country record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found ot when
the country is not defined.
"""
return search_synonym(self.db.countries, "country", value)
def search_publisher(self, value):
"""Get the database publisher identifier using synonyms.
Args:
value (unicode):
the abbreviation of the publisher.
Returns:
int:
* the id of the publisher record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
when more than one synonym is found or when
the publisher is not defined.
"""
return search_synonym(self.db.publishers, "abbreviation", value)
...@@ -127,7 +127,8 @@ def search_synonym(table, fieldname, value, create=False): ...@@ -127,7 +127,8 @@ def search_synonym(table, fieldname, value, create=False):
Raises: Raises:
ToolException: ToolException:
more than one synonym is found. * no synonym found and not allow to create a new one.
* more than one synonym is found.
""" """
if not value: if not value:
......
...@@ -20,7 +20,7 @@ from invenio_tools import (MSG_NO_CONF, ...@@ -20,7 +20,7 @@ from invenio_tools import (MSG_NO_CONF,
from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS from invenio_tools.recordpubli import PAPER_REFERENCE_KEYS
from itertools import imap from itertools import imap
from plugin_dbui import CLEAN_SPACES, get_id from plugin_dbui import CLEAN_SPACES, get_id, UNDEF_ID
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.") DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
...@@ -53,6 +53,9 @@ MSG_NO_OAI = "Reject no OAI identifier" ...@@ -53,6 +53,9 @@ MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF = "Reject incomplete paper reference. Check " MSG_NO_REF = "Reject incomplete paper reference. Check "
MSG_TEMPORARY_RECORD = "Temporary record" MSG_TEMPORARY_RECORD = "Temporary record"
MSG_UNKNOWN_COLLABORATION = "Reject collaboration is unknown."
MSG_UNKNOWN_COUNTRY = "Reject country is unknown."
MSG_UNKNOWN_PUBLISHER = "Reject publisher is unknown."
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed" MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})") REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
...@@ -342,8 +345,7 @@ class CheckAndFix(object): ...@@ -342,8 +345,7 @@ class CheckAndFix(object):
raise CheckException(MSG_NO_AUTHOR) raise CheckException(MSG_NO_AUTHOR)
def collaboration(self, record): def collaboration(self, record):
"""Check the collaboration. """Check synonyms for collaboration by using by the proper value.
Have a look to the synonyms when the collaboration is not well formed.
Args: Args:
record (RecordPubli): record (RecordPubli):
...@@ -351,8 +353,9 @@ class CheckAndFix(object): ...@@ -351,8 +353,9 @@ class CheckAndFix(object):
Raises: Raises:
CheckException: CheckException:
when the collaboration value is defined * the collaboration is unknown
nor entered as a synonym. (neither collaboration nor synonym)
* more than one synonym found.
""" """
if self.dbg: if self.dbg:
...@@ -363,12 +366,41 @@ class CheckAndFix(object): ...@@ -363,12 +366,41 @@ class CheckAndFix(object):
return return
try: try:
search_synonym(self.db.collaborations, "collaboration", val) db = self.db
dbid = search_synonym(db.collaborations, "collaboration", val)
if dbid == UNDEF_ID:
raise ToolException(MSG_UNKNOWN_COLLABORATION)
collaboration = db.collaborations[dbid].collaboration
if collaboration != val:
# one collaboration
if isinstance(record[u"corporate_name"], dict):
record[u"corporate_name"][u"collaboration"] = collaboration
# several collaboration
# replace the list of dictionary by a single one
else:
record[u"corporate_name"] = \
{u"collaboration": collaboration}
except ToolException as e: except ToolException as e:
raise CheckException(*e.args) raise CheckException(*e.args)
def country(self, record): def country(self, record):
"""Check synonyms for conference country by using by the proper value.
Args:
record (RecordPubli):
record describing a publication.
Raises:
CheckException:
* the country is unknown (neither country nor synonym)
* more than one synonym found.
"""
"""Check conference country. """Check conference country.
Have a look to the synonyms when the country does not exist. Have a look to the synonyms when the country does not exist.
...@@ -390,7 +422,28 @@ class CheckAndFix(object): ...@@ -390,7 +422,28 @@ class CheckAndFix(object):
val = record.conference_country() val = record.conference_country()
try: try:
search_synonym(self.db.countries, "country", val) db = self.db
dbid = search_synonym(db.countries, "country", val)
if dbid == UNDEF_ID:
raise ToolException(MSG_UNKNOWN_COUNTRY)
country = db.countries[dbid].country
if country != val:
obj = record[u"meeting_name"]
if isinstance(obj, dict):
location = obj[u"location"].replace(val, country)
record[u"meeting_name"][u"location"] = location
else:
for di in obj:
if u"location" in di:
di[u"location"] = \
di[u"location"].replace(val, country)
record[u"meeting_name"] = obj
except ToolException as e: except ToolException as e:
raise CheckException(*e.args) raise CheckException(*e.args)
...@@ -822,8 +875,7 @@ class CheckAndFix(object): ...@@ -822,8 +875,7 @@ class CheckAndFix(object):
raise ToolException(MSG_NO_REF + "[year]") raise ToolException(MSG_NO_REF + "[year]")
def publisher(self, record): def publisher(self, record):
"""Check publisher. """Check synonyms for publisher by replacing by the abbreviation value.
Have a look to the synonyms when the publisher does not exist.
Args: Args:
record (RecordPubli): record (RecordPubli):
...@@ -831,7 +883,8 @@ class CheckAndFix(object): ...@@ -831,7 +883,8 @@ class CheckAndFix(object):
Raises: Raises:
CheckException: CheckException:
the publisher is not defined nor entered as a synonym. * the publisher is unknown (neither abbreviation nor synonym)
* more than one synonym found.
""" """
if self.dbg: if self.dbg:
...@@ -841,11 +894,18 @@ class CheckAndFix(object): ...@@ -841,11 +894,18 @@ class CheckAndFix(object):
if len(val) == 0: if len(val) == 0:
return return
# convert ToolException to CheckExcpetion
try: try:
db = self.db db = self.db
search_synonym(db.publishers, "abbreviation", val) dbid = search_synonym(db.publishers, "abbreviation", val)
if dbid == UNDEF_ID:
raise ToolException(MSG_UNKNOWN_PUBLISHER)
abbreviation = db.publishers[dbid].abbreviation
if abbreviation != val:
record[u"publication_info"].loc[0, "title"] = abbreviation
# convert ToolException to CheckExcpetion
except ToolException as e: except ToolException as e:
raise CheckException(*e.args) raise CheckException(*e.args)
......
...@@ -8,7 +8,7 @@ from .automaton import Automaton ...@@ -8,7 +8,7 @@ from .automaton import Automaton
from .base import MSG_CRASH, MSG_LOAD from .base import MSG_CRASH, MSG_LOAD
from .checkandfix import CheckException from .checkandfix import CheckException
from invenio_tools import RecordConf, RecordThesis from invenio_tools import RecordConf, RecordThesis
from plugin_dbui import UNDEF_ID from plugin_dbui import get_id, UNDEF_ID
MSG_PREPRINT_IS_PAPER = "Reject preprint is a published paper" MSG_PREPRINT_IS_PAPER = "Reject preprint is a published paper"
...@@ -86,6 +86,8 @@ class Preprints(Automaton): ...@@ -86,6 +86,8 @@ class Preprints(Automaton):
zero otherwise. zero otherwise.
""" """
db = self.db
# alias # alias
first_author = record.first_author() first_author = record.first_author()
oai_url = record.oai_url() oai_url = record.oai_url()
...@@ -95,7 +97,8 @@ class Preprints(Automaton): ...@@ -95,7 +97,8 @@ class Preprints(Automaton):
year = submitted[0:4] year = submitted[0:4]
# get the collaboration identifier # get the collaboration identifier
id_collaboration = self.search_collaboration(record.collaboration()) id_collaboration = \
get_id(db.collaborations, collaboration=record.collaboration())
# get existing preprint or article # get existing preprint or article
fields = dict(first_author=first_author, fields = dict(first_author=first_author,
......
...@@ -7,7 +7,7 @@ import traceback ...@@ -7,7 +7,7 @@ import traceback
from .automaton import Automaton from .automaton import Automaton
from .base import MSG_CRASH, MSG_LOAD from .base import MSG_CRASH, MSG_LOAD
from .checkandfix import CheckException from .checkandfix import CheckException
from plugin_dbui import UNDEF_ID from plugin_dbui import get_id, UNDEF_ID
class Proceedings(Automaton): class Proceedings(Automaton):
...@@ -71,6 +71,8 @@ class Proceedings(Automaton): ...@@ -71,6 +71,8 @@ class Proceedings(Automaton):
zero otherwise. zero otherwise.
""" """
db = self.db
# alias # alias
oai_url = record.oai_url() oai_url = record.oai_url()
year = record.paper_year() year = record.paper_year()
...@@ -94,11 +96,13 @@ class Proceedings(Automaton): ...@@ -94,11 +96,13 @@ class Proceedings(Automaton):
conference_dates = record.conference_dates() conference_dates = record.conference_dates()
conference_title = record.conference_title() conference_title = record.conference_title()
first_author = record.first_author() first_author = record.first_author()
id_country = self.search_country(record.conference_country()) id_country = get_id(db.countries, country=record.conference_country())
# get the collaboration/publisher identifiers # get the collaboration/publisher identifiers
id_collaboration = self.search_collaboration(record.collaboration()) id_collaboration = \
id_publisher = self.search_publisher(editor) get_id(db.collaborations, collaboration=record.collaboration())
id_publisher = get_id(db.publishers, abbreviation=editor)
# get an already published proceeding # get an already published proceeding
fields = dict(authors=authors, fields = dict(authors=authors,
......
...@@ -90,7 +90,8 @@ class Reports(Automaton): ...@@ -90,7 +90,8 @@ class Reports(Automaton):
id_status = get_id(db.status, code=UNKNOWN) id_status = get_id(db.status, code=UNKNOWN)
# get the collaboration identifier # get the collaboration identifier
id_collaboration = self.search_collaboration(record.collaboration()) id_collaboration = \
get_id(db.collaborations, collaboration=record.collaboration())
# get an already published reports # get an already published reports
fields = dict(id_categories=self.id_category, fields = dict(id_categories=self.id_category,
......
...@@ -4,10 +4,17 @@ ...@@ -4,10 +4,17 @@
import traceback import traceback
<<<<<<< HEAD
from .automaton import Automaton from .automaton import Automaton
from .base import MSG_CRASH, MSG_LOAD from .base import MSG_CRASH, MSG_LOAD
from .checkandfix import CheckException from .checkandfix import CheckException
from plugin_dbui import UNDEF_ID from plugin_dbui import UNDEF_ID
=======
from automaton import Automaton
from base import MSG_CRASH, MSG_LOAD
from checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
>>>>>>> Modify the logic to deal with synonyms.
class Talks(Automaton): class Talks(Automaton):
...@@ -67,6 +74,8 @@ class Talks(Automaton): ...@@ -67,6 +74,8 @@ class Talks(Automaton):
zero otherwise. zero otherwise.
""" """
db = self.db
# alias # alias
oai_url = record.oai_url() oai_url = record.oai_url()
year = record.conference_year() year = record.conference_year()
...@@ -75,12 +84,13 @@ class Talks(Automaton): ...@@ -75,12 +84,13 @@ class Talks(Automaton):
conference_dates = record.conference_dates() conference_dates = record.conference_dates()
conference_title = record.conference_title() conference_title = record.conference_title()
first_author = record.first_author() first_author = record.first_author()
id_country = self.search_country(record.conference_country()) id_country = get_id(db.countries, country=record.conference_country())
submitted = record.submitted() submitted = record.submitted()
title = record.title() title = record.title()
# get the collaboration identifier # get the collaboration identifier
id_collaboration = self.search_collaboration(record.collaboration()) id_collaboration = \
get_id(db.collaborations, collaboration=record.collaboration())
# get an already published talk # get an already published talk
fields = dict(conference_title=conference_title, fields = dict(conference_title=conference_title,
......
...@@ -28,6 +28,9 @@ from harvest_tools.checkandfix import ( ...@@ -28,6 +28,9 @@ from harvest_tools.checkandfix import (
MSG_NO_OAI, MSG_NO_OAI,
MSG_NO_REF, MSG_NO_REF,
MSG_TEMPORARY_RECORD, MSG_TEMPORARY_RECORD,
MSG_UNKNOWN_COLLABORATION,
MSG_UNKNOWN_COUNTRY,
MSG_UNKNOWN_PUBLISHER,
MSG_WELL_FORMED_DATE) MSG_WELL_FORMED_DATE)
from harvest_tools.preprints import ( from harvest_tools.preprints import (
...@@ -82,6 +85,9 @@ def messages(): ...@@ -82,6 +85,9 @@ def messages():
T(MSG_PREPRINT_NO_NUMBER), T(MSG_PREPRINT_NO_NUMBER),
T(MSG_REPORT_NO_NUMBER), T(MSG_REPORT_NO_NUMBER),
T(MSG_TEMPORARY_RECORD), T(MSG_TEMPORARY_RECORD),
T(MSG_UNKNOWN_COLLABORATION),
T(MSG_UNKNOWN_COUNTRY),
T(MSG_UNKNOWN_PUBLISHER),
T(MSG_WELL_FORMED_COLLABORATION), T(MSG_WELL_FORMED_COLLABORATION),
T(MSG_WELL_FORMED_DATE)} T(MSG_WELL_FORMED_DATE)}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment