Commit f5403c9a authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Clean module harvester_tools.

parent 6ae3ac92
...@@ -25,7 +25,6 @@ MSG_NO_PROJECT = 'Select a "project" !!!' ...@@ -25,7 +25,6 @@ MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!' MSG_NO_TEAM = 'Select a "team" !!!'
MSG_INSERT_FAIL = "Fail to insert the new record in the database." MSG_INSERT_FAIL = "Fail to insert the new record in the database."
MSG_WELL_FORM_OAI = "Reject OAI is not well formed"
# search collection when using inspirehep # search collection when using inspirehep
# require for "Hal Hidden" # require for "Hal Hidden"
......
...@@ -19,11 +19,12 @@ def family_name_fr(full_name): ...@@ -19,11 +19,12 @@ def family_name_fr(full_name):
"""Extract the family name when the full name is encoded as ``J. Doe``. """Extract the family name when the full name is encoded as ``J. Doe``.
Args: Args:
full_name (str): author name encoded according to French full_name (unicode):
typographic rules. author name encoded according to French typographic rules.
Returns: Returns:
str: family name unicode:
family name
""" """
return full_name[full_name.find(' ') + 1:] return full_name[full_name.find(' ') + 1:]
...@@ -43,11 +44,20 @@ def learn_my_authors(db, ...@@ -43,11 +44,20 @@ def learn_my_authors(db,
all keyword arguments have to be defined. all keyword arguments have to be defined.
Args: Args:
db (gluon.dal.DAL): database connection. db (gluon.dal.DAL):
authors (list): authors names database connection.
id_project (int): the identifier of the project in the database.
id_team (int): the identifier of the team in the database. authors (list):
year (int): the year authors names
id_project (int):
the identifier of the project in the database.
id_team (int):
the identifier of the team in the database.
year (int):
the year
""" """
# get the list of authors store in the database # get the list of authors store in the database
...@@ -97,11 +107,17 @@ def search_synonym(table, fieldname, value, create=False): ...@@ -97,11 +107,17 @@ def search_synonym(table, fieldname, value, create=False):
The database table must have a field name *synonyms*. The database table must have a field name *synonyms*.
It contains a list of strings. It contains a list of strings.
Args: Args:
table (gluon.DAL.Table): database table. table (gluon.DAL.Table):
fieldname (str): field of the database table database table.
identified by its name.
value (str): value to be matched. fieldname (unicode):
create(bool): create a new entry in the database table when field of the database table identified by its name.
value (unicode):
value to be matched.
create(bool):
create a new entry in the database table when
it is ``True`` it is ``True``
Returns: Returns:
...@@ -110,7 +126,8 @@ def search_synonym(table, fieldname, value, create=False): ...@@ -110,7 +126,8 @@ def search_synonym(table, fieldname, value, create=False):
* UNDEF_ID if value is not defined. * UNDEF_ID if value is not defined.
Raises: Raises:
ToolException: when more than one synonym is found. ToolException:
more than one synonym is found.
""" """
if not value: if not value:
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
""" """
import numpy as np import numpy as np
import re import re
import regex
from .base import search_synonym, ToolException from .base import search_synonym, ToolException
from datetime import datetime from datetime import datetime
...@@ -29,7 +28,6 @@ DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.") ...@@ -29,7 +28,6 @@ DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY # Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})") DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})") DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
DECODE_YYYY = re.compile(r"^(\d{4})$")
MONTHS = {"Jan": "01", MONTHS = {"Jan": "01",
"Feb": "02", "Feb": "02",
...@@ -47,41 +45,23 @@ MONTHS = {"Jan": "01", ...@@ -47,41 +45,23 @@ MONTHS = {"Jan": "01",
"Nov": "11", "Nov": "11",
"Dec": "12"} "Dec": "12"}
MSG_INVALID_HOST = "Invalid host"
MSG_NO_AUTHOR = "Reject no author(s)" MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_CONF_DATE = "Reject no conference date" MSG_NO_CONF_DATE = "Reject no conference date"
MSG_NO_DATE = "Reject no submission date" MSG_NO_DATE = "Reject no submission date"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute" MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
MSG_NO_OAI = "Reject no OAI identifier" MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF = "Reject incomplete paper reference. Check " MSG_NO_REF = "Reject incomplete paper reference. Check "
MSG_NO_YEAR = "Reject no publication year"
MSG_TEMPORARY_RECORD = "Temporary record" MSG_TEMPORARY_RECORD = "Temporary record"
MSG_TO_MANY_DATE = "Reject to many submit date"
MSG_TO_MANY_FAUTHOR = "Reject to many first author"
MSG_TO_MANY_YEAR = "Reject to many year"
MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed" MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
OAI_INVENIO = "oai:%s:%s"
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})") REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = \ REG_CONF_DATES_2 = \
re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})") re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)") REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}") REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}")
REG_WELL_FORMED_CONF_DATES_2 = \ REG_WELL_FORMED_CONF_DATES_2 = \
...@@ -324,7 +304,7 @@ class CheckAndFix(object): ...@@ -324,7 +304,7 @@ class CheckAndFix(object):
val = u"" val = u""
if isinstance(record, RecordConf): if isinstance(record, RecordConf):
opening, closing = self._get_conference_dates(record) opening = self._get_conference_dates(record)[0]
val = opening.strftime("%Y-%m-%d") val = opening.strftime("%Y-%m-%d")
elif isinstance(record, RecordThesis): elif isinstance(record, RecordThesis):
...@@ -509,8 +489,8 @@ class CheckAndFix(object): ...@@ -509,8 +489,8 @@ class CheckAndFix(object):
print "\t\tCheck is oai" print "\t\tCheck is oai"
# field / subfield depends on the store # field / subfield depends on the store
test = (u"oai" in record and u"value" in record[u"oai"]) or \ test = (u"oai" in record and u"value" in record[u"oai"]) or \
(u"FIXME_OAI" in record and u"id" in record[u"FIXME_OAI"]) (u"FIXME_OAI" in record and u"id" in record[u"FIXME_OAI"])
if not test: if not test:
raise ToolException(MSG_NO_OAI) raise ToolException(MSG_NO_OAI)
...@@ -519,7 +499,9 @@ class CheckAndFix(object): ...@@ -519,7 +499,9 @@ class CheckAndFix(object):
"""Format the author names. """Format the author names.
Args: Args:
record (RecordPubli): record describing a publication. record (RecordPubli):
record describing a publication.
fmt (str): fmt (str):
define the format for author names. define the format for author names.
Possible values are "First, Last", "F. Last", "Last", Possible values are "First, Last", "F. Last", "Last",
...@@ -634,7 +616,7 @@ class CheckAndFix(object): ...@@ -634,7 +616,7 @@ class CheckAndFix(object):
Raises: Raises:
CheckException: CheckException:
when the list is empty the list is empty
""" """
if self.dbg: if self.dbg:
...@@ -787,7 +769,7 @@ class CheckAndFix(object): ...@@ -787,7 +769,7 @@ class CheckAndFix(object):
Raises: Raises:
CheckException: CheckException:
when the paper reference is not well formed. the paper reference is not well formed.
""" """
if self.dbg: if self.dbg:
...@@ -849,7 +831,7 @@ class CheckAndFix(object): ...@@ -849,7 +831,7 @@ class CheckAndFix(object):
Raises: Raises:
CheckException: CheckException:
when the publisher is not defined nor entered as a synonym. the publisher is not defined nor entered as a synonym.
""" """
if self.dbg: if self.dbg:
...@@ -880,8 +862,8 @@ class CheckAndFix(object): ...@@ -880,8 +862,8 @@ class CheckAndFix(object):
Raises: Raises:
CheckException: CheckException:
when the date is not well formed or when more * the date is not well formed
than one date are found. * more than one date are found.
""" """
if self.dbg: if self.dbg:
...@@ -927,10 +909,12 @@ class CheckAndFix(object): ...@@ -927,10 +909,12 @@ class CheckAndFix(object):
"""Some records are marked temporary. """Some records are marked temporary.
Args: Args:
record (RecordPubli): record describing a publication. record (RecordPubli):
record describing a publication.
Raises: Raises:
CheckException: when the record is marked temporary CheckException:
the record is marked temporary
""" """
if self.dbg: if self.dbg:
......
...@@ -23,24 +23,39 @@ def build_harvester_tool(db, ...@@ -23,24 +23,39 @@ def build_harvester_tool(db,
Harvest tool factory function. Harvest tool factory function.
Args: Args:
db (gluon.dal.DAL): database connection. db (gluon.dal.DAL):
id_team (int): the identifier of the team in the database. database connection.
id_project (int): the identifier of the project in the database.
automaton (str): the name of the automaton which id_team (int):
will be used to process the data. Possible values are: the identifier of the team in the database.
``articles``, ``notes``, ``preprints``, ``proceedings``,
``reports``, ``talks`` and ``theses``. id_project (int):
id_category (int): the identifier of the publication category, the identifier of the project in the database.
*e.g.* ACL, ACTI, ...
year_start (int): starting year for the scan. automaton (unicode):
year_end (int): ending year for the scan. the name of the automaton which will be used to process the data.
dry_run (bool): new records are not inserted in the database Possible values are ``articles``, ``notes``, ``preprints``,
when ``True``. ``proceedings``, ``reports``, ``talks`` and ``theses``.
debug (bool): activate the verbose mode.
id_category (int):
the identifier of the publication category, *e.g.* ACL, ACTI, ...
year_start (int):
starting year for the scan.
year_end (int):
ending year for the scan.
dry_run (bool):
new records are not inserted in the database when ``True``.
debug (bool):
activate the verbose mode.
Returns: Returns:
Automaton: returns the appropriate harvester automaton or Automaton:
``None`` if no factory exist for the specified automaton. * the appropriate harvester automaton.
* ``None`` if no factory exist for the specified automaton.
""" """
tool_class = get_harvester_tool(automaton) tool_class = get_harvester_tool(automaton)
...@@ -75,11 +90,13 @@ def get_harvester_tool(automaton): ...@@ -75,11 +90,13 @@ def get_harvester_tool(automaton):
* theses * theses
Args: Args:
automaton (str): name of the automaton automaton (unicode):
name of the automaton
Returns: Returns:
Automaton: class reference or ``None``. The latter happens Automaton:
when the automaton corresponds to nothing. * class reference
* ``None`` when the automaton corresponds to nothing.
""" """
if automaton == "articles": if automaton == "articles":
......
...@@ -26,12 +26,17 @@ class Msg(Storage): ...@@ -26,12 +26,17 @@ class Msg(Storage):
* *reject* the record is rejected. * *reject* the record is rejected.
Args: Args:
collection (str): the harvester collection used to collection (unicode):
search the record. the harvester collection used to search the record.
harvester (gluon.dal.Row): the database harvester used to scan the
store. harvester (gluon.dal.Row):
record_id (int): the record identifier in the store. the database harvester used to scan the store.
title (str): the title of the publication.
record_id (int):
the record identifier in the store.
title (unicode):
the title of the publication.
""" """
def __init__(self, def __init__(self,
...@@ -59,10 +64,14 @@ class Msg(Storage): ...@@ -59,10 +64,14 @@ class Msg(Storage):
"""Set the action as *idle* and the explanation as ``txt``. """Set the action as *idle* and the explanation as ``txt``.
Args: Args:
txt (str): message associated to the action. txt (unicode):
year (str): year of the publication message associated to the action.
translate (bool): translate the message according to the
current language. year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
""" """
self.action = "idle" self.action = "idle"
...@@ -72,10 +81,14 @@ class Msg(Storage): ...@@ -72,10 +81,14 @@ class Msg(Storage):
"""Set the action as *load* and the explanation as ``txt``. """Set the action as *load* and the explanation as ``txt``.
Args: Args:
txt (str): message associated to the action. txt (unicode):
year (str): year of the publication message associated to the action.
translate (bool): translate the message according to the
current language. year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
""" """
self.action = "load" self.action = "load"
...@@ -85,10 +98,14 @@ class Msg(Storage): ...@@ -85,10 +98,14 @@ class Msg(Storage):
"""Set the action as *modify* and the explanation as ``txt``. """Set the action as *modify* and the explanation as ``txt``.
Args: Args:
txt (str): message associated to the action. txt (unicode):
year (str): year of the publication message associated to the action.
translate (bool): translate the message according to the
current language. year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
""" """
self.action = "modify" self.action = "modify"
...@@ -98,11 +115,14 @@ class Msg(Storage): ...@@ -98,11 +115,14 @@ class Msg(Storage):
"""Set the action as *reject* and the explanation as ``txt``. """Set the action as *reject* and the explanation as ``txt``.
Args: Args:
txt (str): message associated to the action. txt (unicode):
message associated to the action.
year (str): year of the publication year (unicode):
year of the publication
record (RecordPubli): the record on which the action is applied. record (RecordPubli):
the record on which the action is applied.
It is used to determine the synonym value when the It is used to determine the synonym value when the
*collaboration*, *country* or *publisher* data is not *collaboration*, *country* or *publisher* data is not
understood. understood.
...@@ -111,8 +131,8 @@ class Msg(Storage): ...@@ -111,8 +131,8 @@ class Msg(Storage):
The *year* argument is not needed when The *year* argument is not needed when
the *record* is specified. the *record* is specified.
translate (bool): translate the message according to the translate (bool):
current language. translate the message according to the current language.
""" """
self.action = "reject" self.action = "reject"
......
...@@ -8,10 +8,17 @@ class MsgCollection(Storage): ...@@ -8,10 +8,17 @@ class MsgCollection(Storage):
"""Messages for a collection. """Messages for a collection.
Args: Args:
error (str): error when scanning the collection. error (unicode):
found (int): number of publication found in the harvester repository. error when scanning the collection.
url (str): URL used to scan the harvester repository.
title (str): title of the collection. found (int):
number of publication found in the harvester repository.
url (unicode):
URL used to scan the harvester repository.
title (unicode):
title of the collection.
""" """
def __init__(self, error="", found=0, title="", url=""): def __init__(self, error="", found=0, title="", url=""):
...@@ -24,8 +31,9 @@ class MsgCollection(Storage): ...@@ -24,8 +31,9 @@ class MsgCollection(Storage):
def url_hb(self): def url_hb(self):
""" """
Returns: Returns:
str: an URL configures to return a list of record str:
in readable format. an URL configures to return a list of record
sin readable format.
""" """
return self.url.replace("of=id", "of=hb") return self.url.replace("of=id", "of=hb")
...@@ -11,9 +11,6 @@ from .checkandfix import CheckException ...@@ -11,9 +11,6 @@ from .checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID from plugin_dbui import get_id, UNDEF_ID
MSG_NO_THESIS = "Reject not a thesis record"
class Thesis(Automaton): class Thesis(Automaton):
"""Automaton for thesis. """Automaton for thesis.
......
...@@ -11,9 +11,7 @@ from harvest_tools.articles import ( ...@@ -11,9 +11,7 @@ from harvest_tools.articles import (
MSG_NO_EDITOR, MSG_NO_EDITOR,
MSG_TRANSFORM_PREPRINT) MSG_TRANSFORM_PREPRINT)
from harvest_tools.automaton import ( from harvest_tools.automaton import MSG_INSERT_FAIL
MSG_INSERT_FAIL,
MSG_WELL_FORM_OAI)
from harvest_tools.base import ( from harvest_tools.base import (
MSG_FIX_ORIGIN, MSG_FIX_ORIGIN,
...@@ -29,14 +27,8 @@ from harvest_tools.checkandfix import ( ...@@ -29,14 +27,8 @@ from harvest_tools.checkandfix import (
MSG_NO_MY_AUTHOR, MSG_NO_MY_AUTHOR,
MSG_NO_OAI, MSG_NO_OAI,
MSG_NO_REF, MSG_NO_REF,
MSG_NO_YEAR,
MSG_TEMPORARY_RECORD, MSG_TEMPORARY_RECORD,
MSG_TO_MANY_DATE, MSG_WELL_FORMED_DATE)
MSG_TO_MANY_FAUTHOR,
MSG_TO_MANY_YEAR,
MSG_WELL_FORMED_CONF_DATES,
MSG_WELL_FORMED_DATE,
MSG_WELL_FORMED_EDITOR)
from harvest_tools.preprints import ( from harvest_tools.preprints import (
MSG_PREPRINT_IS_PAPER, MSG_PREPRINT_IS_PAPER,
...@@ -45,7 +37,6 @@ from harvest_tools.preprints import ( ...@@ -45,7 +37,6 @@ from harvest_tools.preprints import (
MSG_PREPRINT_NO_NUMBER) MSG_PREPRINT_NO_NUMBER)
from harvest_tools.reports import MSG_REPORT_NO_NUMBER from harvest_tools.reports import MSG_REPORT_NO_NUMBER
from harvest_tools.thesis import MSG_NO_THESIS
from invenio_tools.base import ( from invenio_tools.base import (
MSG_INV_CONF, MSG_INV_CONF,
...@@ -85,21 +76,13 @@ def messages(): ...@@ -85,21 +76,13 @@ def messages():
T(MSG_NO_OAI), T(MSG_NO_OAI),
T(MSG_NO_PUBLISHER), T(MSG_NO_PUBLISHER),
T(MSG_NO_REF), T(MSG_NO_REF),