Commit f5403c9a authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Clean module harvester_tools.

parent 6ae3ac92
......@@ -25,7 +25,6 @@ MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
MSG_INSERT_FAIL = "Fail to insert the new record in the database."
MSG_WELL_FORM_OAI = "Reject OAI is not well formed"
# search collection when using inspirehep
# require for "Hal Hidden"
......
......@@ -19,11 +19,12 @@ def family_name_fr(full_name):
"""Extract the family name when the full name is encoded as ``J. Doe``.
Args:
full_name (str): author name encoded according to French
typographic rules.
full_name (unicode):
author name encoded according to French typographic rules.
Returns:
str: family name
unicode:
family name
"""
return full_name[full_name.find(' ') + 1:]
......@@ -43,11 +44,20 @@ def learn_my_authors(db,
all keyword arguments have to be defined.
Args:
db (gluon.dal.DAL): database connection.
authors (list): authors names
id_project (int): the identifier of the project in the database.
id_team (int): the identifier of the team in the database.
year (int): the year
db (gluon.dal.DAL):
database connection.
authors (list):
authors names
id_project (int):
the identifier of the project in the database.
id_team (int):
the identifier of the team in the database.
year (int):
the year
"""
# get the list of authors store in the database
......@@ -97,11 +107,17 @@ def search_synonym(table, fieldname, value, create=False):
The database table must have a field name *synonyms*.
It contains a list of strings.
Args:
table (gluon.DAL.Table): database table.
fieldname (str): field of the database table
identified by its name.
value (str): value to be matched.
create(bool): create a new entry in the database table when
table (gluon.DAL.Table):
database table.
fieldname (unicode):
field of the database table identified by its name.
value (unicode):
value to be matched.
create(bool):
create a new entry in the database table when
it is ``True``
Returns:
......@@ -110,7 +126,8 @@ def search_synonym(table, fieldname, value, create=False):
* UNDEF_ID if value is not defined.
Raises:
ToolException: when more than one synonym is found.
ToolException:
more than one synonym is found.
"""
if not value:
......
......@@ -4,7 +4,6 @@
"""
import numpy as np
import re
import regex
from .base import search_synonym, ToolException
from datetime import datetime
......@@ -29,7 +28,6 @@ DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
DECODE_YYYY = re.compile(r"^(\d{4})$")
MONTHS = {"Jan": "01",
"Feb": "02",
......@@ -47,41 +45,23 @@ MONTHS = {"Jan": "01",
"Nov": "11",
"Dec": "12"}
MSG_INVALID_HOST = "Invalid host"
MSG_NO_AUTHOR = "Reject no author(s)"
MSG_NO_CONF_DATE = "Reject no conference date"
MSG_NO_DATE = "Reject no submission date"
MSG_NO_MY_AUTHOR = "Reject no authors of my institute"
MSG_NO_OAI = "Reject no OAI identifier"
MSG_NO_REF = "Reject incomplete paper reference. Check "
MSG_NO_YEAR = "Reject no publication year"
MSG_TEMPORARY_RECORD = "Temporary record"
MSG_TO_MANY_DATE = "Reject to many submit date"
MSG_TO_MANY_FAUTHOR = "Reject to many first author"
MSG_TO_MANY_YEAR = "Reject to many year"
MSG_WELL_FORMED_CONF_DATES = "Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR = "Reject editor is not well formed"
OAI_INVENIO = "oai:%s:%s"
REG_COLLABORATION = re.compile(regex.REG_COLLABORATION)
REG_CONF_DATES_1 = re.compile("(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES_2 = \
re.compile("(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})")
REG_CONF_DATES = re.compile(regex.REG_CONF_DATES)
REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")
REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
REG_WELL_FORMED_CONF_DATES_1 = re.compile("\d{2} - \d{2} [A-Z][a-z]{2} \d{4}")
REG_WELL_FORMED_CONF_DATES_2 = \
......@@ -324,7 +304,7 @@ class CheckAndFix(object):
val = u""
if isinstance(record, RecordConf):
opening, closing = self._get_conference_dates(record)
opening = self._get_conference_dates(record)[0]
val = opening.strftime("%Y-%m-%d")
elif isinstance(record, RecordThesis):
......@@ -519,7 +499,9 @@ class CheckAndFix(object):
"""Format the author names.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
fmt (str):
define the format for author names.
Possible values are "First, Last", "F. Last", "Last",
......@@ -634,7 +616,7 @@ class CheckAndFix(object):
Raises:
CheckException:
when the list is empty
the list is empty
"""
if self.dbg:
......@@ -787,7 +769,7 @@ class CheckAndFix(object):
Raises:
CheckException:
when the paper reference is not well formed.
the paper reference is not well formed.
"""
if self.dbg:
......@@ -849,7 +831,7 @@ class CheckAndFix(object):
Raises:
CheckException:
when the publisher is not defined nor entered as a synonym.
the publisher is not defined nor entered as a synonym.
"""
if self.dbg:
......@@ -880,8 +862,8 @@ class CheckAndFix(object):
Raises:
CheckException:
when the date is not well formed or when more
than one date are found.
* the date is not well formed
* more than one date are found.
"""
if self.dbg:
......@@ -927,10 +909,12 @@ class CheckAndFix(object):
"""Some records are marked temporary.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException: when the record is marked temporary
CheckException:
the record is marked temporary
"""
if self.dbg:
......
......@@ -23,24 +23,39 @@ def build_harvester_tool(db,
Harvest tool factory function.
Args:
db (gluon.dal.DAL): database connection.
id_team (int): the identifier of the team in the database.
id_project (int): the identifier of the project in the database.
automaton (str): the name of the automaton which
will be used to process the data. Possible values are:
``articles``, ``notes``, ``preprints``, ``proceedings``,
``reports``, ``talks`` and ``theses``.
id_category (int): the identifier of the publication category,
*e.g.* ACL, ACTI, ...
year_start (int): starting year for the scan.
year_end (int): ending year for the scan.
dry_run (bool): new records are not inserted in the database
when ``True``.
debug (bool): activate the verbose mode.
db (gluon.dal.DAL):
database connection.
id_team (int):
the identifier of the team in the database.
id_project (int):
the identifier of the project in the database.
automaton (unicode):
the name of the automaton which will be used to process the data.
Possible values are ``articles``, ``notes``, ``preprints``,
``proceedings``, ``reports``, ``talks`` and ``theses``.
id_category (int):
the identifier of the publication category, *e.g.* ACL, ACTI, ...
year_start (int):
starting year for the scan.
year_end (int):
ending year for the scan.
dry_run (bool):
new records are not inserted in the database when ``True``.
debug (bool):
activate the verbose mode.
Returns:
Automaton: returns the appropriate harvester automaton or
``None`` if no factory exist for the specified automaton.
Automaton:
* the appropriate harvester automaton.
* ``None`` if no factory exist for the specified automaton.
"""
tool_class = get_harvester_tool(automaton)
......@@ -75,11 +90,13 @@ def get_harvester_tool(automaton):
* theses
Args:
automaton (str): name of the automaton
automaton (unicode):
name of the automaton
Returns:
Automaton: class reference or ``None``. The latter happens
when the automaton corresponds to nothing.
Automaton:
* class reference
* ``None`` when the automaton corresponds to nothing.
"""
if automaton == "articles":
......
......@@ -26,12 +26,17 @@ class Msg(Storage):
* *reject* the record is rejected.
Args:
collection (str): the harvester collection used to
search the record.
harvester (gluon.dal.Row): the database harvester used to scan the
store.
record_id (int): the record identifier in the store.
title (str): the title of the publication.
collection (unicode):
the harvester collection used to search the record.
harvester (gluon.dal.Row):
the database harvester used to scan the store.
record_id (int):
the record identifier in the store.
title (unicode):
the title of the publication.
"""
def __init__(self,
......@@ -59,10 +64,14 @@ class Msg(Storage):
"""Set the action as *idle* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
year (str): year of the publication
translate (bool): translate the message according to the
current language.
txt (unicode):
message associated to the action.
year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
"""
self.action = "idle"
......@@ -72,10 +81,14 @@ class Msg(Storage):
"""Set the action as *load* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
year (str): year of the publication
translate (bool): translate the message according to the
current language.
txt (unicode):
message associated to the action.
year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
"""
self.action = "load"
......@@ -85,10 +98,14 @@ class Msg(Storage):
"""Set the action as *modify* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
year (str): year of the publication
translate (bool): translate the message according to the
current language.
txt (unicode):
message associated to the action.
year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
"""
self.action = "modify"
......@@ -98,11 +115,14 @@ class Msg(Storage):
"""Set the action as *reject* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
txt (unicode):
message associated to the action.
year (str): year of the publication
year (unicode):
year of the publication
record (RecordPubli): the record on which the action is applied.
record (RecordPubli):
the record on which the action is applied.
It is used to determine the synonym value when the
*collaboration*, *country* or *publisher* data is not
understood.
......@@ -111,8 +131,8 @@ class Msg(Storage):
The *year* argument is not needed when
the *record* is specified.
translate (bool): translate the message according to the
current language.
translate (bool):
translate the message according to the current language.
"""
self.action = "reject"
......
......@@ -8,10 +8,17 @@ class MsgCollection(Storage):
"""Messages for a collection.
Args:
error (str): error when scanning the collection.
found (int): number of publication found in the harvester repository.
url (str): URL used to scan the harvester repository.
title (str): title of the collection.
error (unicode):
error when scanning the collection.
found (int):
number of publication found in the harvester repository.
url (unicode):
URL used to scan the harvester repository.
title (unicode):
title of the collection.
"""
def __init__(self, error="", found=0, title="", url=""):
......@@ -24,8 +31,9 @@ class MsgCollection(Storage):
def url_hb(self):
"""
Returns:
str: an URL configures to return a list of record
in readable format.
str:
an URL configures to return a list of record
sin readable format.
"""
return self.url.replace("of=id", "of=hb")
......@@ -11,9 +11,6 @@ from .checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
MSG_NO_THESIS = "Reject not a thesis record"
class Thesis(Automaton):
"""Automaton for thesis.
......
......@@ -11,9 +11,7 @@ from harvest_tools.articles import (
MSG_NO_EDITOR,
MSG_TRANSFORM_PREPRINT)
from harvest_tools.automaton import (
MSG_INSERT_FAIL,
MSG_WELL_FORM_OAI)
from harvest_tools.automaton import MSG_INSERT_FAIL
from harvest_tools.base import (
MSG_FIX_ORIGIN,
......@@ -29,14 +27,8 @@ from harvest_tools.checkandfix import (
MSG_NO_MY_AUTHOR,
MSG_NO_OAI,
MSG_NO_REF,
MSG_NO_YEAR,
MSG_TEMPORARY_RECORD,
MSG_TO_MANY_DATE,
MSG_TO_MANY_FAUTHOR,
MSG_TO_MANY_YEAR,
MSG_WELL_FORMED_CONF_DATES,
MSG_WELL_FORMED_DATE,
MSG_WELL_FORMED_EDITOR)
MSG_WELL_FORMED_DATE)
from harvest_tools.preprints import (
MSG_PREPRINT_IS_PAPER,
......@@ -45,7 +37,6 @@ from harvest_tools.preprints import (
MSG_PREPRINT_NO_NUMBER)
from harvest_tools.reports import MSG_REPORT_NO_NUMBER
from harvest_tools.thesis import MSG_NO_THESIS
from invenio_tools.base import (
MSG_INV_CONF,
......@@ -85,21 +76,13 @@ def messages():
T(MSG_NO_OAI),
T(MSG_NO_PUBLISHER),
T(MSG_NO_REF),
T(MSG_NO_THESIS),
T(MSG_NO_YEAR),
T(MSG_PREPRINT_IS_PAPER),
T(MSG_PREPRINT_IS_CONFERENCE),
T(MSG_PREPRINT_IS_THESIS),
T(MSG_PREPRINT_NO_NUMBER),
T(MSG_REPORT_NO_NUMBER),
T(MSG_TEMPORARY_RECORD),
T(MSG_TO_MANY_DATE),
T(MSG_TO_MANY_FAUTHOR),
T(MSG_TO_MANY_YEAR),
T(MSG_WELL_FORMED_COLLABORATION),
T(MSG_WELL_FORMED_CONF_DATES),
T(MSG_WELL_FORMED_DATE),
T(MSG_WELL_FORMED_EDITOR),
T(MSG_WELL_FORM_OAI)}
T(MSG_WELL_FORMED_DATE)}
return set_msgs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment