Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
f5403c9a
Commit
f5403c9a
authored
Jun 30, 2017
by
LE GAC Renaud
Browse files
Clean module harvester_tools.
parent
6ae3ac92
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
142 additions
and
117 deletions
+142
-117
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+0
-1
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+31
-14
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+15
-31
modules/harvest_tools/factory.py
modules/harvest_tools/factory.py
+36
-19
modules/harvest_tools/msg.py
modules/harvest_tools/msg.py
+43
-23
modules/harvest_tools/msgcollection.py
modules/harvest_tools/msgcollection.py
+14
-6
modules/harvest_tools/thesis.py
modules/harvest_tools/thesis.py
+0
-3
modules/test_tools.py
modules/test_tools.py
+3
-20
No files found.
modules/harvest_tools/automaton.py
View file @
f5403c9a
...
...
@@ -25,7 +25,6 @@ MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM
=
'Select a "team" !!!'
MSG_INSERT_FAIL
=
"Fail to insert the new record in the database."
MSG_WELL_FORM_OAI
=
"Reject OAI is not well formed"
# search collection when using inspirehep
# require for "Hal Hidden"
...
...
modules/harvest_tools/base.py
View file @
f5403c9a
...
...
@@ -19,11 +19,12 @@ def family_name_fr(full_name):
"""Extract the family name when the full name is encoded as ``J. Doe``.
Args:
full_name (
str): author name encoded according to French
typographic rules.
full_name (
unicode):
author name encoded according to French
typographic rules.
Returns:
str: family name
unicode:
family name
"""
return
full_name
[
full_name
.
find
(
' '
)
+
1
:]
...
...
@@ -43,11 +44,20 @@ def learn_my_authors(db,
all keyword arguments have to be defined.
Args:
db (gluon.dal.DAL): database connection.
authors (list): authors names
id_project (int): the identifier of the project in the database.
id_team (int): the identifier of the team in the database.
year (int): the year
db (gluon.dal.DAL):
database connection.
authors (list):
authors names
id_project (int):
the identifier of the project in the database.
id_team (int):
the identifier of the team in the database.
year (int):
the year
"""
# get the list of authors store in the database
...
...
@@ -97,11 +107,17 @@ def search_synonym(table, fieldname, value, create=False):
The database table must have a field name *synonyms*.
It contains a list of strings.
Args:
table (gluon.DAL.Table): database table.
fieldname (str): field of the database table
identified by its name.
value (str): value to be matched.
create(bool): create a new entry in the database table when
table (gluon.DAL.Table):
database table.
fieldname (unicode):
field of the database table identified by its name.
value (unicode):
value to be matched.
create(bool):
create a new entry in the database table when
it is ``True``
Returns:
...
...
@@ -110,7 +126,8 @@ def search_synonym(table, fieldname, value, create=False):
* UNDEF_ID if value is not defined.
Raises:
ToolException: when more than one synonym is found.
ToolException:
more than one synonym is found.
"""
if
not
value
:
...
...
modules/harvest_tools/checkandfix.py
View file @
f5403c9a
...
...
@@ -4,7 +4,6 @@
"""
import
numpy
as
np
import
re
import
regex
from
.base
import
search_synonym
,
ToolException
from
datetime
import
datetime
...
...
@@ -29,7 +28,6 @@ DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})"
)
DECODE_DD_MM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) +(\d{1,2}) +(\d{4})"
)
DECODE_YYYY
=
re
.
compile
(
r
"^(\d{4})$"
)
MONTHS
=
{
"Jan"
:
"01"
,
"Feb"
:
"02"
,
...
...
@@ -47,41 +45,23 @@ MONTHS = {"Jan": "01",
"Nov"
:
"11"
,
"Dec"
:
"12"
}
MSG_INVALID_HOST
=
"Invalid host"
MSG_NO_AUTHOR
=
"Reject no author(s)"
MSG_NO_CONF_DATE
=
"Reject no conference date"
MSG_NO_DATE
=
"Reject no submission date"
MSG_NO_MY_AUTHOR
=
"Reject no authors of my institute"
MSG_NO_OAI
=
"Reject no OAI identifier"
MSG_NO_REF
=
"Reject incomplete paper reference. Check "
MSG_NO_YEAR
=
"Reject no publication year"
MSG_TEMPORARY_RECORD
=
"Temporary record"
MSG_TO_MANY_DATE
=
"Reject to many submit date"
MSG_TO_MANY_FAUTHOR
=
"Reject to many first author"
MSG_TO_MANY_YEAR
=
"Reject to many year"
MSG_WELL_FORMED_CONF_DATES
=
"Reject conference dates is not well formed"
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR
=
"Reject editor is not well formed"
OAI_INVENIO
=
"oai:%s:%s"
REG_COLLABORATION
=
re
.
compile
(
regex
.
REG_COLLABORATION
)
REG_CONF_DATES_1
=
re
.
compile
(
"(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES_2
=
\
re
.
compile
(
"(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES
=
re
.
compile
(
regex
.
REG_CONF_DATES
)
REG_DOI
=
re
.
compile
(
r
"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)"
)
REG_SUBMITTED
=
re
.
compile
(
regex
.
REG_SUBMITTED
)
REG_WELL_FORMED_CONF_DATES_1
=
re
.
compile
(
"\d{2} - \d{2} [A-Z][a-z]{2} \d{4}"
)
REG_WELL_FORMED_CONF_DATES_2
=
\
...
...
@@ -324,7 +304,7 @@ class CheckAndFix(object):
val
=
u
""
if
isinstance
(
record
,
RecordConf
):
opening
,
closing
=
self
.
_get_conference_dates
(
record
)
opening
=
self
.
_get_conference_dates
(
record
)
[
0
]
val
=
opening
.
strftime
(
"%Y-%m-%d"
)
elif
isinstance
(
record
,
RecordThesis
):
...
...
@@ -519,7 +499,9 @@ class CheckAndFix(object):
"""Format the author names.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
fmt (str):
define the format for author names.
Possible values are "First, Last", "F. Last", "Last",
...
...
@@ -634,7 +616,7 @@ class CheckAndFix(object):
Raises:
CheckException:
when
the list is empty
the list is empty
"""
if
self
.
dbg
:
...
...
@@ -787,7 +769,7 @@ class CheckAndFix(object):
Raises:
CheckException:
when
the paper reference is not well formed.
the paper reference is not well formed.
"""
if
self
.
dbg
:
...
...
@@ -849,7 +831,7 @@ class CheckAndFix(object):
Raises:
CheckException:
when
the publisher is not defined nor entered as a synonym.
the publisher is not defined nor entered as a synonym.
"""
if
self
.
dbg
:
...
...
@@ -880,8 +862,8 @@ class CheckAndFix(object):
Raises:
CheckException:
when
the date is not well formed
or when more
than one date are found.
*
the date is not well formed
* more
than one date are found.
"""
if
self
.
dbg
:
...
...
@@ -927,10 +909,12 @@ class CheckAndFix(object):
"""Some records are marked temporary.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException: when the record is marked temporary
CheckException:
the record is marked temporary
"""
if
self
.
dbg
:
...
...
modules/harvest_tools/factory.py
View file @
f5403c9a
...
...
@@ -23,24 +23,39 @@ def build_harvester_tool(db,
Harvest tool factory function.
Args:
db (gluon.dal.DAL): database connection.
id_team (int): the identifier of the team in the database.
id_project (int): the identifier of the project in the database.
automaton (str): the name of the automaton which
will be used to process the data. Possible values are:
``articles``, ``notes``, ``preprints``, ``proceedings``,
``reports``, ``talks`` and ``theses``.
id_category (int): the identifier of the publication category,
*e.g.* ACL, ACTI, ...
year_start (int): starting year for the scan.
year_end (int): ending year for the scan.
dry_run (bool): new records are not inserted in the database
when ``True``.
debug (bool): activate the verbose mode.
db (gluon.dal.DAL):
database connection.
id_team (int):
the identifier of the team in the database.
id_project (int):
the identifier of the project in the database.
automaton (unicode):
the name of the automaton which will be used to process the data.
Possible values are ``articles``, ``notes``, ``preprints``,
``proceedings``, ``reports``, ``talks`` and ``theses``.
id_category (int):
the identifier of the publication category, *e.g.* ACL, ACTI, ...
year_start (int):
starting year for the scan.
year_end (int):
ending year for the scan.
dry_run (bool):
new records are not inserted in the database when ``True``.
debug (bool):
activate the verbose mode.
Returns:
Automaton: returns the appropriate harvester automaton or
``None`` if no factory exist for the specified automaton.
Automaton:
* the appropriate harvester automaton.
* ``None`` if no factory exist for the specified automaton.
"""
tool_class
=
get_harvester_tool
(
automaton
)
...
...
@@ -75,11 +90,13 @@ def get_harvester_tool(automaton):
* theses
Args:
automaton (str): name of the automaton
automaton (unicode):
name of the automaton
Returns:
Automaton: class reference or ``None``. The latter happens
when the automaton corresponds to nothing.
Automaton:
* class reference
* ``None`` when the automaton corresponds to nothing.
"""
if
automaton
==
"articles"
:
...
...
modules/harvest_tools/msg.py
View file @
f5403c9a
...
...
@@ -26,12 +26,17 @@ class Msg(Storage):
* *reject* the record is rejected.
Args:
collection (str): the harvester collection used to
search the record.
harvester (gluon.dal.Row): the database harvester used to scan the
store.
record_id (int): the record identifier in the store.
title (str): the title of the publication.
collection (unicode):
the harvester collection used to search the record.
harvester (gluon.dal.Row):
the database harvester used to scan the store.
record_id (int):
the record identifier in the store.
title (unicode):
the title of the publication.
"""
def
__init__
(
self
,
...
...
@@ -59,10 +64,14 @@ class Msg(Storage):
"""Set the action as *idle* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
year (str): year of the publication
translate (bool): translate the message according to the
current language.
txt (unicode):
message associated to the action.
year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
"""
self
.
action
=
"idle"
...
...
@@ -72,10 +81,14 @@ class Msg(Storage):
"""Set the action as *load* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
year (str): year of the publication
translate (bool): translate the message according to the
current language.
txt (unicode):
message associated to the action.
year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
"""
self
.
action
=
"load"
...
...
@@ -85,10 +98,14 @@ class Msg(Storage):
"""Set the action as *modify* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
year (str): year of the publication
translate (bool): translate the message according to the
current language.
txt (unicode):
message associated to the action.
year (unicode):
year of the publication
translate (bool):
translate the message according to the current language.
"""
self
.
action
=
"modify"
...
...
@@ -98,11 +115,14 @@ class Msg(Storage):
"""Set the action as *reject* and the explanation as ``txt``.
Args:
txt (str): message associated to the action.
txt (unicode):
message associated to the action.
year (str): year of the publication
year (unicode):
year of the publication
record (RecordPubli): the record on which the action is applied.
record (RecordPubli):
the record on which the action is applied.
It is used to determine the synonym value when the
*collaboration*, *country* or *publisher* data is not
understood.
...
...
@@ -111,8 +131,8 @@ class Msg(Storage):
The *year* argument is not needed when
the *record* is specified.
translate (bool):
translate the message according to the
current language.
translate (bool):
translate the message according to the
current language.
"""
self
.
action
=
"reject"
...
...
modules/harvest_tools/msgcollection.py
View file @
f5403c9a
...
...
@@ -8,10 +8,17 @@ class MsgCollection(Storage):
"""Messages for a collection.
Args:
error (str): error when scanning the collection.
found (int): number of publication found in the harvester repository.
url (str): URL used to scan the harvester repository.
title (str): title of the collection.
error (unicode):
error when scanning the collection.
found (int):
number of publication found in the harvester repository.
url (unicode):
URL used to scan the harvester repository.
title (unicode):
title of the collection.
"""
def
__init__
(
self
,
error
=
""
,
found
=
0
,
title
=
""
,
url
=
""
):
...
...
@@ -24,8 +31,9 @@ class MsgCollection(Storage):
def
url_hb
(
self
):
"""
Returns:
str: an URL configures to return a list of record
in readable format.
str:
an URL configures to return a list of record
sin readable format.
"""
return
self
.
url
.
replace
(
"of=id"
,
"of=hb"
)
modules/harvest_tools/thesis.py
View file @
f5403c9a
...
...
@@ -11,9 +11,6 @@ from .checkandfix import CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
MSG_NO_THESIS
=
"Reject not a thesis record"
class
Thesis
(
Automaton
):
"""Automaton for thesis.
...
...
modules/test_tools.py
View file @
f5403c9a
...
...
@@ -11,9 +11,7 @@ from harvest_tools.articles import (
MSG_NO_EDITOR
,
MSG_TRANSFORM_PREPRINT
)
from
harvest_tools.automaton
import
(
MSG_INSERT_FAIL
,
MSG_WELL_FORM_OAI
)
from
harvest_tools.automaton
import
MSG_INSERT_FAIL
from
harvest_tools.base
import
(
MSG_FIX_ORIGIN
,
...
...
@@ -29,14 +27,8 @@ from harvest_tools.checkandfix import (
MSG_NO_MY_AUTHOR
,
MSG_NO_OAI
,
MSG_NO_REF
,
MSG_NO_YEAR
,
MSG_TEMPORARY_RECORD
,
MSG_TO_MANY_DATE
,
MSG_TO_MANY_FAUTHOR
,
MSG_TO_MANY_YEAR
,
MSG_WELL_FORMED_CONF_DATES
,
MSG_WELL_FORMED_DATE
,
MSG_WELL_FORMED_EDITOR
)
MSG_WELL_FORMED_DATE
)
from
harvest_tools.preprints
import
(
MSG_PREPRINT_IS_PAPER
,
...
...
@@ -45,7 +37,6 @@ from harvest_tools.preprints import (
MSG_PREPRINT_NO_NUMBER
)
from
harvest_tools.reports
import
MSG_REPORT_NO_NUMBER
from
harvest_tools.thesis
import
MSG_NO_THESIS
from
invenio_tools.base
import
(
MSG_INV_CONF
,
...
...
@@ -85,21 +76,13 @@ def messages():
T
(
MSG_NO_OAI
),
T
(
MSG_NO_PUBLISHER
),
T
(
MSG_NO_REF
),
T
(
MSG_NO_THESIS
),
T
(
MSG_NO_YEAR
),
T
(
MSG_PREPRINT_IS_PAPER
),
T
(
MSG_PREPRINT_IS_CONFERENCE
),
T
(
MSG_PREPRINT_IS_THESIS
),
T
(
MSG_PREPRINT_NO_NUMBER
),
T
(
MSG_REPORT_NO_NUMBER
),
T
(
MSG_TEMPORARY_RECORD
),
T
(
MSG_TO_MANY_DATE
),
T
(
MSG_TO_MANY_FAUTHOR
),
T
(
MSG_TO_MANY_YEAR
),
T
(
MSG_WELL_FORMED_COLLABORATION
),
T
(
MSG_WELL_FORMED_CONF_DATES
),
T
(
MSG_WELL_FORMED_DATE
),
T
(
MSG_WELL_FORMED_EDITOR
),
T
(
MSG_WELL_FORM_OAI
)}
T
(
MSG_WELL_FORMED_DATE
)}
return
set_msgs
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment