Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
517343f6
Commit
517343f6
authored
Sep 13, 2014
by
LE GAC Renaud
Browse files
Revisit the naming of the classes in invenio and harvester tools.
Polish documentation.
parent
8280655d
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
127 additions
and
118 deletions
+127
-118
controllers/harvest.py
controllers/harvest.py
+4
-4
modules/harvest_tools.py
modules/harvest_tools.py
+78
-77
modules/invenio_tools.py
modules/invenio_tools.py
+41
-35
modules/list_postprocessing.py
modules/list_postprocessing.py
+1
-1
static/CHANGELOG
static/CHANGELOG
+3
-1
No files found.
controllers/harvest.py
View file @
517343f6
...
...
@@ -43,7 +43,7 @@ def free_run():
for
el
in
fields
:
tool
.
harvester
[
el
]
=
selector
[
el
]
tool
.
process
()
tool
()
except
ToolException
,
e
:
return
T
(
str
(
e
))
...
...
@@ -70,7 +70,7 @@ def insert_marcxml():
return
INLINE_ALERT
%
(
T
(
'Error'
),
T
(
'Select a controller.'
))
tool
=
tool_class
(
db
,
selector
,
debug
=
False
)
tool
.
process
()
tool
()
except
ToolException
,
e
:
return
T
(
str
(
e
))
...
...
@@ -107,7 +107,7 @@ def run():
return
INLINE_ALERT
%
(
T
(
'Error'
),
T
(
'Select an harvester.'
))
tool
=
tool_class
(
db
,
selector
,
debug
=
False
)
tool
.
process
()
tool
()
except
ToolException
,
e
:
return
T
(
str
(
e
))
...
...
@@ -157,7 +157,7 @@ def run_all():
tool_class
=
get_harvester_tool
(
selector
.
controller
)
tool
=
tool_class
(
db
,
selector
,
debug
=
False
)
tool
.
process
()
tool
()
collection_logs
.
extend
(
tool
.
collection_logs
)
logs
.
extend
(
tool
.
logs
)
...
...
modules/harvest_tools.py
View file @
517343f6
...
...
@@ -15,9 +15,9 @@ import re
from
gluon
import
current
from
gluon.storage
import
Storage
from
invenio_tools
import
(
OAI_URL
,
C
dsSvc
,
CheckAndFixSvc
,
Marc12
Svc
)
C
heckAndFix
,
InvenioStore
,
Marc12
)
from
plugin_dbui
import
(
UNDEF_ID
,
UNKNOWN
,
get_create_id
,
...
...
@@ -31,7 +31,6 @@ MSG_FIX_ORIGIN = current.T("Fixed the origin field", lazy=False)
MSG_IN_DB
=
current
.
T
(
"Already in the database"
,
lazy
=
False
)
MSG_LOAD
=
current
.
T
(
"Load in the database"
,
lazy
=
False
)
MSG_MATCH
=
current
.
T
(
"Reject the talk match a proceeding"
,
lazy
=
False
)
MSG_NO_AUTHOR
=
current
.
T
(
"Reject no authors"
,
lazy
=
False
)
MSG_NO_CAT
=
current
.
T
(
'Select a "category" !!!'
,
lazy
=
False
)
MSG_NO_CPPM_AUTHOR
=
current
.
T
(
"Reject no CPPM authors"
,
lazy
=
False
)
MSG_NO_EDITOR
=
current
.
T
(
"Reject article is not published"
,
lazy
=
False
)
...
...
@@ -442,8 +441,8 @@ class PublicationsTool(object):
self
.
harvester
=
None
self
.
logs
=
[]
self
.
check
_me
=
CheckAndFix
Svc
()
self
.
marc12
=
Marc12
Svc
()
self
.
check
=
CheckAndFix
()
self
.
marc12
=
Marc12
()
self
.
selector
=
selector
...
...
@@ -460,7 +459,7 @@ class PublicationsTool(object):
@rtype: dict
@return: the key are a sub-set of those defined in
L{invenio_tools.
CdsSvc
.get_ids}.
L{invenio_tools.
InvenioStore
.get_ids}.
"""
selector
=
self
.
selector
...
...
@@ -659,67 +658,17 @@ class PublicationsTool(object):
return
0
def
process
(
self
):
"""Launch the search in the invenio store according to search
criteria and load publications in the database.
@raise ToolException: when projet, team or category identifier
are not defined.
"""
selector
=
self
.
selector
if
self
.
dbg
:
print
"start processing"
,
self
.
__class__
.
__name__
print
"decode request"
# protection team, project and/or category have to be defined
if
not
selector
.
id_projects
:
raise
ToolException
(
MSG_NO_PROJECT
)
if
not
selector
.
id_teams
:
raise
ToolException
(
MSG_NO_TEAM
)
if
selector
.
xml
and
not
selector
.
id_categories
:
raise
ToolException
(
MSG_NO_CAT
)
if
self
.
dbg
:
print
"get harvest parameters"
# process an XML request
if
selector
.
xml
:
self
.
harvester
=
Storage
(
controller
=
selector
.
controller
,
id_categories
=
selector
.
id_categories
,
id_projects
=
selector
.
id_projects
,
id_teams
=
selector
.
id_teams
)
self
.
collection_logs
.
append
(
MsgCollection
(
found
=
1
))
self
.
process_xml
(
selector
.
xml
)
return
# retrieve the harvester parameter in the database
# if not yet defined (free run)
if
not
self
.
harvester
:
row
=
selector
.
select
(
self
.
db
.
harvesters
).
first
()
if
not
row
:
raise
ToolException
(
MSG_NO_HARVESTER
)
self
.
harvester
=
row
.
harvesters
# retrieve records in the store and load them in the database
self
.
process_url
()
def
process_url
(
self
):
"""Retrieve the MARC XML records from the store
and load them in the database.
"""Retrieve the MARC XML string and launch its decoding.
@raise Exception: depending on what happen, can be StoreException,
Marc12ZException, ...
"""
if
self
.
dbg
:
print
"process URL search"
cds
=
CdsSvc
(
host
=
self
.
harvester
.
host
)
store
=
InvenioStore
(
self
.
harvester
.
host
)
# list of collections
collections
=
self
.
harvester
.
collections
...
...
@@ -747,14 +696,14 @@ class PublicationsTool(object):
kwargs
=
self
.
_search_parameters
(
collection
)
try
:
ids
=
cds
.
get_ids
(
**
kwargs
)
ids
=
store
.
get_ids
(
**
kwargs
)
except
Exception
as
error
:
self
.
collection_logs
[
-
1
].
url
=
cds
.
last_search_url
()
self
.
collection_logs
[
-
1
].
url
=
store
.
last_search_url
()
self
.
collection_logs
[
-
1
].
error
=
error
continue
self
.
collection_logs
[
-
1
].
url
=
cds
.
last_search_url
()
self
.
collection_logs
[
-
1
].
url
=
store
.
last_search_url
()
self
.
collection_logs
[
-
1
].
found
=
len
(
ids
)
if
not
ids
:
...
...
@@ -769,7 +718,7 @@ class PublicationsTool(object):
print
"
\n
processing record"
,
id
try
:
xml
=
cds
.
get_record
(
id
)
xml
=
store
.
get_record
(
id
)
self
.
process_xml
(
xml
)
except
Exception
as
error
:
...
...
@@ -779,7 +728,6 @@ class PublicationsTool(object):
self
.
logs
[
-
1
].
reject
(
error
)
return
def
process_xml
(
self
,
xml
):
"""Decode the MARC XML string and load records in the database.
...
...
@@ -791,7 +739,7 @@ class PublicationsTool(object):
if
self
.
dbg
:
print
"process xml record"
li
=
self
.
marc12
.
process
(
xml
)
li
=
self
.
marc12
(
xml
)
for
record
in
li
:
...
...
@@ -803,7 +751,7 @@ class PublicationsTool(object):
self
.
logs
[
-
1
].
title
=
record
.
title
()
self
.
logs
[
-
1
].
collection
=
self
.
collection_logs
[
-
1
].
title
self
.
check
_me
(
record
,
format_author_fr
)
self
.
check
(
record
,
format_author_fr
)
if
record
.
is_valid
:
self
.
logs
[
-
1
].
year
=
record
.
year
()
...
...
@@ -845,6 +793,64 @@ class PublicationsTool(object):
selector
=
self
.
selector
)
def
__call__
(
self
):
"""Search publication in the invenio store according to criteria
and load them in the database.
@raise Exception: the type of exception depends on what happen:
- L{ToolException} when projet, team or category identifier
are not defined.
- C{StoreException} when somethings goes wrong interrogating the store.
- C{Marc12Exception} when somethings goes wrong decoding the XML
string return by the store
- C{CheckException} if the L{Record} is not valid
- C{Exception} if the python code crash
"""
selector
=
self
.
selector
if
self
.
dbg
:
print
"start processing"
,
self
.
__class__
.
__name__
print
"decode request"
# protection team, project and/or category have to be defined
if
not
selector
.
id_projects
:
raise
ToolException
(
MSG_NO_PROJECT
)
if
not
selector
.
id_teams
:
raise
ToolException
(
MSG_NO_TEAM
)
if
selector
.
xml
and
not
selector
.
id_categories
:
raise
ToolException
(
MSG_NO_CAT
)
if
self
.
dbg
:
print
"get harvest parameters"
# process an XML request
if
selector
.
xml
:
self
.
harvester
=
Storage
(
controller
=
selector
.
controller
,
id_categories
=
selector
.
id_categories
,
id_projects
=
selector
.
id_projects
,
id_teams
=
selector
.
id_teams
)
self
.
collection_logs
.
append
(
MsgCollection
(
found
=
1
))
self
.
process_xml
(
selector
.
xml
)
return
# retrieve the harvester parameter in the database
# if not yet defined (free run)
if
not
self
.
harvester
:
row
=
selector
.
select
(
self
.
db
.
harvesters
).
first
()
if
not
row
:
raise
ToolException
(
MSG_NO_HARVESTER
)
self
.
harvester
=
row
.
harvesters
# retrieve records in the store and load them in the database
self
.
process_url
()
class
Articles
(
PublicationsTool
):
"""Publications tool for articles.
...
...
@@ -1213,11 +1219,6 @@ class Preprints(PublicationsTool):
# check the collaboration
id_collaboration
=
self
.
check_collaboration
(
record
.
collaboration
())
# Protection to only keep preprints with authors
if
not
first_author
:
self
.
logs
[
-
1
].
reject
(
MSG_NO_AUTHOR
)
return
0
# check against preprint or article already published
id
,
status
=
self
.
check_by_origin
(
oai_url
=
oai_url
)
if
id
:
...
...
@@ -1271,11 +1272,11 @@ class Preprints(PublicationsTool):
self
.
logs
[
-
1
].
reject
(
MSG_PREPRINT_IS_PAPER
)
return
False
if
record
.
is_conference_
record
():
if
record
.
is_conference_
data
():
self
.
logs
[
-
1
].
reject
(
MSG_PREPRINT_IS_CONFERENCE
)
return
False
if
record
.
is_thesis
_record
():
if
record
.
is_thesis
():
self
.
logs
[
-
1
].
reject
(
MSG_PREPRINT_IS_THESIS
)
return
False
...
...
@@ -1971,7 +1972,7 @@ class Thesis(PublicationsTool):
@param record:
"""
if
not
record
.
is_thesis
_record
():
if
not
record
.
is_thesis
():
self
.
logs
[
-
1
].
reject
(
MSG_NO_THESIS
)
return
False
...
...
modules/invenio_tools.py
View file @
517343f6
...
...
@@ -32,8 +32,8 @@ CDS_SEARCH_KEYS = ('req', 'cc', 'c', 'ec', 'p', 'f', 'rg', 'sf', 'so', 'sp',
'verbose'
,
'ap'
,
'ln'
,
'ec'
)
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_
SUBMITTED_MMM
=
re
.
compile
(
r
"(\d{1,2}) +([A-Za-z]{3}) +(\d{4})"
)
DECODE_
SUBMITTED_MM
=
re
.
compile
(
r
"(\d{1,2}) +(\d{1,2}) +(\d{4})"
)
DECODE_
DD_MMM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) +([A-Za-z]{3}) +(\d{4})"
)
DECODE_
DD_MM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) +(\d{1,2}) +(\d{4})"
)
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
...
...
@@ -121,12 +121,13 @@ def get_origin_data(xml):
return
(
m
.
group
(
1
),
m
.
group
(
2
))
class
CheckAndFixSvc
(
object
):
"""Service to check and fix the Marc12 record:
- Check if the record is valid and mark invalid one.
- Detailed information on conference are added.
- Fix as far as possible inconsistencies.
class
CheckAndFix
(
object
):
"""Tool to check and fix the content of the Marc12
record:
- Check the validity of the record.
- Conference information are added for talk and proceeding.
- Fix as far as possible inconsistencies and stupid mistakes.
"""
def
_check_authors
(
self
,
record
):
...
...
@@ -217,6 +218,10 @@ class CheckAndFixSvc(object):
@raise CheckException:
"""
# no submitted date for thesis only a defence date
if
record
.
is_thesis
():
return
if
"269"
not
in
record
:
raise
CheckException
(
MSG_NO_DATE
)
...
...
@@ -233,12 +238,12 @@ class CheckAndFixSvc(object):
date
=
dates
[
i
][
"c"
]
m
=
DECODE_
SUBMITTED_MMM
.
match
(
date
)
m
=
DECODE_
DD_MMM_YYYY
.
match
(
date
)
if
m
:
dates
[
i
][
"c"
]
=
'%s-%s-%02i'
%
(
m
.
group
(
3
),
MONTHS
[
m
.
group
(
2
)],
int
(
m
.
group
(
1
)))
continue
m
=
DECODE_
SUBMITTED_MM
.
match
(
date
)
m
=
DECODE_
DD_MM_YYYY
.
match
(
date
)
if
m
:
dates
[
i
][
"c"
]
=
'%s-%02i-%02i'
%
(
m
.
group
(
3
),
int
(
m
.
group
(
2
)),
int
(
m
.
group
(
1
)))
continue
...
...
@@ -416,7 +421,7 @@ class CheckAndFixSvc(object):
else
:
raise
CheckException
(
MSG_NO_REF
)
def
_format_authors
(
self
,
record
,
func
):
"""Format the author names using the function func.
...
...
@@ -500,7 +505,7 @@ class CheckAndFixSvc(object):
@param host:
@type id: unicode
@para
p
m id:
@param id:
@type key: unicode
@param key:
...
...
@@ -511,13 +516,13 @@ class CheckAndFixSvc(object):
@raise CheckException:
"""
cds
=
CdsSvc
(
host
)
marc12
=
Marc12
Svc
()
cds
=
InvenioStore
(
host
)
marc12
=
Marc12
()
# search the conference by id the preferred method
if
id
:
xml
=
cds
.
get_record
(
id
)
for
conference
in
marc12
.
process
(
xml
):
for
conference
in
marc12
(
xml
):
if
conference
.
id
()
==
id
:
return
conference
...
...
@@ -536,7 +541,7 @@ class CheckAndFixSvc(object):
for
id
in
ids
:
xml
=
cds
.
get_record
(
id
)
for
conference
in
marc12
.
process
(
xml
):
for
conference
in
marc12
(
xml
):
if
conference
.
conference_key
()
==
key
:
return
conference
...
...
@@ -546,10 +551,10 @@ class CheckAndFixSvc(object):
def
_oldest_year
(
self
,
li
):
"""Helper function.
@type li: list or str
ing
@type li: list or str
@param li: list of years
@rtype: str
ing
@rtype: str
@return: the oldest year or empty string when not defined
"""
...
...
@@ -602,14 +607,17 @@ class CheckAndFixSvc(object):
record
.
is_valid
=
False
record
.
msg
=
"Crash %s"
%
e
print
traceback
.
format_exc
()
print
traceback
.
format_stack
()
class
CdsSvc
(
object
):
"""
Service to interrogate
U{invenio<http://invenio-software.org/>} store.
class
InvenioStore
(
object
):
"""
Class to dialogue with
U{invenio<http://invenio-software.org/>} store.
In the dialogue between CdsSvc and the invenio store, the request is provided by
an URL while the response is an XML string
compliant with the U{MARC<http://www.loc.gov/marc/>} standard.
In the dialogue:
- the request is an URL
- the response is an XML string which is compliant with the
U{MARC<http://www.loc.gov/marc/>} standard.
"""
...
...
@@ -784,7 +792,7 @@ class CdsSvc(object):
@rtype: unicode
@return: the XML string is compliant with
the U{MARC<http://www.loc.gov/marc/>} standard.
Use L{Marc12
Svc.process
} to decode it.
Use L{Marc12
.__call__
} to decode it.
@raise CdsException: when the server return an HTTP error.
...
...
@@ -1074,11 +1082,11 @@ class CdsSvc(object):
so
=
so
)
class
Marc12
Svc
(
object
):
"""
Service to decode record
string encoded with the
class
Marc12
(
object
):
"""
Decode the XML
string encoded with the
U{MARC<http://www.loc.gov/marc>} format.
The main method L{
process
} analyses the XML string
The main method L{
__call__
} analyses the XML string
which has the follwing structure::
<?xml version="1.0" encoding="UTF-8"?>
...
...
@@ -1256,10 +1264,8 @@ class Marc12Svc(object):
return
True
def
process
(
self
,
xml
,
filter
=
None
,
func
=
None
):
"""Transform the I{<record>} nodes of the XML string
into a list of L{Record}.
def
__call__
(
self
,
xml
,
filter
=
None
,
func
=
None
):
"""Transform the the XML string into a list of L{Record}.
@type xml: unicode
@param xml: the XML string has the following structure::
...
...
@@ -1747,8 +1753,8 @@ class Record(dict):
return
True
def
is_thesis
_record
(
self
):
"""C{True} when the record
is
a thesis.
def
is_thesis
(
self
):
"""C{True} when the record
corresponf to
a thesis.
@rtype: bool
@return:
...
...
@@ -2182,8 +2188,8 @@ def print_talk(record):
if
__name__
==
"__main__"
:
csv
=
CdsSvc
()
msv
=
Marc12
Svc
()
csv
=
InvenioStore
()
msv
=
Marc12
()
# papers
xml
=
csv
.
search_year
(
'LHCb Papers'
,
'2010'
,
rg
=
100
)
...
...
modules/list_postprocessing.py
View file @
517343f6
...
...
@@ -110,7 +110,7 @@ def highlight_cppm_speaker(value, template, record):
def
remove_undef
(
value
,
template
,
record
):
"""Remove the
L{plugin_dbui.
UNDEF} string.
"""Remove the
C{
UNDEF} string.
@type value: unicode
@param value: the current string representing the record
...
...
static/CHANGELOG
View file @
517343f6
...
...
@@ -2,9 +2,11 @@
HEAD
- Modify the logic of the harvester by introducing the class CheckAndFixSvc.
- Consolidate harvesters software.
- Modify the logic of the harvester by introducing the class CheckAndFix.
Validation and corrections of each record is performed at only one place.
Should improve code stability and maintenance.
- Review class naming of the invenio and harvester tools modules.
0.8.7.2 (Sep 2014)
- Migrate to plugin_dbui 0.6.1.7.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment