Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
739e4103
Commit
739e4103
authored
Oct 19, 2015
by
LE GAC Renaud
Browse files
Move the class CheckAndFix from invenio_tools to harvest_tools.
parent
4bd73009
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
176 additions
and
160 deletions
+176
-160
controllers/harvest.py
controllers/harvest.py
+3
-3
modules/harvest_tools/__init__.py
modules/harvest_tools/__init__.py
+3
-2
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+1
-1
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+2
-2
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+1
-4
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+13
-33
modules/harvest_tools/exception.py
modules/harvest_tools/exception.py
+13
-0
modules/harvest_tools/notes.py
modules/harvest_tools/notes.py
+1
-1
modules/harvest_tools/preprints.py
modules/harvest_tools/preprints.py
+2
-1
modules/harvest_tools/proceedings.py
modules/harvest_tools/proceedings.py
+1
-1
modules/harvest_tools/reports.py
modules/harvest_tools/reports.py
+1
-1
modules/harvest_tools/talks.py
modules/harvest_tools/talks.py
+1
-1
modules/harvest_tools/thesis.py
modules/harvest_tools/thesis.py
+2
-1
modules/invenio_tools/__init__.py
modules/invenio_tools/__init__.py
+22
-2
modules/invenio_tools/exception.py
modules/invenio_tools/exception.py
+0
-4
scripts/fix-conference-url.py
scripts/fix-conference-url.py
+29
-26
scripts/fix-page-volume.py
scripts/fix-page-volume.py
+42
-39
scripts/fix-publications-url.py
scripts/fix-publications-url.py
+35
-34
tests/harvester/CheckAndFix/test_acl_cds1951625_fix.py
tests/harvester/CheckAndFix/test_acl_cds1951625_fix.py
+2
-2
tests/harvester/CheckAndFix/test_acti_cds1411352_fix.py
tests/harvester/CheckAndFix/test_acti_cds1411352_fix.py
+2
-2
No files found.
controllers/harvest.py
View file @
739e4103
...
...
@@ -7,14 +7,14 @@ import traceback
from
gluon
import
current
from
gluon.restricted
import
RestrictedError
from
harvest_tools
import
(
build_harvester_tool
,
CheckAndFix
,
CheckException
,
DRY_RUN
,
format_author_fr
,
family_name_fr
,
search_synonym
,
ToolException
)
from
invenio_tools
import
(
CheckAndFix
,
CheckException
,
load_record
,
from
invenio_tools
import
(
load_record
,
OAI_URL
,
RecordConf
,
RecordThesis
)
...
...
modules/harvest_tools/__init__.py
View file @
739e4103
...
...
@@ -6,11 +6,12 @@ and to push them in the database.
from
base
import
(
DRY_RUN
,
family_name_fr
,
format_author_fr
,
search_synonym
,
ToolException
)
search_synonym
)
from
automaton
import
Automaton
from
articles
import
Articles
from
checkandfix
import
CheckAndFix
from
exception
import
CheckException
,
ToolException
from
factory
import
build_harvester_tool
,
get_harvester_tool
from
msg
import
Msg
from
msgcollection
import
MsgCollection
...
...
modules/harvest_tools/articles.py
View file @
739e4103
...
...
@@ -13,7 +13,7 @@ from base import (family_name_fr,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
)
from
invenio_tools
import
CheckException
from
checkandfix
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
...
...
modules/harvest_tools/automaton.py
View file @
739e4103
...
...
@@ -10,9 +10,9 @@ from base import (MSG_FIX_ORIGIN,
MSG_IN_DB
,
search_synonym
,
ToolException
)
from
checkandfix
import
CheckAndFix
from
gluon.storage
import
Storage
from
invenio_tools
import
(
CheckAndFix
,
InvenioStore
,
from
invenio_tools
import
(
InvenioStore
,
Marc12
,
OAI_URL
)
from
msg
import
Msg
...
...
modules/harvest_tools/base.py
View file @
739e4103
...
...
@@ -2,6 +2,7 @@
""" harvest_tools.base
"""
from
exception
import
ToolException
from
invenio_tools
import
REG_AUTHOR
from
plugin_dbui
import
get_id
,
UNDEF_ID
...
...
@@ -207,7 +208,3 @@ def search_synonym(table, fieldname, value, create=False):
else
:
msg
=
MSG_TOOMANY_SYNONYM
%
table
.
_tablename
raise
ToolException
(
msg
)
class
ToolException
(
Exception
):
pass
modules/
invenio
_tools/checkandfix.py
→
modules/
harvest
_tools/checkandfix.py
View file @
739e4103
# -*- coding: utf-8 -*-
"""
invenio
_tools.checkandfix
"""
harvest
_tools.checkandfix
"""
import
re
import
regex
from
base
import
(
MSG_NO_CONF
,
MSG_NO_COUNTRY
,
MSG_NO_PUBLISHER
,
MSG_WELL_FORMED_COLLABORATION
,
MSG_NO_THESIS
,
OAI_URL
,
REG_AUTHOR
,
REG_OAI
,
REG_YEAR
)
from
exception
import
CheckException
from
invenio_tools
import
(
load_record
,
MSG_NO_COUNTRY
,
MSG_NO_CONF
,
MSG_NO_PUBLISHER
,
MSG_WELL_FORMED_COLLABORATION
,
MSG_NO_THESIS
,
OAI_URL
,
RecordConf
,
RecordThesis
,
REG_AUTHOR
,
REG_OAI
,
REG_YEAR
)
from
filters
import
CLEAN_REVIEW
from
gluon
import
current
from
inveniostore
import
InvenioStore
from
marc12
import
Marc12
from
plugin_dbui
import
get_id
from
recordconf
import
RecordConf
from
recordthesis
import
RecordThesis
DECODE_ARXIV
=
re
.
compile
(
r
"arXiv:(\d{2})(\d{2})\."
)
...
...
@@ -88,25 +87,6 @@ REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
UNIVERSITY
=
"University"
def
load_record
(
host
,
record_id
):
"""Helper function to load a single record from an invenio store.
Args:
host (unicode): host of the store.
Possible values are ``cds.cern.ch`` or ``inspirehep.net``.
record_id (integer): the record identifier in the store
Returns:
Record: the decoded record.
"""
store
=
InvenioStore
(
host
)
xml
=
store
.
get_record
(
record_id
)
svc
=
Marc12
()
return
svc
(
xml
)[
0
]
class
CheckAndFix
(
object
):
"""A collection of tools to check and repair the content
of the Marc12 record.
...
...
modules/harvest_tools/exception.py
0 → 100644
View file @
739e4103
# -*- coding: utf-8 -*-
""" harvest_tools.exception
"""
from
invenio_tools
import
ExceptionUTF8
class
CheckException
(
ExceptionUTF8
):
pass
class
ToolException
(
ExceptionUTF8
):
pass
modules/harvest_tools/notes.py
View file @
739e4103
...
...
@@ -7,7 +7,7 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_LOAD
from
invenio_tools
import
CheckException
from
checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
...
...
modules/harvest_tools/preprints.py
View file @
739e4103
...
...
@@ -7,7 +7,8 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_LOAD
from
invenio_tools
import
CheckException
,
RecordConf
,
RecordThesis
from
checkandfix
import
CheckException
from
invenio_tools
import
RecordConf
,
RecordThesis
from
plugin_dbui
import
UNDEF_ID
...
...
modules/harvest_tools/proceedings.py
View file @
739e4103
...
...
@@ -7,7 +7,7 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_LOAD
from
invenio_tools
import
CheckException
from
checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
...
...
modules/harvest_tools/reports.py
View file @
739e4103
...
...
@@ -7,7 +7,7 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_LOAD
from
invenio_tools
import
CheckException
from
checkandfix
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
,
UNKNOWN
...
...
modules/harvest_tools/talks.py
View file @
739e4103
...
...
@@ -7,7 +7,7 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_LOAD
from
invenio_tools
import
CheckException
from
checkandfix
import
CheckException
from
plugin_dbui
import
UNDEF_ID
...
...
modules/harvest_tools/thesis.py
View file @
739e4103
...
...
@@ -8,7 +8,8 @@ import traceback
from
automaton
import
Automaton
from
base
import
family_name_fr
,
format_author_fr
,
MSG_CRASH
,
MSG_LOAD
from
invenio_tools
import
CheckException
,
RecordThesis
from
checkandfix
import
CheckException
from
invenio_tools
import
RecordThesis
from
plugin_dbui
import
get_id
,
UNDEF_ID
...
...
modules/invenio_tools/__init__.py
View file @
739e4103
...
...
@@ -8,8 +8,10 @@ from base import (ARXIV,
is_conference
,
is_institute
,
is_thesis
,
MSG_NO_CONF
,
MSG_NO_COUNTRY
,
MSG_NO_PUBLISHER
,
MSG_NO_THESIS
,
MSG_WELL_FORMED_COLLABORATION
,
OAI_URL
,
REG_ARXIV_NUMBER
,
...
...
@@ -19,12 +21,11 @@ from base import (ARXIV,
THESIS_DIR
)
from
exception
import
(
CdsException
,
Check
Exception
,
Exception
UTF8
,
Marc12Exception
,
RecordException
,
XmlException
)
from
checkandfix
import
CheckAndFix
,
load_record
from
inveniostore
import
InvenioStore
from
iterrecord
import
IterRecord
from
marc12
import
Marc12
...
...
@@ -33,3 +34,22 @@ from recordconf import RecordConf
from
recordinst
import
RecordInst
from
recordpubli
import
RecordPubli
from
recordthesis
import
RecordThesis
def
load_record
(
host
,
record_id
):
"""Helper function to load a single record from an invenio store.
Args:
host (unicode): host of the store.
Possible values are ``cds.cern.ch`` or ``inspirehep.net``.
record_id (integer): the record identifier in the store
Returns:
Record: the decoded record.
"""
store
=
InvenioStore
(
host
)
xml
=
store
.
get_record
(
record_id
)
svc
=
Marc12
()
return
svc
(
xml
)[
0
]
modules/invenio_tools/exception.py
View file @
739e4103
...
...
@@ -20,10 +20,6 @@ class CdsException(ExceptionUTF8):
pass
class
CheckException
(
ExceptionUTF8
):
pass
class
Marc12Exception
(
ExceptionUTF8
):
pass
...
...
scripts/fix-conference-url.py
View file @
739e4103
# -*- coding: utf-8 -*-
""" NAME
fix-conference-url
SYNOPSIS
fix the publications field conference_url
DESCRIPTION
Check the field conference_url in the invenio store and update it.
From time to time, it has been forgotten.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-conference-url
AUTHOR
R. Le Gac -- Dec 2014
"""
"""
if
__name__
==
"__main__"
:
import
re
import
sys
from
argparse
import
ArgumentParser
,
FileType
from
invenio_tools
import
CheckAndFix
,
CheckException
,
InvenioStore
,
Marc12
from
harvest_tools
import
CheckAndFix
,
CheckException
from
invenio_tools
import
InvenioStore
,
Marc12
REG_ORIGIN
=
re
.
compile
(
"http://([a-z\.]+)/record/(\d+)"
)
# command line options
parser
=
ArgumentParser
()
args
=
parser
.
parse_args
()
# unlock the publications update when the status is OK
db
.
publications
.
_before_update
.
remove
(
INHIBIT_PUBLICATION_UPDATE_ON_OK
)
# service
check
=
CheckAndFix
()
decode
=
Marc12
()
# scan the publications table
query
=
db
.
publications
.
origin
.
len
()
>
0
query
&=
db
.
publications
.
conference_url
.
len
()
==
0
query
&=
(
db
.
publications
.
id_categories
==
7
)
|
(
db
.
publications
.
id_categories
==
9
)
query
&=
(
db
.
publications
.
id_categories
==
7
)
|
(
db
.
publications
.
id_categories
==
9
)
for
row
in
db
(
query
).
select
():
m
=
REG_ORIGIN
.
match
(
row
.
origin
)
if
not
m
:
continue
host
,
store_id
=
m
.
groups
()
# retrieve the full record from the store
store
=
InvenioStore
(
host
)
xml
=
store
.
get_record
(
store_id
)
record
=
decode
(
xml
)[
0
]
try
:
check
.
conference
(
record
)
except
CheckException
,
e
:
pass
val
=
record
.
conference_url
()
if
val
:
print
" - %s, conference url: %s"
%
(
row
.
id
,
val
)
db
(
db
.
publications
.
id
==
row
.
id
).
update
(
conference_url
=
val
)
db
(
db
.
publications
.
id
==
row
.
id
).
update
(
conference_url
=
val
)
db
.
commit
()
# close
sys
.
exit
(
0
)
scripts/fix-page-volume.py
View file @
739e4103
# -*- coding: utf-8 -*-
""" NAME
fix-page-volume
SYNOPSIS
fix the publications page and volume.
DESCRIPTION
In September 2014, the pages and volume information have been
exchange when decoding the marc12 data (commit 8280655).
This script fix this bug.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-page-volume
AUTHOR
R. Le Gac -- Dec 2014
"""
"""
if
__name__
==
"__main__"
:
import
re
import
sys
from
argparse
import
ArgumentParser
,
FileType
from
invenio_tools
import
CheckAndFix
,
InvenioStore
,
Marc12
from
harvest_tools
import
CheckAndFix
from
invenio_tools
import
InvenioStore
,
Marc12
REG_ORIGIN
=
re
.
compile
(
"http://([a-z\.]+)/record/(\d+)"
)
# command line options
parser
=
ArgumentParser
()
args
=
parser
.
parse_args
()
# unlock the publications update when the status is OK
db
.
publications
.
_before_update
.
remove
(
INHIBIT_PUBLICATION_UPDATE_ON_OK
)
# service
decode
=
Marc12
()
check
=
CheckAndFix
()
# the ACL
# the ACL
query
=
db
.
publications
.
id_categories
==
2
# scan the publications table
# scan the publications table
for
row
in
db
(
query
).
select
():
if
not
isinstance
(
row
.
origin
,
(
str
,
unicode
)):
print
" - Invalid origin"
,
row
.
origin
continue
m
=
REG_ORIGIN
.
match
(
row
.
origin
)
if
not
m
:
continue
host
,
store_id
=
m
.
groups
()
# retrieve the full record from the store
store
=
InvenioStore
(
host
)
xml
=
store
.
get_record
(
store_id
)
record
=
decode
(
xml
)[
0
]
# record might be deleted and replace by a new one
if
"980"
in
record
and
"c"
in
record
[
"980"
]
and
record
[
"980"
][
"c"
]
==
"DELETED"
:
if
"970"
in
record
and
"d"
in
record
[
"970"
]:
...
...
@@ -79,30 +82,30 @@ if __name__ == "__main__":
check
.
clean_erratum
(
record
)
check
.
paper_reference
(
record
)
check
.
format_editor
(
record
)
pages
=
record
.
paper_pages
()
volume
=
record
.
paper_volume
()
if
row
.
pages
==
pages
and
row
.
volume
==
volume
:
continue
t1
=
(
db
.
publishers
[
row
.
id_publishers
].
abbreviation
,
row
.
volume
,
row
.
year
,
t1
=
(
db
.
publishers
[
row
.
id_publishers
].
abbreviation
,
row
.
volume
,
row
.
year
,
row
.
pages
)
t2
=
(
record
.
paper_editor
(),
record
.
paper_volume
(),
record
.
paper_year
(),
t2
=
(
record
.
paper_editor
(),
record
.
paper_volume
(),
record
.
paper_year
(),
record
.
paper_pages
())
status
=
db
.
status
[
row
.
id_status
].
code
print
" - %s %s (%s) %s"
%
t1
,
"--> %s %s (%s) %s"
%
t2
,
status
,
row
.
origin
rep
=
raw_input
(
" - Fix it [y/N]:"
)
if
rep
.
lower
()
==
"y"
:
db
(
db
.
publications
.
id
==
row
.
id
).
update
(
pages
=
pages
,
volume
=
volume
)
db
(
db
.
publications
.
id
==
row
.
id
).
update
(
pages
=
pages
,
volume
=
volume
)
db
.
commit
()
# close
sys
.
exit
(
0
)
\ No newline at end of file
scripts/fix-publications-url.py
View file @
739e4103
# -*- coding: utf-8 -*-
""" NAME
fix-publication-url
SYNOPSIS
fix the publications field publication_url
DESCRIPTION
The field publication_url is the URL of the pdf file.
This definition has been re-enforce in track_publications 0.8.8.
The script check this field and try to fix it.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-paper-url
AUTHOR
R. Le Gac -- Dec 2014
"""
def
get_record
(
host
,
record_id
):
"""Retrieve the record, record_id, from the store.
"""
decode
=
Marc12
()
store
=
InvenioStore
(
host
)
try
:
xml
=
store
.
get_record
(
store_id
)
record
=
decode
(
xml
)[
0
]
except
Marc12Exception
:
record
=
None
return
record
def
update
(
row
,
val
):
"""update publication_url field for the record row.
"""
"""
print
u
" - %s, %s → '%s'"
%
(
row
.
id
,
row
.
publication_url
,
val
)
db
(
db
.
publications
.
id
==
row
.
id
).
update
(
publication_url
=
val
)
if
__name__
==
"__main__"
:
import
os
import
re
import
sys
from
argparse
import
ArgumentParser
,
FileType
from
invenio_tools
import
(
CheckAndFix
,
CheckException
,
InvenioStore
,
from
harvest_tools
import
CheckAndFix
,
CheckException
from
invenio_tools
import
(
InvenioStore
,
Marc12
,
Marc12Exception
)
REG_ARXIV
=
re
.
compile
(
"http://[a-z\.]*arxiv.org/abs/(?:arXiv:)?\d+\.\d+"
)
REG_INDICO
=
re
.
compile
(
"https?://indico"
)
REG_IOP
=
re
.
compile
(
"http://iopscience.iop.org/(\d+-\d+)/(\d+)/(\d+)/([A-Z]?\d+)/?"
)
REG_ORIGIN
=
re
.
compile
(
"https?://([a-z\.]+)/record/(\d+)"
)
REG_TEL
=
re
.
compile
(
"(http://tel.archives-ouvertes.fr/tel-\d+)(?:/fr/)?"
)
# command line options
parser
=
ArgumentParser
()
args
=
parser
.
parse_args
()