Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
8280655d
Commit
8280655d
authored
Sep 11, 2014
by
LE GAC Renaud
Browse files
Add the class CheckAndFixSvc to validate record before processing.
parent
a464865d
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
661 additions
and
568 deletions
+661
-568
languages/fr-fr.py
languages/fr-fr.py
+4
-1
modules/harvest_tools.py
modules/harvest_tools.py
+56
-168
modules/invenio_tools.py
modules/invenio_tools.py
+597
-399
static/CHANGELOG
static/CHANGELOG
+4
-0
No files found.
languages/fr-fr.py
View file @
8280655d
...
...
@@ -387,12 +387,15 @@
'Registration key'
:
'Registration key'
,
'Registration successful'
:
'Registration successful'
,
'Reject'
:
'Rejeter'
,
'Reject article is not published'
:
"Rejeter l'article n'est pas publié"
,
'Reject article is not published'
:
"Rejeté l'article n'est pas publié"
,
'Reject incomplete paper reference'
:
'Rejeté la référence du papier est incomplète'
,
'Reject no author(s)'
:
"Rejeté pas d'autheur(s)"
,
'Reject no authors'
:
"Rejeté pas d'auteurs"
,
'Reject no conference information'
:
"Rejeté pas d'information sur la conférence"
,
'Reject no CPPM authors'
:
"Rejeté pas d'auteurs du CPPM"
,
'Reject no OAI identifier'
:
"Rejeté pas d'identifiant OAI"
,
'Reject no preprint number nor submission date'
:
'Rejeté pas de numéro de preprint ou de date de soumission'
,
'Reject no submission date'
:
'Rejeté pas de date de soumission'
,
'Reject not a thesis record'
:
"Rejeté cet enregistement n'est pas une thèse"
,
'Reject preprint is a conference'
:
'Rejeté ce preprint est une conférence'
,
'Reject preprint is a published paper'
:
'Rejeté ce preprint est un article publié'
,
...
...
modules/harvest_tools.py
View file @
8280655d
...
...
@@ -14,6 +14,10 @@ import re
from
gluon
import
current
from
gluon.storage
import
Storage
from
invenio_tools
import
(
OAI_URL
,
CdsSvc
,
CheckAndFixSvc
,
Marc12Svc
)
from
plugin_dbui
import
(
UNDEF_ID
,
UNKNOWN
,
get_create_id
,
...
...
@@ -24,19 +28,14 @@ DRY_RUN = current.T("dry run")
# explain message
MSG_DELETE_TALK
=
current
.
T
(
"Delete the associated talk"
,
lazy
=
False
)
MSG_FIX_ORIGIN
=
current
.
T
(
"Fixed the origin field"
,
lazy
=
False
)
MSG_FIX_PAGE
=
current
.
T
(
"Fixed the page field"
,
lazy
=
False
)
MSG_IN_DB
=
current
.
T
(
"Already in the database"
,
lazy
=
False
)
MSG_LOAD
=
current
.
T
(
"Load in the database"
,
lazy
=
False
)
MSG_MATCH
=
current
.
T
(
"Reject the talk match a proceeding"
,
lazy
=
False
)
MSG_NO_AUTHOR
=
current
.
T
(
"Reject no authors"
,
lazy
=
False
)
MSG_NO_CAT
=
current
.
T
(
'Select a "category" !!!'
,
lazy
=
False
)
MSG_NO_CPPM_AUTHOR
=
current
.
T
(
"Reject no CPPM authors"
,
lazy
=
False
)
MSG_NO_CONF_ID
=
current
.
T
(
'Missing record for conference identified by id'
,
lazy
=
False
)
MSG_NO_CONF_KEY
=
current
.
T
(
'Missing record for conference identified by key'
,
lazy
=
False
)
MSG_NO_EDITOR
=
current
.
T
(
"Reject article is not published"
,
lazy
=
False
)
MSG_NO_HARVESTER
=
current
.
T
(
'Harvester parameters not defined in the database.'
,
lazy
=
False
)
MSG_NO_LINK
=
current
.
T
(
'Record not link to a conference'
,
lazy
=
False
)
MSG_NO_OAI
=
current
.
T
(
"Reject no OAI identifier"
,
lazy
=
False
)
MSG_NO_PROJECT
=
current
.
T
(
'Select a "project" !!!'
,
lazy
=
False
)
MSG_NO_TEAM
=
current
.
T
(
'Select a "team" !!!'
,
lazy
=
False
)
MSG_NO_THESIS
=
current
.
T
(
"Reject not a thesis record"
,
lazy
=
False
)
...
...
@@ -44,7 +43,6 @@ MSG_PREPRINT_IS_PAPER = current.T("Reject preprint is a published paper", lazy=
MSG_PREPRINT_IS_CONFERENCE
=
current
.
T
(
"Reject preprint is a conference"
,
lazy
=
False
)
MSG_PREPRINT_IS_THESIS
=
current
.
T
(
"Reject preprint is a thesis"
,
lazy
=
False
)
MSG_PREPRINT_NO_NUMBER
=
current
.
T
(
"Reject no preprint number nor submission date"
,
lazy
=
False
)
MSG_SERVER_ERROR
=
current
.
T
(
"Error !!!"
,
lazy
=
False
)
MSG_TRANSFORM_PREPRINT
=
current
.
T
(
"Transform the preprint into an article"
,
lazy
=
False
)
MSG_TRANSFORM_TALK
=
current
.
T
(
"Transform the talk into a proceeding"
,
lazy
=
False
)
...
...
@@ -202,78 +200,6 @@ def format_author_fr(name):
return
r
def
get_conference_record
(
cds
,
marc12
,
record
):
"""Get the conference information associated to the talk
or to the proceeding.
@type cds: L{CdsSvc}
@param cds: service to interrogate invenio store
@type marc12: L{Marc12Svc}
@param marc12: service to decode record encoded with the MARC format.
@type record: L{Record}
@param record: The record defining the talk or the proceeding.
@rtype: L{Record} or unicode
@return: The conference record or a string with an error message.
"""
msg
=
''
id
=
record
.
reference_conference_id
()
key
=
record
.
reference_conference_key
()
# search the conference by id the preferred method
if
id
:
try
:
xml
=
cds
.
get_record
(
id
)
except
invenio_tools
.
CdsSvcException
as
error
:
msg
=
"%s. %s"
%
(
MSG_NO_CONF_ID
,
error
)
return
msg
conferences
=
marc12
.
process
(
xml
)
for
el
in
conferences
:
if
el
.
id
()
==
id
:
return
el
msg
=
MSG_NO_CONF_ID
# search the conference by key if the previous method failed.
# the method depends on the store.
if
key
:
if
cds
.
_host
.
startswith
(
'inspirehep'
):
key
=
key
.
replace
(
'/'
,
'-'
)
ids
=
cds
.
get_ids
(
cc
=
'Conferences'
,
p
=
'111__g:%s-%s-%s'
%
tuple
(
key
.
split
(
'-'
)))
else
:
ids
=
cds
.
get_ids
(
p
=
key
)
if
not
ids
:
msg
=
"%s. %s"
%
(
msg
,
MSG_NO_CONF_KEY
)
return
msg
for
id
in
ids
:
try
:
xml
=
cds
.
get_record
(
id
)
except
invenio_tools
.
CdsSvcException
as
error
:
msg
=
"%s. %s"
%
(
MSG_NO_CONF_KEY
,
error
)
return
msg
conferences
=
marc12
.
process
(
xml
)
for
el
in
conferences
:
if
el
.
conference_key
()
==
key
:
return
el
msg
=
"%s. %s"
%
(
msg
,
MSG_NO_CONF_KEY
)
return
msg
# no id and no key
return
MSG_NO_LINK
def
get_harvester_tool
(
controller
):
"""Get the harvester tool associated to the controller
or None if .
...
...
@@ -321,22 +247,6 @@ def get_harvester_tool(controller):
return
Tool
def
is_published_paper
(
record
):
"""C{True} when the record corresponds to a published paper.
It should have a review and a volume numbers.
@type record: L{Record}
@param record:
@rtype: bool
"""
if
record
.
paper_editor
()
and
record
.
paper_volume
():
return
True
return
False
def
learn_cppm_authors
(
db
,
authors
=
None
,
id_project
=
None
,
id_team
=
None
,
...
...
@@ -432,7 +342,8 @@ class Msg(Storage):
"""
self
.
action
=
'idle'
self
.
txt
=
txt
self
.
_set_txt
(
txt
)
def
load
(
self
,
txt
):
"""Set the action as C{load} and the message as C{txt}.
...
...
@@ -442,7 +353,8 @@ class Msg(Storage):
"""
self
.
action
=
'load'
self
.
txt
=
txt
self
.
_set_txt
(
txt
)
def
modify
(
self
,
txt
):
"""Set the action as C{modify} and the message as C{txt}.
...
...
@@ -452,7 +364,8 @@ class Msg(Storage):
"""
self
.
action
=
'modify'
self
.
txt
=
txt
self
.
_set_txt
(
txt
)
def
reject
(
self
,
txt
):
"""Set the action as C{reject} set the message as C{txt}.
...
...
@@ -462,7 +375,13 @@ class Msg(Storage):
"""
self
.
action
=
'reject'
self
.
txt
=
txt
self
.
_set_txt
(
txt
)
def
_set_txt
(
self
,
value
):
if
isinstance
(
value
,
unicode
):
value
=
value
.
encode
(
"utf-8"
)
self
.
txt
=
value
class
MsgCollection
(
Storage
):
...
...
@@ -523,8 +442,8 @@ class PublicationsTool(object):
self
.
harvester
=
None
self
.
logs
=
[]
self
.
marc12
=
invenio_tools
.
Marc12
Svc
()
self
.
marc12
.
set_format_author_name
(
format_author_fr
)
self
.
check_me
=
CheckAndFix
Svc
()
self
.
marc12
=
Marc12Svc
(
)
self
.
selector
=
selector
...
...
@@ -800,7 +719,7 @@ class PublicationsTool(object):
if
self
.
dbg
:
print
"process URL search"
cds
=
invenio_tools
.
CdsSvc
(
host
=
self
.
harvester
.
host
)
cds
=
CdsSvc
(
host
=
self
.
harvester
.
host
)
# list of collections
collections
=
self
.
harvester
.
collections
...
...
@@ -830,7 +749,7 @@ class PublicationsTool(object):
try
:
ids
=
cds
.
get_ids
(
**
kwargs
)
except
invenio_tools
.
CdsSvc
Exception
as
error
:
except
Exception
as
error
:
self
.
collection_logs
[
-
1
].
url
=
cds
.
last_search_url
()
self
.
collection_logs
[
-
1
].
error
=
error
continue
...
...
@@ -851,15 +770,15 @@ class PublicationsTool(object):
try
:
xml
=
cds
.
get_record
(
id
)
self
.
process_xml
(
xml
)
except
invenio_tools
.
CdsSvc
Exception
as
error
:
url
=
invenio_tools
.
OAI_URL
%
(
self
.
harvester
.
host
,
id
)
except
Exception
as
error
:
url
=
OAI_URL
%
(
self
.
harvester
.
host
,
id
)
self
.
logs
.
append
(
Msg
(
url
=
url
))
self
.
logs
[
-
1
].
title
=
url
self
.
logs
[
-
1
].
reject
(
error
)
return
self
.
process_xml
(
xml
)
def
process_xml
(
self
,
xml
):
...
...
@@ -873,22 +792,27 @@ class PublicationsTool(object):
print
"process xml record"
li
=
self
.
marc12
.
process
(
xml
)
for
record
in
li
:
if
self
.
dbg
:
print
"record decoded"
oai_
url
=
record
.
oai_url
()
self
.
logs
.
append
(
Msg
(
url
=
oai_
url
))
url
=
OAI_URL
%
(
self
.
harvester
.
host
,
record
.
id
()
)
self
.
logs
.
append
(
Msg
(
url
=
url
))
self
.
logs
[
-
1
].
title
=
record
.
title
()
self
.
logs
[
-
1
].
collection
=
self
.
collection_logs
[
-
1
].
title
self
.
logs
[
-
1
].
year
=
record
.
year
()
if
not
oai_url
:
self
.
logs
[
-
1
].
reject
(
MSG_NO_OAI
)
self
.
check_me
(
record
,
format_author_fr
)
if
record
.
is_valid
:
self
.
logs
[
-
1
].
year
=
record
.
year
()
else
:
self
.
logs
[
-
1
].
year
=
record
.
year
()
self
.
logs
[
-
1
].
reject
(
record
.
msg
)
continue
if
not
self
.
select_record
(
record
):
continue
...
...
@@ -1192,7 +1116,7 @@ class Articles(PublicationsTool):
@rtype: bool
"""
if
not
is_published
_paper
(
record
):
if
not
record
.
is_published
(
):
self
.
logs
[
-
1
].
reject
(
MSG_NO_EDITOR
)
return
False
...
...
@@ -1343,7 +1267,7 @@ class Preprints(PublicationsTool):
"""
if
is_published
_paper
(
record
):
if
record
.
is_published
(
):
self
.
logs
[
-
1
].
reject
(
MSG_PREPRINT_IS_PAPER
)
return
False
...
...
@@ -1411,7 +1335,7 @@ class Proceedings(PublicationsTool):
db
=
self
.
db
origin_proc
=
oai_url
origin_talk
=
invenio_tools
.
OAI_URL
%
(
host
,
reference_talk
)
origin_talk
=
OAI_URL
%
(
host
,
reference_talk
)
id_proc
=
get_id
(
db
.
publications
,
origin
=
origin_proc
)
id_talk
=
get_id
(
db
.
publications
,
origin
=
origin_talk
)
...
...
@@ -1600,10 +1524,6 @@ class Proceedings(PublicationsTool):
if
isinstance
(
pages
,
list
)
and
len
(
pages
)
==
1
:
pages
=
pages
[
0
]
# try to recover missing year using the submission field
if
not
year
:
year
=
submitted
[
0
:
4
]
# check the publisher
id_publisher
=
self
.
check_publisher
(
editor
)
...
...
@@ -1625,22 +1545,10 @@ class Proceedings(PublicationsTool):
if
id
:
return
status
# get the conference record
cds
=
invenio_tools
.
CdsSvc
(
host
=
host
)
conference
=
get_conference_record
(
cds
,
self
.
marc12
,
record
)
# conference is either a Record or and error msg
if
isinstance
(
conference
,
(
str
,
unicode
)):
msg
=
conference
if
isinstance
(
msg
,
unicode
):
msg
=
msg
.
encode
(
'utf-8'
)
self
.
logs
[
-
1
].
reject
(
msg
)
return
0
# alias
conference_dates
=
conference
.
conference_dates
()
conference_title
=
conference
.
conference_title
()
country
=
conference
.
conference_country
()
# alias for the conference information
conference_dates
=
record
.
conference_dates
()
conference_title
=
record
.
conference_title
()
country
=
record
.
conference_country
()
first_author
=
record
.
first_author
()
# check conference country
...
...
@@ -1679,8 +1587,8 @@ class Proceedings(PublicationsTool):
conference_dates
=
conference_dates
,
conference_speaker
=
first_author
,
conference_title
=
conference_title
,
conference_town
=
conference
.
conference_town
(),
conference_url
=
conference
.
conference_url
(),
conference_town
=
record
.
conference_town
(),
conference_url
=
record
.
conference_url
(),
first_author
=
first_author
,
id_categories
=
self
.
harvester
.
id_categories
,
id_collaborations
=
id_collaboration
,
...
...
@@ -1716,7 +1624,7 @@ class Reports(PublicationsTool):
@type record: L{Record}
@param record:
@rtype: int
@rtype: int
@return: one when the record is inserted / updated in the database
zero otherwise.
...
...
@@ -1732,19 +1640,11 @@ class Reports(PublicationsTool):
title
=
record
.
title
()
year
=
record
.
year
()
# protection against authors not defined
if
not
authors
:
authors
=
UNKNOWN
id_status
=
get_id
(
db
.
status
,
code
=
UNKNOWN
)
# protection against cppm authors not defined
if
not
authors_cppm
:
authors_cppm
=
UNKNOWN
id_status
=
get_id
(
db
.
status
,
code
=
UNKNOWN
)
if
not
first_author
:
first_author
=
UNKNOWN
id_status
=
get_id
(
db
.
status
,
code
=
UNKNOWN
)
# check the collaboration
id_collaboration
=
self
.
check_collaboration
(
record
.
collaboration
())
...
...
@@ -1823,7 +1723,7 @@ class Talks(PublicationsTool):
db
=
self
.
db
origin_proc
=
invenio_tools
.
OAI_URL
%
(
host
,
reference_proceeding
)
origin_proc
=
OAI_URL
%
(
host
,
reference_proceeding
)
origin_talk
=
oai_url
id_proc
=
get_id
(
db
.
publications
,
origin
=
origin_proc
)
...
...
@@ -1947,22 +1847,10 @@ class Talks(PublicationsTool):
if
id
:
return
status
# get the conference record
cds
=
invenio_tools
.
CdsSvc
(
host
=
host
)
conference
=
get_conference_record
(
cds
,
self
.
marc12
,
record
)
# conference is either a Record or and error msg
if
isinstance
(
conference
,
(
str
,
unicode
)):
msg
=
conference
if
isinstance
(
msg
,
unicode
):
msg
=
msg
.
encode
(
'utf-8'
)
self
.
logs
[
-
1
].
reject
(
msg
)
return
0
# conference alias
conference_dates
=
conference
.
conference_dates
()
conference_title
=
conference
.
conference_title
()
country
=
conference
.
conference_country
()
# alias for the conference information
conference_dates
=
record
.
conference_dates
()
conference_title
=
record
.
conference_title
()
country
=
record
.
conference_country
()
first_author
=
record
.
first_author
()
submitted
=
record
.
submitted
()
title
=
record
.
title
()
...
...
@@ -1991,8 +1879,8 @@ class Talks(PublicationsTool):
conference_dates
=
conference_dates
,
conference_speaker
=
first_author
,
conference_title
=
conference_title
,
conference_town
=
conference
.
conference_town
(),
conference_url
=
conference
.
conference_url
(),
conference_town
=
record
.
conference_town
(),
conference_url
=
record
.
conference_url
(),
first_author
=
first_author
,
id_categories
=
self
.
harvester
.
id_categories
,
id_collaborations
=
id_collaboration
,
...
...
modules/invenio_tools.py
View file @
8280655d
This diff is collapsed.
Click to expand it.
static/CHANGELOG
View file @
8280655d
...
...
@@ -2,6 +2,10 @@
HEAD
- Modify the logic of the harvester by introducing the class CheckAndFixSvc.
Validation and corrections of each record is performed at only one place.
Should improve code stability and maintenance.
0.8.7.2 (Sep 2014)
- Migrate to plugin_dbui 0.6.1.7.
- More robust harvester algorithms.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment