Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
df4943f4
Commit
df4943f4
authored
Sep 24, 2015
by
LE GAC Renaud
Browse files
Do no reject a record with a bad OAI, recover it.
parent
6409d1e6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
26 deletions
+55
-26
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+12
-15
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+27
-0
modules/invenio_tools/checkandfix.py
modules/invenio_tools/checkandfix.py
+16
-11
No files found.
modules/harvest_tools/automaton.py
View file @
df4943f4
...
...
@@ -6,7 +6,7 @@ import re
import
traceback
from
base
import
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
ToolException
from
base
import
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
recover_oai
,
ToolException
from
gluon.storage
import
Storage
from
invenio_tools
import
(
CheckAndFix
,
InvenioStore
,
...
...
@@ -392,17 +392,17 @@ class Automaton(object):
"""Decode the xml and load it in the database.
@raise Exception: the type of exception depends on what happen:
- L{ToolException} when projet, team or category identifier
- L{ToolException} when proje
c
t, team or category identifier
are not defined.
- C{StoreException} when something
s
goes wrong interrogating the
store.
- C{Marc12Exception} when something
s
goes wrong decoding the XML
- C{StoreException} when something goes wrong interrogating the
store.
- C{Marc12Exception} when something goes wrong decoding the XML
string return by the store
- C{CheckException} if the L{Record} is not valid
- C{Exception} if the python code crash
@type xml: unicode
@keyword xml: marc12
xml
encoding of the publication record
@keyword xml: marc12
XML
encoding of the publication record
"""
if
self
.
dbg
:
...
...
@@ -417,7 +417,7 @@ class Automaton(object):
self
.
decode_xml
(
xml
)
def
process_url
(
self
,
host
,
collections
):
"""Retrieve the
xml
from the invenio store and load it in the database
"""Retrieve the
XML
from the invenio store and load it in the database
@raise Exception: depending on what happen, can be StoreException,
Marc12ZException, ...
...
...
@@ -528,17 +528,14 @@ class Automaton(object):
record_id
=
record
.
id
(),
title
=
record
.
title
()))
#
reject record with undefined OAI field
#
the OAI is not defined -- recover it
oai
=
record
.
oai
()
if
oai
is
None
:
self
.
logs
[
-
1
].
reject
(
MSG_NO_OAI
,
record
.
year
())
continue
recover_oai
(
record
,
self
.
harvester
.
host
)
# reject the record when the OAI is not well
match
=
REG_OAI
.
match
(
oai
)
if
not
match
:
self
.
logs
[
-
1
].
reject
(
MSG_WELL_FORM_OAI
,
record
.
year
())
continue
# the OAI is not well --recover it
if
not
REG_OAI
.
match
(
oai
):
recover_oai
(
record
,
self
.
harvester
.
host
)
# check that the record is well formed
# repair non-conformity as far as possible
...
...
modules/harvest_tools/base.py
View file @
df4943f4
...
...
@@ -14,6 +14,10 @@ MSG_FIX_ORIGIN = "Fixed the origin field"
MSG_IN_DB
=
"Already in the database"
MSG_LOAD
=
"Load in the database"
MSG_INVALID_HOST
=
"Invalid host"
OAI_INVENIO
=
"oai:%s:%s"
def
family_name_fr
(
full_name
):
"""Extract the family name when the full name is encoded as C{J. Doe}.
...
...
@@ -141,5 +145,28 @@ def learn_my_authors(db,
db
.
my_authors
[
row
.
id
]
=
dict
(
authors
=
', '
.
join
(
database_authors
))
def
recover_oai
(
record
,
host
):
"""Helper function to recover the OAI identifier when it is not defined
or not well form.
@type record: Record
@param record:
"""
if
host
==
"cds.cern.ch"
:
field
,
subfield
=
u
"0248"
,
"a"
elif
host
==
"inspirehep.net"
:
field
,
subfield
=
u
"909CO"
,
"o"
else
:
raise
ValueError
(
MSG_INVALID_HOST
)
if
field
not
in
record
:
record
[
field
]
=
dict
()
record
[
field
][
subfield
]
=
OAI_INVENIO
%
(
host
,
record
.
id
())
class
ToolException
(
Exception
):
pass
modules/invenio_tools/checkandfix.py
View file @
df4943f4
...
...
@@ -21,18 +21,19 @@ from recordconf import RecordConf
from
recordthesis
import
RecordThesis
DECODE_ARXIV
=
re
.
compile
(
"arXiv:(\d{2})(\d{2})\."
)
DECODE_ARXIV
=
re
.
compile
(
r
"arXiv:(\d{2})(\d{2})\."
)
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})"
)
DECODE_DD_MM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) +(\d{1,2}) +(\d{4})"
)
DECODE_YYYY
=
re
.
compile
(
"^(\d{4})$"
)
DECODE_YYYY
=
re
.
compile
(
r
"^(\d{4})$"
)
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
DECODE_REF
=
[
re
.
compile
(
"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
),
re
.
compile
(
"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
)]
_ref1
=
r
"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2
=
r
"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF
=
[
re
.
compile
(
_ref1
),
re
.
compile
(
_ref2
)]
MONTHS
=
{
u
'Jan'
:
'01'
,
u
'Feb'
:
'02'
,
...
...
@@ -70,11 +71,12 @@ MSG_WELL_FORMED_DATE = "Reject submission date is not well formed"
MSG_WELL_FORMED_EDITOR
=
"Reject editor is not well formed"
OAI_INVENIO
=
"oai:%s:%s"
REG_COLLABORATION
=
re
.
compile
(
regex
.
REG_COLLABORATION
)
REG_CONF_DATES_1
=
re
.
compile
(
"(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES_2
=
re
.
compile
(
"(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES_2
=
\
re
.
compile
(
"(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES
=
re
.
compile
(
regex
.
REG_CONF_DATES
)
REG_SUBMITTED
=
re
.
compile
(
regex
.
REG_SUBMITTED
)
...
...
@@ -483,7 +485,7 @@ class CheckAndFix(object):
- INVENIO: Phys. Lett. B + volume 673
- INSPIREHEP: Phys.Lett + volume B673
Standardi
s
e the answer as Phys Lett B
Standardi
z
e the answer as Phys Lett B
@note: It is recommended to call this method when erratum are removed.
...
...
@@ -595,7 +597,8 @@ class CheckAndFix(object):
def
get_my_authors
(
self
,
record
,
cmpFct
=
None
):
"""Get authors of my institutes signing the record.
The information is append to the L{Record} via the attribute C{my_authors}.
The information is append to the L{Record} via the attribute
C{my_authors}.
@type record: L{Record}
@param record:
...
...
@@ -764,13 +767,15 @@ class CheckAndFix(object):
# 22 Mar 2011
m
=
DECODE_DD_MMM_YYYY
.
match
(
dates
[
i
])
if
m
:
dates
[
i
]
=
'%s-%s-%02i'
%
(
m
.
group
(
3
),
MONTHS
[
m
.
group
(
2
)],
int
(
m
.
group
(
1
)))
data
=
(
m
.
group
(
3
),
MONTHS
[
m
.
group
(
2
)],
int
(
m
.
group
(
1
)))
dates
[
i
]
=
'%s-%s-%02i'
%
data
continue
# 22 03 2011
m
=
DECODE_DD_MM_YYYY
.
match
(
dates
[
i
])
if
m
:
dates
[
i
]
=
'%s-%02i-%02i'
%
(
m
.
group
(
3
),
int
(
m
.
group
(
2
)),
int
(
m
.
group
(
1
)))
data
(
m
.
group
(
3
),
int
(
m
.
group
(
2
)),
int
(
m
.
group
(
1
)))
dates
[
i
]
=
'%s-%02i-%02i'
%
data
continue
# 2011
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment