Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
6ae3ac92
Commit
6ae3ac92
authored
Jun 29, 2017
by
LE GAC Renaud
Browse files
Clean module invenio_tools.
parent
656ecd86
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1 addition
and
590 deletions
+1
-590
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+0
-1
modules/invenio_tools/__init__.py
modules/invenio_tools/__init__.py
+1
-1
modules/invenio_tools/iterrecord.py
modules/invenio_tools/iterrecord.py
+0
-294
modules/invenio_tools/marc12.py
modules/invenio_tools/marc12.py
+0
-286
modules/invenio_tools/recordinst.py
modules/invenio_tools/recordinst.py
+0
-1
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+0
-7
No files found.
modules/harvest_tools/automaton.py
View file @
6ae3ac92
...
...
@@ -13,7 +13,6 @@ from .checkandfix import CheckAndFix
from
gluon.storage
import
Storage
from
invenio_tools
import
(
CdsException
,
InvenioStore
,
Marc12
,
OAI_URL
)
from
invenio_tools.factory
import
build_record
from
.msg
import
Msg
...
...
modules/invenio_tools/__init__.py
View file @
6ae3ac92
...
...
@@ -33,7 +33,7 @@ from .marc12 import Marc12
from
.record
import
Record
from
.recordconf
import
RecordConf
from
.recordinst
import
RecordInst
from
.recordpubli
import
DECODE_REF
,
RecordPubli
from
.recordpubli
import
RecordPubli
from
.recordthesis
import
RecordThesis
...
...
modules/invenio_tools/iterrecord.py
deleted
100644 → 0
View file @
656ecd86
""" invenio_tools.iterrecord
"""
import
re
from
.exception
import
Marc12Exception
from
.record
import
Record
from
xml.dom.minidom
import
parseString
MSG_WELL_FORMED_XML
=
"Reject XML is not well formed"
REG_INT
=
re
.
compile
(
"^\d+$"
)
class
IterRecord
(
object
):
"""Iterator to decode the XML string and to iterate on Record.
The XML string is encoded using the
`MARC <http://www.loc.gov/marc>`_ format.
The XML string has the following structure::
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
The iterator finds each record block and decode it.
"""
def
__init__
(
self
,
xml
):
"""
Args:
xml (str):
Raises:
Marc12Exception: not well formed XML.
"""
if
self
.
_is_not_xml
(
xml
):
raise
Marc12Exception
(
MSG_WELL_FORMED_XML
)
dom
=
parseString
(
xml
)
root
=
dom
.
documentElement
nodes
=
root
.
getElementsByTagName
(
"record"
)
self
.
i
=
0
self
.
length
=
len
(
nodes
)
self
.
nodes
=
nodes
def
_clean_record
(
self
,
record
):
"""Internal tool to clean the record.
Concatenate the following dictionary::
record[field] = [
dict(subfield1=val1),
dict(subfield2=val2),
dict(subfield3=val3),...
]
record[field] = [
dict(subfield1=val1),
dict(subfield2=val2,
subfield3=val3),...
]
into a single one::
record[field] = dict1(subfield1=val1,
subfield2=val2,
subfield3=val3)
Args:
record (Record):
"""
for
field
in
record
:
if
not
isinstance
(
record
[
field
],
list
):
continue
nkeys
=
[
len
(
di
)
for
di
in
record
[
field
]]
# several dictionary with more than one nkeys
# don't know how to treat that case
if
max
(
nkeys
)
>
1
and
nkeys
.
count
(
max
(
nkeys
))
>
1
:
continue
# merge single entity dict in one big dict
# works when all the nkeys are different
# otherwise don't know what to do
if
max
(
nkeys
)
==
1
:
keys
=
[]
for
di
in
record
[
field
]:
keys
.
extend
(
di
.
keys
())
# in a set duplicate entries are removed
# the next statement is true when all keys are different
if
len
(
keys
)
==
len
(
set
(
keys
)):
di
=
record
[
field
][
0
]
for
i
in
range
(
1
,
len
(
record
[
field
])):
for
k
,
v
in
record
[
field
][
i
].
items
():
di
[
k
]
=
v
record
[
field
]
=
di
# merge a single entity one dict into an existing big one
# works when key don't exist in the big one
# otherwise don't known what to do
#
# Example 1: the following list is kept unchanged
# [{"a": u"LHCB-PAPER-2014-047"},
# {"a": u"CERN-PH-EP-2014-221"},
# {"9": u"arXiv", "a": u"arXiv:1410.0149", "c": u"hep-ex"}]
#
else
:
index
=
nkeys
.
index
(
max
(
nkeys
))
di
,
ko
=
record
[
field
][
index
],
False
# check that key do not exist in the big one
keys
=
list
(
di
.
keys
())
for
i
in
range
(
len
(
record
[
field
])):
if
i
==
index
:
continue
for
k
in
record
[
field
][
i
].
keys
():
if
k
in
di
:
ko
=
True
break
else
:
keys
.
append
(
k
)
if
ko
:
continue
# copy keys
for
i
in
range
(
len
(
record
[
field
])):
if
i
==
index
:
continue
for
k
,
v
in
record
[
field
][
i
].
items
():
di
[
k
]
=
v
record
[
field
]
=
di
def
_decode_record
(
self
,
node
):
"""Transform the XML node *<record>* into a Record.
Args:
node (str): the *<record>* node has the following structure::
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield key="a">oai:cds.cern.ch:1540265</subfield>
<subfield key="p">cerncds:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN</subfield>
</datafield>
...
</record>
Returns:
Record: the keys of the record correspond to the *datafield tag*.
"""
record
=
Record
()
# controlfield
for
controlfield
in
node
.
getElementsByTagName
(
"controlfield"
):
key
=
controlfield
.
getAttribute
(
"tag"
)
value
=
controlfield
.
childNodes
[
0
].
nodeValue
record
[
key
]
=
value
# datafield
for
datafield
in
node
.
getElementsByTagName
(
"datafield"
):
di
=
self
.
_decode_datafield
(
datafield
)
key
=
datafield
.
getAttribute
(
"tag"
)
ind1
=
datafield
.
getAttribute
(
"ind1"
).
replace
(
" "
,
""
)
ind2
=
datafield
.
getAttribute
(
"ind2"
).
replace
(
" "
,
""
)
# In almost all case the tag is an integer
# but from time to time it is equal to "FFT" (inspirehep) !!
if
not
REG_INT
.
match
(
key
):
continue
# build the key by concataining all attributes
key
=
"%s%s%s"
%
(
key
,
ind1
,
ind2
)
# one occurrence of the key
if
key
not
in
record
:
record
[
key
]
=
di
# several occurrence of the key - transform a list of dictionary
elif
isinstance
(
record
[
key
],
list
):
record
[
key
].
append
(
di
)
else
:
record
[
key
]
=
[
record
[
key
],
di
]
return
record
def
_decode_datafield
(
self
,
node
):
"""Transform the XML node *<datafiled>* into a dictionary.
Args:
node (str): the *<datafiled>* node has the following
structure::
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
Returns:
dict: the keys correspond to the *subfield code* while the values
are a string of a list of strings.
"""
di
=
{}
for
subfield
in
node
.
getElementsByTagName
(
"subfield"
):
code
=
str
(
subfield
.
getAttribute
(
"code"
))
value
=
""
if
subfield
.
childNodes
:
value
=
subfield
.
childNodes
[
0
].
nodeValue
if
code
not
in
di
:
di
[
code
]
=
value
elif
isinstance
(
di
[
code
],
list
):
di
[
code
].
append
(
value
)
else
:
di
[
code
]
=
[
di
[
code
],
value
]
return
di
def
_is_not_xml
(
self
,
xml
):
"""C{True} when the C{xml} string is not well formed.
Args:
xml (str):
Returns:
bool:
"""
if
xml
.
startswith
(
"<?xml"
):
return
False
return
True
def
__iter__
(
self
):
return
self
def
__next__
(
self
):
"""
Returns:
Record: the next decoded record.
Raises:
StopIteration: when there is no more record.
"""
i
=
self
.
i
if
i
<
self
.
length
:
node
=
self
.
nodes
.
item
(
i
)
record
=
self
.
_decode_record
(
node
)
self
.
_clean_record
(
record
)
self
.
i
+=
1
return
record
else
:
raise
StopIteration
()
modules/invenio_tools/marc12.py
deleted
100644 → 0
View file @
656ecd86
""" invenio_tools.marc12
"""
import
re
from
.base
import
(
is_conference
,
is_institute
,
is_thesis
,
MSG_NO_CONF
,
MSG_NO_HOST
,
REG_OAI
)
from
.exception
import
Marc12Exception
from
.inveniostore
import
InvenioStore
from
.iterrecord
import
IterRecord
from
.recordconf
import
RecordConf
from
.recordinst
import
RecordInst
from
.recordpubli
import
RecordPubli
from
.recordthesis
import
RecordThesis
MSG_DECODING_FAILED
=
"Record decoding failed."
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(\.\d+)?$"
)
class
Marc12
(
object
):
"""Service to decode MARC12 records embedded in the XML string.
The main methods are :meth:`.records` which returns a list
of :class:`.Record` object and :meth:`.iterrecords`.
Each record behaves like a dictionary::
record[field][subfield] = value(s)
where the ``field`` correspond to the *datafield tag* and the
``subfield`` to the *subfield code*.
Note:
The record is upcasted to :class:`.RecordInst` When it describes
an institute, :class:`.RecordPubli` for a publication
and :class:`.RecordConf` for a conference talk or proceeding.
Note:
The conference information are added for a talk or a proceeding.
"""
def
_add_conference_data
(
self
,
record
):
"""Add the conference data to the record.
Args:
record (Record): record describing a conference.
"""
# reference to host
host
=
record
.
host
()
if
host
in
(
""
,
None
):
raise
Marc12Exception
(
MSG_NO_HOST
)
# for talk or proceeding a key is always defined
key
=
record
.
reference_conference_key
()
if
not
key
:
raise
Marc12Exception
(
MSG_NO_CONF
)
# get conference information
id_conf
=
record
.
reference_conference_id
()
conference
=
self
.
_get_conference
(
host
,
id_conf
,
key
)
# protection id can be a reference to other object like book
if
"111"
not
in
conference
:
return
# copy conference information in the current record
# the conference URL is in 8564u
record
[
"111"
]
=
conference
[
"111"
]
if
"8564"
in
conference
:
record
[
"8564"
]
=
conference
[
"8564"
]
def
__call__
(
self
,
xml
,
**
kwargs
):
"""
Note:
* Allow the syntax ``Marc12()(xml)``.
* Keep for backward compatibility.
* Prefer the method :meth:`.records`.
Args:
xml (str): the XML string with the publication contents.
Keyword Args:
filter_func (reference): a function to eliminate records
which don't satisfy functions criteria. The argument of the
function is a Record while the return value is a boolean.
func (reference): a function applied to each surviving record.
The argument of the function is a Record.
It can be used to polish the record content.
Returns:
list: list of :clas:`.Record`.
"""
return
self
.
records
(
xml
,
**
kwargs
)
def
_get_conference
(
self
,
host
,
conf_id
,
key
):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
Args:
host (str): possible values are ``cds.cern.ch`` or
``inspirehep.net``.
conf_id (str): the conference identifier in the store.
key (str): the conference key in the store.
Returns:
Record: The conference record
Raises:
Marc12Exception: when the conference is not found.
"""
cds
=
InvenioStore
(
host
)
# search the conference by id the preferred method
if
conf_id
:
xml
=
cds
.
get_record
(
conf_id
)
for
conference
in
IterRecord
(
xml
):
if
conference
.
id
()
==
conf_id
:
return
conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if
key
:
ids
=
[]
if
cds
.
_host
.
startswith
(
"inspirehep"
):
key
=
key
.
replace
(
"/"
,
"-"
)
if
REG_CONF
.
match
(
key
):
ids
=
cds
.
get_ids
(
cc
=
"Conferences"
,
p
=
"111__g:%s"
%
key
)
else
:
ids
=
cds
.
get_ids
(
p
=
key
)
for
conf_id
in
ids
:
xml
=
cds
.
get_record
(
conf_id
)
for
conference
in
IterRecord
(
xml
):
if
conference
.
_get
(
"111"
,
"g"
)
==
key
:
return
conference
raise
Marc12Exception
(
MSG_NO_CONF
)
def
_recover_deleted_record
(
self
,
record
):
"""Recover a deleted record.
From time to time a record is deleted and replace by a new one.
In that case the record looks like::
{
"0248_": {"a": "oai:cds.cern.ch:1366561"},
"001": "1366561",
"980": {"c": "DELETED"},
"970": {"d": "1366710"}
}
The method replace the old record by the new one, by using the oai URL.
It is build using the field 0248_ and 970.
Note:
It might happen that a record is deleted and not replace by a new.
In that case the CheckException is raised.
Args:
record (Record): the record to be check
Returns:
Record: the input record or the new one.
Raises:
Marc12Exception: when the record is deleted and
not replaced by a new one.
"""
is_deleted
=
"980"
in
record
\
and
"c"
in
record
[
"980"
]
and
record
[
"980"
][
"c"
]
==
"DELETED"
if
not
is_deleted
:
return
record
is_replaced
=
\
"970"
in
record
and
"d"
in
record
[
"970"
]
\
and
"0248_"
in
record
and
"a"
in
record
[
"0248_"
]
if
not
is_replaced
:
raise
Marc12Exception
(
MSG_DECODING_FAILED
)
match
=
REG_OAI
.
match
(
record
[
"0248_"
][
"a"
])
if
match
:
cds
=
InvenioStore
(
match
.
group
(
1
))
xml
=
cds
.
get_record
(
record
[
"970"
][
"d"
])
new_record
=
next
(
IterRecord
(
xml
))
return
new_record
raise
Marc12Exception
(
MSG_DECODING_FAILED
)
def
iterrecords
(
self
,
xml
):
"""Return an iterator on the embedded records.
Args:
xml (str): the XML string with the publication contents.
Return:
IterRecord:
"""
return
IterRecord
(
xml
)
def
records
(
self
,
xml
,
filter_func
=
None
,
func
=
None
):
"""Transform the the XML string into a list of Record.
Args:
xml (str): the XML string with the publication contents.
It has the following structure:
.. code-block:: xml
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
filter_func (reference): a function to eliminate records
which don't satisfy functions criteria. The argument of the
function is a Record while the return value is a boolean.
func (reference): a function applied to each surviving record.
The argument of the function is a Record.
It can be used to polish the record content.
Returns:
list: list of :class:`.Record`.
Raises:
Marc12Exception: not well formed XML.
"""
li
=
[]
for
record
in
IterRecord
(
xml
):
record
=
self
.
_recover_deleted_record
(
record
)
if
is_conference
(
record
):
upcast_record
=
RecordConf
(
record
)
self
.
_add_conference_data
(
upcast_record
)
elif
is_institute
(
record
):
upcast_record
=
RecordInst
(
record
)
elif
is_thesis
(
record
):
upcast_record
=
RecordThesis
(
record
)
else
:
upcast_record
=
RecordPubli
(
record
)
if
filter_func
and
not
filter_func
(
upcast_record
):
continue
if
func
:
func
(
upcast_record
)
li
.
append
(
upcast_record
)
return
li
modules/invenio_tools/recordinst.py
View file @
6ae3ac92
...
...
@@ -12,7 +12,6 @@ from record import Record
>>>>>>>
Migrate
RecordInst
.
MSG_INVALID_ARG
=
"Invalid argument record"
MSG_INVALID_HOST
=
"Invalid record host"
MSG_INVALID_RECORD
=
"Invalid record, it is not describing an institute"
...
...
modules/invenio_tools/recordpubli.py
View file @
6ae3ac92
...
...
@@ -25,13 +25,6 @@ AUTHOR_FORMATS = [
"Last, First"
,
"Last F."
]
# decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
_ref1
=
r
"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2
=
r
"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF
=
[
re
.
compile
(
_ref1
),
re
.
compile
(
_ref2
)]
MSG_INVALID_FMT
=
"Invalid format for author"
# the keys containing paper reference
...
...
Write
Preview