Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
e92f798b
Commit
e92f798b
authored
Sep 19, 2015
by
LE GAC Renaud
Browse files
Conference information are added by the Marc12 service.
parent
420d003a
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
545 additions
and
513 deletions
+545
-513
modules/invenio_tools/base.py
modules/invenio_tools/base.py
+15
-1
modules/invenio_tools/checkandfix.py
modules/invenio_tools/checkandfix.py
+2
-76
modules/invenio_tools/marc12.py
modules/invenio_tools/marc12.py
+71
-211
modules/regex.py
modules/regex.py
+8
-8
tests/harvester/CheckAndFix/test_acti_cds1411352_fix.py
tests/harvester/CheckAndFix/test_acti_cds1411352_fix.py
+6
-102
tests/harvester/CheckAndFix/test_acti_ins1276938_fix.py
tests/harvester/CheckAndFix/test_acti_ins1276938_fix.py
+2
-115
tests/harvester/Record/test_acti_cds1411352.py
tests/harvester/Record/test_acti_cds1411352.py
+145
-0
tests/harvester/Record/test_acti_ins1276938.py
tests/harvester/Record/test_acti_ins1276938.py
+150
-0
tests/harvester/Record/test_com_cds1550918.py
tests/harvester/Record/test_com_cds1550918.py
+146
-0
No files found.
modules/invenio_tools/base.py
View file @
e92f798b
...
...
@@ -10,8 +10,22 @@ REG_OAI = re.compile('oai:([a-z\.]+):([\d]+)')
REG_YEAR
=
re
.
compile
(
"(\d{4})"
)
def
is_conference
(
record
):
"""True when the record describes a publication related to a conference.
Args:
record (Record):
Return:
bool: true when the MARC record describes a publication related
to a conference.
"""
return
u
"111"
in
record
or
record
.
reference_conference_key
()
def
is_institute
(
record
):
""" True when the record describe an institute.
""" True when the record describe
s
an institute.
Args:
record (Record):
...
...
modules/invenio_tools/checkandfix.py
View file @
e92f798b
...
...
@@ -5,7 +5,7 @@
import
re
import
regex
from
base
import
OAI_URL
,
REG_OAI
,
REG_YEAR
from
base
import
is_conference
,
OAI_URL
,
REG_OAI
,
REG_YEAR
from
exception
import
CheckException
from
filters
import
CLEAN_REVIEW
from
gluon
import
current
...
...
@@ -44,7 +44,6 @@ MONTHS = {u'Jan':'01',
u
'Dec'
:
'12'
}
MSG_NO_AUTHOR
=
"Reject no author(s)"
MSG_NO_CONF
=
"Reject no conference information"
MSG_NO_COUNTRY
=
"Reject invalid country"
MSG_NO_DATE
=
"Reject no submission date"
MSG_NO_MY_AUTHOR
=
"Reject no authors of my institute"
...
...
@@ -68,7 +67,6 @@ MSG_WELL_FORMED_OAI = "Reject OAI is not well formed"
OAI_INVENIO
=
"oai:%s:%s"
REG_COLLABORATION
=
re
.
compile
(
regex
.
REG_COLLABORATION
)
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(\.\d+)?$"
)
REG_CONF_DATES_1
=
re
.
compile
(
"(\d+) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES_2
=
re
.
compile
(
"(\d+) *([A-Z][a-z]{2}) *-? *(\d+) *([A-Z][a-z]{2}) *(\d{4})"
)
REG_CONF_DATES
=
re
.
compile
(
regex
.
REG_CONF_DATES
)
...
...
@@ -106,56 +104,6 @@ class CheckAndFix(object):
Most of the method raise the CheckException when something went wrong.
"""
def
_get_conference
(
self
,
host
,
id
,
key
):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
@type host: unicode
@param host:
@type id: unicode
@param id:
@type key: unicode
@param key:
@rtype: L{Record}
@return: The conference record
@raise CheckException:
"""
cds
=
InvenioStore
(
host
)
marc12
=
Marc12
()
# search the conference by id the preferred method
if
id
:
xml
=
cds
.
get_record
(
id
)
for
conference
in
marc12
(
xml
):
if
conference
.
id
()
==
id
:
return
conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if
key
:
ids
=
[]
if
cds
.
_host
.
startswith
(
'inspirehep'
):
key
=
key
.
replace
(
'/'
,
'-'
)
if
REG_CONF
.
match
(
key
):
ids
=
cds
.
get_ids
(
cc
=
'Conferences'
,
p
=
'111__g:%s'
%
key
)
else
:
ids
=
cds
.
get_ids
(
p
=
key
)
for
id
in
ids
:
xml
=
cds
.
get_record
(
id
)
for
conference
in
marc12
(
xml
):
if
conference
.
conference_key
()
==
key
:
return
conference
raise
CheckException
(
MSG_NO_CONF
)
def
_recover_submitted
(
self
,
record
):
"""Recover submitted date using conference, preprint or thesis
information.
...
...
@@ -350,31 +298,9 @@ class CheckAndFix(object):
"""
# conference information are available, i.e proceeding
if
"111"
in
record
:
return
# alias
host
=
record
.
host
()
key
=
record
.
reference_conference_key
()
# for talk or proceeding a key is always defined
if
not
key
:
raise
CheckException
(
MSG_NO_CONF
)
# get conference information
id
=
record
.
reference_conference_id
()
conference
=
self
.
_get_conference
(
host
,
id
,
key
)
# protection id can be a reference to other object like book
if
"111"
not
in
conference
:
if
not
is_conference
(
record
):
return
# copy conference information in the current record
# the conference URL is in 8564u
record
[
u
"111"
]
=
conference
[
"111"
]
if
"8564"
in
conference
:
record
[
u
"8564"
]
=
conference
[
"8564"
]
# check country information (all valid countries have been enter once)
db
=
current
.
globalenv
[
'db'
]
id
=
get_id
(
db
.
countries
,
country
=
record
.
conference_country
())
...
...
modules/invenio_tools/marc12.py
View file @
e92f798b
...
...
@@ -5,243 +5,108 @@
import
re
from
base
import
is_institute
from
base
import
is_conference
,
is_institute
from
exception
import
Marc12Exception
from
institute
import
Institute
from
rec
or
d
import
Rec
or
d
from
xml.dom.minidom
import
parseString
from
inveniost
or
e
import
InvenioSt
or
e
from
iterrecord
import
IterRecord
MSG_WELL_FORMED_XML
=
"Reject XML is not well formed"
REG_INT
=
re
.
compile
(
"^\d+$"
)
MSG_NO_CONF
=
"Reject no conference information"
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(\.\d+)?$"
)
class
Marc12
(
object
):
"""Decode the XML string encoded with the
U{MARC<http://www.loc.gov/marc>} format.
The main method L{__call__} analyses the XML string
which has the follwing structure::
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
...
</record>
<record>
...
</record>
</collection>
It returns a L{Record} object which behave like a dictionary::
"""Decode the MARC12 records embedded in the XML string.
The main method L{__call__} returns a list of L{Record} object
which behave like a dictionary::
record[field][subfield] = value(s)
where the C{field} correspond to the I{datafield tag} and the
C{subfield} to the I{subfield code}.
"""
def
_clean_record
(
self
,
record
):
"""Internal tool to clean the record.
concatenate the following dictionary::
When the record describes an institute, it is upcasted to C{Institute}
record[field] = [dict(subfield1=val1), dict(subfield2=val2), dict(subfield3=val3),...]
record[field] = [dict(subfield1=val1), dict(subfield2=val2, subfield3=val3),...]
The conference information are added for a talk or a proceeding.
into a single one::
record[field] = dict1(subfield1=val1, subfield2=val2, subfield3=val3)
"""
def
_add_conference_data
(
self
,
record
):
"""Add the conference data to the record.
@type record: Record
@param record:
"""
for
field
in
record
:
if
not
isinstance
(
record
[
field
],
list
):
continue
nkeys
=
[
len
(
di
)
for
di
in
record
[
field
]]
# several dictionary with more than one nkeys
# don't know how to treat that case
if
max
(
nkeys
)
>
1
and
nkeys
.
count
(
max
(
nkeys
))
>
1
:
continue
# merge single entity dict in one big dict
# works when all the nkeys are different
# otherwise don't know what to do
if
max
(
nkeys
)
==
1
:
keys
=
[]
for
di
in
record
[
field
]:
keys
.
extend
(
di
.
iterkeys
())
# in a set duplicate entries are removed
# the next statement is true when all keys are different
if
len
(
keys
)
==
len
(
set
(
keys
)):
di
=
record
[
field
][
0
]
for
i
in
range
(
1
,
len
(
record
[
field
])):
for
(
k
,
v
)
in
record
[
field
][
i
].
iteritems
():
di
[
k
]
=
v
record
[
field
]
=
di
# merge a single entity one dict into an existing big one
# works when key don't exist in the big one
# otherwise don't known what to do
#
# Example 1: the following list is kept unchanged
# [{'a': u'LHCB-PAPER-2014-047'},
# {'a': u'CERN-PH-EP-2014-221'},
# {'9': u'arXiv', 'a': u'arXiv:1410.0149', 'c': u'hep-ex'}]
#
else
:
index
=
nkeys
.
index
(
max
(
nkeys
))
di
,
ko
=
record
[
field
][
index
],
False
# check that key do not exist in the big one
keys
=
di
.
keys
()
for
i
in
range
(
len
(
record
[
field
])):
if
i
==
index
:
continue
for
k
in
record
[
field
][
i
].
iterkeys
():
if
k
in
di
:
ko
=
True
break
else
:
keys
.
append
(
k
)
if
ko
:
continue
# copy keys
for
i
in
range
(
len
(
record
[
field
])):
if
i
==
index
:
continue
for
(
k
,
v
)
in
record
[
field
][
i
].
iteritems
():
di
[
k
]
=
v
record
[
field
]
=
di
def
_decode_record
(
self
,
node
):
"""Transform the XML node I{<record>} into a L{Record}.
@type node: unicode
@param node: the I{<record>} node has the following structure::
<record>
<controlfield tag="001">1540265</controlfield>
<controlfield tag="005">20130410235250.0</controlfield>
<datafield tag="024" ind1="8" ind2=" ">
<subfield key="a">oai:cds.cern.ch:1540265</subfield>
<subfield key="p">cerncds:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN:FULLTEXT</subfield>
<subfield key="p">cerncds:CERN</subfield>
</datafield>
...
</record>
@rtype: Record
@return: the keys of the record correspond to the I{datafield tag}.
# alias
host
=
record
.
host
()
key
=
record
.
reference_conference_key
()
"""
record
=
Record
()
# controlfield
for
controlfield
in
node
.
getElementsByTagName
(
'controlfield'
):
key
=
controlfield
.
getAttribute
(
'tag'
)
value
=
controlfield
.
childNodes
[
0
].
nodeValue
record
[
key
]
=
value
# datafield
for
datafield
in
node
.
getElementsByTagName
(
'datafield'
):
di
=
self
.
_decode_datafield
(
datafield
)
key
=
datafield
.
getAttribute
(
'tag'
)
ind1
=
datafield
.
getAttribute
(
'ind1'
).
replace
(
' '
,
''
)
ind2
=
datafield
.
getAttribute
(
'ind2'
).
replace
(
' '
,
''
)
# In almost all case the tag is an integer
# but from time to time it is equal to "FFT" (inspirehep) !!
if
not
REG_INT
.
match
(
key
):
continue
# for talk or proceeding a key is always defined
if
not
key
:
raise
Marc12Exception
(
MSG_NO_CONF
)
# build the key by concataining all attributes
key
=
"%s%s%s"
%
(
key
,
ind1
,
ind2
)
# get conference information
id
=
record
.
reference_conference_id
()
conference
=
self
.
_get_conference
(
host
,
id
,
key
)
# one occurrence of the key
if
key
not
in
record
:
record
[
key
]
=
di
# protection id can be a reference to other object like book
if
u
"111"
not
in
conference
:
return
# several occurrence of the key - transform a list of dictionary
elif
isinstance
(
record
[
key
],
list
):
record
[
key
].
append
(
di
)
# copy conference information in the current record
# the conference URL is in 8564u
record
[
u
"111"
]
=
conference
[
u
"111"
]
if
"8564"
in
conference
:
record
[
u
"8564"
]
=
conference
[
u
"8564"
]
else
:
record
[
key
]
=
[
record
[
key
],
di
]
def
_get_conference
(
self
,
host
,
conf_id
,
key
):
"""Get the conference data associated to the record.
The conference is identified by its id or key.
return
record
@type host: unicode
@param host:
def
_decode_datafield
(
self
,
n
ode
):
"""Transform the XML node I{<datafiled>} into a dictionary.
@type id: unic
ode
@param id:
@type
node
: unicode
@param
node: the I{<datafiled>} node has the following structure:
:
@type
key
: unicode
@param
key
:
<datafield tag="024" ind1="8" ind2=" ">
<subfield code="a">oai:cds.cern.ch:1540265</subfield>
<subfield code="p">cerncds:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN:FULLTEXT</subfield>
<subfield code="p">cerncds:CERN</subfield>
</datafield>
@rtype: L{Record}
@return: The conference record
@rtype: dict
@return: the keys correspond to the I{subfield code} while the values
are a string of a list of strings.
@raise CheckException:
"""
di
=
{}
for
subfield
in
node
.
getElementsByTagName
(
'subfield'
):
code
=
str
(
subfield
.
getAttribute
(
'code'
))
value
=
''
if
subfield
.
childNodes
:
value
=
subfield
.
childNodes
[
0
].
nodeValue
if
code
not
in
di
:
di
[
code
]
=
value
elif
isinstance
(
di
[
code
],
list
):
di
[
code
].
append
(
value
)
cds
=
InvenioStore
(
host
)
# search the conference by id the preferred method
if
conf_id
:
xml
=
cds
.
get_record
(
conf_id
)
for
conference
in
IterRecord
(
xml
):
if
conference
.
id
()
==
conf_id
:
return
conference
# search the conference by key if the previous method failed.
# the method depends on the store.
if
key
:
ids
=
[]
if
cds
.
_host
.
startswith
(
'inspirehep'
):
key
=
key
.
replace
(
'/'
,
'-'
)
if
REG_CONF
.
match
(
key
):
ids
=
cds
.
get_ids
(
cc
=
'Conferences'
,
p
=
'111__g:%s'
%
key
)
else
:
di
[
code
]
=
[
di
[
code
],
value
]
return
di
ids
=
cds
.
get_ids
(
p
=
key
)
def
_is_not_xml
(
self
,
xml
):
"""C{True} when the C{xml} sting is well formed.
for
conf_id
in
ids
:
xml
=
cds
.
get_record
(
conf_id
)
for
conference
in
IterRecord
(
xml
):
if
conference
.
conference_key
()
==
key
:
return
conference
@type xml: unicode
@param xml:
@rtype: bool
"""
if
xml
.
startswith
(
"<?xml"
):
return
False
return
True
raise
Marc12Exception
(
MSG_NO_CONF
)
def
__call__
(
self
,
xml
,
filter
=
None
,
func
=
None
):
"""Transform the the XML string into a list of L{Record}
...
...
@@ -284,20 +149,15 @@ class Marc12(object):
"""
li
=
[]
if
self
.
_is_not_xml
(
xml
):
raise
Marc12Exception
(
MSG_WELL_FORMED_XML
)
dom
=
parseString
(
xml
)
root
=
dom
.
documentElement
for
node
in
root
.
getElementsByTagName
(
'record'
):
record
=
self
.
_decode_record
(
node
)
self
.
_clean_record
(
record
)
for
record
in
IterRecord
(
xml
):
if
is_institute
(
record
):
record
=
Institute
(
record
)
elif
is_conference
(
record
):
self
.
_add_conference_data
(
record
)
if
filter
and
not
filter
(
record
):
continue
...
...
modules/regex.py
View file @
e92f798b
# *-* coding: utf-8 *-*
""" A collections of regular expression defining rules
to be applied
on
field content
s
.
""" A collections of regular expression defining rules
validating
field
s
content.
@author: R. Le Gac
"""
# Collaboration field:
#
Valid
Collaboration field:
# - CMS Collaboration
# - CMS and LHCb Collaborations
# - ATLAS Collaboration, CMS Collaboration
# - Heavy Flavour Averaging Group
# - Heavy Flavour Averaging Group
# - CTA Consortium
# - any mixture of the above separated by a comma
#
REG_COLLABORATION
=
r
'^[A-Za-z0-9\-/, ]+([Cc]ollaboration|[Cc]onsortium|[Gg]roup)[s]?$'
# Conference dates
#
Valid
Conference dates
# - 3 Dec 2012
# - 10-14 Dec 2012
# - 28 Jun - 4 Jul 2012
...
...
@@ -25,13 +25,13 @@ reg2 = r'\d{1,2}-\d{1,2} [A-Z][a-z]{2} \d{4}'
reg3
=
r
'\d{1,2} [A-Z][a-z]{2} - \d{1,2} [A-Z][a-z]{2} \d{4}'
REG_CONF_DATES
=
r
'%s|%s|%s'
%
(
reg1
,
reg2
,
reg3
)
# Defence date
#
Valid
Defence date
# - 30 Dec 2012
#
REG_DEFENSE
=
r
'\d{2} [A-Z][a-z]{2} \d{4}'
# Submitted field
#
Valid
Submitted field
# - 2012-12
# - 2012-12-31
#
REG_SUBMITTED
=
r
'\d{4}-\d{2}(-\d{2})?'
\ No newline at end of file
REG_SUBMITTED
=
r
'\d{4}-\d{2}(-\d{2})?'
tests/harvester/CheckAndFix/test_acti_cds1411352_fix.py
View file @
e92f798b
...
...
@@ -14,6 +14,7 @@ Allow to test the brute force decoding with its mistakes.
Note:
* Only the first authors is defined
* The submitted date is 05 Jan 2012
* conference date 6 - 11 Dec 2010
"""
import
copy
...
...
@@ -26,20 +27,17 @@ from invenio_tools import CheckAndFix, load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
record
():
rec
=
load_record
(
'cds.cern.ch'
,
1411352
)
svc
=
CheckAndFix
()
svc
.
conference
(
rec
)
return
rec
@
pytest
.
fixture
(
scope
=
"module"
)
def
recordfix
(
record
):
svc
=
CheckAndFix
()
rec
=
copy
.
deepcopy
(
record
)
svc
=
CheckAndFix
()
svc
.
authors
(
rec
)
svc
.
conference
(
rec
)
svc
.
format_authors
(
rec
,
format_author_fr
)
svc
.
format_editor
(
rec
)
svc
.
my_authors
(
rec
)
...
...
@@ -54,40 +52,9 @@ def test_authors(record, recordfix):
assert
recordfix
.
authors
()
==
"O. Leroy"
def
test_collaboration
(
record
):
assert
record
.
collaboration
()
==
""
def
test_conference_country
(
record
):
assert
record
.
conference_country
()
==
"Italy"
def
test_conference_dates
(
record
):
assert
record
.
conference_dates
()
==
"6-11 Dec 2010"
def
test_conference_key
(
record
):
assert
record
.
conference_key
()
==
"rome20101206"
def
test_conference_location
(
record
):
assert
record
.
conference_location
()
==
"Rome, Italy"
def
test_conference_title
(
record
):
assert
record
.
conference_title
()
==
"Symposium on Prospects in the Physics of Discrete Symmetries"
def
test_conference_town
(
record
):
assert
record
.
conference_town
()
==
"Rome"
def
test_conference_url
(
record
):
assert
record
.
conference_url
()
==
"http://www.roma1.infn.it/discrete10"
def
test_conference_year
(
record
):
assert
record
.
conference_year
()
==
"2010"
def
test_conference_dates
(
record
,
recordfix
):
assert
record
.
conference_dates
()
==
"6 - 11 Dec 2010"
assert
recordfix
.
conference_dates
()
==
"6-11 Dec 2010"
def
test_first_author
(
record
,
recordfix
):
...
...
@@ -100,14 +67,6 @@ def test_first_institutes(record, recordfix):
assert
recordfix
.
first_author_institutes
()
==
"Marseille, CPPM"
def
test_host
(
record
):
assert
record
.
host
()
==
"cds.cern.ch"
def
test_id
(
record
):
assert
record
.
id
()
==
"1411352"
def
test_institutes
(
record
,
recordfix
):
assert
record
.
institutes
()
==
[]
assert
record
.
is_institute_defined
()
==
False
...
...
@@ -116,61 +75,6 @@ def test_institutes(record, recordfix):
assert
recordfix
.
is_institute_defined
()
==
True
def
test_is_proceeding
(
record
):
assert
record
.
is_conference_data
()
==
True
assert
record
.
is_published
()
==
True
assert
record
.
is_thesis
()
==
False
def
test_oai
(
record
):
assert
record
.
oai
()
==
"oai:cds.cern.ch:1411352"
assert
record
.
oai_url
()
==
"http://cds.cern.ch/record/1411352"