Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
d2a8a4fb
Commit
d2a8a4fb
authored
Sep 20, 2015
by
LE GAC Renaud
Browse files
Refactor Record in RecordConf, RecordInst, RecordPubli, RecordThesis classes.
parent
0ec8ce83
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
967 additions
and
878 deletions
+967
-878
modules/invenio_tools/__init__.py
modules/invenio_tools/__init__.py
+12
-4
modules/invenio_tools/base.py
modules/invenio_tools/base.py
+34
-2
modules/invenio_tools/exception.py
modules/invenio_tools/exception.py
+0
-1
modules/invenio_tools/marc12.py
modules/invenio_tools/marc12.py
+22
-13
modules/invenio_tools/record.py
modules/invenio_tools/record.py
+17
-830
modules/invenio_tools/recordconf.py
modules/invenio_tools/recordconf.py
+213
-0
modules/invenio_tools/recordinst.py
modules/invenio_tools/recordinst.py
+9
-19
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+509
-0
modules/invenio_tools/recordthesis.py
modules/invenio_tools/recordthesis.py
+92
-0
tests/harvester/Marc12/test_marc.py
tests/harvester/Marc12/test_marc.py
+59
-9
No files found.
modules/invenio_tools/__init__.py
View file @
d2a8a4fb
...
...
@@ -3,21 +3,29 @@
@note: details on the invenio API at U{http://invenio-software.org/}
"""
from
base
import
(
is_conference
,
from
base
import
(
ARXIV
,
ARXIV_PDF
,
is_conference
,
is_institute
,
is_thesis
,
OAI_URL
,
REG_ARXIV_NUMBER
,
REG_OAI
,
REG_YEAR
)
REG_YEAR
,
THESIS_DIR
)
from
exception
import
(
CdsException
,
CheckException
,
InstituteException
,
Marc12Exception
,
RecordException
,
XmlException
)
from
checkandfix
import
CheckAndFix
,
load_record
from
institute
import
Institute
from
inveniostore
import
InvenioStore
from
iterrecord
import
IterRecord
from
marc12
import
Marc12
from
record
import
Record
from
recordconf
import
RecordConf
from
recordinst
import
RecordInst
from
recordpubli
import
RecordPubli
from
recordthesis
import
RecordThesis
modules/invenio_tools/base.py
View file @
d2a8a4fb
...
...
@@ -4,11 +4,17 @@
"""
import
re
ARXIV
=
"arXiv"
ARXIV_PDF
=
"http://arxiv.org/pdf/"
OAI_URL
=
"http://%s/record/%s"
REG_ARXIV_NUMBER
=
re
.
compile
(
"\d+\.\d+"
)
REG_OAI
=
re
.
compile
(
'oai:([a-z\.]+):([\d]+)'
)
REG_YEAR
=
re
.
compile
(
"(\d{4})"
)
THESIS_DIR
=
u
"dir."
def
is_conference
(
record
):
"""True when the record describes a publication related to a conference.
...
...
@@ -21,11 +27,22 @@ def is_conference(record):
to a conference.
"""
return
u
"111"
in
record
or
record
.
reference_conference_key
()
if
u
"111"
in
record
:
return
True
# try with the conference key
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
if
record
.
host
().
startswith
(
"cds"
):
field
,
subfield
=
u
"962"
,
"n"
else
:
field
,
subfield
=
u
"773"
,
"w"
return
len
(
record
.
_get
(
field
,
subfield
))
>
0
def
is_institute
(
record
):
"""
True when the record describes an institute.
"""True when the record describes an institute.
Args:
record (Record):
...
...
@@ -51,3 +68,18 @@ def is_institute(record):
return
True
return
False
def
is_thesis
(
record
):
"""True when the record describes a thesis.
Args:
record (Record):
Return:
bool: true when the MARC record describes a thesis
"""
li
=
record
.
_get
(
u
"980"
,
"a"
,
force_list
=
True
)
val
=
", "
.
join
(
li
)
return
'THESIS'
in
val
modules/invenio_tools/exception.py
View file @
d2a8a4fb
...
...
@@ -16,7 +16,6 @@ class ExceptionUTF8(Exception):
class
CdsException
(
ExceptionUTF8
):
pass
class
CheckException
(
ExceptionUTF8
):
pass
class
InstituteException
(
ExceptionUTF8
):
pass
class
Marc12Exception
(
ExceptionUTF8
):
pass
class
RecordException
(
ExceptionUTF8
):
pass
class
XmlException
(
ExceptionUTF8
):
pass
modules/invenio_tools/marc12.py
View file @
d2a8a4fb
...
...
@@ -5,11 +5,14 @@
import
re
from
base
import
is_conference
,
is_institute
from
base
import
is_conference
,
is_institute
,
is_thesis
from
exception
import
Marc12Exception
from
institute
import
Institute
from
inveniostore
import
InvenioStore
from
iterrecord
import
IterRecord
from
recordconf
import
RecordConf
from
recordinst
import
RecordInst
from
recordpubli
import
RecordPubli
from
recordthesis
import
RecordThesis
MSG_NO_CONF
=
"Reject no conference information"
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(\.\d+)?$"
)
...
...
@@ -103,14 +106,13 @@ class Marc12(object):
for
conf_id
in
ids
:
xml
=
cds
.
get_record
(
conf_id
)
for
conference
in
IterRecord
(
xml
):
if
conference
.
conference_key
(
)
==
key
:
if
conference
.
_get
(
u
"111"
,
"g"
)
==
key
:
return
conference
raise
Marc12Exception
(
MSG_NO_CONF
)
def
__call__
(
self
,
xml
,
filter
=
None
,
func
=
None
):
"""Transform the the XML string into a list of L{Record}
or L{Institute}
"""Transform the the XML string into a list of L{Record}.
@type xml: unicode
@param xml: the XML string has the following structure::
...
...
@@ -143,7 +145,7 @@ class Marc12(object):
The argument of the function is a Record.
It can be used to polish the record content.
@rtype: list of L{Record}
or L{Institute}
@rtype: list of L{Record}
@raise Marc12Exception: not well formed XML.
...
...
@@ -152,18 +154,25 @@ class Marc12(object):
for
record
in
IterRecord
(
xml
):
if
is_institute
(
record
):
record
=
Institute
(
record
)
if
is_conference
(
record
):
upcast_record
=
RecordConf
(
record
)
self
.
_add_conference_data
(
upcast_record
)
elif
is_
conferenc
e
(
record
):
self
.
_add_conference_data
(
record
)
elif
is_
institut
e
(
record
):
upcast_record
=
RecordInst
(
record
)
if
filter
and
not
filter
(
record
):
elif
is_thesis
(
record
):
upcast_record
=
RecordThesis
(
record
)
else
:
upcast_record
=
RecordPubli
(
record
)
if
filter
and
not
filter
(
upcast_record
):
continue
if
func
:
func
(
record
)
func
(
upcast_
record
)
li
.
append
(
record
)
li
.
append
(
upcast_
record
)
return
li
modules/invenio_tools/record.py
View file @
d2a8a4fb
...
...
@@ -2,21 +2,10 @@
""" invenio_tools.record
"""
import
re
import
pprint
from
base
import
OAI_URL
,
REG_YEAR
,
REG_OAI
from
filters
import
CLEAN_COLLABORATION
,
CLEAN_THESIS_DEFENSE
from
plugin_dbui
import
CLEAN_SPACES
ARXIV
=
"arXiv"
ARXIV_PDF
=
"http://arxiv.org/pdf/"
REG_ARXIV_NUMBER
=
re
.
compile
(
"\d+\.\d+"
)
THESIS_DIR
=
u
"dir."
from
base
import
OAI_URL
,
REG_OAI
class
Record
(
dict
):
...
...
@@ -32,55 +21,24 @@ class Record(dict):
dict2(subfield1=..., subfield2=...), ...]
In the MARC standard, the C{field} is a string containing at least three digit
while the C{subfield} is a letter. The type of the C{field}
and C{subfield}
is string.
while the C{subfield} is a letter. The type of the C{field}
is unicode
and C{subfield}
is string.
The class comes with a collection of methods to extract the record
information
ignor
ing the C{field} and the C{subfield} codification.
information
mask
ing the C{field} and the C{subfield} codification.
The re
a
ltion between methods and MARC field
is
the following::
The rel
a
tion between methods and MARC field
s are
the following::
| CDS | INSPIREP
----------------------+---------+----------
authors | 700 a |
collaboration | 710 g |
conference date | 111 d |
conference end | 111 z | None
conference key | 111 g |
conference location | 111 c |
conference title | 111 a |
conference start | None | 111 x
conference URL | 8564 u |
conference year | 111 f |
first author | 100 a |
id | 001 |
institutes | 700 u |
oai | 0248 a | 909CO o
paper editor | 773 p |
paper pages | 773 c |
paper reference | 773 o |
paper URL | 8564 u |
paper volume | 773 v |
paper year | 773 y |
preprint number | 037 a |
ref. conf. id | 962 b |
ref. conf. key | 962 n | 773 w
ref. conf. proceeding | 7870 w |
ref. conf. talk | 7870 w |
report number | 088 a | 037a
submitted | 269 c |
these defense | 500 a |
these level | 502 a |
these director | 700 a |
these universities | 502 b |
title | 245 a |
year | 260 c |
----------------------+---------+----------
"""
def
__init__
(
self
):
def
__init__
(
self
,
*
args
):
dict
.
__init__
(
self
)
dict
.
__init__
(
self
,
*
args
)
# private cache
self
.
__host
=
None
...
...
@@ -93,8 +51,8 @@ class Record(dict):
def
_get
(
self
,
field
,
subfield
,
force_list
=
False
):
"""Get the value associated to the key C{field} and C{subfield}.
@type field:
str
@param field: typical values are "001", "700", "909CO", ....
@type field:
unicode
@param field: typical values are
u
"001",
u
"700",
u
"909CO", ....
@type subfield: str
@param subfield: typical values are "a", "b", ....
...
...
@@ -125,290 +83,12 @@ class Record(dict):
return
val
def
authors
(
self
,
cmpFct
=
None
):
"""The author(s) signing the publication.
@type cmpFct: reference to a function or None
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- The string is empty when there is no authors.
"""
li
=
self
.
authors_as_list
()
if
cmpFct
:
li
.
sort
(
key
=
cmpFct
)
return
u
', '
.
join
(
li
)
def
authors_as_list
(
self
):
"""The list of author(s) signing the publication.
@rtype: list
@return:
- The list is empty when authors are not defined.
"""
authors
=
[]
# NOTE: the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if
"700"
in
self
and
isinstance
(
self
[
"700"
],
dict
):
if
not
(
"e"
in
self
[
"700"
]
and
self
[
"700"
][
"e"
]
==
THESIS_DIR
):
authors
.
append
(
self
[
"700"
][
"a"
])
elif
"700"
in
self
and
isinstance
(
self
[
"700"
],
list
):
for
di
in
self
[
"700"
]:
if
"e"
in
di
and
di
[
"e"
]
==
THESIS_DIR
:
continue
authors
.
append
(
di
[
"a"
])
return
authors
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
@rtype: unicode
@return:
- Collaboration names are separated by ", ".
- The filter L{CLEAN_COLLABORATION} is applied.
"""
li
=
self
.
_get
(
"710"
,
'g'
,
force_list
=
True
)
return
CLEAN_COLLABORATION
(
', '
.
join
(
li
))
def
conference_dates
(
self
):
"""The dates of the conference.
@rtype: unicode
@return:
- The format is '6-5 March'.
- The format is not standardize and can varies
between records and between stores.
- The value is not a standardize C{date}.
"""
return
self
.
_get
(
"111"
,
'd'
)
def
conference_country
(
self
):
"""The country where the conference took place.
@rtype: unicode
@return:
- Empty string when not defined
- The filter L{CLEAN_SPACES} is applied.
"""
loc
=
self
.
conference_location
()
if
loc
:
return
CLEAN_SPACES
(
loc
.
split
(
','
)[
-
1
])
return
''
def
conference_key
(
self
):
"""The conference key used in the store.
@rtype: unicode
@return:
"""
return
self
.
_get
(
"111"
,
'g'
)
def
conference_location
(
self
):
"""The conference location.
@rtype: unicode
@return:
- The format is C{'town, country'}
"""
location
=
self
.
_get
(
"111"
,
'c'
)
# protection against [u'NOW 2012', u'Conca Specchiulla, Otranto, Lecce, Italy']
if
isinstance
(
location
,
list
)
and
len
(
location
)
==
2
:
location
=
location
[
1
]
return
CLEAN_SPACES
(
location
)
def
conference_title
(
self
):
"""The title of the conference.
@rtype: unicode
@return:
"""
return
CLEAN_SPACES
(
self
.
_get
(
"111"
,
'a'
))
def
conference_town
(
self
):
"""The town where the conference took place.
@rtype: unicode
@return:
- An empty string when not defined
"""
loc
=
self
.
conference_location
()
if
loc
:
return
CLEAN_SPACES
(
loc
.
split
(
','
)[
0
])
return
''
def
conference_url
(
self
):
"""The URL of the conference home page.
@rtype: unicode
@return:
- Select arbitrarely the first URL when more than one are founded.
- empty string when not defined
"""
li
=
self
.
_get
(
"8564"
,
'u'
,
force_list
=
True
)
# protection
# from time to time this field contains the reference to the pdf file
val
=
[]
for
el
in
li
:
if
not
el
.
endswith
(
'pdf'
):
val
.
append
(
el
)
# if more than one URL is associated to the record
# select arbitrarily the first one
if
val
:
return
val
[
0
]
return
u
''
def
conference_year
(
self
):
"""The year of the conference.
@rtype: unicode
@return:
"""
year
=
self
.
_get
(
"111"
,
'f'
)
if
year
:
return
year
# recovery from conference dates
match
=
REG_YEAR
.
search
(
self
.
conference_dates
())
if
match
:
return
match
.
group
(
1
)
return
u
''
def
debug
(
self
):
"""Print the record structure on the standard output.
"""
pprint
.
pprint
(
self
)
def
find_authors
(
self
,
pattern
):
"""Find authors matching the regular expression C{pattern}.
@type pattern: unicode
@param pattern: regular expression defining the author names.
@rtype: unicode
@return:
- Author names are separated by ", ".
- The string is empty when nothing is found.
"""
li
=
[]
regex
=
re
.
compile
(
pattern
)
for
author
in
self
.
authors_as_list
():
if
regex
.
search
(
author
):
li
.
append
(
author
)
return
u
', '
.
join
(
li
)
def
find_authors_by_institute
(
self
,
pattern
,
cmpFct
=
None
):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
@type pattern: unicode
@param pattern: regular expression defining the institute name(s)
@type cmpFct: reference to a function
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode or None
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- C{None} when authors are not found.
"""
# authors not defined
if
(
"100"
not
in
self
)
and
(
"700"
not
in
self
):
return
None
# compile the searching criteria
regex
=
re
.
compile
(
pattern
)
# single author in the author list
if
"700"
in
self
and
isinstance
(
self
[
"700"
],
dict
):
if
'u'
not
in
self
[
"700"
]:
return
None
s
=
self
[
"700"
][
'u'
]
if
isinstance
(
s
,
list
):
s
=
', '
.
join
(
s
)
if
regex
.
search
(
s
):
return
self
[
"700"
][
"a"
]
return
None
# list of authors
elif
"700"
in
self
and
isinstance
(
self
[
"700"
],
list
):
authors
=
[]
for
di
in
self
[
"700"
]:
if
'u'
not
in
di
:
return
None
s
=
di
[
'u'
]
if
isinstance
(
di
[
'u'
],
list
):
s
=
', '
.
join
(
di
[
'u'
])
if
regex
.
search
(
s
):
authors
.
append
(
di
[
'a'
])
if
cmpFct
:
authors
.
sort
(
key
=
cmpFct
)
return
u
', '
.
join
(
authors
)
def
first_author
(
self
):
"""The name of the first author.
@rtype: unicode
@return:
- Empty string when the first author is not defined.
"""
return
self
.
_get
(
"100"
,
'a'
)
def
first_author_institutes
(
self
):
"""The institute(s) associated to the first author.
@rtype: unicode
@return:
- names are separated by ", ".
- The string is empty when institutes are not defined.
"""
li
=
self
.
_get
(
"100"
,
'u'
,
force_list
=
True
)
return
u
', '
.
join
(
li
)
def
host
(
self
):
"""The host housing the record.
...
...
@@ -449,114 +129,13 @@ class Record(dict):
"""The id of the record in the store.
@rtype: unicode
@return: the unic id of the record in the store
"""
return
self
[
"001"
]
def
institutes
(
self
):
"""The list of institute signing the publication.
@rtype: list
@return:
- The list is sort in alphabetic order.
"""
li
=
[]
# each entry can be a string or a list when the author has
# several affiliations
for
el
in
self
.
_get
(
"700"
,
'u'
,
force_list
=
True
):
if
isinstance
(
el
,
list
):
li
.
extend
(
el
)
else
:
li
.
append
(
el
)
# remove duplicate entries
li
=
list
(
set
(
li
))
# sort institute in alphabetic order
li
.
sort
()
return
li
def
is_conference_data
(
self
):
"""C{True} when the record contains conference inforamtion.
@rtype: bool
@return:
"""
return
"111"
in
self
def
is_institute_defined
(
self
):
"""C{True} when institutes are defined for all authors.
@rtype: bool
@return:
"""
if
"700"
not
in
self
:
return
False
# dict case
if
isinstance
(
self
[
"700"
],
dict
):
return
"u"
in
self
[
"700"
]
# list case
elif
isinstance
(
self
[
"700"
],
list
):
for
el
in
self
[
"700"
]:
if
isinstance
(
el
,
dict
):
if
'u'
in
el
:
continue