Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
d2a8a4fb
Commit
d2a8a4fb
authored
Sep 20, 2015
by
LE GAC Renaud
Browse files
Refactor Record in RecordConf, RecordInst, RecordPubli, RecordThesis classes.
parent
0ec8ce83
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
967 additions
and
878 deletions
+967
-878
modules/invenio_tools/__init__.py
modules/invenio_tools/__init__.py
+12
-4
modules/invenio_tools/base.py
modules/invenio_tools/base.py
+34
-2
modules/invenio_tools/exception.py
modules/invenio_tools/exception.py
+0
-1
modules/invenio_tools/marc12.py
modules/invenio_tools/marc12.py
+22
-13
modules/invenio_tools/record.py
modules/invenio_tools/record.py
+17
-830
modules/invenio_tools/recordconf.py
modules/invenio_tools/recordconf.py
+213
-0
modules/invenio_tools/recordinst.py
modules/invenio_tools/recordinst.py
+9
-19
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+509
-0
modules/invenio_tools/recordthesis.py
modules/invenio_tools/recordthesis.py
+92
-0
tests/harvester/Marc12/test_marc.py
tests/harvester/Marc12/test_marc.py
+59
-9
No files found.
modules/invenio_tools/__init__.py
View file @
d2a8a4fb
...
...
@@ -3,21 +3,29 @@
@note: details on the invenio API at U{http://invenio-software.org/}
"""
from
base
import
(
is_conference
,
from
base
import
(
ARXIV
,
ARXIV_PDF
,
is_conference
,
is_institute
,
is_thesis
,
OAI_URL
,
REG_ARXIV_NUMBER
,
REG_OAI
,
REG_YEAR
)
REG_YEAR
,
THESIS_DIR
)
from
exception
import
(
CdsException
,
CheckException
,
InstituteException
,
Marc12Exception
,
RecordException
,
XmlException
)
from
checkandfix
import
CheckAndFix
,
load_record
from
institute
import
Institute
from
inveniostore
import
InvenioStore
from
iterrecord
import
IterRecord
from
marc12
import
Marc12
from
record
import
Record
from
recordconf
import
RecordConf
from
recordinst
import
RecordInst
from
recordpubli
import
RecordPubli
from
recordthesis
import
RecordThesis
modules/invenio_tools/base.py
View file @
d2a8a4fb
...
...
@@ -4,11 +4,17 @@
"""
import
re
ARXIV
=
"arXiv"
ARXIV_PDF
=
"http://arxiv.org/pdf/"
OAI_URL
=
"http://%s/record/%s"
REG_ARXIV_NUMBER
=
re
.
compile
(
"\d+\.\d+"
)
REG_OAI
=
re
.
compile
(
'oai:([a-z\.]+):([\d]+)'
)
REG_YEAR
=
re
.
compile
(
"(\d{4})"
)
THESIS_DIR
=
u
"dir."
def
is_conference
(
record
):
"""True when the record describes a publication related to a conference.
...
...
@@ -21,11 +27,22 @@ def is_conference(record):
to a conference.
"""
return
u
"111"
in
record
or
record
.
reference_conference_key
()
if
u
"111"
in
record
:
return
True
# try with the conference key
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
if
record
.
host
().
startswith
(
"cds"
):
field
,
subfield
=
u
"962"
,
"n"
else
:
field
,
subfield
=
u
"773"
,
"w"
return
len
(
record
.
_get
(
field
,
subfield
))
>
0
def
is_institute
(
record
):
"""
True when the record describes an institute.
"""True when the record describes an institute.
Args:
record (Record):
...
...
@@ -51,3 +68,18 @@ def is_institute(record):
return
True
return
False
def
is_thesis
(
record
):
"""True when the record describes a thesis.
Args:
record (Record):
Return:
bool: true when the MARC record describes a thesis
"""
li
=
record
.
_get
(
u
"980"
,
"a"
,
force_list
=
True
)
val
=
", "
.
join
(
li
)
return
'THESIS'
in
val
modules/invenio_tools/exception.py
View file @
d2a8a4fb
...
...
@@ -16,7 +16,6 @@ class ExceptionUTF8(Exception):
class
CdsException
(
ExceptionUTF8
):
pass
class
CheckException
(
ExceptionUTF8
):
pass
class
InstituteException
(
ExceptionUTF8
):
pass
class
Marc12Exception
(
ExceptionUTF8
):
pass
class
RecordException
(
ExceptionUTF8
):
pass
class
XmlException
(
ExceptionUTF8
):
pass
modules/invenio_tools/marc12.py
View file @
d2a8a4fb
...
...
@@ -5,11 +5,14 @@
import
re
from
base
import
is_conference
,
is_institute
from
base
import
is_conference
,
is_institute
,
is_thesis
from
exception
import
Marc12Exception
from
institute
import
Institute
from
inveniostore
import
InvenioStore
from
iterrecord
import
IterRecord
from
recordconf
import
RecordConf
from
recordinst
import
RecordInst
from
recordpubli
import
RecordPubli
from
recordthesis
import
RecordThesis
MSG_NO_CONF
=
"Reject no conference information"
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(\.\d+)?$"
)
...
...
@@ -103,14 +106,13 @@ class Marc12(object):
for
conf_id
in
ids
:
xml
=
cds
.
get_record
(
conf_id
)
for
conference
in
IterRecord
(
xml
):
if
conference
.
conference_key
(
)
==
key
:
if
conference
.
_get
(
u
"111"
,
"g"
)
==
key
:
return
conference
raise
Marc12Exception
(
MSG_NO_CONF
)
def
__call__
(
self
,
xml
,
filter
=
None
,
func
=
None
):
"""Transform the the XML string into a list of L{Record}
or L{Institute}
"""Transform the the XML string into a list of L{Record}.
@type xml: unicode
@param xml: the XML string has the following structure::
...
...
@@ -143,7 +145,7 @@ class Marc12(object):
The argument of the function is a Record.
It can be used to polish the record content.
@rtype: list of L{Record}
or L{Institute}
@rtype: list of L{Record}
@raise Marc12Exception: not well formed XML.
...
...
@@ -152,18 +154,25 @@ class Marc12(object):
for
record
in
IterRecord
(
xml
):
if
is_institute
(
record
):
record
=
Institute
(
record
)
if
is_conference
(
record
):
upcast_record
=
RecordConf
(
record
)
self
.
_add_conference_data
(
upcast_record
)
elif
is_
conferenc
e
(
record
):
self
.
_add_conference_data
(
record
)
elif
is_
institut
e
(
record
):
upcast_record
=
RecordInst
(
record
)
if
filter
and
not
filter
(
record
):
elif
is_thesis
(
record
):
upcast_record
=
RecordThesis
(
record
)
else
:
upcast_record
=
RecordPubli
(
record
)
if
filter
and
not
filter
(
upcast_record
):
continue
if
func
:
func
(
record
)
func
(
upcast_
record
)
li
.
append
(
record
)
li
.
append
(
upcast_
record
)
return
li
modules/invenio_tools/record.py
View file @
d2a8a4fb
This diff is collapsed.
Click to expand it.
modules/invenio_tools/recordconf.py
0 → 100644
View file @
d2a8a4fb
# -*- coding: utf-8 -*-
""" invenio_tools.recordconf
"""
import
re
from
base
import
REG_YEAR
from
plugin_dbui
import
CLEAN_SPACES
from
recordpubli
import
RecordPubli
class
RecordConf
(
RecordPubli
):
"""MARC describing a conference talk or a proceeding.
The relation between methods and MARC fields are the following::
| CDS | INSPIREP
----------------------+---------+----------
conference date | 111 d |
conference end | 111 z | None
conference key | 111 g |
conference location | 111 c |
conference title | 111 a |
conference start | None | 111 x
conference URL | 8564 u |
conference year | 111 f |
ref. conf. id | 962 b |
ref. conf. key | 962 n | 773 w
ref. conf. proceeding | 7870 w |
ref. conf. talk | 7870 w |
----------------------+---------+----------
"""
def
conference_dates
(
self
):
"""The dates of the conference.
@rtype: unicode
@return:
- The format is '6-5 March'.
- The format is not standardize and can varies
between records and between stores.
- The value is not a standardize C{date}.
"""
return
self
.
_get
(
u
"111"
,
"d"
)
def
conference_country
(
self
):
"""The country where the conference took place.
@rtype: unicode
@return:
- Empty string when not defined
- The filter L{CLEAN_SPACES} is applied.
"""
loc
=
self
.
conference_location
()
if
loc
:
return
CLEAN_SPACES
(
loc
.
split
(
','
)[
-
1
])
return
''
def
conference_key
(
self
):
"""The conference key used in the store.
@rtype: unicode
@return:
"""
return
self
.
_get
(
u
"111"
,
"g"
)
def
conference_location
(
self
):
"""The conference location.
@rtype: unicode
@return:
- The format is C{'town, country'}
"""
location
=
self
.
_get
(
u
"111"
,
"c"
)
# protection against [u'NOW 2012', u'Conca Specchiulla, Otranto, Lecce, Italy']
if
isinstance
(
location
,
list
)
and
len
(
location
)
==
2
:
location
=
location
[
1
]
return
CLEAN_SPACES
(
location
)
def
conference_title
(
self
):
"""The title of the conference.
@rtype: unicode
@return:
"""
return
CLEAN_SPACES
(
self
.
_get
(
u
"111"
,
"a"
))
def
conference_town
(
self
):
"""The town where the conference took place.
@rtype: unicode
@return:
- An empty string when not defined
"""
loc
=
self
.
conference_location
()
if
loc
:
return
CLEAN_SPACES
(
loc
.
split
(
','
)[
0
])
return
''
def
conference_url
(
self
):
"""The URL of the conference home page.
@rtype: unicode
@return:
- Select arbitrarely the first URL when more than one are founded.
- empty string when not defined
"""
li
=
self
.
_get
(
u
"8564"
,
"u"
,
force_list
=
True
)
# protection
# from time to time this field contains the reference to the pdf file
val
=
[]
for
el
in
li
:
if
not
el
.
endswith
(
'pdf'
):
val
.
append
(
el
)
# if more than one URL is associated to the record
# select arbitrarily the first one
if
val
:
return
val
[
0
]
return
u
''
def
conference_year
(
self
):
"""The year of the conference.
@rtype: unicode
@return:
"""
year
=
self
.
_get
(
u
"111"
,
"f"
)
if
year
:
return
year
# recovery from conference dates
match
=
REG_YEAR
.
search
(
self
.
conference_dates
())
if
match
:
return
match
.
group
(
1
)
return
u
''
def
reference_conference_id
(
self
):
"""The C{id} of the conference when the record is a proceeding
or a conference talk.
@rtype: unicode
@return:
- Empty string when not defined.
"""
return
self
.
_get
(
u
"962"
,
"b"
)
def
reference_conference_key
(
self
):
"""The conference C{key} when the record is a proceeding
or a conference talk.
@rtype: unicode
@return:
- Empty string when not defined
"""
val
=
''
# the location of this values depends on the store
# cds.cern.ch (962, n) and inspirehep.net (773,w).
#
# NOTE: for the later the field can be a dictionary or
# a list of dictionary. Two type exist. One for the
# proceeding, the other containing the conference key.
# all topologies exist, proc, conf, proc+conf
#
if
self
.
host
().
startswith
(
"inspirehep"
)
and
u
"773"
in
self
:
if
isinstance
(
self
[
u
"773"
],
dict
)
and
"w"
in
self
[
u
"773"
]:
val
=
self
[
u
"773"
][
"w"
]
elif
isinstance
(
self
[
u
"773"
],
list
):
for
di
in
self
[
u
"773"
]:
if
"w"
in
di
:
val
=
di
[
"w"
]
elif
u
"962"
in
self
and
"n"
in
self
[
u
"962"
]:
val
=
self
[
u
"962"
][
"n"
]
return
val
def
reference_conference_proceeding
(
self
):
"""The id of the proceeding when the record is a conference talk.
@rtype: unicode
@return: record id
"""
return
self
.
_get
(
u
"7870"
,
"w"
)
def
reference_conference_talk
(
self
):
"""The id of the conference talk when the record is a proceeding.
@rtype: unicode
@return: record id
"""
return
self
.
_get
(
u
"7870"
,
"w"
)
modules/invenio_tools/
institute
.py
→
modules/invenio_tools/
recordinst
.py
View file @
d2a8a4fb
# -*- coding: utf-8 -*-
""" invenio_tools.
institute
""" invenio_tools.
recordinst
"""
from
base
import
is_institute
from
exception
import
Institute
Exception
from
exception
import
Record
Exception
from
record
import
Record
...
...
@@ -12,11 +12,9 @@ MSG_INVALID_HOST = "Invalid record host"
MSG_INVALID_RECORD
=
"Invalid record, it is not describing an institute"
class
Institute
(
dict
):
"""MARC record representing an institute. More information on MARC
standard at U{http://www.loc.gov/marc/bibliographic/}).
The relation between methods and MARC field is the following::
class
RecordInst
(
Record
):
"""MARC record describing an institute.
The relation between methods and MARC fields are the following::
| INSPIREHEP |
----------------------+-------------+
...
...
@@ -35,15 +33,15 @@ class Institute(dict):
"""
if
not
isinstance
(
record
,
Record
):
raise
Institute
Exception
(
MSG_INVALID_ARG
)
raise
Record
Exception
(
MSG_INVALID_ARG
)
if
not
is_institute
(
record
):
raise
Institute
Exception
(
MSG_INVALID_RECORD
)
raise
Record
Exception
(
MSG_INVALID_RECORD
)
if
record
.
host
()
!=
'inspirehep.net'
:
raise
Institute
Exception
(
MSG_INVALID_INSTITUTE
)
raise
Record
Exception
(
MSG_INVALID_INSTITUTE
)
dict
.
__init__
(
self
,
record
)
Record
.
__init__
(
self
,
record
)
def
future_id
(
self
):
"""
...
...
@@ -53,14 +51,6 @@ class Institute(dict):
"""
return
self
[
u
"110"
][
"t"
]
def
id
(
self
):
"""
Returns:
unicode: the inspirehep id.
"""
return
self
[
u
"110"
][
"u"
]
def
name
(
self
):
"""
Returns:
...
...
modules/invenio_tools/recordpubli.py
0 → 100644
View file @
d2a8a4fb
# -*- coding: utf-8 -*-
""" invenio_tools.recordpubli
"""
import
re
from
base
import
ARXIV
,
ARXIV_PDF
,
REG_ARXIV_NUMBER
,
REG_YEAR
,
THESIS_DIR
from
filters
import
CLEAN_COLLABORATION
from
plugin_dbui
import
CLEAN_SPACES
from
record
import
Record
class
RecordPubli
(
Record
):
"""MARC record describing a publication.
The relation between methods and MARC fields are the following::
| CDS | INSPIREP
----------------------+---------+----------
authors | 700 a |
collaboration | 710 g |
first author | 100 a |
institutes | 700 u |
paper editor | 773 p |
paper pages | 773 c |
paper reference | 773 o |
paper URL | 8564 u |
paper volume | 773 v |
paper year | 773 y |
preprint number | 037 a |
report number | 088 a | 037a
submitted | 269 c |
title | 245 a |
year | 260 c |
----------------------+---------+----------
"""
def
authors
(
self
,
cmpFct
=
None
):
"""The author(s) signing the publication.
@type cmpFct: reference to a function or None
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- The string is empty when there is no authors.
"""
li
=
self
.
authors_as_list
()
if
cmpFct
:
li
.
sort
(
key
=
cmpFct
)
return
u
', '
.
join
(
li
)
def
authors_as_list
(
self
):
"""The list of author(s) signing the publication.
@rtype: list
@return:
- The list is empty when authors are not defined.
"""
authors
=
[]
# NOTE: the content of the 700 field depend on the record type.
# For thesis it also contains the name of the director
if
u
"700"
in
self
and
isinstance
(
self
[
u
"700"
],
dict
):
if
not
(
"e"
in
self
[
u
"700"
]
and
self
[
u
"700"
][
"e"
]
==
THESIS_DIR
):
authors
.
append
(
self
[
u
"700"
][
"a"
])
elif
u
"700"
in
self
and
isinstance
(
self
[
u
"700"
],
list
):
for
di
in
self
[
u
"700"
]:
if
"e"
in
di
and
di
[
"e"
]
==
THESIS_DIR
:
continue
authors
.
append
(
di
[
"a"
])
return
authors
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
@rtype: unicode
@return:
- Collaboration names are separated by ", ".
- The filter L{CLEAN_COLLABORATION} is applied.
"""
li
=
self
.
_get
(
u
"710"
,
'g'
,
force_list
=
True
)
return
CLEAN_COLLABORATION
(
', '
.
join
(
li
))
def
find_authors
(
self
,
pattern
):
"""Find authors matching the regular expression C{pattern}.
@type pattern: unicode
@param pattern: regular expression defining the author names.
@rtype: unicode
@return:
- Author names are separated by ", ".
- The string is empty when nothing is found.
"""
li
=
[]
regex
=
re
.
compile
(
pattern
)
for
author
in
self
.
authors_as_list
():
if
regex
.
search
(
author
):
li
.
append
(
author
)
return
u
', '
.
join
(
li
)
def
find_authors_by_institute
(
self
,
pattern
,
cmpFct
=
None
):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
@type pattern: unicode
@param pattern: regular expression defining the institute name(s)
@type cmpFct: reference to a function
@param cmpFct: Compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller than,
equal to, or larger than the second one.
@rtype: unicode or None
@return:
- Author names are separated by ", ".
- Author are sorted according to the function C{cmpFct}.
- C{None} when authors are not found.
"""
# authors not defined
if
(
u
"100"
not
in
self
)
and
(
u
"700"
not
in
self
):
return
None
# compile the searching criteria
regex
=
re
.
compile
(
pattern
)
# single author in the author list
if
u
"700"
in
self
and
isinstance
(
self
[
u
"700"
],
dict
):
if
"u"
not
in
self
[
u
"700"
]:
return
None
s
=
self
[
u
"700"
][
"u"
]
if
isinstance
(
s
,
list
):
s
=
', '
.
join
(
s
)
if
regex
.
search
(
s
):
return
self
[
u
"700"
][
"a"
]
return
None
# list of authors
elif
u
"700"
in
self
and
isinstance
(
self
[
u
"700"
],
list
):
authors
=
[]
for
di
in
self
[
u
"700"
]:
if
'u'
not
in
di
:
return
None
s
=
di
[
'u'
]
if
isinstance
(
di
[
'u'
],
list
):
s
=
', '
.
join
(
di
[
'u'
])
if
regex
.
search
(
s
):