Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Docker-in-Docker (DinD) capabilities of public runners deactivated.
More info
Open sidebar
limbra
limbra
Commits
f530752c
Commit
f530752c
authored
Jun 28, 2017
by
LE GAC Renaud
Browse files
Migrate CheckAndFix: methods required by the Thesis harvester.
parent
e8154552
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
91 additions
and
84 deletions
+91
-84
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+31
-84
tests/basis/test_12_CheckAndFix_talk.py
tests/basis/test_12_CheckAndFix_talk.py
+60
-0
No files found.
modules/harvest_tools/checkandfix.py
View file @
f530752c
...
...
@@ -10,8 +10,7 @@ from .base import search_synonym, ToolException
from
datetime
import
datetime
from
.exception
import
CheckException
from
gluon
import
current
from
invenio_tools
import
(
DECODE_REF
,
MSG_NO_CONF
,
from
invenio_tools
import
(
MSG_NO_CONF
,
MSG_NO_THESIS
,
OAI_URL
,
RecordConf
,
...
...
@@ -297,10 +296,7 @@ class CheckAndFix(object):
bool: ``True`` if *one* row is found, ``False`` otherwise.
"""
db
=
self
.
db
table
=
db
[
tablename
]
query
=
table
.
synonyms
.
contains
(
value
)
query
=
self
.
db
[
tablename
].
synonyms
.
contains
(
value
)
if
db
(
query
).
count
()
==
1
:
return
True
...
...
@@ -316,6 +312,7 @@ class CheckAndFix(object):
Returns:
unicode:
target at least YYYY-MM
empty when procedure failed
"""
...
...
@@ -323,7 +320,7 @@ class CheckAndFix(object):
if
isinstance
(
record
,
RecordConf
):
opening
,
closing
=
self
.
_get_conference_dates
(
record
)
return
opening
.
strftime
(
"%Y-%m-%d"
)
val
=
opening
.
strftime
(
"%Y-%m-%d"
)
elif
isinstance
(
record
,
RecordThesis
):
val
=
record
.
these_defense
()
...
...
@@ -335,6 +332,10 @@ class CheckAndFix(object):
if
m_arxiv
:
val
=
"20%s-%s"
%
(
m_arxiv
.
group
(
1
),
m_arxiv
.
group
(
2
))
# last change use the creation date for the record
if
val
==
u
""
or
len
(
val
)
<
7
:
val
=
record
[
u
"creation_date"
][
0
:
7
]
return
val
@
staticmethod
...
...
@@ -538,17 +539,18 @@ class CheckAndFix(object):
* Replace U. by University
Args:
record (RecordThesis): record describing a thesis.
record (RecordThesis):
record describing a thesis.
"""
# protection
if
not
isinstance
(
record
,
RecordThesis
):
return
is_cppm
=
self
.
_get_reg_institute
().
find
(
"CPPM"
)
!=
-
1
values
=
record
[
u
"dissertation_note"
][
u
"university"
]
# CPPM
:
fix the name of Aix-Marseille university
if
is_cppm
:
# CPPM
--
fix the name of Aix-Marseille university
if
self
.
_get_reg_institute
().
find
(
"CPPM"
)
!=
-
1
:
year
=
REG_YEAR
.
search
(
record
.
these_defense
()).
group
(
1
)
if
int
(
year
)
<
2012
:
...
...
@@ -556,33 +558,14 @@ class CheckAndFix(object):
else
:
university
=
"Aix Marseille Université"
if
"502"
in
record
and
"b"
in
record
[
"502"
]:
if
isinstance
(
record
[
"502"
][
"b"
],
str
):
if
"Marseille"
in
record
[
"502"
][
"b"
]:
record
[
"502"
][
"b"
]
=
university
elif
isinstance
(
record
[
"502"
][
"b"
],
list
):
for
i
in
range
(
len
(
record
[
"502"
][
"b"
])):
if
"Marseille"
in
record
[
"502"
][
"b"
][
i
]:
record
[
"502"
][
"b"
][
i
]
=
university
values
=
(
university
if
"Marseille"
in
values
else
values
)
# Other
:
replace U. by University
# Other
--
replace U. by University
else
:
university
=
current
.
T
(
UNIVERSITY
,
lazy
=
False
)
if
"502"
in
record
and
"b"
in
record
[
"502"
]:
if
isinstance
(
record
[
"502"
][
"b"
],
str
):
value
=
record
[
"502"
][
"b"
]
if
"U."
in
value
:
value
=
value
.
replace
(
'U.'
,
university
)
record
[
"502"
][
"b"
]
=
value
elif
isinstance
(
record
[
"502"
][
"b"
],
list
):
for
i
in
range
(
len
(
record
[
"502"
][
"b"
])):
value
=
record
[
"502"
][
"b"
][
i
]
if
"U."
in
value
:
value
=
value
.
replace
(
'U.'
,
university
)
record
[
"502"
][
"b"
][
i
]
=
value
university
=
current
.
T
(
UNIVERSITY
).
decode
(
"utf8"
)
values
.
replace
(
'U.'
,
university
)
record
[
u
"dissertation_note"
][
u
"university"
]
=
values
def
get_my_authors
(
self
,
record
,
sep
=
", "
,
sort
=
False
):
"""Get authors of my institutes signing the record.
...
...
@@ -647,10 +630,12 @@ class CheckAndFix(object):
"""Check that the record described a thesis.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException: when the record does not describe a thesis.
CheckException:
the record does not describe a thesis.
"""
if
not
isinstance
(
record
,
RecordThesis
):
...
...
@@ -819,42 +804,6 @@ class CheckAndFix(object):
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
@
staticmethod
def
recover_oai
(
record
,
host
):
"""Recover the OAI identifier when it is not defined
or not well form.
Args:
record (RecordPubli): record describing a publication.
host (str): possible values ares ``cds.cern.ch``
or ``inspirehep.net``
"""
# Note:
# For the record cds 1951625, possible values are:
# oai:cds.cern.ch:1951625 (if it does not exist in inspirehep)
# oai:cds.cern.ch:1951625, oai:inspirehep.net:1319638 (if it exist
# in both store)
# In all the case the first OAI corresponds to the record.id()
#
oai
=
record
.
oai
()
if
oai
is
not
None
and
REG_OAI
.
match
(
oai
):
return
if
host
==
"cds.cern.ch"
:
field
,
subfield
=
"0248"
,
"a"
elif
host
==
"inspirehep.net"
:
field
,
subfield
=
"909CO"
,
"o"
else
:
raise
ValueError
(
MSG_INVALID_HOST
)
if
field
not
in
record
:
record
[
field
]
=
dict
()
record
[
field
][
subfield
]
=
OAI_INVENIO
%
(
host
,
record
.
id
())
def
submitted
(
self
,
record
):
"""Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
Look for alternative when it is not defined.
...
...
@@ -875,11 +824,14 @@ class CheckAndFix(object):
date
=
record
.
submitted
()
# recover missing date using conference, preprint, thesis information
if
len
(
date
)
==
0
:
if
len
(
date
)
<
7
:
date
=
self
.
_recover_submitted
(
record
)
if
len
(
date
)
==
0
:
raise
CheckException
(
MSG_NO_DATE
)
elif
len
(
date
)
<
7
:
raise
CheckException
(
MSG_WELL_FORMED_DATE
)
# 22 Mar 2011
m
=
DECODE_DD_MMM_YYYY
.
match
(
date
)
if
m
:
...
...
@@ -892,16 +844,11 @@ class CheckAndFix(object):
data
=
(
m
.
group
(
3
),
int
(
m
.
group
(
2
)),
int
(
m
.
group
(
1
)))
date
=
'%s-%02i-%02i'
%
data
# 2011
m_year
=
DECODE_YYYY
.
match
(
date
)
if
m_year
:
date
=
self
.
_recover_submitted
(
record
)
# check the minimum requirement is 2001-05
if
not
REG_SUBMITTED
.
match
(
date
):
raise
CheckException
(
MSG_WELL_FORMED_DATE
)
if
u
"prepublication"
in
record
:
record
[
u
"prepublication"
][
u
"date"
]
=
date
record
[
u
"prepublication"
][
u
"date"
]
=
date
else
:
record
[
u
"prepublication"
]
=
{
u
"date"
:
date
}
@
staticmethod
def
temporary_record
(
record
):
...
...
tests/basis/test_12_CheckAndFix_talk.py
0 → 100644
View file @
f530752c
# -*- coding: utf-8 -*-
"""test_12_CheckAndFix_thesis
* Test CheckAndFix methods for thesis.
Use the one talk in cds.cern.ch
- is_thesis
- submitted
- format_universities
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import
pytest
from
harvest_tools.checkandfix
import
CheckAndFix
from
invenio_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
reccds
():
return
load_record
(
"cds.cern.ch"
,
1394605
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_is_thesis
(
svc
,
reccds
):
assert
svc
.
is_thesis
(
reccds
)
is
None
def
test_submitted
(
svc
,
reccds
):
assert
reccds
.
these_defense
()
==
"2011"
assert
reccds
.
submitted
()
==
""
svc
.
submitted
(
reccds
)
assert
reccds
.
submitted
()
==
"2011-11"
def
test_format_universities
(
svc
,
reccds
):
# Khanji en 2011 (Université de la Méditerrannée)
assert
reccds
[
u
"dissertation_note"
][
u
"university"
]
==
\
"Marseille U., Luminy"
svc
.
format_universities
(
reccds
)
assert
reccds
[
u
"dissertation_note"
][
u
"university"
]
==
\
u
"Université de la Méditerrannée Aix-Marseille II"
# Chen en 2013 (Aix marseille Université)
reccds2
=
load_record
(
"cds.cern.ch"
,
1632177
)
assert
reccds2
[
u
"dissertation_note"
][
u
"university"
]
==
\
u
"Shandong U. & Marseille, CPPM"
svc
.
format_universities
(
reccds2
)
assert
reccds2
[
u
"dissertation_note"
][
u
"university"
]
==
\
u
"Aix Marseille Université"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment