Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
e8154552
Commit
e8154552
authored
Jun 28, 2017
by
LE GAC Renaud
Browse files
Migrate CheckAndFix: methods required by the harvester Proceeding.
parent
56512435
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
189 additions
and
73 deletions
+189
-73
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+117
-60
tests/basis/test_10_CheckAndFix_article.py
tests/basis/test_10_CheckAndFix_article.py
+1
-13
tests/basis/test_11_CheckAndFix_proceeding.py
tests/basis/test_11_CheckAndFix_proceeding.py
+71
-0
No files found.
modules/harvest_tools/checkandfix.py
View file @
e8154552
...
...
@@ -7,6 +7,7 @@ import re
import
regex
from
.base
import
search_synonym
,
ToolException
from
datetime
import
datetime
from
.exception
import
CheckException
from
gluon
import
current
from
invenio_tools
import
(
DECODE_REF
,
...
...
@@ -81,6 +82,11 @@ REG_DOI = re.compile(r"\d+\.\d+/([a-zA-Z]+)\.(\d+)\.(\w+)")
REG_SUBMITTED
=
re
.
compile
(
regex
.
REG_SUBMITTED
)
REG_WELL_FORMED_CONF_DATES_1
=
re
.
compile
(
"\d{2} - \d{2} [A-Z][a-z]{2} \d{4}"
)
REG_WELL_FORMED_CONF_DATES_2
=
\
re
.
compile
(
"\d{2} [A-Z][a-z]{2} - \d{2} [A-Z][a-z]{2} \d{4}"
)
UNIVERSITY
=
"University"
...
...
@@ -101,6 +107,74 @@ class CheckAndFix(object):
# private cache for my authors list
self
.
_my_authors
=
{}
@
staticmethod
def
_get_conference_dates
(
record
):
"""Return the opening and closing dates of a conference.
Args:
record (RecordConf):
record describing a conference proceeding or talk.
Returns:
tuple of datetime.date:
opening and closing dates.
Raise:
ToolException:
no conference date found.
"""
if
u
"meeting_name"
not
in
record
:
raise
ToolException
(
MSG_NO_CONF_DATE
)
meeting
=
record
[
u
"meeting_name"
]
meeting
=
(
meeting
[
0
]
if
isinstance
(
meeting
,
list
)
else
meeting
)
# CDS has the opening and closing dates encoded as 20141231
if
u
"opening_date"
in
meeting
and
u
"closing_date"
in
meeting
:
fmt
=
"%Y%m%d"
val
=
meeting
[
u
"opening_date"
]
opening
=
datetime
.
strptime
(
val
,
fmt
)
val
=
meeting
[
u
"closing_date"
]
closing
=
datetime
.
strptime
(
val
,
fmt
)
return
(
opening
,
closing
)
# both CDS and INSPIRE have the dates subfield
val
=
meeting
[
u
"date"
]
# date is encode as 12 - 15 Mar 2014
m
=
REG_CONF_DATES_1
.
match
(
val
)
if
m
:
fmt
=
"%d-%b-%Y"
val
=
u
"%s-%s-%s"
%
(
m
.
group
(
1
),
m
.
group
(
3
),
m
.
group
(
4
))
opening
=
datetime
.
strptime
(
val
,
fmt
)
val
=
u
"%s-%s-%s"
%
(
m
.
group
(
2
),
m
.
group
(
3
),
m
.
group
(
4
))
closing
=
datetime
.
strptime
(
val
,
fmt
)
return
(
opening
,
closing
)
# dates are encoded 29 Feb - 1 Mar 2014
m
=
REG_CONF_DATES_2
.
match
(
val
)
if
not
m
:
raise
ToolException
(
MSG_NO_CONF_DATE
)
fmt
=
"%d-%b-%Y"
val
=
u
"%s-%s-%s"
%
(
m
.
group
(
1
),
m
.
group
(
2
),
m
.
group
(
5
))
opening
=
datetime
.
strptime
(
val
,
fmt
)
val
=
u
"%s-%s-%s"
%
(
m
.
group
(
3
),
m
.
group
(
4
),
m
.
group
(
5
))
closing
=
datetime
.
strptime
(
val
,
fmt
)
return
(
opening
,
closing
)
def
_get_reg_institute
(
self
):
"""Get the regular expression defining the affiliation of my institute.
...
...
@@ -232,8 +306,7 @@ class CheckAndFix(object):
return
False
@
staticmethod
def
_recover_submitted
(
record
):
def
_recover_submitted
(
self
,
record
):
"""Recover submitted date using conference, preprint or thesis
information.
...
...
@@ -249,25 +322,8 @@ class CheckAndFix(object):
val
=
u
""
if
isinstance
(
record
,
RecordConf
):
# CDS opening date is encoded as 20141231
if
u
"opening_date"
in
record
[
u
"meeting_name"
]:
val
=
record
[
u
"meeting_name"
][
u
"opening_date"
]
val
=
"%s-%s-%s"
%
(
val
[
0
:
4
],
val
[
4
:
6
],
val
[
6
:
8
])
# CDS / INSPIREHEP date
# date is encoded as 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
# decode as DD-MMM-YYYY
elif
u
"date"
in
record
[
u
"meeting_name"
]:
val
=
record
[
u
"meeting_name"
][
u
"date"
]
m1
=
REG_CONF_DATES_1
.
match
(
val
)
m2
=
REG_CONF_DATES_2
.
match
(
val
)
if
m1
:
val
=
u
"%s-%s-%s"
%
(
m1
.
group
(
1
),
m1
.
group
(
3
),
m1
.
group
(
4
))
elif
m2
:
val
=
u
"%s-%s-%s"
%
(
m1
.
group
(
1
),
m1
.
group
(
2
),
m1
.
group
(
5
))
opening
,
closing
=
self
.
_get_conference_dates
(
record
)
return
opening
.
strftime
(
"%Y-%m-%d"
)
elif
isinstance
(
record
,
RecordThesis
):
val
=
record
.
these_defense
()
...
...
@@ -316,10 +372,8 @@ class CheckAndFix(object):
if
not
val
:
return
db
=
self
.
db
try
:
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
search_synonym
(
self
.
db
.
collaborations
,
"collaboration"
,
val
)
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
...
...
@@ -329,69 +383,70 @@ class CheckAndFix(object):
Have a look to the synonyms when the country does not exist.
Args:
record (RecordConf): record describing a talk or a proceeding.
record (RecordConf):
record describing a talk or a proceeding.
Raises:
CheckException:
when the country is not defined
nor entered as a synonym.
CheckException:
the country is not defined
nor entered as a synonym.
"""
if
not
isinstance
(
record
,
RecordConf
):
return
db
=
self
.
db
val
=
record
.
conference_country
()
try
:
search_synonym
(
db
.
countries
,
"country"
,
val
)
search_synonym
(
self
.
db
.
countries
,
"country"
,
val
)
except
ToolException
as
e
:
raise
CheckException
(
*
e
.
args
)
def
conference_date
(
self
,
record
,
host
):
"""Check conference date.
def
conference_date
(
self
,
record
):
"""Check conference date
and format it properly
.
Args:
record (RecordConf): record describing a talk or a proceeding.
host (str): possible values ares ``cds.cern.ch``
or ``inspirehep.net``
record (RecordConf):
record describing a talk or a proceeding.
Raises:
CheckException: when dates are not found or not well formed.
CheckException:
dates are not found.
"""
# conference information are available, i.e proceeding
if
not
isinstance
(
record
,
RecordConf
):
return
# inspirehep.net
if
host
==
"inspirehep.net"
:
value
=
record
.
conference_dates
()
if
len
(
value
)
==
0
:
raise
CheckException
(
MSG_NO_CONF_DATE
)
val
=
record
.
conference_dates
()
if
len
(
val
)
==
0
:
raise
CheckException
(
MSG_NO_CONF_DATE
)
# is it well formed
if
REG_WELL_FORMED_CONF_DATES_1
.
match
(
val
):
return
# cds.cern.ch
if
not
(
"111"
in
record
and
"d"
in
record
[
"111"
]):
raise
CheckException
(
MSG_NO_CONF_DATE
)
value
=
record
[
"111"
][
"d"
]
m
=
REG_CONF_DATES
.
match
(
value
)
if
not
m
:
# 12 - 15 Mar 2014 or 29 Feb - 1 Mar 2014
m1
=
REG_CONF_DATES_1
.
match
(
value
)
m2
=
REG_CONF_DATES_2
.
match
(
value
)
if
REG_WELL_FORMED_CONF_DATES_2
.
match
(
val
):
return
if
m1
:
record
[
"111"
][
"d"
]
=
"%s-%s %s %s"
%
m1
.
groups
(
)
# format the date properly
opening
,
closing
=
self
.
_get_conference_dates
(
record
)
elif
m2
:
record
[
"111"
][
"d"
]
=
"%s %s - %s %s %s"
%
m2
.
groups
()
if
opening
.
month
==
closing
.
month
:
val
=
"%02i - %02i %s %i"
%
(
opening
.
day
,
closing
.
day
,
opening
.
strftime
(
"%b"
),
opening
.
year
)
else
:
val
=
"%02i %s - %02i %s %i"
%
(
opening
.
day
,
opening
.
strftime
(
"%b"
),
closing
.
day
,
closing
.
strftime
(
"%b"
),
opening
.
year
)
else
:
raise
CheckException
(
MSG_WELL_FORMED_CONF_DATES
)
meeting
=
record
[
u
"meeting_name"
]
meeting
=
(
meeting
[
0
]
if
isinstance
(
meeting
,
list
)
else
meeting
)
meeting
[
u
"date"
]
=
val
def
is_bad_oai_used
(
self
,
record
):
"""Bad OAI is when the ``id`` in the OAI field is different from
...
...
@@ -576,10 +631,12 @@ class CheckAndFix(object):
"""Check that the record described a conference talk / proceeding.
Args:
record (RecordPubli): record describing a publication.
record (RecordPubli):
record describing a publication.
Raises:
CheckException: when the record is not associated to a conference.
CheckException:
the record is not associated to a conference.
"""
if
not
isinstance
(
record
,
RecordConf
):
...
...
tests/basis/test_10_CheckAndFix_article.py
View file @
e8154552
...
...
@@ -6,7 +6,6 @@
- publisher
- paper_reference
- submitted
- year
- format_author
- get_my_authors
...
...
@@ -16,7 +15,6 @@
"""
import
pytest
from
harvest_tools.checkandfix
import
CheckAndFix
from
harvest_tools.exception
import
ToolException
from
invenio_tools
import
load_record
...
...
@@ -71,23 +69,13 @@ def test_paper_reference(svc, reccds):
assert
reccds
.
paper_reference
()
==
paper_ref
# Paper is published but there are error in the paper reference
# Correct reference is Eur. Phys. J. C 75 (2015) 158
# But volume is not defined and pagination is wrong (75)
# It is not possible to recover it from the doi data.
record
=
load_record
(
"cds.cern.ch"
,
1753190
)
with
pytest
.
raises
(
ToolException
):
svc
.
paper_reference
(
record
)
def
test_submitted
(
svc
,
reccds
,
recins
):
assert
reccds
.
submitted
()
==
"19 Jan 2017"
assert
recins
.
submitted
()
==
"2017-01-19"
assert
reccds
.
submitted
()
==
"19 Jan 2017"
svc
.
submitted
(
reccds
)
reccds
.
submitted
()
assert
reccds
.
submitted
()
==
"2017-01-19"
# test the case 19 01 2017
...
...
tests/basis/test_11_CheckAndFix_proceeding.py
0 → 100644
View file @
e8154552
# -*- coding: utf-8 -*-
"""test_11_CheckAndFix_proceeding
* Test CheckAndFix methods for proceeding.
Use the same proceeding in cds.cern.ch and inspirehep.net
- is_conference
- country
- conference_date
- submitted
- format_editor (already test with article)
- publisher (already test with article)
- paper_reference (already test with article)
- format_authors (already test with article)
- get_my_authors (already test with article)
"""
import
pytest
from
harvest_tools.checkandfix
import
CheckAndFix
from
invenio_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
reccds
():
return
load_record
(
"cds.cern.ch"
,
1411352
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
recins
():
return
load_record
(
"inspirehep.net"
,
1089237
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_is_conference
(
svc
,
reccds
,
recins
):
assert
svc
.
is_conference
(
reccds
)
is
None
assert
svc
.
is_conference
(
recins
)
is
None
def
test_country
(
svc
,
reccds
,
recins
):
assert
svc
.
country
(
reccds
)
is
None
assert
svc
.
country
(
recins
)
is
None
def
test_conference_date
(
svc
,
reccds
,
recins
):
assert
reccds
.
conference_dates
()
==
"6 - 11 Dec 2010"
svc
.
conference_date
(
reccds
)
assert
reccds
.
conference_dates
()
==
"06 - 11 Dec 2010"
assert
recins
.
conference_dates
()
==
"6-11 Dec 2010"
svc
.
conference_date
(
recins
)
assert
recins
.
conference_dates
()
==
"06 - 11 Dec 2010"
def
test_submitted
(
svc
,
reccds
,
recins
):
assert
reccds
.
submitted
()
==
"05 Jan 2012"
svc
.
submitted
(
reccds
)
assert
reccds
.
submitted
()
==
"2012-01-05"
assert
recins
.
submitted
()
==
"2011"
svc
.
submitted
(
recins
)
assert
recins
.
submitted
()
==
"2010-12-06"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment