Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
294ee8a8
Commit
294ee8a8
authored
Jun 28, 2017
by
LE GAC Renaud
Browse files
Migrate the harvester Article.
parent
db4817d1
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
208 additions
and
93 deletions
+208
-93
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+69
-36
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+3
-3
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+8
-1
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+5
-2
modules/test_tools.py
modules/test_tools.py
+30
-51
tests/exception/test_01_acl.py
tests/exception/test_01_acl.py
+31
-0
tests/scan/test_01_Article.py
tests/scan/test_01_Article.py
+62
-0
No files found.
modules/harvest_tools/articles.py
View file @
294ee8a8
...
...
@@ -33,10 +33,12 @@ class Articles(Automaton):
"""Check the content of the article in order to fix non-conformities.
Args:
record (RecordPubli): the MARC12 record describing the article.
record (RecordPubli):
the record describing the article.
Returns:
bool: ``False`` when a non conformity is found and
bool:
``False`` when a non conformity is found and
can not be corrected.
"""
...
...
@@ -47,7 +49,6 @@ class Articles(Automaton):
print
(
"check article record"
)
try
:
self
.
check
.
clean_erratum
(
record
)
if
not
record
.
is_published
():
self
.
logs
[
-
1
].
reject
(
MSG_NO_EDITOR
,
record
=
record
)
...
...
@@ -58,7 +59,6 @@ class Articles(Automaton):
self
.
check
.
paper_reference
(
record
)
self
.
check
.
submitted
(
record
)
self
.
check
.
year
(
record
)
self
.
check
.
format_authors
(
record
,
fmt
=
"F. Last"
)
self
.
check
.
get_my_authors
(
record
,
sort
=
True
)
...
...
@@ -88,32 +88,47 @@ class Articles(Automaton):
in the keyword arguments.
Note:
This method is required deal with an article entered by hand and
This method is required
to
deal with an article entered by hand and
found later by the harvester.
Args:
oai_url (
str): the oai_url, *e.g*
``http://cds.cern.ch/record/123456``.
The origin field
of the existing database record is update
to **oai_url**
when a match is found.
oai_url (
unicode):
the oai_url, *e.g*
``http://cds.cern.ch/record/123456``.
The origin field
of the existing database record is update
to **oai_url**
when a match is found.
year (str): the year of the publication. It is used
year (unicode):
the year of the publication. It is used
by the search algorithm and by the logger.
Keyword Args:
id_publisher (int): identifier of the publisher in the database.
my_authors (str): authors of my institute separated by a comma.
pages (str): the page reference.
publication_url (str): the URL of the publications
preprint_number (str): the preprint number
title (str): the title of the publication.
volume (str): the volume reference.
id_publisher (int):
identifier of the publisher in the database.
my_authors (unicode):
authors of my institute separated by a comma.
pages (unicode):
the page reference.
publication_url (unicode):
the URL of the publications
preprint_number (unicode):
the preprint number
title (unicode):
the title of the publication.
volume (unicode):
the volume reference.
Returns:
tuple: ``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
tuple:
``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
"""
if
self
.
dbg
:
...
...
@@ -194,26 +209,42 @@ class Articles(Automaton):
All the keyword arguments are needed by the transformation.
Args:
primary_oai_url (str): the *primary* OAI identifier of the
primary_oai_url (unicode):
the *primary* OAI identifier of the
record. It is used by the search algorithm.
year (str): the year of publication which is used
year (unicode):
the year of publication which is used
by the logger.
Keyword Args:
id_publisher (int): identifier of the publisher in the database.
my_authors (str): authors of my institute separated by a comma.
oai_url (str): the full oai_url(s) of the article.
pages (str): the page reference.
publication_url (str): the URL of the publications
title (str): the title of the publication.
volume (str): the volume reference.
id_publisher (int):
identifier of the publisher in the database.
my_authors (unicode):
authors of my institute separated by a comma.
oai_url (unicode):
the full oai_url(s) of the article.
pages (unicode):
the page reference.
publication_url (unicode):
the URL of the publications
title (unicode):
the title of the publication.
volume (unicode):
the volume reference.
Returns:
tuple: ``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
tuple:
``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
"""
if
self
.
dbg
:
...
...
@@ -262,10 +293,12 @@ class Articles(Automaton):
The method assumes that erratum are removed.
Args:
record (RecordPubli): the MARC12 record describing the article.
record (RecordPubli):
the record describing the article.
Returns:
int: one when the record is inserted / updated in the database,
int:
one when the record is inserted / updated in the database,
zero otherwise.
"""
...
...
modules/harvest_tools/automaton.py
View file @
294ee8a8
...
...
@@ -25,8 +25,7 @@ MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT
=
'Select a "project" !!!'
MSG_NO_TEAM
=
'Select a "team" !!!'
MSG_NSERT_FAIL
=
"Fail to insert the new record in the database."
MSG_NO_OAI
=
"Reject no OAI identifier"
MSG_INSERT_FAIL
=
"Fail to insert the new record in the database."
MSG_WELL_FORM_OAI
=
"Reject OAI is not well formed"
# search collection when using inspirehep
...
...
@@ -181,7 +180,7 @@ class Automaton(object):
# operation can be reject by callback table._before_insert
else
:
msg
=
MSG_NSERT_FAIL
msg
=
MSG_
I
NSERT_FAIL
if
CALLBACK_ERRORS
in
db
.
publications
:
msg
=
db
.
publications
.
_callback_errors
...
...
@@ -594,6 +593,7 @@ class Automaton(object):
record_id
=
rec_id
,
title
=
url
))
logs
[
-
1
].
reject
(
e
)
return
# start the log for the record
logs
.
append
(
Msg
(
harvester
=
harvester
,
...
...
modules/harvest_tools/checkandfix.py
View file @
294ee8a8
...
...
@@ -863,8 +863,15 @@ class CheckAndFix(object):
data
=
(
m
.
group
(
3
),
int
(
m
.
group
(
2
)),
int
(
m
.
group
(
1
)))
date
=
'%s-%02i-%02i'
%
data
# in some case we have to deal with a list (see cds 2234042)
# in some case it is not defined (e.g. phd thesis)
if
u
"prepublication"
in
record
:
record
[
u
"prepublication"
][
u
"date"
]
=
date
prepublication
=
record
[
u
"prepublication"
]
if
isinstance
(
prepublication
,
list
):
prepublication
[
0
][
u
"date"
]
=
date
else
:
prepublication
[
u
"date"
]
=
date
else
:
record
[
u
"prepublication"
]
=
{
u
"date"
:
date
}
...
...
modules/invenio_tools/recordpubli.py
View file @
294ee8a8
...
...
@@ -778,12 +778,15 @@ class RecordPubli(Record):
"""The date of submission.
Returns:
unicode:
unicode
or list
:
* format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
* Empty string when not defined.
"""
return
self
.
_get
(
u
"prepublication"
,
u
"date"
)
# in some case there is more than one date (see cds 2234042)
# select the oldest one which should be the first one
val
=
self
.
_get
(
u
"prepublication"
,
u
"date"
)
return
(
val
[
0
]
if
isinstance
(
val
,
list
)
else
val
)
def
title
(
self
):
"""The title of the publication.
...
...
tests/harvest_tools/Automaton/test_single_harvester
.py
→
modules/test_tools
.py
View file @
294ee8a8
"""test_single_harvester
# -*- coding: utf-8 -*-
"""a collection of tools to help tests procedure.
* Build the complete list of messages
which can be generated during harvesters.
"""
from
gluon
import
current
import
pytest
from
harvest_tools.articles
import
(
Articles
,
MSG_NO_EDITOR
,
MSG_TRANSFORM_PREPRINT
)
from
harvest_tools.automaton
import
(
MSG_INSERT_FAIL
,
MSG_WELL_FORM_OAI
)
from
harvest_tools.base
import
(
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
...
...
@@ -21,6 +27,7 @@ from harvest_tools.checkandfix import (
MSG_NO_CONF_DATE
,
MSG_NO_DATE
,
MSG_NO_MY_AUTHOR
,
MSG_NO_OAI
,
MSG_NO_REF
,
MSG_NO_YEAR
,
MSG_TEMPORARY_RECORD
,
...
...
@@ -31,8 +38,6 @@ from harvest_tools.checkandfix import (
MSG_WELL_FORMED_DATE
,
MSG_WELL_FORMED_EDITOR
)
from
harvest_tools.factory
import
build_harvester_tool
from
harvest_tools.preprints
import
(
MSG_PREPRINT_IS_PAPER
,
MSG_PREPRINT_IS_CONFERENCE
,
...
...
@@ -41,10 +46,17 @@ from harvest_tools.preprints import (
from
harvest_tools.reports
import
MSG_REPORT_NO_NUMBER
from
harvest_tools.thesis
import
MSG_NO_THESIS
from
invenio_tools.base
import
MSG_NO_CONF
,
MSG_NO_PUBLISHER
from
invenio_tools.base
import
(
MSG_INV_CONF
,
MSG_INV_CONF_KEY
,
MSG_NO_CONF
,
MSG_NO_CONF_ID_KEY
,
MSG_NO_COUNTRY
,
MSG_NO_PUBLISHER
,
MSG_WELL_FORMED_COLLABORATION
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
messages
():
T
=
current
.
T
...
...
@@ -53,16 +65,24 @@ def messages():
T
(
MSG_TRANSFORM_PREPRINT
),
T
(
MSG_FIX_ORIGIN
),
T
(
MSG_IN_DB
),
T
(
MSG_INV_CONF
),
T
(
MSG_INV_CONF_KEY
),
T
(
MSG_INSERT_FAIL
),
T
(
MSG_LOAD
),
T
(
MSG_NO_CONF
),
T
(
MSG_NO_CONF_ID_KEY
),
T
(
MSG_NO_COUNTRY
),
T
(
MSG_NO_ENTRY
%
"collaborations"
),
T
(
MSG_NO_ENTRY
%
"countries"
),
T
(
MSG_NO_ENTRY
%
"publishers"
),
T
(
MSG_NO_OAI
),
T
(
MSG_TOOMANY_SYNONYM
),
T
(
MSG_NO_AUTHOR
),
T
(
MSG_NO_CONF
),
T
(
MSG_NO_CONF_DATE
),
T
(
MSG_NO_DATE
),
T
(
MSG_NO_MY_AUTHOR
),
T
(
MSG_NO_OAI
),
T
(
MSG_NO_PUBLISHER
),
T
(
MSG_NO_REF
),
T
(
MSG_NO_THESIS
),
...
...
@@ -76,51 +96,10 @@ def messages():
T
(
MSG_TO_MANY_DATE
),
T
(
MSG_TO_MANY_FAUTHOR
),
T
(
MSG_TO_MANY_YEAR
),
T
(
MSG_WELL_FORMED_COLLABORATION
),
T
(
MSG_WELL_FORMED_CONF_DATES
),
T
(
MSG_WELL_FORMED_DATE
),
T
(
MSG_WELL_FORMED_EDITOR
)}
T
(
MSG_WELL_FORMED_EDITOR
),
T
(
MSG_WELL_FORM_OAI
)}
return
set_msgs
def
test_lhcb_acl
(
messages
):
"""Harvest LHCb article for a given year.
This test is useful to:
* debug an harvester
* profile its performance to see where the time is spent.
* compare different implementation to measure improvements.
* ...
"""
# These parameter only make sense if you are inserting record in database
# Select the current year in order to test different case
db
=
current
.
db
id_team
=
7
# LHCb
id_project
=
8
# LHCb
id_category
=
2
# ACL
year
=
current
.
request
.
now
.
year
# build the harvester
tool
=
build_harvester_tool
(
db
,
id_team
,
id_project
,
"articles"
,
id_category
,
year_start
=
str
(
year
),
year_end
=
""
,
dry_run
=
True
,
debug
=
False
)
assert
isinstance
(
tool
,
Articles
)
# run the harvester
tool
.
process_url
(
"cds.cern.ch"
,
"LHCb Papers"
)
# analyse the log
# Number of article cannot be check since it evolve within a year
# Only test that there are no unexpected messages
msgs
=
set
([
el
.
txt
for
el
in
tool
.
logs
])
assert
msgs
.
issubset
(
messages
)
tests/exception/test_01_acl.py
0 → 100644
View file @
294ee8a8
# -*- coding: utf-8 -*-
"""test_01_acl
* collection of article with exception
"""
import
pytest
from
harvest_tools.checkandfix
import
CheckAndFix
from
invenio_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_acl_cds2234042
(
svc
):
"""
* The field ``prepublication`` is dictionary
* For the publication 2234042 it is a list.
* Protection added Record.submitted and CheckAndFix.submitted
"""
reccds
=
load_record
(
"cds.cern.ch"
,
2234042
)
assert
reccds
.
submitted
()
==
"18 Nov 2016"
svc
=
CheckAndFix
()
assert
svc
.
submitted
(
reccds
)
is
None
assert
reccds
.
submitted
()
==
"2016-11-18"
tests/scan/test_01_Article.py
0 → 100644
View file @
294ee8a8
# -*- coding: utf-8 -*-
"""test_01_Article
* Harvester is Article
* Store is cds.cern.ch
* LHCb ACL for the current year
* Check that all error messages are expected
"""
import
pytest
from
gluon
import
current
from
harvest_tools.articles
import
Articles
from
harvest_tools.factory
import
build_harvester_tool
from
test_tools
import
messages
@
pytest
.
fixture
(
scope
=
"module"
)
def
harvester_messages
():
return
messages
()
def
test_lhcb_acl
(
harvester_messages
):
"""Harvest LHCb article for a given year.
This test is useful to:
* debug an harvester
* profile its performance to see where the time is spent.
* compare different implementation to measure improvements.
* ...
"""
# These parameter only make sense if you are inserting record in database
# Select the current year in order to test different case
db
=
current
.
db
id_team
=
7
# LHCb
id_project
=
8
# LHCb
id_category
=
2
# ACL
year
=
current
.
request
.
now
.
year
# build the harvester
tool
=
build_harvester_tool
(
db
,
id_team
,
id_project
,
"articles"
,
id_category
,
year_start
=
str
(
year
),
year_end
=
""
,
dry_run
=
True
,
debug
=
True
)
assert
isinstance
(
tool
,
Articles
)
# run the harvester
tool
.
process_url
(
"cds.cern.ch"
,
"LHCb Papers"
)
# analyse the log
# Number of article cannot be check since it evolve within a year
# Only test that there are no unexpected messages
msgs
=
set
([
el
.
txt
for
el
in
tool
.
logs
])
assert
msgs
.
issubset
(
harvester_messages
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment