Commit 294ee8a8 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Migrate the harvester Article.

parent db4817d1
......@@ -33,10 +33,12 @@ class Articles(Automaton):
"""Check the content of the article in order to fix non-conformities.
Args:
record (RecordPubli): the MARC12 record describing the article.
record (RecordPubli):
the record describing the article.
Returns:
bool: ``False`` when a non conformity is found and
bool:
``False`` when a non conformity is found and
can not be corrected.
"""
......@@ -47,7 +49,6 @@ class Articles(Automaton):
print("check article record")
try:
self.check.clean_erratum(record)
if not record.is_published():
self.logs[-1].reject(MSG_NO_EDITOR, record=record)
......@@ -58,7 +59,6 @@ class Articles(Automaton):
self.check.paper_reference(record)
self.check.submitted(record)
self.check.year(record)
self.check.format_authors(record, fmt="F. Last")
self.check.get_my_authors(record, sort=True)
......@@ -88,32 +88,47 @@ class Articles(Automaton):
in the keyword arguments.
Note:
This method is required deal with an article entered by hand and
This method is required to deal with an article entered by hand and
found later by the harvester.
Args:
oai_url (str): the oai_url, *e.g*
``http://cds.cern.ch/record/123456``. The origin field
of the existing database record is update to **oai_url**
when a match is found.
oai_url (unicode):
the oai_url, *e.g* ``http://cds.cern.ch/record/123456``.
The origin field of the existing database record is update
to **oai_url** when a match is found.
year (str): the year of the publication. It is used
year (unicode):
the year of the publication. It is used
by the search algorithm and by the logger.
Keyword Args:
id_publisher (int): identifier of the publisher in the database.
my_authors (str): authors of my institute separated by a comma.
pages (str): the page reference.
publication_url (str): the URL of the publications
preprint_number (str): the preprint number
title (str): the title of the publication.
volume (str): the volume reference.
id_publisher (int):
identifier of the publisher in the database.
my_authors (unicode):
authors of my institute separated by a comma.
pages (unicode):
the page reference.
publication_url (unicode):
the URL of the publications
preprint_number (unicode):
the preprint number
title (unicode):
the title of the publication.
volume (unicode):
the volume reference.
Returns:
tuple: ``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
tuple:
``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
"""
if self.dbg:
......@@ -194,26 +209,42 @@ class Articles(Automaton):
All the keyword arguments are needed by the transformation.
Args:
primary_oai_url (str): the *primary* OAI identifier of the
primary_oai_url (unicode):
the *primary* OAI identifier of the
record. It is used by the search algorithm.
year (str): the year of publication which is used
year (unicode):
the year of publication which is used
by the logger.
Keyword Args:
id_publisher (int): identifier of the publisher in the database.
my_authors (str): authors of my institute separated by a comma.
oai_url (str): the full oai_url(s) of the article.
pages (str): the page reference.
publication_url (str): the URL of the publications
title (str): the title of the publication.
volume (str): the volume reference.
id_publisher (int):
identifier of the publisher in the database.
my_authors (unicode):
authors of my institute separated by a comma.
oai_url (unicode):
the full oai_url(s) of the article.
pages (unicode):
the page reference.
publication_url (unicode):
the URL of the publications
title (unicode):
the title of the publication.
volume (unicode):
the volume reference.
Returns:
tuple: ``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
tuple:
``(id, status)`` which contains the ``id`` of the record.
It is equal to ``None`` when nothing is found.
The ``status`` is equal to one when the existing preprint was
modified into article, zero otherwise
"""
if self.dbg:
......@@ -262,10 +293,12 @@ class Articles(Automaton):
The method assumes that erratum are removed.
Args:
record (RecordPubli): the MARC12 record describing the article.
record (RecordPubli):
the record describing the article.
Returns:
int: one when the record is inserted / updated in the database,
int:
one when the record is inserted / updated in the database,
zero otherwise.
"""
......
......@@ -25,8 +25,7 @@ MSG_NO_CAT = 'Select a "category" !!!'
MSG_NO_PROJECT = 'Select a "project" !!!'
MSG_NO_TEAM = 'Select a "team" !!!'
MSG_NSERT_FAIL = "Fail to insert the new record in the database."
MSG_NO_OAI = "Reject no OAI identifier"
MSG_INSERT_FAIL = "Fail to insert the new record in the database."
MSG_WELL_FORM_OAI = "Reject OAI is not well formed"
# search collection when using inspirehep
......@@ -181,7 +180,7 @@ class Automaton(object):
# operation can be reject by callback table._before_insert
else:
msg = MSG_NSERT_FAIL
msg = MSG_INSERT_FAIL
if CALLBACK_ERRORS in db.publications:
msg = db.publications._callback_errors
......@@ -594,6 +593,7 @@ class Automaton(object):
record_id=rec_id,
title=url))
logs[-1].reject(e)
return
# start the log for the record
logs.append(Msg(harvester=harvester,
......
......@@ -863,8 +863,15 @@ class CheckAndFix(object):
data = (m.group(3), int(m.group(2)), int(m.group(1)))
date = '%s-%02i-%02i' % data
# in some case we have to deal with a list (see cds 2234042)
# in some case it is not defined (e.g. phd thesis)
if u"prepublication" in record:
record[u"prepublication"][u"date"] = date
prepublication = record[u"prepublication"]
if isinstance(prepublication, list):
prepublication[0][u"date"] = date
else:
prepublication[u"date"] = date
else:
record[u"prepublication"] = {u"date": date}
......
......@@ -778,12 +778,15 @@ class RecordPubli(Record):
"""The date of submission.
Returns:
unicode:
unicode or list:
* format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
* Empty string when not defined.
"""
return self._get(u"prepublication", u"date")
# in some case there is more than one date (see cds 2234042)
# select the oldest one which should be the first one
val = self._get(u"prepublication", u"date")
return (val[0] if isinstance(val, list) else val)
def title(self):
"""The title of the publication.
......
"""test_single_harvester
# -*- coding: utf-8 -*-
"""a collection of tools to help tests procedure.
* Build the complete list of messages
which can be generated during harvesters.
"""
from gluon import current
import pytest
from harvest_tools.articles import (
Articles,
MSG_NO_EDITOR,
MSG_TRANSFORM_PREPRINT)
from harvest_tools.automaton import (
MSG_INSERT_FAIL,
MSG_WELL_FORM_OAI)
from harvest_tools.base import (
MSG_FIX_ORIGIN,
MSG_IN_DB,
......@@ -21,6 +27,7 @@ from harvest_tools.checkandfix import (
MSG_NO_CONF_DATE,
MSG_NO_DATE,
MSG_NO_MY_AUTHOR,
MSG_NO_OAI,
MSG_NO_REF,
MSG_NO_YEAR,
MSG_TEMPORARY_RECORD,
......@@ -31,8 +38,6 @@ from harvest_tools.checkandfix import (
MSG_WELL_FORMED_DATE,
MSG_WELL_FORMED_EDITOR)
from harvest_tools.factory import build_harvester_tool
from harvest_tools.preprints import (
MSG_PREPRINT_IS_PAPER,
MSG_PREPRINT_IS_CONFERENCE,
......@@ -41,10 +46,17 @@ from harvest_tools.preprints import (
from harvest_tools.reports import MSG_REPORT_NO_NUMBER
from harvest_tools.thesis import MSG_NO_THESIS
from invenio_tools.base import MSG_NO_CONF, MSG_NO_PUBLISHER
from invenio_tools.base import (
MSG_INV_CONF,
MSG_INV_CONF_KEY,
MSG_NO_CONF,
MSG_NO_CONF_ID_KEY,
MSG_NO_COUNTRY,
MSG_NO_PUBLISHER,
MSG_WELL_FORMED_COLLABORATION)
@pytest.fixture(scope="module")
def messages():
T = current.T
......@@ -53,16 +65,24 @@ def messages():
T(MSG_TRANSFORM_PREPRINT),
T(MSG_FIX_ORIGIN),
T(MSG_IN_DB),
T(MSG_INV_CONF),
T(MSG_INV_CONF_KEY),
T(MSG_INSERT_FAIL),
T(MSG_LOAD),
T(MSG_NO_CONF),
T(MSG_NO_CONF_ID_KEY),
T(MSG_NO_COUNTRY),
T(MSG_NO_ENTRY % "collaborations"),
T(MSG_NO_ENTRY % "countries"),
T(MSG_NO_ENTRY % "publishers"),
T(MSG_NO_OAI),
T(MSG_TOOMANY_SYNONYM),
T(MSG_NO_AUTHOR),
T(MSG_NO_CONF),
T(MSG_NO_CONF_DATE),
T(MSG_NO_DATE),
T(MSG_NO_MY_AUTHOR),
T(MSG_NO_OAI),
T(MSG_NO_PUBLISHER),
T(MSG_NO_REF),
T(MSG_NO_THESIS),
......@@ -76,51 +96,10 @@ def messages():
T(MSG_TO_MANY_DATE),
T(MSG_TO_MANY_FAUTHOR),
T(MSG_TO_MANY_YEAR),
T(MSG_WELL_FORMED_COLLABORATION),
T(MSG_WELL_FORMED_CONF_DATES),
T(MSG_WELL_FORMED_DATE),
T(MSG_WELL_FORMED_EDITOR)}
T(MSG_WELL_FORMED_EDITOR),
T(MSG_WELL_FORM_OAI)}
return set_msgs
def test_lhcb_acl(messages):
"""Harvest LHCb article for a given year.
This test is useful to:
* debug an harvester
* profile its performance to see where the time is spent.
* compare different implementation to measure improvements.
* ...
"""
# These parameter only make sense if you are inserting record in database
# Select the current year in order to test different case
db = current.db
id_team = 7 # LHCb
id_project = 8 # LHCb
id_category = 2 # ACL
year = current.request.now.year
# build the harvester
tool = build_harvester_tool(
db,
id_team,
id_project,
"articles",
id_category,
year_start=str(year),
year_end="",
dry_run=True,
debug=False)
assert isinstance(tool, Articles)
# run the harvester
tool.process_url("cds.cern.ch", "LHCb Papers")
# analyse the log
# Number of article cannot be check since it evolve within a year
# Only test that there are no unexpected messages
msgs = set([el.txt for el in tool.logs])
assert msgs.issubset(messages)
# -*- coding: utf-8 -*-
"""test_01_acl
* collection of article with exception
"""
import pytest
from harvest_tools.checkandfix import CheckAndFix
from invenio_tools import load_record
@pytest.fixture(scope="module")
def svc():
return CheckAndFix()
def test_acl_cds2234042(svc):
"""
* The field ``prepublication`` is dictionary
* For the publication 2234042 it is a list.
* Protection added Record.submitted and CheckAndFix.submitted
"""
reccds = load_record("cds.cern.ch", 2234042)
assert reccds.submitted() == "18 Nov 2016"
svc = CheckAndFix()
assert svc.submitted(reccds) is None
assert reccds.submitted() == "2016-11-18"
# -*- coding: utf-8 -*-
"""test_01_Article
* Harvester is Article
* Store is cds.cern.ch
* LHCb ACL for the current year
* Check that all error messages are expected
"""
import pytest
from gluon import current
from harvest_tools.articles import Articles
from harvest_tools.factory import build_harvester_tool
from test_tools import messages
@pytest.fixture(scope="module")
def harvester_messages():
return messages()
def test_lhcb_acl(harvester_messages):
"""Harvest LHCb article for a given year.
This test is useful to:
* debug an harvester
* profile its performance to see where the time is spent.
* compare different implementation to measure improvements.
* ...
"""
# These parameter only make sense if you are inserting record in database
# Select the current year in order to test different case
db = current.db
id_team = 7 # LHCb
id_project = 8 # LHCb
id_category = 2 # ACL
year = current.request.now.year
# build the harvester
tool = build_harvester_tool(
db,
id_team,
id_project,
"articles",
id_category,
year_start=str(year),
year_end="",
dry_run=True,
debug=True)
assert isinstance(tool, Articles)
# run the harvester
tool.process_url("cds.cern.ch", "LHCb Papers")
# analyse the log
# Number of article cannot be check since it evolve within a year
# Only test that there are no unexpected messages
msgs = set([el.txt for el in tool.logs])
assert msgs.issubset(harvester_messages)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment