Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 739e4103 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Move the class CheckAndFix from invenio_tools to harvest_tools.

parent 4bd73009
......@@ -7,14 +7,14 @@ import traceback
from gluon import current
from gluon.restricted import RestrictedError
from harvest_tools import (build_harvester_tool,
CheckAndFix,
CheckException,
DRY_RUN,
format_author_fr,
family_name_fr,
search_synonym,
ToolException)
from invenio_tools import (CheckAndFix,
CheckException,
load_record,
from invenio_tools import (load_record,
OAI_URL,
RecordConf,
RecordThesis)
......
......@@ -6,11 +6,12 @@ and to push them in the database.
from base import (DRY_RUN,
family_name_fr,
format_author_fr,
search_synonym,
ToolException)
search_synonym)
from automaton import Automaton
from articles import Articles
from checkandfix import CheckAndFix
from exception import CheckException, ToolException
from factory import build_harvester_tool, get_harvester_tool
from msg import Msg
from msgcollection import MsgCollection
......
......@@ -13,7 +13,7 @@ from base import (family_name_fr,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from invenio_tools import CheckException
from checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
......
......@@ -10,9 +10,9 @@ from base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
search_synonym,
ToolException)
from checkandfix import CheckAndFix
from gluon.storage import Storage
from invenio_tools import (CheckAndFix,
InvenioStore,
from invenio_tools import (InvenioStore,
Marc12,
OAI_URL)
from msg import Msg
......
......@@ -2,6 +2,7 @@
""" harvest_tools.base
"""
from exception import ToolException
from invenio_tools import REG_AUTHOR
from plugin_dbui import get_id, UNDEF_ID
......@@ -207,7 +208,3 @@ def search_synonym(table, fieldname, value, create=False):
else:
msg = MSG_TOOMANY_SYNONYM % table._tablename
raise ToolException(msg)
class ToolException(Exception):
pass
# -*- coding: utf-8 -*-
""" invenio_tools.checkandfix
""" harvest_tools.checkandfix
"""
import re
import regex
from base import (MSG_NO_CONF,
MSG_NO_COUNTRY,
MSG_NO_PUBLISHER,
MSG_WELL_FORMED_COLLABORATION,
MSG_NO_THESIS,
OAI_URL,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from exception import CheckException
from invenio_tools import (load_record,
MSG_NO_COUNTRY,
MSG_NO_CONF,
MSG_NO_PUBLISHER,
MSG_WELL_FORMED_COLLABORATION,
MSG_NO_THESIS,
OAI_URL,
RecordConf,
RecordThesis,
REG_AUTHOR,
REG_OAI,
REG_YEAR)
from filters import CLEAN_REVIEW
from gluon import current
from inveniostore import InvenioStore
from marc12 import Marc12
from plugin_dbui import get_id
from recordconf import RecordConf
from recordthesis import RecordThesis
DECODE_ARXIV = re.compile(r"arXiv:(\d{2})(\d{2})\.")
......@@ -88,25 +87,6 @@ REG_SUBMITTED = re.compile(regex.REG_SUBMITTED)
UNIVERSITY = "University"
def load_record(host, record_id):
"""Helper function to load a single record from an invenio store.
Args:
host (unicode): host of the store.
Possible values are ``cds.cern.ch`` or ``inspirehep.net``.
record_id (integer): the record identifier in the store
Returns:
Record: the decoded record.
"""
store = InvenioStore(host)
xml = store.get_record(record_id)
svc = Marc12()
return svc(xml)[0]
class CheckAndFix(object):
"""A collection of tools to check and repair the content
of the Marc12 record.
......
# -*- coding: utf-8 -*-
""" harvest_tools.exception
"""
from invenio_tools import ExceptionUTF8
class CheckException(ExceptionUTF8):
pass
class ToolException(ExceptionUTF8):
pass
......@@ -7,7 +7,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from checkandfix import CheckException
from plugin_dbui import UNDEF_ID
......
......@@ -7,7 +7,8 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException, RecordConf, RecordThesis
from checkandfix import CheckException
from invenio_tools import RecordConf, RecordThesis
from plugin_dbui import UNDEF_ID
......
......@@ -7,7 +7,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from checkandfix import CheckException
from plugin_dbui import UNDEF_ID
......
......@@ -7,7 +7,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID, UNKNOWN
......
......@@ -7,7 +7,7 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException
from checkandfix import CheckException
from plugin_dbui import UNDEF_ID
......
......@@ -8,7 +8,8 @@ import traceback
from automaton import Automaton
from base import family_name_fr, format_author_fr, MSG_CRASH, MSG_LOAD
from invenio_tools import CheckException, RecordThesis
from checkandfix import CheckException
from invenio_tools import RecordThesis
from plugin_dbui import get_id, UNDEF_ID
......
......@@ -8,8 +8,10 @@ from base import (ARXIV,
is_conference,
is_institute,
is_thesis,
MSG_NO_CONF,
MSG_NO_COUNTRY,
MSG_NO_PUBLISHER,
MSG_NO_THESIS,
MSG_WELL_FORMED_COLLABORATION,
OAI_URL,
REG_ARXIV_NUMBER,
......@@ -19,12 +21,11 @@ from base import (ARXIV,
THESIS_DIR)
from exception import (CdsException,
CheckException,
ExceptionUTF8,
Marc12Exception,
RecordException,
XmlException)
from checkandfix import CheckAndFix, load_record
from inveniostore import InvenioStore
from iterrecord import IterRecord
from marc12 import Marc12
......@@ -33,3 +34,22 @@ from recordconf import RecordConf
from recordinst import RecordInst
from recordpubli import RecordPubli
from recordthesis import RecordThesis
def load_record(host, record_id):
"""Helper function to load a single record from an invenio store.
Args:
host (unicode): host of the store.
Possible values are ``cds.cern.ch`` or ``inspirehep.net``.
record_id (integer): the record identifier in the store
Returns:
Record: the decoded record.
"""
store = InvenioStore(host)
xml = store.get_record(record_id)
svc = Marc12()
return svc(xml)[0]
......@@ -20,10 +20,6 @@ class CdsException(ExceptionUTF8):
pass
class CheckException(ExceptionUTF8):
pass
class Marc12Exception(ExceptionUTF8):
pass
......
# -*- coding: utf-8 -*-
""" NAME
fix-conference-url
SYNOPSIS
fix the publications field conference_url
DESCRIPTION
Check the field conference_url in the invenio store and update it.
From time to time, it has been forgotten.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-conference-url
AUTHOR
R. Le Gac -- Dec 2014
"""
"""
if __name__ == "__main__":
import re
import sys
from argparse import ArgumentParser, FileType
from invenio_tools import CheckAndFix, CheckException, InvenioStore, Marc12
from harvest_tools import CheckAndFix, CheckException
from invenio_tools import InvenioStore, Marc12
REG_ORIGIN = re.compile("http://([a-z\.]+)/record/(\d+)")
# command line options
parser = ArgumentParser()
args = parser.parse_args()
# unlock the publications update when the status is OK
db.publications._before_update.remove(INHIBIT_PUBLICATION_UPDATE_ON_OK)
# service
check = CheckAndFix()
decode = Marc12()
# scan the publications table
query = db.publications.origin.len() > 0
query &= db.publications.conference_url.len() == 0
query &= (db.publications.id_categories == 7) | (db.publications.id_categories == 9)
query &= (db.publications.id_categories == 7) | (db.publications.id_categories == 9)
for row in db(query).select():
m = REG_ORIGIN.match(row.origin)
if not m:
continue
host, store_id = m.groups()
# retrieve the full record from the store
store = InvenioStore(host)
xml = store.get_record(store_id)
record = decode(xml)[0]
try:
check.conference(record)
except CheckException, e:
pass
val = record.conference_url()
if val:
print " - %s, conference url: %s" % (row.id, val)
db(db.publications.id==row.id).update(conference_url=val)
db(db.publications.id==row.id).update(conference_url=val)
db.commit()
# close
sys.exit(0)
# -*- coding: utf-8 -*-
""" NAME
fix-page-volume
SYNOPSIS
fix the publications page and volume.
DESCRIPTION
In September 2014, the pages and volume information have been
exchange when decoding the marc12 data (commit 8280655).
This script fix this bug.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-page-volume
AUTHOR
R. Le Gac -- Dec 2014
"""
"""
if __name__ == "__main__":
import re
import sys
from argparse import ArgumentParser, FileType
from invenio_tools import CheckAndFix, InvenioStore, Marc12
from harvest_tools import CheckAndFix
from invenio_tools import InvenioStore, Marc12
REG_ORIGIN = re.compile("http://([a-z\.]+)/record/(\d+)")
# command line options
parser = ArgumentParser()
args = parser.parse_args()
# unlock the publications update when the status is OK
db.publications._before_update.remove(INHIBIT_PUBLICATION_UPDATE_ON_OK)
# service
decode = Marc12()
check = CheckAndFix()
# the ACL
# the ACL
query = db.publications.id_categories == 2
# scan the publications table
# scan the publications table
for row in db(query).select():
if not isinstance(row.origin, (str, unicode)):
print " - Invalid origin", row.origin
continue
m = REG_ORIGIN.match(row.origin)
if not m:
continue
host, store_id = m.groups()
# retrieve the full record from the store
store = InvenioStore(host)
xml = store.get_record(store_id)
record = decode(xml)[0]
# record might be deleted and replace by a new one
if "980" in record and "c" in record["980"] and record["980"]["c"] == "DELETED":
if "970" in record and "d" in record["970"]:
......@@ -79,30 +82,30 @@ if __name__ == "__main__":
check.clean_erratum(record)
check.paper_reference(record)
check.format_editor(record)
pages = record.paper_pages()
volume = record.paper_volume()
if row.pages == pages and row.volume == volume:
continue
t1 = (db.publishers[row.id_publishers].abbreviation,
row.volume,
row.year,
t1 = (db.publishers[row.id_publishers].abbreviation,
row.volume,
row.year,
row.pages)
t2 = (record.paper_editor(),
record.paper_volume(),
record.paper_year(),
t2 = (record.paper_editor(),
record.paper_volume(),
record.paper_year(),
record.paper_pages())
status = db.status[row.id_status].code
print " - %s %s (%s) %s" % t1, "--> %s %s (%s) %s" % t2, status, row.origin
rep = raw_input(" - Fix it [y/N]:")
if rep.lower() == "y":
db(db.publications.id==row.id).update(pages=pages, volume=volume)
db(db.publications.id==row.id).update(pages=pages, volume=volume)
db.commit()
# close
sys.exit(0)
\ No newline at end of file
# -*- coding: utf-8 -*-
""" NAME
fix-publication-url
SYNOPSIS
fix the publications field publication_url
DESCRIPTION
The field publication_url is the URL of the pdf file.
This definition has been re-enforce in track_publications 0.8.8.
The script check this field and try to fix it.
OPTIONS
-h, --help
Display the help and exit.
EXAMPLE
> cd ...track_publications/scripts
> ./track_publications fix-paper-url
AUTHOR
R. Le Gac -- Dec 2014
"""
def get_record(host, record_id):
"""Retrieve the record, record_id, from the store.
"""
decode = Marc12()
store = InvenioStore(host)
try:
xml = store.get_record(store_id)
record = decode(xml)[0]
except Marc12Exception:
record = None
return record
def update(row, val):
"""update publication_url field for the record row.
"""
"""
print u" - %s, %s → '%s'" % (row.id, row.publication_url, val)
db(db.publications.id==row.id).update(publication_url=val)
if __name__ == "__main__":
import os
import re
import sys
from argparse import ArgumentParser, FileType
from invenio_tools import (CheckAndFix,
CheckException,
InvenioStore,
from harvest_tools import CheckAndFix, CheckException
from invenio_tools import (InvenioStore,
Marc12,
Marc12Exception)
REG_ARXIV = re.compile("http://[a-z\.]*arxiv.org/abs/(?:arXiv:)?\d+\.\d+")
REG_INDICO = re.compile("https?://indico")
REG_IOP = re.compile("http://iopscience.iop.org/(\d+-\d+)/(\d+)/(\d+)/([A-Z]?\d+)/?")
REG_ORIGIN = re.compile("https?://([a-z\.]+)/record/(\d+)")
REG_TEL = re.compile("(http://tel.archives-ouvertes.fr/tel-\d+)(?:/fr/)?")
# command line options
parser = ArgumentParser()
args = parser.parse_args()