Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit c2623351 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Apply 2to3 converter to harvest_tools.

parent 52483d05
# -*- coding: utf-8 -*-
"""a collection of tools to search of publications in invenio store
and to push them in the database.
"""
from base import (DRY_RUN,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD,
MSG_NO_ENTRY,
MSG_TOOMANY_SYNONYM,
family_name_fr,
search_synonym)
from .base import (DRY_RUN,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD,
MSG_NO_ENTRY,
MSG_TOOMANY_SYNONYM,
family_name_fr,
search_synonym)
from automaton import Automaton
from articles import Articles
from checkandfix import CheckAndFix, MONTHS
from exception import CheckException, ToolException
from factory import build_harvester_tool, get_harvester_tool
from msg import Msg
from msgcollection import MsgCollection
from notes import Notes
from preprints import Preprints
from proceedings import Proceedings
from reports import Reports
from talks import Talks
from thesis import Thesis
from .automaton import Automaton
from .articles import Articles
from .checkandfix import CheckAndFix, MONTHS
from .exception import CheckException, ToolException
from .factory import build_harvester_tool, get_harvester_tool
from .msg import Msg
from .msgcollection import MsgCollection
from .notes import Notes
from .preprints import Preprints
from .proceedings import Proceedings
from .reports import Reports
from .talks import Talks
from .thesis import Thesis
# -*- coding: utf-8 -*-
""" harvest_tools.articles
"""
import traceback
from automaton import Automaton
from base import (learn_my_authors,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from checkandfix import CheckException
from .automaton import Automaton
from .base import (learn_my_authors,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
MSG_LOAD)
from .checkandfix import CheckException
from plugin_dbui import get_id, UNDEF_ID
......@@ -45,7 +44,7 @@ class Articles(Automaton):
return False
if self.dbg:
print "check article record"
print("check article record")
try:
self.check.clean_erratum(record)
......@@ -70,7 +69,7 @@ class Articles(Automaton):
except Exception as e:
self.logs[-1].reject(MSG_CRASH % e, record=record, translate=False)
print traceback.format_exc()
print((traceback.format_exc()))
return False
return True
......@@ -118,7 +117,7 @@ class Articles(Automaton):
"""
if self.dbg:
print "get existing article by fields"
print("get existing article by fields")
# alias
db = self.db
......@@ -218,7 +217,7 @@ class Articles(Automaton):
"""
if self.dbg:
print "check existing article by origin"
print("check existing article by origin")
# alias
db = self.db
......
# -*- coding: utf-8 -*-
""" harvest_tools.automaton
"""
......@@ -6,17 +5,17 @@ import re
import traceback
from base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
search_synonym,
ToolException)
from checkandfix import CheckAndFix
from .base import (MSG_FIX_ORIGIN,
MSG_IN_DB,
search_synonym,
ToolException)
from .checkandfix import CheckAndFix
from gluon.storage import Storage
from invenio_tools import (InvenioStore,
Marc12,
OAI_URL)
from msg import Msg
from msgcollection import MsgCollection
from .invenio_tools import (InvenioStore,
Marc12,
OAI_URL)
from .msg import Msg
from .msgcollection import MsgCollection
from plugin_dbui import CALLBACK_ERRORS, get_id
......@@ -358,7 +357,7 @@ class Automaton(object):
"""
if self.dbg:
print "check record"
print("check record")
try:
self.check.recover_oai(record, self.harvester.host)
......@@ -407,7 +406,7 @@ class Automaton(object):
"""
if self.dbg:
print "get existing record by fields"
print("get existing record by fields")
# alias
db = self.db
......@@ -473,11 +472,11 @@ class Automaton(object):
"""
if self.dbg:
print "start processing", self.__class__.__name__
print "decode request"
print("start processing", self.__class__.__name__)
print("decode request")
if self.dbg:
print "get harvest parameters"
print("get harvest parameters")
# decode the XML request
self.collection_logs.append(MsgCollection(found=1))
......@@ -511,7 +510,7 @@ class Automaton(object):
"""
if self.dbg:
print "process URL search"
print("process URL search")
# extend harvester for logs
self.harvester.host = host
......@@ -557,25 +556,25 @@ class Automaton(object):
return
if self.dbg:
print "%i records found in %s" % (len(rec_ids), collection)
print("%i records found in %s" % (len(rec_ids), collection))
for rec_id in rec_ids:
if self.dbg:
print "\nprocessing record", rec_id
print("\nprocessing record", rec_id)
try:
db_id = is_record_in_db(title, host=host, rec_id=rec_id)
if db_id:
if self.dbg:
print "record in db", rec_id, "->", db_id
print("record in db", rec_id, "->", db_id)
continue
xml = store.get_record(rec_id)
decode_xml(xml)
except Exception as e:
print traceback.format_exc()
print(traceback.format_exc())
url = OAI_URL % (host, rec_id)
logs.append(Msg(harvester=self.harvester,
collection=title,
......@@ -592,7 +591,7 @@ class Automaton(object):
"""
if self.dbg:
print "process xml record"
print("process xml record")
# alias
db = self.db
......@@ -610,8 +609,8 @@ class Automaton(object):
for record in records:
if self.dbg:
print "record decoded"
print record.title().encode("utf-8")
print("record decoded")
print(record.title())
# reject the record using the secondary OAI
# require to cover the case:
......@@ -627,7 +626,7 @@ class Automaton(object):
db.publications[rec_id] = dict(origin=oai_url)
if self.dbg:
print "record in db (secondary oai) ->", rec_id
print("record in db (secondary oai) ->", rec_id)
continue
......@@ -641,17 +640,17 @@ class Automaton(object):
# repair non-conformity as far as possible
if not check_record(record):
if self.dbg:
print "record rejected", logs[-1].txt
print("record rejected", logs[-1].txt)
continue
if self.dbg:
print "insert record in the database"
print("insert record in the database")
# insert the record in the database
insert_record(record)
if self.dbg:
print logs[-1].action.upper(), logs[-1].txt
print(logs[-1].action.upper(), logs[-1].txt)
def report(self):
"""Build the processing report.
......
# -*- coding: utf-8 -*-
""" harvest_tools.base
"""
from exception import ToolException
from .exception import ToolException
from plugin_dbui import get_id, UNDEF_ID
......@@ -79,8 +78,7 @@ def learn_my_authors(db,
# NOTE2: handle the case J. Foo and J. M. Foo are the same person
elems = []
for elem in diff:
if isinstance(elem, unicode):
elem = elem.encode('utf8')
if isinstance(elem, str):
family_name = elem[elem.rfind('. ') + 2:]
if family_name not in row.authors:
......
......@@ -5,18 +5,17 @@
import re
import regex
from base import search_synonym, ToolException
from exception import CheckException
from .base import search_synonym, ToolException
from .exception import CheckException
from gluon import current
from invenio_tools import (DECODE_REF,
MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
RecordConf,
RecordThesis,
REG_OAI,
REG_YEAR)
from itertools import imap
from .invenio_tools import (DECODE_REF,
MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
RecordConf,
RecordThesis,
REG_OAI,
REG_YEAR)
from plugin_dbui import CLEAN_SPACES, get_id
......@@ -27,21 +26,21 @@ DECODE_DD_MMM_YYYY = re.compile(r"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})")
DECODE_DD_MM_YYYY = re.compile(r"(\d{1,2}) +(\d{1,2}) +(\d{4})")
DECODE_YYYY = re.compile(r"^(\d{4})$")
MONTHS = {u'Jan': '01',
u'Feb': '02',
u'Fev': '02',
u'Mar': '03',
u'Apr': '04',
u'Avr': '04',
u'May': '05',
u'Mai': '05',
u'Jun': '06',
u'Jul': '07',
u'Aug': '08',
u'Sep': '09',
u'Oct': '10',
u'Nov': '11',
u'Dec': '12'}
MONTHS = {"Jan": "01",
"Feb": "02",
"Fev": "02",
"Mar": "03",
"Apr": "04",
"Avr": "04",
"May": "05",
"Mai": "05",
"Jun": "06",
"Jul": "07",
"Aug": "08",
"Sep": "09",
"Oct": "10",
"Nov": "11",
"Dec": "12"}
MSG_INVALID_HOST = "Invalid host"
......@@ -232,19 +231,19 @@ class CheckAndFix(object):
record (RecordPubli): record describing a publication.
Returns:
unicode: empty when procedure failed
str: empty when procedure failed
"""
val = u''
val = ''
if isinstance(record, RecordConf):
# INSPIREHEP start date encoded as 2014-12-31
if "x" in record[u"111"]:
val = record[u"111"]["x"]
if "x" in record["111"]:
val = record["111"]["x"]
# CDS end date encoded as 20141231
elif "z" in record[u"111"]:
val = record[u"111"]["z"]
elif "z" in record["111"]:
val = record["111"]["z"]
val = "%s-%s-%s" % (val[0:4], val[4:6], val[6:8])
elif isinstance(record, RecordThesis):
......@@ -276,34 +275,34 @@ class CheckAndFix(object):
"""
# standard case
if isinstance(record[u"773"], dict):
if isinstance(record["773"], dict):
if "o" in record[u"773"]:
if "o" in record["773"]:
for reg in DECODE_REF:
m = reg.match(record[u"773"]["o"])
m = reg.match(record["773"]["o"])
if m:
record[u"773"]["p"] = m.group("p")
record[u"773"]["v"] = m.group("v")
record[u"773"]["y"] = m.group("y")
record[u"773"]["c"] = m.group("c")
record["773"]["p"] = m.group("p")
record["773"]["v"] = m.group("v")
record["773"]["y"] = m.group("y")
record["773"]["c"] = m.group("c")
return
raise CheckException(MSG_NO_REF)
# list case -- paper with erratum
elif isinstance(record[u"773"], list):
elif isinstance(record["773"], list):
for i in range(len(record[u"773"])):
for i in range(len(record["773"])):
if "o" in record[u"773"][i]:
if "o" in record["773"][i]:
fixed = False
for reg in DECODE_REF:
m = reg.match(record[u"773"][i]["o"])
m = reg.match(record["773"][i]["o"])
if m:
record[u"773"][i]["p"] = m.group("p")
record[u"773"][i]["v"] = m.group("v")
record[u"773"][i]["y"] = m.group("y")
record[u"773"][i]["c"] = m.group("c")
record["773"][i]["p"] = m.group("p")
record["773"][i]["v"] = m.group("v")
record["773"][i]["y"] = m.group("y")
record["773"][i]["c"] = m.group("c")
fixed = True
break
......@@ -331,7 +330,7 @@ class CheckAndFix(object):
if not record.is_authors():
raise CheckException(MSG_NO_AUTHOR)
if len(record[u"100"]) > 1:
if len(record["100"]) > 1:
raise CheckException(MSG_TO_MANY_FAUTHOR)
def clean_erratum(self, record):
......@@ -350,10 +349,10 @@ class CheckAndFix(object):
# use the simplest algorithm by selecting the first entry in the list
# fare to assume that the article is published first.
record[u"773"] = record[u"773"][0]
record["773"] = record["773"][0]
# treat year and submitted date
for k in (u"260", u"269"):
for k in ("260", "269"):
if k in record and isinstance(record[k], list):
record[k] = record[k][0]
......@@ -428,10 +427,10 @@ class CheckAndFix(object):
return
# cds.cern.ch
if not (u"111" in record and "d" in record[u"111"]):
if not ("111" in record and "d" in record["111"]):
raise CheckException(MSG_NO_CONF_DATE)
value = record[u"111"]["d"]
value = record["111"]["d"]
m = REG_CONF_DATES.match(value)
if not m:
......@@ -441,10 +440,10 @@ class CheckAndFix(object):
m2 = REG_CONF_DATES_2.match(value)
if m1:
record[u"111"]["d"] = "%s-%s %s %s" % m1.groups()
record["111"]["d"] = "%s-%s %s %s" % m1.groups()
elif m2:
record[u"111"]["d"] = "%s %s - %s %s %s" % m2.groups()
record["111"]["d"] = "%s %s - %s %s %s" % m2.groups()
else:
raise CheckException(MSG_WELL_FORMED_CONF_DATES)
......@@ -511,11 +510,11 @@ class CheckAndFix(object):
return
# standard case
if isinstance(record[u"773"], dict):
if "p" in record[u"773"] and "v" in record[u"773"]:
if isinstance(record["773"], dict):
if "p" in record["773"] and "v" in record["773"]:
editor = record[u"773"]["p"]
volume = record[u"773"]["v"]
editor = record["773"]["p"]
volume = record["773"]["v"]
# add space after the dot Phys.Rev -> Phys. Rev
editor = re.sub(r'\.([A-Z])', r'. \1', editor)
......@@ -529,14 +528,14 @@ class CheckAndFix(object):
# remove stupid mistake
editor = CLEAN_SPACES(editor)
record[u"773"]["p"] = editor
record[u"773"]["v"] = volume
record["773"]["p"] = editor
record["773"]["v"] = volume
# list case -- publication with erratum
elif isinstance(record[u"773"], list):
elif isinstance(record["773"], list):
editors = record._get(u"773", 'p', force_list=True)
volumes = record._get(u"773", 'v', force_list=True)
editors = record._get("773", 'p', force_list=True)
volumes = record._get("773", 'v', force_list=True)
if len(editors) != len(volumes):
raise CheckException(MSG_WELL_FORMED_EDITOR)
......@@ -553,8 +552,8 @@ class CheckAndFix(object):
editor = CLEAN_SPACES(editor)
record[u"773"][i]["p"] = editor
record[u"773"][i]["v"] = volume
record["773"][i]["p"] = editor
record["773"][i]["v"] = volume
def format_universities(self, record):
"""Format the name of the university for PhD:
......@@ -577,39 +576,39 @@ class CheckAndFix(object):
year = REG_YEAR.search(record.these_defense()).group(1)
if int(year) < 2012:
university = u"Université de la Méditerrannée Aix-Marseille II"
university = "Université de la Méditerrannée Aix-Marseille II"
else:
university = u"Aix Marseille Université"
university = "Aix Marseille Université"
if u'502' in record and "b" in record[u'502']:
if isinstance(record[u'502']['b'], unicode):
if "Marseille" in record[u'502']['b']:
record[u'502']['b'] = university
if "502" in record and "b" in record["502"]:
if isinstance(record["502"]["b"], str):
if "Marseille" in record["502"]["b"]:
record["502"]["b"] = university
elif isinstance(record[u'502']['b'], list):
for i in xrange(len(record[u'502']['b'])):
if "Marseille" in record[u'502']['b'][i]:
record[u'502']['b'][i] = university
elif isinstance(record["502"]["b"], list):
for i in range(len(record["502"]["b"])):
if "Marseille" in record["502"]["b"][i]:
record["502"]["b"][i] = university
# Other: replace U. by University
else:
university = current.T(UNIVERSITY).decode("utf8")
university = current.T(UNIVERSITY)
if u'502' in record and "b" in record[u'502']:
if isinstance(record[u'502']['b'], unicode):
value = record[u'502']['b']
if "502" in record and "b" in record["502"]:
if isinstance(record["502"]["b"], str):
value = record["502"]["b"]
if "U." in value:
value = value.replace('U.', university)
record[u'502']['b'] = value
record["502"]["b"] = value
elif isinstance(record[u'502']['b'], list):
for i in xrange(len(record[u'502']['b'])):
value = record[u'502']['b'][i]
elif isinstance(record["502"]["b"], list):
for i in range(len(record["502"]["b"])):
value = record["502"]["b"][i]
if "U." in value:
value = value.replace('U.', university)
record[u'502']['b'][i] = value
record["502"]["b"][i] = value
def get_my_authors(self, record, sep=u", ", sort=False):
def get_my_authors(self, record, sep=", ", sort=False):
"""Get authors of my institutes signing the record.
The information is append to the Record object via the attribute
``my_authors``.
......@@ -721,12 +720,12 @@ class CheckAndFix(object):
record.reformat_authors(fmt_rescue)
if sort:
authors = (record[u"700"][["last_name", "fmt_name"]]
authors = (record["700"][["last_name", "fmt_name"]]
.sort_values(by="last_name")
.fmt_name)
else:
authors = (record[u"700"].fmt_name
authors = (record["700"].fmt_name
.sort_index())
# go back to the origin formatting
......@@ -741,7 +740,7 @@ class CheckAndFix(object):
# cache the result for a latter use
self._my_authors[record.id()] = list(intersection)
return u""
return ""
def paper_reference(self, record):
"""Check that editor, page, volume and paper year are defined
......@@ -762,8 +761,8 @@ class CheckAndFix(object):
return
# list of reference (paper with erratum)
refs = record[u"773"]
if not isinstance(record[u"773"], list):
refs = record["773"]
if not isinstance(record["773"], list):