Commit fae5d9e3 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update to remove duplicate entries in harvester logs

parent 659afc2c
......@@ -9,6 +9,7 @@ from gluon import current
from gluon.restricted import RestrictedError
from harvest_tools import (build_harvester_tool,
DRY_RUN,
filter_logs,
get_rex_institute,
MsgCollection)
from plugin_dbui import (inline_alert,
......@@ -457,6 +458,9 @@ def run():
logger.info("-"*79)
# filter logs to remove duplicated entries
logs = filter_logs(logs)
# delegate rendering to the report view
response.view = "harvest/layout.%s" % request.extension
return dict(collection_logs=collection_logs,
......@@ -559,6 +563,9 @@ def run_all():
logger.info("-"*79)
# filter logs to remove duplicated entries
logs = filter_logs(logs)
# tune selector parameters used in the report title
if query is None:
selector.id_projects = None
......
......@@ -3,6 +3,7 @@ and to push them in the database.
"""
from .base import (DRY_RUN,
filter_logs,
MSG_CRASH,
MSG_FIX_ORIGIN,
MSG_IN_DB,
......
......@@ -478,7 +478,7 @@ class Automaton(object):
# start the log for the record
logs.append(Msg(harvester=harvester,
collection=collection_logs[-1].title,
origin=record.oai(),
oais=record.oai(),
record_id=record.id(),
title=record.title()))
......@@ -537,7 +537,7 @@ class Automaton(object):
url = OAI_URL % (harvester.host, rec_id)
logs.append(Msg(harvester=harvester,
collection=collection_logs[-1].title,
origin=OAI % (harvester.host, rec_id),
oais=OAI % (harvester.host, rec_id),
record_id=rec_id,
title=url))
logs[-1].reject(e)
......
""" harvest_tools.base
"""
import pandas as pd
import re
DRY_RUN = "dry run"
MSG_CRASH = "Crash: %s"
......@@ -9,6 +12,8 @@ MSG_IN_DB = "Already in the database"
MSG_IS = "Reject publication is a {}"
MSG_LOAD = "Load in the database"
REX_OAI_CDS = re.compile(r"oai:cds")
T4 = " "*4
T6 = " "*6
......@@ -28,6 +33,60 @@ def family_name_fr(full_name):
return full_name[full_name.find(' ') + 1:]
def order_oais(oais):
"""Order OAIS string as cds, inspirehep
Args:
oais (str):
record identifier in stores
Returns:
str
"""
if oais is None:
return ""
if oais.count(",") != 1 or REX_OAI_CDS.match(oais):
return oais
u, v = (el.strip() for el in oais.split(","))
return f"{v}, {u}"
def filter_logs(logs):
"""Filter on OAI to remove duplicated entries.
Note:
* Entries can be duplicated when user harvest several stores.
* Prefer entries from inspirehep
Args:
logs (list):
list of message (Msg).
Returns:
list
"""
data = [{"oais": dct["oais"]} for dct in logs]
df = pd.DataFrame(data)
# tag primary OAI as cds or ins
df["first_oai"] = df.oais.str.extract(r"oai:(\w{3})", expand=True)
# update origin to order oai as cds, ins
df["oais"] = df.oais.apply(order_oais)
# filter preserving inspirehep
fltr = (df
.sort_values(["first_oai", "oais"])
.oais
.duplicated(keep="last"))
return [logs[tpl[0]] for tpl in fltr.items() if tpl[1] is False]
def learn_my_authors(db,
authors=None,
id_project=None,
......
......@@ -29,7 +29,7 @@ class Msg(Storage):
harvester (gluon.dal.Row):
the database harvester used to scan the store.
origin (str):
oais (str):
identify store(s) housing the publication
record_id (int):
......@@ -43,7 +43,7 @@ class Msg(Storage):
def __init__(self,
collection=None,
harvester=None,
origin=None,
oais=None,
record_id=None,
title=None):
......@@ -55,7 +55,7 @@ class Msg(Storage):
else:
self.harvester = json.dumps(harvester.as_dict())
self.origin = origin
self.oais = oais
self.record_id = record_id
self.synonym = None
self.title = title
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment