Commit fae5d9e3 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Update to remove duplicate entries in harvester logs

parent 659afc2c
...@@ -9,6 +9,7 @@ from gluon import current ...@@ -9,6 +9,7 @@ from gluon import current
from gluon.restricted import RestrictedError from gluon.restricted import RestrictedError
from harvest_tools import (build_harvester_tool, from harvest_tools import (build_harvester_tool,
DRY_RUN, DRY_RUN,
filter_logs,
get_rex_institute, get_rex_institute,
MsgCollection) MsgCollection)
from plugin_dbui import (inline_alert, from plugin_dbui import (inline_alert,
...@@ -457,6 +458,9 @@ def run(): ...@@ -457,6 +458,9 @@ def run():
logger.info("-"*79) logger.info("-"*79)
# filter logs to remove duplicated entries
logs = filter_logs(logs)
# delegate rendering to the report view # delegate rendering to the report view
response.view = "harvest/layout.%s" % request.extension response.view = "harvest/layout.%s" % request.extension
return dict(collection_logs=collection_logs, return dict(collection_logs=collection_logs,
...@@ -559,6 +563,9 @@ def run_all(): ...@@ -559,6 +563,9 @@ def run_all():
logger.info("-"*79) logger.info("-"*79)
# filter logs to remove duplicated entries
logs = filter_logs(logs)
# tune selector parameters used in the report title # tune selector parameters used in the report title
if query is None: if query is None:
selector.id_projects = None selector.id_projects = None
......
...@@ -3,6 +3,7 @@ and to push them in the database. ...@@ -3,6 +3,7 @@ and to push them in the database.
""" """
from .base import (DRY_RUN, from .base import (DRY_RUN,
filter_logs,
MSG_CRASH, MSG_CRASH,
MSG_FIX_ORIGIN, MSG_FIX_ORIGIN,
MSG_IN_DB, MSG_IN_DB,
......
...@@ -478,7 +478,7 @@ class Automaton(object): ...@@ -478,7 +478,7 @@ class Automaton(object):
# start the log for the record # start the log for the record
logs.append(Msg(harvester=harvester, logs.append(Msg(harvester=harvester,
collection=collection_logs[-1].title, collection=collection_logs[-1].title,
origin=record.oai(), oais=record.oai(),
record_id=record.id(), record_id=record.id(),
title=record.title())) title=record.title()))
...@@ -537,7 +537,7 @@ class Automaton(object): ...@@ -537,7 +537,7 @@ class Automaton(object):
url = OAI_URL % (harvester.host, rec_id) url = OAI_URL % (harvester.host, rec_id)
logs.append(Msg(harvester=harvester, logs.append(Msg(harvester=harvester,
collection=collection_logs[-1].title, collection=collection_logs[-1].title,
origin=OAI % (harvester.host, rec_id), oais=OAI % (harvester.host, rec_id),
record_id=rec_id, record_id=rec_id,
title=url)) title=url))
logs[-1].reject(e) logs[-1].reject(e)
......
""" harvest_tools.base """ harvest_tools.base
""" """
import pandas as pd
import re
DRY_RUN = "dry run" DRY_RUN = "dry run"
MSG_CRASH = "Crash: %s" MSG_CRASH = "Crash: %s"
...@@ -9,6 +12,8 @@ MSG_IN_DB = "Already in the database" ...@@ -9,6 +12,8 @@ MSG_IN_DB = "Already in the database"
MSG_IS = "Reject publication is a {}" MSG_IS = "Reject publication is a {}"
MSG_LOAD = "Load in the database" MSG_LOAD = "Load in the database"
REX_OAI_CDS = re.compile(r"oai:cds")
T4 = " "*4 T4 = " "*4
T6 = " "*6 T6 = " "*6
...@@ -28,6 +33,60 @@ def family_name_fr(full_name): ...@@ -28,6 +33,60 @@ def family_name_fr(full_name):
return full_name[full_name.find(' ') + 1:] return full_name[full_name.find(' ') + 1:]
def order_oais(oais):
"""Order OAIS string as cds, inspirehep
Args:
oais (str):
record identifier in stores
Returns:
str
"""
if oais is None:
return ""
if oais.count(",") != 1 or REX_OAI_CDS.match(oais):
return oais
u, v = (el.strip() for el in oais.split(","))
return f"{v}, {u}"
def filter_logs(logs):
"""Filter on OAI to remove duplicated entries.
Note:
* Entries can be duplicated when user harvest several stores.
* Prefer entries from inspirehep
Args:
logs (list):
list of message (Msg).
Returns:
list
"""
data = [{"oais": dct["oais"]} for dct in logs]
df = pd.DataFrame(data)
# tag primary OAI as cds or ins
df["first_oai"] = df.oais.str.extract(r"oai:(\w{3})", expand=True)
# update origin to order oai as cds, ins
df["oais"] = df.oais.apply(order_oais)
# filter preserving inspirehep
fltr = (df
.sort_values(["first_oai", "oais"])
.oais
.duplicated(keep="last"))
return [logs[tpl[0]] for tpl in fltr.items() if tpl[1] is False]
def learn_my_authors(db, def learn_my_authors(db,
authors=None, authors=None,
id_project=None, id_project=None,
......
...@@ -29,7 +29,7 @@ class Msg(Storage): ...@@ -29,7 +29,7 @@ class Msg(Storage):
harvester (gluon.dal.Row): harvester (gluon.dal.Row):
the database harvester used to scan the store. the database harvester used to scan the store.
origin (str): oais (str):
identify store(s) housing the publication identify store(s) housing the publication
record_id (int): record_id (int):
...@@ -43,7 +43,7 @@ class Msg(Storage): ...@@ -43,7 +43,7 @@ class Msg(Storage):
def __init__(self, def __init__(self,
collection=None, collection=None,
harvester=None, harvester=None,
origin=None, oais=None,
record_id=None, record_id=None,
title=None): title=None):
...@@ -55,7 +55,7 @@ class Msg(Storage): ...@@ -55,7 +55,7 @@ class Msg(Storage):
else: else:
self.harvester = json.dumps(harvester.as_dict()) self.harvester = json.dumps(harvester.as_dict())
self.origin = origin self.oais = oais
self.record_id = record_id self.record_id = record_id
self.synonym = None self.synonym = None
self.title = title self.title = title
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment