Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
fae5d9e3
Commit
fae5d9e3
authored
Jan 19, 2021
by
LE GAC Renaud
Browse files
Update to remove duplicate entries in harvester logs
parent
659afc2c
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
5 deletions
+72
-5
controllers/harvest.py
controllers/harvest.py
+7
-0
modules/harvest_tools/__init__.py
modules/harvest_tools/__init__.py
+1
-0
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+2
-2
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+59
-0
modules/harvest_tools/msg.py
modules/harvest_tools/msg.py
+3
-3
No files found.
controllers/harvest.py
View file @
fae5d9e3
...
...
@@ -9,6 +9,7 @@ from gluon import current
from
gluon.restricted
import
RestrictedError
from
harvest_tools
import
(
build_harvester_tool
,
DRY_RUN
,
filter_logs
,
get_rex_institute
,
MsgCollection
)
from
plugin_dbui
import
(
inline_alert
,
...
...
@@ -457,6 +458,9 @@ def run():
logger
.
info
(
"-"
*
79
)
# filter logs to remove duplicated entries
logs
=
filter_logs
(
logs
)
# delegate rendering to the report view
response
.
view
=
"harvest/layout.%s"
%
request
.
extension
return
dict
(
collection_logs
=
collection_logs
,
...
...
@@ -559,6 +563,9 @@ def run_all():
logger
.
info
(
"-"
*
79
)
# filter logs to remove duplicated entries
logs
=
filter_logs
(
logs
)
# tune selector parameters used in the report title
if
query
is
None
:
selector
.
id_projects
=
None
...
...
modules/harvest_tools/__init__.py
View file @
fae5d9e3
...
...
@@ -3,6 +3,7 @@ and to push them in the database.
"""
from
.base
import
(
DRY_RUN
,
filter_logs
,
MSG_CRASH
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
...
...
modules/harvest_tools/automaton.py
View file @
fae5d9e3
...
...
@@ -478,7 +478,7 @@ class Automaton(object):
# start the log for the record
logs
.
append
(
Msg
(
harvester
=
harvester
,
collection
=
collection_logs
[
-
1
].
title
,
o
rigin
=
record
.
oai
(),
o
ais
=
record
.
oai
(),
record_id
=
record
.
id
(),
title
=
record
.
title
()))
...
...
@@ -537,7 +537,7 @@ class Automaton(object):
url
=
OAI_URL
%
(
harvester
.
host
,
rec_id
)
logs
.
append
(
Msg
(
harvester
=
harvester
,
collection
=
collection_logs
[
-
1
].
title
,
o
rigin
=
OAI
%
(
harvester
.
host
,
rec_id
),
o
ais
=
OAI
%
(
harvester
.
host
,
rec_id
),
record_id
=
rec_id
,
title
=
url
))
logs
[
-
1
].
reject
(
e
)
...
...
modules/harvest_tools/base.py
View file @
fae5d9e3
""" harvest_tools.base
"""
import
pandas
as
pd
import
re
DRY_RUN
=
"dry run"
MSG_CRASH
=
"Crash: %s"
...
...
@@ -9,6 +12,8 @@ MSG_IN_DB = "Already in the database"
MSG_IS
=
"Reject publication is a {}"
MSG_LOAD
=
"Load in the database"
REX_OAI_CDS
=
re
.
compile
(
r
"oai:cds"
)
T4
=
" "
*
4
T6
=
" "
*
6
...
...
@@ -28,6 +33,60 @@ def family_name_fr(full_name):
return
full_name
[
full_name
.
find
(
' '
)
+
1
:]
def
order_oais
(
oais
):
"""Order OAIS string as cds, inspirehep
Args:
oais (str):
record identifier in stores
Returns:
str
"""
if
oais
is
None
:
return
""
if
oais
.
count
(
","
)
!=
1
or
REX_OAI_CDS
.
match
(
oais
):
return
oais
u
,
v
=
(
el
.
strip
()
for
el
in
oais
.
split
(
","
))
return
f
"
{
v
}
,
{
u
}
"
def
filter_logs
(
logs
):
"""Filter on OAI to remove duplicated entries.
Note:
* Entries can be duplicated when user harvest several stores.
* Prefer entries from inspirehep
Args:
logs (list):
list of message (Msg).
Returns:
list
"""
data
=
[{
"oais"
:
dct
[
"oais"
]}
for
dct
in
logs
]
df
=
pd
.
DataFrame
(
data
)
# tag primary OAI as cds or ins
df
[
"first_oai"
]
=
df
.
oais
.
str
.
extract
(
r
"oai:(\w{3})"
,
expand
=
True
)
# update origin to order oai as cds, ins
df
[
"oais"
]
=
df
.
oais
.
apply
(
order_oais
)
# filter preserving inspirehep
fltr
=
(
df
.
sort_values
([
"first_oai"
,
"oais"
])
.
oais
.
duplicated
(
keep
=
"last"
))
return
[
logs
[
tpl
[
0
]]
for
tpl
in
fltr
.
items
()
if
tpl
[
1
]
is
False
]
def
learn_my_authors
(
db
,
authors
=
None
,
id_project
=
None
,
...
...
modules/harvest_tools/msg.py
View file @
fae5d9e3
...
...
@@ -29,7 +29,7 @@ class Msg(Storage):
harvester (gluon.dal.Row):
the database harvester used to scan the store.
o
rigin
(str):
o
ais
(str):
identify store(s) housing the publication
record_id (int):
...
...
@@ -43,7 +43,7 @@ class Msg(Storage):
def
__init__
(
self
,
collection
=
None
,
harvester
=
None
,
o
rigin
=
None
,
o
ais
=
None
,
record_id
=
None
,
title
=
None
):
...
...
@@ -55,7 +55,7 @@ class Msg(Storage):
else
:
self
.
harvester
=
json
.
dumps
(
harvester
.
as_dict
())
self
.
o
rigin
=
origin
self
.
o
ais
=
oais
self
.
record_id
=
record_id
self
.
synonym
=
None
self
.
title
=
title
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment