Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
a3173b67
Commit
a3173b67
authored
Jan 14, 2021
by
LE GAC Renaud
Browse files
Major upgrade to migrate check ad fix methods in RecordHepPubli
parent
6cae4419
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
622 additions
and
166 deletions
+622
-166
modules/harvest_tools/__init__.py
modules/harvest_tools/__init__.py
+2
-4
modules/harvest_tools/articles.py
modules/harvest_tools/articles.py
+29
-21
modules/harvest_tools/automaton.py
modules/harvest_tools/automaton.py
+9
-37
modules/harvest_tools/base.py
modules/harvest_tools/base.py
+37
-63
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+5
-4
modules/harvest_tools/exception.py
modules/harvest_tools/exception.py
+0
-11
modules/harvest_tools/msg.py
modules/harvest_tools/msg.py
+1
-2
modules/store_tools/__init__.py
modules/store_tools/__init__.py
+10
-2
modules/store_tools/authorsmixin.py
modules/store_tools/authorsmixin.py
+87
-1
modules/store_tools/base.py
modules/store_tools/base.py
+77
-0
modules/store_tools/exception.py
modules/store_tools/exception.py
+8
-0
modules/store_tools/publicationinfomixin.py
modules/store_tools/publicationinfomixin.py
+71
-0
modules/store_tools/recordcdspubli.py
modules/store_tools/recordcdspubli.py
+1
-1
modules/store_tools/recordheppubli.py
modules/store_tools/recordheppubli.py
+146
-1
modules/test_tools.py
modules/test_tools.py
+1
-1
tests/basis/test_03_Record.py
tests/basis/test_03_Record.py
+1
-4
tests/basis/test_14_check_fix_article_ins.py
tests/basis/test_14_check_fix_article_ins.py
+123
-0
tests/basis/test_15_CheckAndFix_proceeding.py
tests/basis/test_15_CheckAndFix_proceeding.py
+10
-10
tests/basis/test_16_CheckAndFix_thesis.py
tests/basis/test_16_CheckAndFix_thesis.py
+4
-4
No files found.
modules/harvest_tools/__init__.py
View file @
a3173b67
...
...
@@ -7,15 +7,13 @@ from .base import (DRY_RUN,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
,
MSG_NO_ENTRY
,
MSG_TOOMANY_SYNONYM
,
family_name_fr
,
search_synonym
)
learn_my_authors
,
get_rex_institute
)
from
.automaton
import
Automaton
from
.articles
import
Articles
from
.checkandfix
import
CheckAndFix
,
MONTHS
from
.exception
import
CheckException
,
ToolException
from
.factory
import
build_harvester_tool
,
get_harvester_tool
from
.msg
import
Msg
from
.msgcollection
import
MsgCollection
...
...
modules/harvest_tools/articles.py
View file @
a3173b67
...
...
@@ -3,14 +3,13 @@
"""
from
.automaton
import
Automaton
from
.base
import
(
learn_my_authors
,
MSG_CRASH
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
MSG_LOAD
)
from
.checkandfix
import
CheckException
from
plugin_dbui
import
get_id
,
UNDEF_ID
MSG_NO_EDITOR
=
"Reject article is not published"
MSG_NOT_ARTICLE
=
"Reject publication is not and article"
MSG_TRANSFORM_PREPRINT
=
"Transform the preprint into an article"
T4
=
" "
*
4
...
...
@@ -32,6 +31,17 @@ class Articles(Automaton):
def
check_record
(
self
,
record
):
"""Check the content of the article in order to fix non-conformities.
* publication is a published article
* is with authors form my institute
* standardise name of collaboration
* format authors according to my format
* extract authors form my institute signing the publication
* is submitted date well formed
* format editor according to my criteria
* resolve published synonym
* check reference paper
Args:
record (RecordPubli):
the record describing the article.
...
...
@@ -42,32 +52,30 @@ class Articles(Automaton):
can not be corrected.
"""
if
not
Automaton
.
check_record
(
self
,
record
):
return
False
self
.
logger
.
debug
(
f
"
{
T4
}
check record (article)"
)
try
:
if
not
record
.
is_published
():
self
.
logs
[
-
1
].
reject
(
MSG_NO_EDITOR
,
record
=
record
)
return
False
self
.
check
.
format_editor
(
record
)
self
.
check
.
publisher
(
record
)
if
record
.
subtype
()
==
"article"
:
self
.
logs
[
-
1
].
reject
(
MSG_NOT_ARTICLE
,
record
)
return
False
self
.
check
.
paper_reference
(
record
)
self
.
check
.
submitted
(
record
)
try
:
self
.
check
.
format_authors
(
record
,
fmt
=
"F. Last"
)
self
.
check
.
get_my_authors
(
record
,
sort
=
True
)
# is with authors form my institute
# standardise name of collaboration
# format authors according to my format
# extract authors form my institute signing the publication
# is submitted date well formed
record
.
check_and_fix
(
self
.
rex_institute
,
fmt_author
=
"F. Last"
,
sep_author
=
", "
,
sort_author
=
True
)
except
CheckException
as
e
:
self
.
logs
[
-
1
].
reject
(
e
,
record
=
record
)
re
turn
False
record
.
format_editor
()
record
.
check_publisher
(
self
.
db
)
re
cord
.
check_paper_reference
()
except
Exception
as
e
:
self
.
logs
[
-
1
].
reject
(
MSG_CRASH
%
e
,
record
=
record
,
translate
=
False
)
self
.
logs
[
-
1
].
reject
(
e
,
record
=
record
)
return
False
return
True
...
...
modules/harvest_tools/automaton.py
View file @
a3173b67
...
...
@@ -4,17 +4,16 @@
import
logging
import
re
from
.base
import
(
MSG_FIX_ORIGIN
,
MSG_IN_DB
,
ToolException
)
from
.checkandfix
import
CheckAndFix
from
.base
import
get_rex_institute
,
MSG_FIX_ORIGIN
,
MSG_IN_DB
from
gluon
import
current
from
gluon.storage
import
Storage
from
.msg
import
Msg
from
.msgcollection
import
MsgCollection
from
plugin_dbui
import
CALLBACK_ERRORS
,
get_id
from
store_tools
import
(
StoreException
,
build_store
,
OAI_URL
)
from
store_tools
import
(
build_store
,
OAI_URL
,
StoreException
,
ToolException
)
from
store_tools.factory
import
build_record
MSG_NO_CAT
=
'Select a "category" !!!'
...
...
@@ -123,7 +122,6 @@ class Automaton(object):
if
not
id_category
:
raise
ToolException
(
MSG_NO_CAT
)
self
.
check
=
CheckAndFix
()
self
.
collection_logs
=
[]
self
.
controller
=
automaton
self
.
db
=
db
...
...
@@ -133,6 +131,7 @@ class Automaton(object):
self
.
id_project
=
id_project
self
.
logs
=
[]
self
.
logger
=
logging
.
getLogger
(
"web2py.app.limbra"
)
self
.
rex_intitute
=
get_rex_institute
(
db
,
current
.
app
)
self
.
store
=
None
self
.
year_start
=
year_start
self
.
year_end
=
year_end
...
...
@@ -295,13 +294,7 @@ class Automaton(object):
corrected.
Note:
Some checks depend on the type of publications and have to be
implemented in inherited class.
Note:
The order of the checks matter. It should be OAI,
temporary record, authors, my authors and then a series of checks
specific to the publication type.
To be implemented by inheried classes
Args:
record (Record):
...
...
@@ -313,28 +306,7 @@ class Automaton(object):
corrected.
"""
self
.
logger
.
debug
(
f
"
{
T4
}
check record (automaton)"
)
try
:
# fix record with a missing OAI
if
not
self
.
check
.
is_oai
(
record
):
oai
=
OAI
%
(
self
.
harvester
.
host
,
record
.
id
())
record
[
"oai"
]
=
{
"value"
:
oai
}
if
self
.
check
.
is_bad_oai_used
(
record
):
self
.
logs
[
-
1
].
idle
(
MSG_IN_DB
,
record
.
submitted
())
return
False
self
.
check
.
temporary_record
(
record
)
self
.
check
.
authors
(
record
)
self
.
check
.
my_affiliation
(
record
,
self
.
id_project
,
self
.
id_team
)
self
.
check
.
collaboration
(
record
)
except
Exception
as
e
:
self
.
logs
[
-
1
].
reject
(
e
,
record
=
record
)
return
False
return
True
return
False
def
get_record_by_fields
(
self
,
oai_url
,
year
,
**
kwargs
):
"""Get database record matching fields values defined
...
...
modules/harvest_tools/base.py
View file @
a3173b67
""" harvest_tools.base
"""
from
.exception
import
ToolException
from
plugin_dbui
import
get_id
,
UNDEF_ID
DRY_RUN
=
"dry run"
MSG_CRASH
=
"Crash: %s"
MSG_FIX_ORIGIN
=
"Fixed the origin field"
MSG_IN_DB
=
"Already in the database"
MSG_LOAD
=
"Load in the database"
MSG_NO_ENTRY
=
"Reject %s is not defined"
MSG_TOOMANY_SYNONYM
=
"Reject too many %s synonyms"
def
family_name_fr
(
full_name
):
...
...
@@ -99,68 +93,48 @@ def learn_my_authors(db,
db
.
my_authors
[
row
.
id
]
=
dict
(
authors
=
', '
.
join
(
database_authors
))
def
search_synonym
(
table
,
fieldname
,
value
,
create
=
False
):
"""Get the database identifier for the record having the database field
or the synonyms field matching the value.
def
get_rex_institute
(
db
,
app
):
"""Get the regular expression defining the affiliation of my institute.
It is obtained by concatenating the affiliation keys.
Affiliation key can contains character like ``(``, ``)`` or ``&``.
They are replaced by ``\(`` *etc*.
Note:
The database table must have a field name *synonyms*.
It contains a list of strings.
Args:
table (gluon.DAL.Table):
database table.
db (pydal.DAL):
database connection
app (gluon.storage.Storage):
namespace defining the application
Returns:
str:
fieldname (str):
field of the database table identified by its name.
"""
# alias
reg_institute
=
app
.
reg_institute
value (str):
value to be matched.
# regular expression for the affiliation keys
# protect special character
# add start and end of string for an exact match
if
not
reg_institute
:
create(bool):
create a new entry in the database table when
it is ``True``
lst
=
[]
for
row
in
db
(
db
.
affiliation_keys
.
id
>
0
).
iterselect
():
val
=
row
.
key_u
Returns:
int:
* the id of the database record.
* UNDEF_ID if value is not defined.
val
=
(
val
.
replace
(
"("
,
"\("
)
.
replace
(
")"
,
"\)"
)
.
replace
(
"&"
,
"\&"
)
.
replace
(
"$"
,
"\$"
)
.
replace
(
"+"
,
"\+"
)
.
replace
(
"?"
,
"\?"
))
Raises:
ToolException:
* no synonym found and not allow to create a new one.
* more than one synonym is found.
val
=
r
"(^|\|){}($|\|)"
.
format
(
val
)
"""
if
not
value
:
return
UNDEF_ID
db
=
table
.
_db
kwargs
=
{}
kwargs
[
fieldname
]
=
value
id_rec
=
get_id
(
table
,
**
kwargs
)
if
id_rec
is
not
None
:
return
id_rec
# nothing found, have a look to the synonyms field
query
=
table
.
synonyms
.
contains
(
value
)
setrows
=
db
(
query
)
# no synonym found, create the entry
ncount
=
setrows
.
count
()
if
ncount
==
0
:
if
create
:
return
table
.
insert
(
**
kwargs
)
else
:
msg
=
MSG_NO_ENTRY
%
table
.
_tablename
raise
ToolException
(
msg
)
# one synonym found
elif
ncount
==
1
:
return
setrows
.
select
(
table
.
id
).
first
().
id
# more than one synonyms - don't know what to choose
else
:
msg
=
MSG_TOOMANY_SYNONYM
%
table
.
_tablename
raise
ToolException
(
msg
)
lst
.
append
(
val
)
reg_institute
=
r
"|"
.
join
(
lst
)
return
reg_institute
modules/harvest_tools/checkandfix.py
View file @
a3173b67
...
...
@@ -5,11 +5,10 @@ import logging
import
numpy
as
np
import
re
from
.base
import
search_synonym
,
ToolException
from
datetime
import
datetime
from
.exception
import
CheckException
from
gluon
import
current
from
store_tools
import
(
MSG_NO_CONF
,
from
store_tools
import
(
CheckException
,
MSG_NO_CONF
,
MSG_NO_THESIS
,
OAI_URL
,
RecordCdsConf
,
...
...
@@ -19,7 +18,9 @@ from store_tools import (MSG_NO_CONF,
RecordHepPubli
,
RecordHepThesis
,
REG_OAI
,
REG_YEAR
)
REG_YEAR
,
search_synonym
,
ToolException
)
from
store_tools.publicationinfomixin
import
PAPER_REFERENCE_KEYS
...
...
modules/harvest_tools/exception.py
deleted
100644 → 0
View file @
6cae4419
""" harvest_tools.exception
"""
class
CheckException
(
Exception
):
pass
class
ToolException
(
Exception
):
pass
modules/harvest_tools/msg.py
View file @
a3173b67
...
...
@@ -3,10 +3,9 @@
"""
import
json
from
.base
import
MSG_NO_ENTRY
,
MSG_TOOMANY_SYNONYM
from
gluon
import
current
from
gluon.storage
import
Storage
from
store_tools
import
OAI_URL
from
store_tools
import
MSG_NO_ENTRY
,
MSG_TOOMANY_SYNONYM
,
OAI_URL
MSGS
=
(
MSG_NO_ENTRY
,
MSG_TOOMANY_SYNONYM
)
TABLES
=
(
"collaborations"
,
"countries"
,
"publishers"
)
...
...
modules/store_tools/__init__.py
View file @
a3173b67
...
...
@@ -12,18 +12,26 @@ from .base import (ARXIV,
is_thesis
,
MSG_NO_CONF
,
MSG_NO_COUNTRY
,
MSG_NO_ENTRY
,
MSG_NO_PUBLISHER
,
MSG_NO_THESIS
,
MSG_TOOMANY_SYNONYM
,
MSG_UNKNOWN_COLLABORATION
,
MSG_WELL_FORMED_COLLABORATION
,
OAI_URL
,
REG_ARXIV_NUMBER
,
REG_AUTHOR
,
REG_DATE
,
REG_DATE_YYYYMM
,
REG_OAI
,
REG_YEAR
,
search_synonym
,
THESIS_DIR
)
from
.exception
import
(
StoreException
,
RecordException
)
from
.exception
import
(
CheckException
,
RecordException
,
StoreException
,
ToolException
)
from
.factory
import
build_record
,
build_store
from
.inspirehepstore
import
InspirehepStore
...
...
modules/store_tools/authorsmixin.py
View file @
a3173b67
...
...
@@ -3,7 +3,8 @@
"""
import
re
from
.exception
import
RecordException
from
.base
import
T6
from
.exception
import
CheckException
,
RecordException
from
numpy
import
NaN
from
pandas
import
concat
...
...
@@ -14,7 +15,10 @@ AUTHOR_FORMATS = [
"Last, First"
,
"Last F."
]
MSG_FAUTHOR_COLLABORATION
=
"Reject first author is a Collaboration"
MSG_INVALID_FMT
=
"Invalid format for author"
MSG_NO_AUTHOR
=
"Reject no author(s)"
MSG_NO_MY_AUTHOR
=
"Reject no authors of my institute"
def
to_initial
(
name
):
...
...
@@ -125,6 +129,85 @@ class AuthorsMixin(object):
return
li
def
check_authors
(
self
):
"""Check that:
* author fields are defined.
* first author is not like ATLAS Collaboration
Args:
record (RecordCdsPubli):
record describing a publication.
Raises:
CheckException:
when there is no authors.
"""
self
.
logger
.
debug
(
f
"
{
T6
}
check authors"
)
if
not
self
.
is_authors
():
raise
CheckException
(
MSG_NO_AUTHOR
)
if
"collaboration"
in
self
.
first_author
().
lower
():
raise
CheckException
(
MSG_FAUTHOR_COLLABORATION
)
def
check_format_authors
(
self
,
fmt
=
None
):
"""Format the author names.
Args:
fmt (str):
define the format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``
"""
if
fmt
is
None
:
self
.
logger
.
debug
(
f
"
{
T6
}
skip format authors -- fmt is None"
)
return
self
.
logger
.
debug
(
f
"
{
T6
}
format authors"
)
self
.
reformat_authors
(
fmt
)
def
extract_my_authors
(
self
,
rex_institute
,
sep
=
", "
,
sort
=
False
):
"""Authors of my institutes signing the record.
The information is append to the record instance via the attribute
``my_authors``.
Args:
rex_institute (str):
regular expression defining the affiliation of my institute
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
the list of authors separated by the ``sep`` argument.
Raises:
CheckException:
the list is empty
"""
self
.
logger
.
debug
(
f
"
{
T6
}
get my authors"
)
# might have been computed when affiliation is checked
if
self
.
my_authors
is
not
None
:
return
# find authors of my institute signing the record
value
=
self
.
find_authors_by_affiliation
(
rex_institute
,
sep
,
sort
)
if
len
(
value
)
==
0
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
self
.
my_authors
=
value
def
find_affiliation
(
self
,
pattern
):
"""Find affiliation matching the regular expression *pattern*.
...
...
@@ -310,6 +393,9 @@ class AuthorsMixin(object):
bool:
"""
if
getattr
(
self
,
"df_authors"
,
None
)
is
None
:
return
False
df
=
self
.
df_authors
cols
=
{
"first_name"
,
"full_name"
,
"last_name"
}
...
...
modules/store_tools/base.py
View file @
a3173b67
...
...
@@ -3,6 +3,9 @@
"""
import
re
from
.exception
import
ToolException
from
plugin_dbui
import
get_id
,
UNDEF_ID
ARXIV
=
"arXiv"
ARXIV_PDF
=
"http://arxiv.org/pdf/"
...
...
@@ -14,11 +17,15 @@ MSG_INV_CONF_KEY = "Reject invalid conference key"
MSG_NO_CONF
=
"Reject no conference information"
MSG_NO_CONF_ID_KEY
=
"Reject no conference identifier and key"
MSG_NO_COUNTRY
=
"Reject invalid country"
MSG_NO_ENTRY
=
"Reject %s is not defined"
MSG_NO_HOST
=
"Reject no host information in record"
MSG_NO_PUBLISHER
=
"Reject invalid publisher"
MSG_NO_SHELF
=
"No shelf %s for store %s"
MSG_NO_THESIS
=
"Reject no thesis information"
MSG_TOOMANY_SYNONYM
=
"Reject too many %s synonyms"
MSG_UNKNOWN_COLLABORATION
=
"Reject collaboration is unknown."
MSG_WELL_FORMED_COLLABORATION
=
"Reject collaboration is not well formed"
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
OAI
=
"oai:%s:%s"
OAI_URL
=
"http://%s/record/%s"
...
...
@@ -33,10 +40,13 @@ REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
REG_AUTHOR
=
re
.
compile
(
r
"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$"
,
re
.
UNICODE
)
REG_DATE
=
re
.
compile
(
r
"(\d{4}-\d{2}-\d{2})"
)
REG_DATE_YYYYMM
=
re
.
compile
(
r
"(\d{4}-\d{2})"
)
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(?:\.\d+)?$"
)
REG_OAI
=
re
.
compile
(
r
"oai:([a-z\.]+):([\d]+)"
)
REG_YEAR
=
re
.
compile
(
r
"(\d{4})"
)
T2
,
T4
,
T6
=
" "
*
2
,
" "
*
4
,
" "
*
6
THESIS_DIR
=
"dir."
...
...
@@ -140,3 +150,70 @@ def is_thesis(recjson):
return
True
return
False
def
search_synonym
(
table
,
fieldname
,
value
,
create
=
False
):
"""Get the database identifier for the record having the database field
or the synonyms field matching the value.
Note:
The database table must have a field name *synonyms*.
It contains a list of strings.
Args:
table (gluon.DAL.Table):
database table.
fieldname (str):
field of the database table identified by its name.
value (str):
value to be matched.
create(bool):
create a new entry in the database table when
it is ``True``
Returns:
int:
* the id of the database record.
* UNDEF_ID if value is not defined.
Raises:
ToolException:
* no synonym found and not allow to create a new one.
* more than one synonym is found.
"""
if
len
(
value
)
==
0
:
return
UNDEF_ID
db
=
table
.
_db
kwargs
=
{}
kwargs
[
fieldname
]
=
value
id_rec
=
get_id
(
table
,
**
kwargs
)
if
id_rec
is
not
None
:
return
id_rec
# nothing found, have a look to the synonyms field
query
=
table
.
synonyms
.
contains
(
value
)
setrows
=
db
(
query
)
# no synonym found, create the entry
ncount
=
setrows
.
count
()
if
ncount
==
0
:
if
create
:
return
table
.
insert
(
**
kwargs
)
else
:
msg
=
MSG_NO_ENTRY
%
table
.
_tablename
raise
ToolException
(
msg
)
# one synonym found
elif
ncount
==
1
:
return
setrows
.
select
(
table
.
id
).
first
().
id
# more than one synonyms - don't know what to choose
else
:
msg
=
MSG_TOOMANY_SYNONYM
%
table
.
_tablename
raise
ToolException
(
msg
)
modules/store_tools/exception.py
View file @
a3173b67
...
...
@@ -3,9 +3,17 @@
"""