Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
c8095367
Commit
c8095367
authored
Jan 14, 2021
by
LE GAC Renaud
Browse files
Migrate check and fix method to RecordCdsPubli
parent
a3173b67
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
574 additions
and
186 deletions
+574
-186
modules/store_tools/base.py
modules/store_tools/base.py
+3
-1
modules/store_tools/recordcdspubli.py
modules/store_tools/recordcdspubli.py
+348
-39
modules/store_tools/recordheppubli.py
modules/store_tools/recordheppubli.py
+13
-13
tests/basis/test_13_CheckAndFix_article.py
tests/basis/test_13_CheckAndFix_article.py
+0
-133
tests/basis/test_13_check_and_fix_article_cds.py
tests/basis/test_13_check_and_fix_article_cds.py
+210
-0
No files found.
modules/store_tools/base.py
View file @
c8095367
...
...
@@ -20,6 +20,7 @@ MSG_NO_COUNTRY = "Reject invalid country"
MSG_NO_ENTRY
=
"Reject %s is not defined"
MSG_NO_HOST
=
"Reject no host information in record"
MSG_NO_PUBLISHER
=
"Reject invalid publisher"
MSG_NO_REF
=
"Reject incomplete paper reference. Check "
MSG_NO_SHELF
=
"No shelf %s for store %s"
MSG_NO_THESIS
=
"Reject no thesis information"
MSG_TOOMANY_SYNONYM
=
"Reject too many %s synonyms"
...
...
@@ -39,9 +40,10 @@ REG_ARXIV_NUMBER = re.compile("\d+\.\d+")
# group(3) is the part of the first name after the separator (" ", "-")
REG_AUTHOR
=
re
.
compile
(
r
"^([\w\- ]+), (\w+)\.?[\- ]*(\w+)*\.?$"
,
re
.
UNICODE
)
REG_CONF
=
re
.
compile
(
"^C\d+-\d+-\d+(?:\.\d+)?$"
)
REG_DATE
=
re
.
compile
(
r
"(\d{4}-\d{2}-\d{2})"
)
REG_DATE_YYYYMM
=
re
.
compile
(
r
"(\d{4}-\d{2})"
)
REG_
CONF
=
re
.
compile
(
"
^C
\d+
-
\d+
-\d+(?:\.\d+)?$
"
)
REG_
DOI
=
re
.
compile
(
r
"\d+
\.
\d+
/([a-zA-Z]+)\.(\d+)\.(\w+)
"
)
REG_OAI
=
re
.
compile
(
r
"oai:([a-z\.]+):([\d]+)"
)
REG_YEAR
=
re
.
compile
(
r
"(\d{4})"
)
...
...
modules/store_tools/recordcdspubli.py
View file @
c8095367
...
...
@@ -2,14 +2,50 @@
"""
import
logging
import
numpy
as
np
import
pprint
from
.authorsmixin
import
AuthorsMixin
from
.base
import
ARXIV
,
OAI
,
OAI_URL
,
REG_OAI
import
re
from
.authorsmixin
import
AuthorsMixin
,
MSG_NO_MY_AUTHOR
from
.base
import
(
ARXIV
,
MSG_UNKNOWN_COLLABORATION
,
MSG_NO_REF
,
OAI
,
OAI_URL
,
search_synonym
,
REG_DOI
,
REG_OAI
,
T6
)
from
.exception
import
CheckException
from
filters
import
CLEAN_COLLABORATION
from
pandas
import
concat
,
DataFrame
from
plugin_dbui
import
CLEAN_SPACES
from
.publicationinfomixin
import
PublicationInfoMixin
from
plugin_dbui
import
CLEAN_SPACES
,
UNDEF_ID
from
.publicationinfomixin
import
PAPER_REFERENCE_KEYS
,
PublicationInfoMixin
DECODE_ARXIV
=
re
.
compile
(
r
"arXiv:(\d{2})(\d{2})\."
)
# Decode submitted date: DD MMM YYYY or DD MM YYY
DECODE_DD_MMM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) *([A-Za-z]{3}) *(\d{4})"
)
DECODE_DD_MM_YYYY
=
re
.
compile
(
r
"(\d{1,2}) +(\d{1,2}) +(\d{4})"
)
MONTHS
=
{
"Jan"
:
"01"
,
"Feb"
:
"02"
,
"Fev"
:
"02"
,
"Mar"
:
"03"
,
"Apr"
:
"04"
,
"Avr"
:
"04"
,
"May"
:
"05"
,
"Mai"
:
"05"
,
"Jun"
:
"06"
,
"Jul"
:
"07"
,
"Aug"
:
"08"
,
"Sep"
:
"09"
,
"Oct"
:
"10"
,
"Nov"
:
"11"
,
"Dec"
:
"12"
}
MSG_NO_DATE
=
"Reject no submission date"
MSG_WELL_FORMED_DATE
=
"Reject submission date is not well formed"
def
to_str
(
x
):
...
...
@@ -310,6 +346,257 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
# replace
self
.
df_info
=
df
def
_recover_submitted
(
self
):
"""Recover submitted date using conference, preprint or thesis
information.
Args:
record (RecordPubli):
record describing a publication.
Returns:
str:
target at least YYYY-MM
empty when procedure failed
"""
val
=
""
# try by using the preprint information
report
=
self
.
preprint_number
()
if
report
:
m_arxiv
=
DECODE_ARXIV
.
match
(
report
)
if
m_arxiv
:
val
=
"20%s-%s"
%
(
m_arxiv
.
group
(
1
),
m_arxiv
.
group
(
2
))
# last change use the creation date for the record
if
val
==
""
or
len
(
val
)
<
7
:
val
=
self
[
"creation_date"
][
0
:
7
]
return
val
def
check_collaboration
(
self
,
db
=
None
):
"""Check synonyms for collaboration by using by the proper value.
Args:
db (pydal.DAL):
database connection
Raises:
CheckException:
* the collaboration is unknown in the database
* more than one synonym found.
"""
if
db
is
None
:
self
.
logger
.
debug
(
f
"
{
T6
}
skip check collaboration -- db is None"
)
return
self
.
logger
.
debug
(
f
"
{
T6
}
check collaboration"
)
val
=
self
.
collaboration
()
if
len
(
val
)
==
0
:
return
dbid
=
search_synonym
(
db
.
collaborations
,
"collaboration"
,
val
)
if
dbid
==
UNDEF_ID
:
raise
CheckException
(
MSG_UNKNOWN_COLLABORATION
)
collaboration
=
db
.
collaborations
[
dbid
].
collaboration
if
collaboration
!=
val
:
if
"corporate_name"
in
self
:
# one collaboration
if
isinstance
(
self
[
"corporate_name"
],
dict
):
self
[
"corporate_name"
][
"collaboration"
]
=
collaboration
# several collaboration
# replace the list of dictionary by a single one
else
:
self
[
"corporate_name"
]
=
{
"collaboration"
:
collaboration
}
def
check_my_affiliation
(
self
,
rex_institute
=
None
):
"""Check that authors of my institute are signatories.
Args:
rex_institute (str):
regular expression defining my institute
Raises:
CheckException
"""
if
rex_institute
is
None
:
self
.
logger
.
debug
(
f
"
{
T6
}
skip check my affiliation -- rex is None"
)
return
self
.
logger
.
debug
(
f
"
{
T6
}
check my affiliation"
)
value
=
self
.
find_affiliation
(
rex_institute
)
if
len
(
value
)
==
0
:
raise
CheckException
(
MSG_NO_MY_AUTHOR
)
def
check_paper_reference
(
self
):
"""Check that editor, page, volume and paper year are defined
for a published paper. Repair it from doi when possible.
Args:
record (RecordCdsPubli):
record describing a publication.
Raises:
CheckException:
the paper reference is not well formed.
"""
self
.
logger
.
debug
(
f
"
{
T6
}
check paper reference"
)
if
self
.
is_published
():
return
# paper reference can be incomplete or missing
# is the paper published ? In that case the doi is defined
if
"doi"
not
in
self
:
return
# what information is missing ?
# * df.columns are title, volume, year and pagination
# * df can contains one or more rows due to erratum.
# * assume that the first row is the oldest one and corresponds tp
# the first publication
# * the row contains empty string when the record is not published.
# * iloc[0] returns a serie where the index are the column's name
#
columns
=
(
self
.
df_info
.
iloc
[
0
]
.
replace
(
""
,
np
.
nan
)
.
dropna
()
.
index
)
missing
=
PAPER_REFERENCE_KEYS
.
difference
(
columns
)
# try to recover from the doi when it has the form
# xx.yyyy/Publisher.Volume.Page
m
=
REG_DOI
.
match
(
self
[
"doi"
])
if
not
m
:
raise
CheckException
(
MSG_NO_REF
+
str
(
list
(
missing
)))
for
subfield
in
missing
:
if
subfield
==
"title"
:
# transform PhysRevD in Phys. Rev. D
li
=
re
.
split
(
r
"([A-Z][a-z]+)"
,
m
.
group
(
1
))
title
=
". "
.
join
([
el
for
el
in
li
if
len
(
el
)
>
0
])
self
.
df_info
.
loc
[
0
,
"title"
]
=
title
elif
subfield
==
"volume"
:
self
.
df_info
.
loc
[
0
,
"volume"
]
=
m
.
group
(
2
)
elif
subfield
==
"pagination"
:
self
.
df_info
.
loc
[
0
,
"pagination"
]
=
m
.
group
(
3
)
elif
subfield
==
"year"
:
raise
CheckException
(
MSG_NO_REF
+
"[year]"
)
def
check_submitted_date
(
self
):
"""Standardise the submitted date as ``YYYY-MM`` or ``YYYY-MM-DD``.
Look for alternative when it is not defined.
Note:
After this check the year submitted contains one entry.
Args:
record (RecordCdsPubli):
record describing a publication.
Raises:
CheckException::
* the date is not well formed
* more than one date are found.
"""
self
.
logger
.
debug
(
f
"
{
T6
}
check submitted"
)
date
=
self
.
submitted
()
# recover missing date using conference, preprint, thesis information
if
len
(
date
)
<
7
:
date
=
self
.
_recover_submitted
()
if
len
(
date
)
==
0
:
raise
CheckException
(
MSG_NO_DATE
)
elif
len
(
date
)
<
7
:
raise
CheckException
(
MSG_WELL_FORMED_DATE
)
# 22 Mar 2011
m
=
DECODE_DD_MMM_YYYY
.
match
(
date
)
if
m
:
data
=
(
m
.
group
(
3
),
MONTHS
[
m
.
group
(
2
)],
int
(
m
.
group
(
1
)))
date
=
'%s-%s-%02i'
%
data
# 22 03 2011
m
=
DECODE_DD_MM_YYYY
.
match
(
date
)
if
m
:
data
=
(
m
.
group
(
3
),
int
(
m
.
group
(
2
)),
int
(
m
.
group
(
1
)))
date
=
'%s-%02i-%02i'
%
data
# in some case we have to deal with a list (see cds 2234042)
# in some case it is not defined (e.g. phd thesis)
if
"prepublication"
in
self
:
prepublication
=
self
[
"prepublication"
]
if
isinstance
(
prepublication
,
list
):
prepublication
[
0
][
"date"
]
=
date
else
:
prepublication
[
"date"
]
=
date
else
:
self
[
"prepublication"
]
=
{
"date"
:
date
}
def
check_and_fix_record
(
self
,
db
=
None
,
fmt_author
=
None
,
rex_institute
=
None
,
sep_author
=
", "
,
sort_author
=
False
):
"""Check record and fix non-conformities.
* is with authors
* is with authors form my institute
* standardise name of collaboration
* format authors according to my format
* extract authors form my institute signing the publication
Args:
db (pydal.DAL):
database connection
fmt_author (str):
define the format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``
rex_institute (str):
regular expression defining my institute
sep_author (str):
string separating author names. The default is the comma.
sort_author (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Raises:
CheckException
"""
self
.
check_authors
()
self
.
check_my_affiliation
(
rex_institute
)
self
.
check_collaboration
(
db
)
self
.
check_format_authors
(
fmt_author
)
self
.
extract_my_authors
(
rex_institute
,
sep_author
,
sort_author
)
self
.
check_submitted_date
()
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
...
...
@@ -442,6 +729,40 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
recid
=
self
[
"recid"
]
return
f
"http://cds.cern.ch/record/
{
recid
}
"
def
report_number
(
self
):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
# CDS
if
"report_number"
in
self
:
data
=
self
[
"report_number"
]
data
=
(
data
if
isinstance
(
data
,
list
)
else
[
data
])
li
=
[]
[
li
.
extend
(
di
.
values
())
for
di
in
data
]
return
", "
.
join
(
sorted
(
li
))
# OLD.INSPIRE
if
"primary_report_number"
in
self
:
data
=
self
[
"primary_report_number"
]
data
=
(
data
if
isinstance
(
data
,
list
)
else
[
data
])
li
=
[
el
for
el
in
data
if
el
is
not
None
and
not
el
.
startswith
(
ARXIV
)]
return
", "
.
join
(
sorted
(
li
))
return
""
def
secondary_oai
(
self
):
"""The secondary OAI identifier.
...
...
@@ -489,40 +810,6 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
return
""
def
report_number
(
self
):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
# CDS
if
"report_number"
in
self
:
data
=
self
[
"report_number"
]
data
=
(
data
if
isinstance
(
data
,
list
)
else
[
data
])
li
=
[]
[
li
.
extend
(
di
.
values
())
for
di
in
data
]
return
", "
.
join
(
sorted
(
li
))
# OLD.INSPIRE
if
"primary_report_number"
in
self
:
data
=
self
[
"primary_report_number"
]
data
=
(
data
if
isinstance
(
data
,
list
)
else
[
data
])
li
=
[
el
for
el
in
data
if
el
is
not
None
and
not
el
.
startswith
(
ARXIV
)]
return
", "
.
join
(
sorted
(
li
))
return
""
def
submitted
(
self
):
"""The date of submission.
...
...
@@ -537,6 +824,28 @@ class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
val
=
self
.
_get
(
"prepublication"
,
"date"
)
return
(
val
[
0
]
if
isinstance
(
val
,
list
)
else
val
)
def
subtype
(
self
):
"""The subtype of the publication.
Returns:
str:
* "articles", "preprint", "note" or "report"
* empty string when it is not defined
"""
collection
=
self
.
get
(
"collection"
,
None
)
if
collection
is
None
:
return
""
lst
=
[
dct
.
get
(
"primary"
,
""
).
lower
()
for
dct
in
collection
]
# order matter since note can have preprint+note
for
val
in
(
"article"
,
"note"
,
"report"
,
"preprint"
):
if
val
in
lst
:
return
val
return
""
def
title
(
self
):
"""The title of the publication.
...
...
modules/store_tools/recordheppubli.py
View file @
c8095367
...
...
@@ -387,6 +387,19 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst
=
[
self
.
primary_oai
(),
self
.
secondary_oai
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
oai_url
(
self
):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
* the pattern of the URL is ``http://host/record/id``
* primary and secondary URLs are separated by a comma.
* an empty string when it is not defined
"""
lst
=
[
self
.
primary_oai_url
(),
self
.
secondary_oai_url
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
paper_url
(
self
):
"""The URL of the document.
...
...
@@ -458,19 +471,6 @@ class RecordHepPubli(dict, AuthorsMixin, PublicationInfoMixin):
lst
=
[
elt
[
"value"
]
for
elt
in
lst
]
return
", "
.
join
(
lst
)
def
oai_url
(
self
):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
* the pattern of the URL is ``http://host/record/id``
* primary and secondary URLs are separated by a comma.
* an empty string when it is not defined
"""
lst
=
[
self
.
primary_oai_url
(),
self
.
secondary_oai_url
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
secondary_oai
(
self
):
"""The secondary OAI identifier.
...
...
tests/basis/test_13_CheckAndFix_article.py
deleted
100644 → 0
View file @
a3173b67
"""test_13_CheckAndFix_article
* Test CheckAndFix methods for article:
- format_editor
- publisher
- paper_reference
- submitted
- format_author
- get_my_authors
* Same article in cds.cern.ch and inspirehep.net
Phys. Rev. D 95 (2017) 052005
"""
import
pytest
from
harvest_tools.checkandfix
import
CheckAndFix
from
store_tools
import
load_record
@
pytest
.
fixture
(
scope
=
"module"
)
def
reccds
():
return
load_record
(
"cds.cern.ch"
,
2242641
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
recins
():
return
load_record
(
"inspirehep.net"
,
1509922
,
shelf
=
"literature"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
svc
():
return
CheckAndFix
()
def
test_format_editor_cds_13001
(
svc
,
reccds
):
# cds
assert
reccds
.
paper_editor
()
==
"Phys. Rev. D"
assert
reccds
.
paper_volume
()
==
"95"
svc
.
format_editor
(
reccds
)
assert
reccds
.
paper_editor
()
==
"Phys. Rev. D"
assert
reccds
.
paper_volume
()
==
"95"
def
test_format_editor_ins_13002
(
svc
,
recins
):
# inspire
assert
recins
.
paper_editor
()
==
"Phys. Rev. D"
assert
recins
.
paper_volume
()
==
"95"
svc
.
format_editor
(
recins
)
assert
recins
.
paper_editor
()
==
"Phys. Rev. D"
assert
recins
.
paper_volume
()
==
"95"
def
test_publisher_cds_13003
(
svc
,
reccds
):
assert
svc
.
publisher
(
reccds
)
is
None
def
test_paper_reference_cds_13004
(
svc
,
reccds
):
# check recovery procedure using DOI
# remove the publisher and volume information
paper_ref
=
reccds
.
paper_reference
()
reccds
.
df_info
.
loc
[
0
,
[
"title"
,
"volume"
]]
=
[
""
,
""
]
svc
.
paper_reference
(
reccds
)
assert
reccds
.
paper_reference
()
==
paper_ref
def
test_submitted_cds_13005
(
svc
,
reccds
):
assert
reccds
.
submitted
()
==
"19 Jan 2017"
svc
.
submitted
(
reccds
)
assert
reccds
.
submitted
()
==
"2017-01-19"
# test the case 19 01 2017
reccds
[
"prepublication"
][
"date"
]
=
"19 01 2017"
svc
.
submitted
(
reccds
)
assert
reccds
.
submitted
()
==
"2017-01-19"
# test the case 2017
reccds
[
"prepublication"
][
"date"
]
=
"2017"
svc
.
submitted
(
reccds
)
assert
reccds
.
submitted
()
==
"2017-01"
def
test_submitted_ins_13006
(
svc
,
recins
):
assert
recins
.
submitted
()
==
"2017-01-19"
def
test_format_authors_cds_13007
(
svc
,
reccds
):
authors
=
reccds
.
authors_as_list
()
assert
len
(
authors
)
==
reccds
[
"number_of_authors"
]
assert
authors
[
0
]
==
"Aaij, Roel"
assert
authors
[
1
]
==
"Adeva, Bernardo"
assert
authors
[
344
]
==
"Koopman, Rose"
assert
authors
[
-
1
]
==
"Zucchelli, Stefano"
svc
.
format_authors
(
reccds
,
fmt
=
"F. Last"
)
authors
=
reccds
.
authors_as_list
()
assert
authors
[
0
]
==
"R. Aaij"
assert
authors
[
1
]
==
"B. Adeva"
assert
authors
[
344
]
==
"R. Koopman"
assert
authors
[
-
1
]
==
"S. Zucchelli"
def
test_get_my_authors_cds_13008
(
svc
,
reccds
):
svc
.
format_authors
(
reccds
,
fmt
=
"F. Last"
)
assert
svc
.
get_my_authors
(
reccds
,
sep
=
"|"
,
sort
=
True
)
is
None
my_authors
=
reccds
.
my_authors
assert
my_authors
==
"J. Arnau Romeu|E. Aslanides|J. Cogan|"
\
"K. De Bruyn|R. Le Gac|O. Leroy|"
\
"G. Mancinelli|M. Martin|A. Mordà|"
\
"J. Serrano|A. Tayduganov|A. Tsaregorodtsev"
def
test_collaboration_ins_13009
(
svc
):
# require the CPPM database (test_limbra)
record
=
load_record
(
"inspirehep.net"
,
1826290
,
shelf
=
"literature"
)