Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
14d0602f
Commit
14d0602f
authored
Jan 07, 2021
by
LE GAC Renaud
Browse files
Redesing RecordPubli by using PluginAuthors and PluginPublicationInfo
parent
0a40e705
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
33 additions
and
463 deletions
+33
-463
modules/store_tools/recordpubli.py
modules/store_tools/recordpubli.py
+6
-441
tests/basis/test_04_RecordPubli.py
tests/basis/test_04_RecordPubli.py
+27
-22
No files found.
modules/store_tools/recordpubli.py
View file @
14d0602f
""" store_tools.recordpubli
"""
import
numpy
as
np
import
re
from
.base
import
to_initial
from
.exception
import
RecordException
from
filters
import
CLEAN_COLLABORATION
from
numpy
import
NaN
from
pandas
import
concat
,
DataFrame
from
plugin_dbui
import
as_list
,
CLEAN_SPACES
from
plugin_dbui
import
CLEAN_SPACES
from
.record
import
Record
from
store_tools
import
(
ARXIV
,
ARXIV_PDF
,
REG_ARXIV_NUMBER
,
REG_YEAR
)
AUTHOR_FORMATS
=
[
"First, Last"
,
"F. Last"
,
"Last"
,
"Last, First"
,
"Last F."
]
MSG_INVALID_FMT
=
"Invalid format for author"
# the keys containing paper reference
PAPER_REFERENCE_KEYS
=
{
"pagination"
,
"title"
,
"volume"
,
"year"
}
from
store_tools
import
ARXIV
from
store_tools.pluginauthors
import
PluginAuthors
from
store_tools.pluginpublicationinfo
import
PluginPublicationInfo
def
to_str
(
x
):
return
(
"|"
.
join
(
x
)
if
isinstance
(
x
,
list
)
else
x
)
class
RecordPubli
(
Record
):
class
RecordPubli
(
Record
,
PluginAuthors
,
PluginPublicationInfo
):
"""Article, preprint, proceeding, report and talk from cds.cern.ch or
old.inspirehep.net.
...
...
@@ -283,58 +264,6 @@ class RecordPubli(Record):
# replace
self
[
"publication_info"
]
=
df
def
authors
(
self
,
sep
=
", "
,
sort
=
False
):
"""The author(s) signing the publication.
Args:
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li
=
self
.
authors_as_list
(
sort
=
sort
)
return
sep
.
join
(
li
)
def
authors_as_list
(
self
,
sort
=
False
):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list:
* name are unique
* the list is empty when authors are not defined.
"""
df
=
self
[
"authors"
]
if
sort
:
li
=
(
df
[[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
.
tolist
())
else
:
li
=
(
df
.
fmt_name
.
sort_index
()
.
tolist
())
if
len
(
li
)
==
1
and
li
[
0
]
==
""
:
li
=
[]
return
li
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
...
...
@@ -347,286 +276,6 @@ class RecordPubli(Record):
li
=
self
.
_get
(
"corporate_name"
,
"collaboration"
,
force_list
=
True
)
return
CLEAN_COLLABORATION
(
", "
.
join
(
li
))
def
find_affiliation
(
self
,
pattern
):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (str):
regular expression defining the affiliation keys.
It has to be build for an exact match namely containing
start and end of string. This is required to separate
`Ecole Plolytechnique` from `Ecole Polytechnique, Lausanne`.
Returns:
str:
- the affiliation or the first one when several are found.
- empty string when nothing is found.
"""
df
=
self
[
"authors"
]
query
=
df
.
affiliation
.
str
.
match
(
pattern
)
data
=
df
[
query
]
if
data
.
empty
:
return
""
data
=
data
.
affiliation
.
unique
()
return
(
data
[
0
]
if
len
(
data
)
>
0
else
""
)
def
find_authors
(
self
,
pattern
,
sep
=
", "
,
sort
=
False
):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (str):
regular expression defining the author name(s).
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by ``sep`` argument.
* The string is empty when nothing is found.
"""
df
=
self
[
"authors"
]
query
=
df
.
fmt_name
.
str
.
contains
(
pattern
)
if
sort
:
data
=
(
df
.
loc
[
query
,
[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
else
:
data
=
(
df
.
loc
[
query
,
[
"fmt_name"
]]
.
sort_index
()
.
fmt_name
)
return
(
""
if
len
(
data
)
==
0
else
sep
.
join
(
data
))
def
find_authors_by_affiliation
(
self
,
pattern
,
sep
=
", "
,
sort
=
False
):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (str):
regular expression defining the affiliation keys
for the institute(s).
sep (str):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
str:
* Author names are separated by the ``sep`` argument.
* Author are sorted according to their family name.
* Empty string when authors are not found.
"""
df
=
self
[
"authors"
]
query
=
df
.
affiliation
.
str
.
contains
(
pattern
)
if
sort
:
data
=
(
df
.
loc
[
query
,
[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
else
:
data
=
(
df
.
loc
[
query
,
[
"fmt_name"
]]
.
sort_index
()
.
fmt_name
)
return
(
sep
.
join
(
data
)
if
len
(
data
)
>
0
else
""
)
def
first_author
(
self
):
"""The name of the first author.
Returns:
str:
empty string when the first author is not defined.
"""
return
self
[
"authors"
].
fmt_name
.
iloc
[
0
]
def
first_author_institutes
(
self
):
"""The institute(s) associated to the first author.
Returns:
str:
- names are separated by ``|``.
- The string is empty when institutes are not defined.
"""
val
=
self
[
"authors"
].
affiliation
.
iloc
[
0
]
return
(
""
if
val
==
NaN
else
val
)
def
institutes
(
self
):
"""The list of institute signing the publication.
Returns:
list:
the list is sort in alphabetic order.
"""
df
=
self
[
"authors"
]
# expand multi-affiliation (one per column)
df
=
df
.
affiliation
.
str
.
split
(
"|"
,
expand
=
True
)
# merge all columns into a single one,
# sort and remove duplicate entries
li
=
[
df
[
el
].
dropna
()
for
el
in
df
.
columns
]
df
=
(
concat
(
li
,
ignore_index
=
True
)
.
sort_values
()
.
unique
())
return
df
.
tolist
()
def
is_affiliations
(
self
):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking that the ``affiliation`` field
exists. To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
Returns:
bool:
"""
df
=
self
[
"authors"
]
if
len
(
df
)
==
1
and
df
.
affiliation
.
iloc
[
0
]
==
""
:
return
False
return
True
def
is_affiliation_for_all
(
self
):
"""``True`` when affiliation are defined for all authors.
Return:
bool:
"""
df
=
self
[
"authors"
]
query
=
df
.
affiliation
.
isin
([
""
,
NaN
])
return
df
.
affiliation
[
query
].
size
==
0
def
is_authors
(
self
):
"""``True`` when authors are defined.
Returns:
bool:
"""
df
=
self
[
"authors"
]
cols
=
{
"first_name"
,
"full_name"
,
"last_name"
}
if
len
(
df
.
columns
.
intersection
(
cols
))
!=
3
:
return
False
if
len
(
df
)
==
1
and
df
.
full_name
.
iloc
[
0
]
==
""
:
return
False
return
True
def
is_published
(
self
):
"""``True`` is the record is published and contains a full set
of publication information (title, volume, year and pagination).
Returns:
bool:
"""
# NOTE
# * df.columns are title, volume, year and pagination
# * df can contains one or more rows due to erratum.
# * assume that the first row is the oldest one and corresponds tp
# the first publication
# * the row contains empty string when the record is not published.
# * iloc[0] returns a serie where the index are the column's name
#
columns
=
(
self
[
"publication_info"
].
iloc
[
0
]
.
replace
(
""
,
np
.
nan
)
.
dropna
()
.
index
)
return
len
(
columns
.
intersection
(
PAPER_REFERENCE_KEYS
))
==
4
def
is_with_erratum
(
self
):
"""``True`` when the record contains erratum data.
Returns:
bool
"""
df
=
self
[
"publication_info"
]
return
len
(
df
)
>
1
def
paper_editor
(
self
):
"""The abbreviated version of the review, *e.g* Phys Lett B.
Returns:
str:
empty string when not defined.
"""
df
=
self
[
"publication_info"
]
return
(
df
.
title
.
iloc
[
0
]
if
"title"
in
df
else
""
)
def
paper_pages
(
self
):
"""The page number / range when the record is published in a review.
Returns:
str:
* The format is "45-67" or "234".
* Empty string when not defined.
"""
df
=
self
[
"publication_info"
]
return
(
df
.
pagination
.
iloc
[
0
]
if
"pagination"
in
df
else
""
)
def
paper_reference
(
self
):
"""The full reference for a publication published in a review.
Returns:
str:
* The format is "Phys Lett B 456 2010 5-6".
* The string is empty when the publication is not
published in a review.
"""
df
=
self
[
"publication_info"
]
its
=
df
.
columns
.
intersection
({
"title"
,
"volume"
,
"year"
,
"pagination"
})
if
len
(
its
)
!=
4
:
return
""
paper
=
df
.
iloc
[
0
]
li
=
[
paper
.
title
,
paper
.
volume
,
paper
.
year
,
paper
.
pagination
]
return
" "
.
join
(
li
).
strip
()
def
paper_url
(
self
):
"""The URL of the preprint.
...
...
@@ -651,28 +300,6 @@ class RecordPubli(Record):
return
""
def
paper_volume
(
self
):
"""The volume number when the record is published in a review.
Returns:
str:
empty string when nothing is found.
"""
df
=
self
[
"publication_info"
]
return
(
df
.
volume
.
iloc
[
0
]
if
"volume"
in
df
else
""
)
def
paper_year
(
self
):
"""The year of the publication.
Returns:
str:
- Empty string if the year is not defined.
"""
df
=
self
[
"publication_info"
]
return
(
df
.
year
.
iloc
[
0
]
if
"year"
in
df
else
""
)
def
preprint_number
(
self
):
"""The ArXiv preprint number.
...
...
@@ -693,68 +320,6 @@ class RecordPubli(Record):
return
""
def
reformat_authors
(
self
,
fmt
=
"Last, First"
):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are ``First, Last``, ``F. Last``, ``Last``,
``Last, First`` and ``Last F.``.
Raises:
RecordException:
the argument ``fmt`` is not valid.
"""
if
fmt
not
in
AUTHOR_FORMATS
:
raise
RecordException
(
MSG_INVALID_FMT
)
if
fmt
==
self
.
_last_fmt_author
:
return
self
.
_last_fmt_author
=
fmt
df
=
self
[
"authors"
]
# ....................................................................
#
# Compute initial for the first name
#
if
fmt
in
(
"F. Last"
,
"Last F."
):
df
[
"initial"
]
=
(
df
.
first_name
.
fillna
(
""
)
.
apply
(
to_initial
))
# ....................................................................
#
# Format
#
if
fmt
==
"Last, First"
:
df
[
"fmt_name"
]
=
df
.
last_name
+
", "
+
df
.
first_name
elif
fmt
==
"First, Last"
:
df
[
"fmt_name"
]
=
df
.
first_name
+
", "
+
df
.
last_name
elif
fmt
==
"F. Last"
:
df
[
"fmt_name"
]
=
df
.
initial
+
" "
+
df
.
last_name
elif
fmt
==
"Last"
:
df
[
"fmt_name"
]
=
df
.
last_name
elif
fmt
==
"Last F."
:
df
[
"fmt_name"
]
=
df
.
last_name
+
" "
+
df
.
initial
# ....................................................................
#
# Clean initial column
#
if
fmt
in
(
"F. Last"
,
"Last F."
):
df
=
df
.
drop
(
"initial"
,
axis
=
"columns"
)
def
report_number
(
self
):
"""The report number(s) associated to the publication.
...
...
@@ -776,7 +341,7 @@ class RecordPubli(Record):
return
", "
.
join
(
sorted
(
li
))
# INSPIRE
#
OLD.
INSPIRE
if
"primary_report_number"
in
self
:
data
=
self
[
"primary_report_number"
]
...
...
tests/basis/test_04_RecordPubli.py
View file @
14d0602f
...
...
@@ -23,6 +23,7 @@ import pandas as pd
import
pytest
from
store_tools
import
load_record
from
store_tools.recordpubli
import
RecordPubli
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -30,7 +31,11 @@ def record():
return
load_record
(
"cds.cern.ch"
,
1951625
)
def
test_constructor_cds_04001
(
record
):
def
test_upcast_cds_04001
(
record
):
assert
isinstance
(
record
,
RecordPubli
)
def
test_constructor_cds_04002
(
record
):
"""test the method _process_authors and _process_publication_info.
"""
...
...
@@ -63,11 +68,11 @@ def test_constructor_cds_04001(record):
#
# Section devoted to authors
#
def
test_is_authors_cds_0400
2
(
record
):
def
test_is_authors_cds_040
1
0
(
record
):
assert
record
.
is_authors
()
def
test_authors_as_list_cds_040
03
(
record
):
def
test_authors_as_list_cds_040
11
(
record
):
authors
=
record
.
authors_as_list
()
assert
len
(
authors
)
==
record
[
"number_of_authors"
]
...
...
@@ -77,16 +82,16 @@ def test_authors_as_list_cds_04003(record):
assert
authors
[
-
1
]
==
"Zvyagin, Alexander"
def
test_first_author_cds_040
04
(
record
):
def
test_first_author_cds_040
12
(
record
):
assert
record
.
first_author
()
==
"Aaij, Roel"
def
test_find_authors_cds_040
05
(
record
):
def
test_find_authors_cds_040
13
(
record
):
assert
record
.
find_authors
(
"Leo"
)
==
\
"Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato"
def
test_reformat_author_cds_040
06
(
record
):
def
test_reformat_author_cds_040
14
(
record
):
record
.
reformat_authors
(
"F. Last"
)
authors
=
record
.
authors_as_list
()
...
...
@@ -105,12 +110,12 @@ def test_reformat_author_cds_04006(record):
#
# Section devoted to affiliation
#
def
test_is_affiliations_cds_0400
7
(
record
):
def
test_is_affiliations_cds_040
2
0
(
record
):
assert
record
.
is_affiliations
()
assert
record
.
is_affiliation_for_all
()
def
test_institutes_cds_040
08
(
record
):
def
test_institutes_cds_040
21
(
record
):
institutes
=
record
.
institutes
()
...
...
@@ -119,7 +124,7 @@ def test_institutes_cds_04008(record):
assert
institutes
[
-
1
]
==
"Zurich U."
def
test_find_affiliation_cds_040
09
(
record
):
def
test_find_affiliation_cds_040
22
(
record
):
affiliation
=
record
.
find_affiliation
(
r
"Marseille, CPPM|CPPM, Marseille"
)
assert
affiliation
==
"Marseille, CPPM"
...
...
@@ -128,11 +133,11 @@ def test_find_affiliation_cds_04009(record):
#
# Section devoted to authors and institutes
#
def
test_first_author_institutes_cds_040
10
(
record
):
def
test_first_author_institutes_cds_040
23
(
record
):
assert
record
.
first_author_institutes
()
==
"NIKHEF, Amsterdam"
def
test_find_authors_by_affiliation_cds_040
11
(
record
):
def
test_find_authors_by_affiliation_cds_040
24
(
record
):
pattern
=
"CPPM, Marseille|Marseille, CPPM"
authors
=
record
.
find_authors_by_affiliation
(
pattern
,
sep
=
"|"
)
...
...
@@ -147,52 +152,52 @@ def test_find_authors_by_affiliation_cds_04011(record):
#
# Other methods
#
def
test_collaboration_cds_040
12
(
record
):
def
test_collaboration_cds_040
30
(
record
):
assert
record
.
collaboration
()
==
"LHCb Collaboration"
def
test_is_published_cds_040
1
3
(
record
):
def
test_is_published_cds_0403
1
(
record
):
assert
record
.
is_published
()
def
test_is_with_erratum_cds_040
14
(
record
):
def
test_is_with_erratum_cds_040
32
(
record
):
assert
not
record
.
is_with_erratum
()
def
test_paper_info_cds_040
15
(
record
):
def
test_paper_info_cds_040
33
(
record
):
assert
record
.
paper_editor
()
==
"JINST"
assert
record
.
paper_pages
()
==
"P12005"
assert
record
.
paper_volume
()
==
"9"
assert
record
.
paper_year
()
==
"2014"
def
test_paper_reference_cds_040
16
(
record
):
def
test_paper_reference_cds_040
34
(
record
):
assert
record
.
paper_reference
()
==
"JINST 9 2014 P12005"
def
test_preprint_number_cds_040
17
(
record
):
def
test_preprint_number_cds_040
35
(
record
):
assert
record
.
preprint_number
()
==
"arXiv:1410.0149"
def
test_paper_url_cds_040
18
(
record
):
def
test_paper_url_cds_040
36
(
record
):
assert
record
.
paper_url
()
==
\
"http://cds.cern.ch/record/1951625/files/arXiv:1410.0149.pdf"