Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
b7ca0e1b
Commit
b7ca0e1b
authored
May 31, 2017
by
LE GAC Renaud
Browse files
Update RecordPubli and RecordThesis in order to include author formatting/sorting.
parent
8a8147e4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
265 additions
and
52 deletions
+265
-52
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+244
-42
modules/invenio_tools/recordthesis.py
modules/invenio_tools/recordthesis.py
+17
-6
tests/invenio_tools/Record/test_acl_cds1951625.py
tests/invenio_tools/Record/test_acl_cds1951625.py
+2
-2
tests/invenio_tools/Record/test_phd_cds1632177.py
tests/invenio_tools/Record/test_phd_cds1632177.py
+1
-1
tests/invenio_tools/Record/test_record_non_conformities.py
tests/invenio_tools/Record/test_record_non_conformities.py
+1
-1
No files found.
modules/invenio_tools/recordpubli.py
View file @
b7ca0e1b
...
...
@@ -9,12 +9,22 @@ from base import (ARXIV,
ARXIV_PDF
,
REG_ARXIV_NUMBER
,
REG_YEAR
)
from
exception
import
RecordException
from
filters
import
CLEAN_COLLABORATION
from
numpy
import
NaN
from
pandas
import
concat
,
DataFrame
from
plugin_dbui
import
as_list
,
CLEAN_SPACES
from
record
import
Record
AUTHOR_FORMATS
=
[
"First, Last"
,
"F. Last"
,
"Last"
,
"Last, First"
,
"Last F."
]
# Decode publication reference:
# Phys. Rev. Lett. 113, 032001 (2014)
# Eur. Phys. J. C (2014) 74:2883
...
...
@@ -22,9 +32,38 @@ _ref1 = r"(?P<p>[A-Za-z\. ]+) +(?P<v>\d+),? +(?P<c>[\d-]+) +\((?P<y>[\d]+)\)"
_ref2
=
r
"(?P<p>[A-Za-z\. ]+) +\((?P<y>\d+)\) +(?P<v>[\d]+):(?P<c>[\d-]+)"
DECODE_REF
=
[
re
.
compile
(
_ref1
),
re
.
compile
(
_ref2
)]
MSG_INVALID_FMT
=
"Invalid format for author"
# The MARC12 keys containing paper reference
PAPER_REFERENCE_KEYS
=
set
([
"c"
,
"p"
,
"v"
,
"y"
])
REG_INITIAL
=
initial
=
r
"^(\w+)\.?(\-)* *(\w+)*\.?$"
def
to_initial
(
x
,
y
,
z
):
"""Help function to extract initial from a first name split in x, y and z:
Albert (x="Albert", y="", z="")
Antonio Augusto (x="Antonio", y="", z="Augusto")
Jean-Pierre (x="Jean", y="-", z="Pierre")
Args:
x (str): first part
y (str): separator
z (str): second part
Returns:
str
"""
if
z
==
""
:
return
"%s."
%
x
[
0
:
1
]
if
y
==
""
:
return
"%s. %s."
%
(
x
[
0
:
1
],
z
[
0
:
1
])
else
:
return
"%s.%s%s."
%
(
x
[
0
:
1
],
y
[
0
:
1
],
z
[
0
:
1
])
def
to_str
(
x
):
return
(
"|"
.
join
(
x
)
if
isinstance
(
x
,
list
)
else
x
)
...
...
@@ -58,6 +97,8 @@ class RecordPubli(Record):
"""
def
__init__
(
self
,
*
args
):
self
.
_last_fmt_author
=
"Last, First"
Record
.
__init__
(
self
,
*
args
)
self
.
_process_authors
()
...
...
@@ -67,14 +108,29 @@ class RecordPubli(Record):
* Keep the subfield "a", "u" and "e" (phd thesis)
* Convert list of affiliation in string separated by "|"
The a
uthor a
re spread over the
100 and 700
field
.
A
uthor
s
a
nd their affiliations are defined in the fields
100 and 700.
The method deals with cases where:
* the first author is defined in 100 but it is not in 700
* first author is not defined in 100 but in 700
* thesis in which 700 contains names of director
Authors and their affiliations are stored in DataFrame with the
following structure:
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
| fmt_name | formated name |
+------------+---------------------------+
"""
columns4names
=
[
"last_name"
,
"first_name"
]
# ....................................................................
#
# Instantiate DataFrame for field 100 and 700
...
...
@@ -91,12 +147,20 @@ class RecordPubli(Record):
df
=
DataFrame
(
data
)
columns
=
df
.
columns
#
only
keep columns:
# keep columns:
# - "a": author name
# - "e": phd director (equal to "dir.")
# - "u": affiliation(s)
df
=
df
.
drop
(
columns
.
difference
([
"a"
,
"e"
,
"u"
]),
axis
=
"columns"
)
# add columns first_name, last_name and fmt_name
# warning: in some case split create more than 2 columns
df
[
columns4names
]
=
df
.
a
.
str
.
split
(
u
","
,
expand
=
True
)[[
0
,
1
]]
df
[
"fmt_name"
]
=
df
.
a
df
.
first_name
=
df
.
first_name
.
str
.
strip
()
df
.
last_name
=
df
.
last_name
.
str
.
strip
()
# protection -- affiliation not defined
if
"a"
in
columns
and
"u"
not
in
columns
:
dfu
=
DataFrame
([
""
]
*
len
(
df
),
columns
=
[
"u"
])
...
...
@@ -105,8 +169,7 @@ class RecordPubli(Record):
# protection -- mission affiliation
df
.
u
=
df
.
u
.
fillna
(
""
)
# convert list of affiliation to string
# in which values are separated by |
# convert list of affiliation to string separated by |
df
.
u
=
df
.
u
.
apply
(
lambda
x
:
to_str
(
x
))
di
[
key
]
=
df
...
...
@@ -116,15 +179,33 @@ class RecordPubli(Record):
# ....................................................................
#
# protection -- more than one first author
# the case with duplicate author name (build affiliation)
# Protection -- more than one first author
#
# treat the case with duplicate author name
# by building the affiliation string
#
if
d100
is
not
None
and
len
(
d100
)
>
1
:
grouped
=
d100
.
groupby
([
"a"
],
sort
=
False
)
if
len
(
grouped
)
==
1
:
for
name
,
group
in
grouped
:
li
=
[
el
for
el
in
group
.
u
if
el
not
in
(
""
,
NaN
,
None
)]
d100
=
DataFrame
({
"a"
:
[
name
],
"u"
:
[
"|"
.
join
(
li
)]})
last_name
,
first_name
=
name
.
split
(
u
","
)
affiliations
=
\
[
el
for
el
in
group
.
u
if
el
not
in
(
""
,
NaN
,
None
)]
di
=
{
"a"
:
[
name
],
"first_name"
:
[
first_name
.
strip
()],
"fmt_name"
:
[
name
],
"last_name"
:
[
last_name
.
strip
()],
"u"
:
[
"|"
.
join
(
affiliations
)]}
d100
=
DataFrame
(
di
)
# NOTE
# The case with more than one first author is rare
# It will be detect by the CheckAndFix procedure when it is
# not fixed by the above protection
# ....................................................................
#
...
...
@@ -132,6 +213,7 @@ class RecordPubli(Record):
# deal with cases where the first author is defined in 100
# but not in 700, first author is defined in 100 and in 700
# or no author in 100
#
if
d100
is
not
None
and
d700
is
not
None
:
if
d100
.
a
.
iloc
[
0
]
!=
d700
.
a
.
iloc
[
0
]:
if
len
(
d100
)
==
1
:
...
...
@@ -146,39 +228,54 @@ class RecordPubli(Record):
else
:
d100
=
d700
=
DataFrame
({
"a"
:
[
""
],
"u"
:
[
""
]})
# ....................................................................
#
# Update
#
self
[
u
"100"
]
=
d100
self
[
u
"700"
]
=
d700
def
authors
(
self
,
cmpFct
=
Non
e
):
def
authors
(
self
,
sep
=
u
", "
,
sort
=
Fals
e
):
"""The author(s) signing the publication.
Args:
cmpFct (reference): function to compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller
than, equal to, or larger than the second one.
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by comma.
* Author are sorted according to the function *cmpFct*.
* Author names are separated by the ``sep`` argument.
* The string is empty when there is no authors.
"""
li
=
self
.
authors_as_list
()
if
cmpFct
:
li
.
sort
(
key
=
cmpFct
)
return
u
", "
.
join
(
li
)
li
=
self
.
authors_as_list
(
sort
=
sort
)
return
sep
.
join
(
li
)
def
authors_as_list
(
self
):
def
authors_as_list
(
self
,
sort
=
False
):
"""The list of author(s) signing the publication.
Args:
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
list: the list is empty when authors are not defined.
"""
li
=
self
[
u
"700"
].
a
.
tolist
()
if
sort
:
li
=
(
self
[
u
"700"
][[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
.
tolist
())
else
:
li
=
(
self
[
u
"700"
].
fmt_name
.
sort_index
()
.
tolist
())
if
len
(
li
)
==
1
and
li
[
0
]
==
""
:
li
=
[]
...
...
@@ -201,8 +298,9 @@ class RecordPubli(Record):
"""Find affiliation matching the regular expression *pattern*.
Args:
pattern (unicode): regular expression defining the
affiliation keys.
pattern (unicode):
regular expression defining the affiliation keys.
It should not contains groups.
Returns:
unicode:
...
...
@@ -213,39 +311,63 @@ class RecordPubli(Record):
# modify the pattern to capture group
pattern
=
"(%s)"
%
pattern
series
=
self
[
u
"700"
].
u
.
str
.
extract
(
pattern
,
expand
=
False
)
.
dropna
()
return
(
series
.
iloc
[
0
]
if
len
(
series
)
>
0
else
u
""
)
data
=
(
self
[
u
"700"
].
u
.
str
.
extract
(
pattern
,
expand
=
False
)
.
dropna
()
)
def
find_authors
(
self
,
pattern
):
return
(
data
.
iloc
[
0
]
if
len
(
data
)
>
0
else
u
""
)
def
find_authors
(
self
,
pattern
,
sep
=
u
", "
,
sort
=
False
):
"""Find authors containing the regular expression *pattern*.
The search is performed on the formatted name.
Args:
pattern (unicode): regular expression defining the author name(s).
pattern (unicode):
regular expression defining the author name(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by ``
|``
.
* Author names are separated by ``
sep`` argument
.
* The string is empty when nothing is found.
"""
df
=
self
[
u
"700"
]
query
=
df
.
a
.
str
.
contains
(
pattern
)
df
=
df
.
loc
[
query
,
[
"a"
]]
query
=
df
.
fmt_name
.
str
.
contains
(
pattern
)
if
sort
:
data
=
(
df
.
loc
[
query
,
[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
return
(
u
""
if
len
(
df
)
==
0
else
u
"|"
.
join
(
df
.
a
))
else
:
data
=
(
df
.
loc
[
query
,
[
"fmt_name"
]]
.
sort_index
()
.
fmt_name
)
return
(
u
""
if
len
(
data
)
==
0
else
sep
.
join
(
data
))
def
find_authors_by_affiliation
(
self
,
pattern
):
def
find_authors_by_affiliation
(
self
,
pattern
,
sep
=
u
", "
,
sort
=
False
):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (unicode): regular expression defining the affiliation keys
pattern (unicode):
regular expression defining the affiliation keys
for the institute(s).
sep (unicode):
string separating author names. The default is the comma.
sort (bool):
sort authors by family name when true otherwise use the
order of authors at the creation of the record
Returns:
unicode:
* Author names are separated by
``|``
.
* Author names are separated by
the ``sep`` argument
.
* Author are sorted according to their family name.
* Empty string when authors are not found.
...
...
@@ -253,21 +375,31 @@ class RecordPubli(Record):
df
=
self
[
u
"700"
]
query
=
df
.
u
.
str
.
contains
(
pattern
)
df
=
df
.
loc
[
query
,
[
"a"
]]
df
.
a
=
df
.
a
.
str
.
encode
(
"utf-8"
).
sort_values
()
return
(
"|"
.
join
(
df
.
a
)
if
len
(
df
)
>
0
else
""
).
decode
(
"utf-8"
)
if
sort
:
data
=
(
df
.
loc
[
query
,
[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
)
else
:
data
=
(
df
.
loc
[
query
,
[
"fmt_name"
]]
.
sort_index
()
.
fmt_name
)
data
=
data
.
str
.
encode
(
"utf-8"
)
sep
=
sep
.
encode
(
"utf-8"
)
return
(
sep
.
join
(
data
)
if
len
(
data
)
>
0
else
""
).
decode
(
"utf-8"
)
def
first_author
(
self
):
"""The name of the first author.
Returns:
unicode
or list
:
unicode:
- Empty string when the first author is not defined.
- List of name when there is more than one.
"""
return
self
[
u
"700"
].
a
.
iloc
[
0
]
return
self
[
u
"700"
].
fmt_name
.
iloc
[
0
]
def
first_author_institutes
(
self
):
"""The institute(s) associated to the first author.
...
...
@@ -312,7 +444,7 @@ class RecordPubli(Record):
"""``True`` when affiliations are defined for authors.
Note:
This is a fast algorithm checking
only first and last author
s.
This is a fast algorithm checking
that the ``u`` field exist
s.
To check that the affiliation is defined for all authors,
uses the method :func:`is_affiliation_for_all`.
...
...
@@ -524,6 +656,76 @@ class RecordPubli(Record):
return
val
return
u
""
def
reformat_authors
(
self
,
fmt
=
"Last, First"
):
"""Reformat names of authors.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
"""
if
fmt
not
in
AUTHOR_FORMATS
:
raise
RecordException
(
MSG_INVALID_FMT
)
if
fmt
==
self
.
_last_fmt_author
:
return
self
.
_last_fmt_author
=
fmt
# alias
d100
,
d700
=
self
[
u
"100"
],
self
[
u
"700"
]
# ....................................................................
#
# Compute initial for the first name
#
if
fmt
in
(
"F. Last"
,
"Last F."
):
for
df
in
(
d100
,
d700
):
dfm
=
(
df
.
first_name
.
str
.
extract
(
REG_INITIAL
,
expand
=
True
)
.
fillna
(
""
))
df
[
"initial"
]
=
dfm
.
apply
(
lambda
x
:
to_initial
(
x
[
0
],
x
[
1
],
x
[
2
]),
axis
=
"columns"
)
# ....................................................................
#
# Format
#
if
fmt
==
"Last, First"
:
d100
[
"fmt_name"
]
=
d100
.
a
d700
[
"fmt_name"
]
=
d700
.
a
elif
fmt
==
"First, Last"
:
d100
[
"fmt_name"
]
=
d100
.
first_name
+
", "
+
d100
.
last_name
d700
[
"fmt_name"
]
=
d700
.
first_name
+
" "
+
d700
.
last_name
elif
fmt
==
"F. Last"
:
d100
[
"fmt_name"
]
=
d100
.
initial
+
" "
+
d100
.
last_name
d700
[
"fmt_name"
]
=
d700
.
initial
+
" "
+
d700
.
last_name
elif
fmt
==
"Last"
:
d100
[
"fmt_name"
]
=
d100
.
last_name
d700
[
"fmt_name"
]
=
d700
.
last_name
elif
fmt
==
"Last F."
:
d100
[
"fmt_name"
]
=
d100
.
last_name
+
" "
+
d100
.
initial
d700
[
"fmt_name"
]
=
d700
.
last_name
+
" "
+
d700
.
initial
# ....................................................................
#
# Clean initial column
#
if
fmt
in
(
"F. Last"
,
"Last F."
):
d100
=
d100
.
drop
(
"initial"
,
axis
=
"columns"
)
d700
=
d700
.
drop
(
"initial"
,
axis
=
"columns"
)
def
report_number
(
self
):
"""The report number(s) associated to the publication.
...
...
modules/invenio_tools/recordthesis.py
View file @
b7ca0e1b
...
...
@@ -30,9 +30,12 @@ class RecordThesis(RecordPubli):
+-----------------------+---------+----------+
"""
def
authors_as_list
(
self
):
def
authors_as_list
(
self
,
sort
=
False
):
"""The list of author(s) signing the publication.
Args:
sort (bool): sort authors by first name when true.
Returns:
list: the list is empty when authors are not defined.
...
...
@@ -42,9 +45,17 @@ class RecordThesis(RecordPubli):
df
=
self
[
u
"700"
]
query
=
df
.
e
!=
THESIS_DIR
df
=
df
.
loc
[
query
]
li
=
df
.
a
.
tolist
()
if
sort
:
li
=
(
df
.
loc
[
query
,
[
"last_name"
,
"fmt_name"
]]
.
sort_values
(
by
=
"last_name"
)
.
fmt_name
.
tolist
())
else
:
li
=
(
df
.
loc
[
query
].
fmt_name
.
sort_index
()
.
tolist
())
if
len
(
li
)
==
1
and
li
[
0
]
==
""
:
li
=
[]
...
...
@@ -78,12 +89,12 @@ class RecordThesis(RecordPubli):
"""
return
self
.
_get
(
u
"502"
,
"a"
)
def
these_directors
(
self
):
def
these_directors
(
self
,
sep
=
u
", "
):
"""The list of director(s)
Returns:
unicode:
* Names are separated by
``|``
.
* Names are separated by
the ``sep`` argument
.
* Empty string when it is not defined.
"""
...
...
@@ -94,7 +105,7 @@ class RecordThesis(RecordPubli):
query
=
df
.
e
==
THESIS_DIR
df
=
df
.
loc
[
query
]
return
(
u
"|"
.
join
(
df
.
a
)
if
len
(
df
)
>
0
else
u
""
)
return
(
sep
.
join
(
df
.
fmt_name
)
if
len
(
df
)
>
0
else
u
""
)
def
these_town
(
self
):
"""The town where the thesis took place.
...
...
tests/invenio_tools/Record/test_acl_cds1951625.py
View file @
b7ca0e1b
...
...
@@ -65,12 +65,12 @@ def test_find_affiliation(record):
def
test_find_authors
(
record
):
assert
record
.
find_authors
(
"Leo"
)
==
u
"Beaucourt, Leo
|
Kravchuk, Leonid
|
Leo, Sabato"
assert
record
.
find_authors
(
"Leo"
)
==
u
"Beaucourt, Leo
,
Kravchuk, Leonid
,
Leo, Sabato"
def
test_find_authors_by_affiliation
(
record
):
pattern
=
"CPPM, Marseille|Marseille, CPPM"
authors
=
record
.
find_authors_by_affiliation
(
pattern
)
authors
=
record
.
find_authors_by_affiliation
(
pattern
,
sep
=
u
"|"
)
assert
authors
==
u
"Akar, Simon|Aslanides, Elie|Cogan, Julien|"
\
u
"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|"
\
...
...
tests/invenio_tools/Record/test_phd_cds1632177.py
View file @
b7ca0e1b
...
...
@@ -104,7 +104,7 @@ def test_these_level(record):
def
test_these_directors
(
record
):
assert
record
.
these_directors
()
==
"He, Mao
|
Monnier, Emmanuel
|
Zhu, Chengguang"
assert
record
.
these_directors
()
==
"He, Mao
,
Monnier, Emmanuel
,
Zhu, Chengguang"
def
test_these_town
(
record
):
...
...
tests/invenio_tools/Record/test_record_non_conformities.py
View file @
b7ca0e1b
...
...
@@ -20,7 +20,7 @@ def test_protection_find_authors_by_affiliation():
"""The affiliation is not defined for one author -- skip it."""
record
=
load_record
(
'cds.cern.ch'
,
2012165
)
pattern
=
"Marseille, CPPM|CPPM, Marseille"
authors
=
record
.
find_authors_by_affiliation
(
pattern
)
authors
=
record
.
find_authors_by_affiliation
(
pattern
,
sep
=
u
"|"
)
assert
authors
==
u
"Akar, Simon|Aslanides, Elie|Cogan, Julien|"
\
u
"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|"
\
u
"Mancinelli, Giampiero|Mordà, Alessandro|"
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment