Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
fe4859a0
Commit
fe4859a0
authored
May 30, 2017
by
LE GAC Renaud
Browse files
Update RecordPubli and RecordThesis to implement DataFrame for author/affilation.
parent
28d07ec0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
177 additions
and
76 deletions
+177
-76
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+159
-69
modules/invenio_tools/recordthesis.py
modules/invenio_tools/recordthesis.py
+18
-7
No files found.
modules/invenio_tools/recordpubli.py
View file @
fe4859a0
...
...
@@ -15,6 +15,8 @@ from iterauthors import (iter_author_affiliations,
iter_author_items
,
iter_author_names
)
from
itertools
import
ifilter
,
imap
from
numpy
import
NaN
from
pandas
import
concat
,
DataFrame
,
merge
from
plugin_dbui
import
as_list
,
CLEAN_SPACES
from
record
import
Record
...
...
@@ -29,6 +31,10 @@ DECODE_REF = [re.compile(_ref1), re.compile(_ref2)]
PAPER_REFERENCE_KEYS
=
set
([
"c"
,
"p"
,
"v"
,
"y"
])
def
to_str
(
x
):
return
(
"|"
.
join
(
x
)
if
isinstance
(
x
,
list
)
else
x
)
class
RecordPubli
(
Record
):
"""The MARC record describing a publication.
Usual publications are article, preprint, proceeding, report and talk.
...
...
@@ -55,6 +61,99 @@ class RecordPubli(Record):
+-----------------------+---------+----------+
"""
def
__init__
(
self
,
*
args
):
Record
.
__init__
(
self
,
*
args
)
self
.
_process_authors
()
def
_process_authors
(
self
):
"""Convert authors information into DataFrame:
* Keep the subfield "a", "u" and "e" (phd thesis)
* Convert list of affiliation in string separated by "|"
The author are spread over the 100 and 700 field.
The method deals with cases where:
* the first author is defined in 100 but it is not in 700
* first author is not defined in 100 but in 700
* thesis in which 700 contains names of director
"""
# ....................................................................
#
# Instantiate DataFrame for field 100 and 700
#
di
=
{
u
"100"
:
None
,
u
"700"
:
None
}
for
key
in
di
.
iterkeys
():
if
key
not
in
self
:
continue
data
=
self
[
key
]
data
=
(
data
if
isinstance
(
data
,
list
)
else
[
data
])
df
=
DataFrame
(
data
)
columns
=
df
.
columns
# only keep columns:
# - "a": author name
# - "e": phd director (equal to "dir.")
# - "u": affiliation(s)
df
=
df
.
drop
(
columns
.
difference
([
"a"
,
"e"
,
"u"
]),
axis
=
"columns"
)
# protection -- affiliation not defined
if
"a"
in
columns
and
"u"
not
in
columns
:
dfu
=
DataFrame
([
""
]
*
len
(
df
),
columns
=
[
"u"
])
df
=
concat
([
df
,
dfu
],
axis
=
"columns"
)
# protection -- mission affiliation
df
.
u
=
df
.
u
.
fillna
(
""
)
# convert list of affiliation to string
# in which values are separated by |
df
.
u
=
df
.
u
.
apply
(
lambda
x
:
to_str
(
x
))
di
[
key
]
=
df
# alias
d100
,
d700
=
di
[
u
"100"
],
di
[
u
"700"
]
# ....................................................................
#
# protection -- more than one first author
# the case with duplicate author name (build affiliation)
#
if
d100
is
not
None
and
len
(
d100
)
>
1
:
grouped
=
d100
.
groupby
([
"a"
],
sort
=
False
)
if
len
(
grouped
)
==
1
:
for
name
,
group
in
grouped
:
li
=
[
el
for
el
in
group
.
u
if
el
not
in
(
""
,
NaN
,
None
)]
d100
=
DataFrame
({
"a"
:
[
name
],
"u"
:
[
"|"
.
join
(
li
)]})
# ....................................................................
#
# the author are spread over the 100 and 700 field.
# deal with cases where the first author is defined in 100
# but not in 700, first author is defined in 100 and in 700
# or no author in 100
if
d100
is
not
None
and
d700
is
not
None
:
if
d100
.
a
.
iloc
[
0
]
!=
d700
.
a
.
iloc
[
0
]:
if
len
(
d100
)
==
1
:
d700
=
concat
([
d100
,
d700
],
ignore_index
=
True
)
elif
d100
is
None
and
d700
is
not
None
:
d100
=
DataFrame
(
d700
.
iloc
[
0
]).
transpose
()
elif
d700
is
None
and
d100
is
not
None
:
d700
=
d100
else
:
d100
=
d700
=
DataFrame
({
"a"
:
[
""
],
"u"
:
[
""
]})
self
[
u
"100"
]
=
d100
self
[
u
"700"
]
=
d700
def
authors
(
self
,
cmpFct
=
None
):
"""The author(s) signing the publication.
...
...
@@ -72,6 +171,7 @@ class RecordPubli(Record):
"""
li
=
self
.
authors_as_list
()
if
cmpFct
:
li
.
sort
(
key
=
cmpFct
)
return
u
", "
.
join
(
li
)
...
...
@@ -83,7 +183,12 @@ class RecordPubli(Record):
list: the list is empty when authors are not defined.
"""
return
list
(
iter_author_names
(
self
))
li
=
self
[
u
"700"
].
a
.
tolist
()
if
len
(
li
)
==
1
and
li
[
0
]
==
""
:
li
=
[]
return
li
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
...
...
@@ -100,9 +205,6 @@ class RecordPubli(Record):
def
find_affiliation
(
self
,
pattern
):
"""Find affiliation matching the regular expression *pattern*.
Affiliation keys are obtained by concatenating the "u" and "v"
keys of the author field 100 and 700.
Args:
pattern (unicode): regular expression defining the
affiliation keys.
...
...
@@ -113,74 +215,53 @@ class RecordPubli(Record):
- empty string when nothing is found.
"""
regex
=
re
.
compile
(
pattern
)
# modify the pattern to capture group
pattern
=
"(%s)"
%
pattern
# NOTE: an author can have several affiliations
for
iter_key
in
iter_author_affiliation_keys
(
self
):
li
=
list
(
ifilter
(
regex
.
match
,
iter_key
))
if
len
(
li
)
>
0
:
return
li
[
0
]
return
u
""
series
=
self
[
u
"700"
].
u
.
str
.
extract
(
pattern
,
expand
=
False
).
dropna
()
return
(
series
.
iloc
[
0
]
if
len
(
series
)
>
0
else
u
""
)
def
find_authors
(
self
,
pattern
):
"""Find authors
match
ing the regular expression *pattern*.
"""Find authors
contain
ing the regular expression *pattern*.
Args:
pattern (unicode): regular expression defining the author name(s).
Returns:
unicode:
* Author names are separated by
a comma
.
* Author names are separated by
``|``
.
* The string is empty when nothing is found.
"""
regex
=
re
.
compile
(
pattern
)
return
u
", "
.
join
(
ifilter
(
regex
.
search
,
iter_author_names
(
self
)))
df
=
self
[
u
"700"
]
def
find_authors_by_affiliation
(
self
,
pattern
,
cmpFct
=
None
):
"""Find authors belonging to a given institute(s) defined by a regular
expression. The search is performed on the affiliation keys.
query
=
df
.
a
.
str
.
contains
(
pattern
)
df
=
df
.
loc
[
query
,
[
"a"
]]
return
(
u
""
if
len
(
df
)
==
0
else
u
"|"
.
join
(
df
.
a
))
Affiliation keys are obtained by concatenating the "u" and "v" keys
of the author field 100 and 700.
def
find_authors_by_affiliation
(
self
,
pattern
):
"""Find authors belonging to a given institute(s) defined by a regular
expression.
Args:
pattern (unicode): regular expression defining the affiliation keys
for the institute(s).
cmpFct (reference): function to compare author names.
The comparison function takes two items and returns -1, 0, or 1
depending on whether the first argument is considered smaller
than, equal to, or larger than the second one.
Returns:
unicode:
* Author names are separated by
a comma
.
* Author are sorted according to the
function *cmpFct*
.
* Author names are separated by
``|``
.
* Author are sorted according to the
ir family name
.
* Empty string when authors are not found.
"""
# authors not defined
if
not
self
.
is_authors
():
return
u
""
# filter the list using affiliation key(s)
regex
=
re
.
compile
(
pattern
)
df
=
self
[
u
"700"
]
# extract author name from the item
authors
=
list
(
imap
(
lambda
x
:
x
[
0
],
ifilter
(
lambda
x
:
len
(
list
(
ifilter
(
regex
.
search
,
x
[
1
])))
>
0
,
iter_author_items
(
self
))))
# short the list
if
cmpFct
:
authors
.
sort
(
key
=
cmpFct
)
query
=
df
.
u
.
str
.
contains
(
pattern
)
df
=
df
.
loc
[
query
,
[
"a"
]]
return
u
", "
.
join
(
authors
)
df
.
a
=
df
.
a
.
str
.
encode
(
"utf-8"
).
sort_values
()
return
(
"|"
.
join
(
df
.
a
)
if
len
(
df
)
>
0
else
""
).
decode
(
"utf-8"
)
def
first_author
(
self
):
"""The name of the first author.
...
...
@@ -191,7 +272,7 @@ class RecordPubli(Record):
- List of name when there is more than one.
"""
return
iter_author_names
(
self
).
next
()
return
self
[
u
"700"
].
a
.
iloc
[
0
]
def
first_author_institutes
(
self
):
"""The institute(s) associated to the first author.
...
...
@@ -202,11 +283,12 @@ class RecordPubli(Record):
Returns:
unicode:
- names are separated by
a comma
.
- names are separated by
``|``
.
- The string is empty when institutes are not defined.
"""
return
u
", "
.
join
(
iter_author_affiliations
(
self
).
next
())
val
=
self
[
u
"700"
].
u
.
iloc
[
0
]
return
(
""
if
val
==
NaN
else
val
)
def
institutes
(
self
):
"""The list of institute signing the publication.
...
...
@@ -219,16 +301,17 @@ class RecordPubli(Record):
list: the list is sort in alphabetic order.
"""
myset
=
set
()
# expand multi-affiliation (one per column)
df
=
self
[
u
"700"
].
u
.
str
.
split
(
"|"
,
expand
=
True
)
for
elt
in
iter_author_affiliations
(
self
):
myset
.
update
(
elt
)
# merge all columns into a single one,
# sort and remove duplicate entries
li
=
[
df
[
el
].
dropna
()
for
el
in
df
.
columns
]
df
=
(
concat
(
li
,
ignore_index
=
True
)
.
sort_values
()
.
unique
())
# sort institute in alphabetic order
myli
=
list
(
myset
)
myli
.
sort
()
return
myli
return
df
.
tolist
()
def
is_affiliations
(
self
):
"""``True`` when affiliations are defined for authors.
...
...
@@ -243,16 +326,13 @@ class RecordPubli(Record):
bool:
"""
for
field
in
(
u
"100"
,
u
"700"
):
if
field
in
self
:
if
isinstance
(
self
[
field
],
dict
):
if
"u"
not
in
self
[
field
]:
return
False
df
=
self
[
u
"700"
]
if
"u"
not
in
df
.
columns
:
return
False
elif
isinstance
(
self
[
field
],
list
):
for
i
in
(
1
,
-
1
):
if
"u"
not
in
self
[
field
][
i
]:
return
False
if
len
(
df
)
==
1
and
df
.
u
.
iloc
[
0
]
==
""
:
return
False
return
True
...
...
@@ -263,8 +343,10 @@ class RecordPubli(Record):
bool:
"""
return
len
(
list
(
ifilter
(
lambda
x
:
len
(
x
)
==
0
,
iter_author_affiliations
(
self
))))
==
0
df
=
self
[
u
"700"
]
query
=
df
.
u
.
isin
([
""
,
NaN
])
return
df
.
u
[
query
].
size
==
0
def
is_authors
(
self
):
"""``True`` when authors are defined.
...
...
@@ -273,7 +355,15 @@ class RecordPubli(Record):
bool:
"""
return
u
"100"
in
self
or
u
"700"
in
self
df
=
self
[
u
"700"
]
if
"a"
not
in
df
.
columns
:
return
False
if
len
(
df
)
==
1
and
df
.
a
.
iloc
[
0
]
==
""
:
return
False
return
True
def
is_published
(
self
):
"""``True`` is the record is published.
...
...
modules/invenio_tools/recordthesis.py
View file @
fe4859a0
...
...
@@ -39,11 +39,19 @@ class RecordThesis(RecordPubli):
list: the list is empty when authors are not defined.
"""
# for a thesis, the author field 700 contains names of director
# which have to be removed.
# for a thesis, the author field 700 contains names of author
# as well as directors. The latter have to be removed.
df
=
self
[
u
"700"
]
iter_filter
=
ifilterfalse
(
is_thesis_dir
,
iter_author_fields
(
self
))
return
list
(
imap
(
author_name
,
iter_filter
))
query
=
df
.
e
!=
THESIS_DIR
df
=
df
.
loc
[
query
]
li
=
df
.
a
.
tolist
()
if
len
(
li
)
==
1
and
li
[
0
]
==
""
:
li
=
[]
return
li
def
these_defense
(
self
):
"""The defence date for a master/phd thesis.
...
...
@@ -77,15 +85,18 @@ class RecordThesis(RecordPubli):
Returns:
unicode:
* Names are separated by
a comma
.
* Names are separated by
``|``
.
* Empty string when it is not defined.
"""
# for a thesis, the author field 700 field contains
# names of the director as well as the name of authors
df
=
self
[
u
"700"
]
query
=
df
.
e
==
THESIS_DIR
df
=
df
.
loc
[
query
]
iter_filter
=
ifilter
(
is_thesis_dir
,
iter_author_fields
(
self
))
return
u
", "
.
join
(
list
(
imap
(
author_name
,
iter_filter
)))
return
(
u
"|"
.
join
(
df
.
a
)
if
len
(
df
)
>
0
else
u
""
)
def
these_town
(
self
):
"""The town where the thesis took place.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment