Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Docker-in-Docker (DinD) capabilities of public runners deactivated.
More info
Open sidebar
limbra
limbra
Commits
38150847
Commit
38150847
authored
Nov 14, 2019
by
LE GAC Renaud
Browse files
Refactor recordpubli and recordthesis to process phd director's name.
parent
5db30cca
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
117 additions
and
60 deletions
+117
-60
modules/invenio_tools/recordpubli.py
modules/invenio_tools/recordpubli.py
+67
-47
modules/invenio_tools/recordthesis.py
modules/invenio_tools/recordthesis.py
+50
-13
No files found.
modules/invenio_tools/recordpubli.py
View file @
38150847
...
...
@@ -40,6 +40,69 @@ PAPER_REFERENCE_KEYS = set(["c", "p", "v", "y"])
REG_INITIAL
=
initial
=
r
"^(\w+)\.?(\-)* *(\w+)*\.?"
def
format_names
(
df
,
fmt
):
"""Helper function to format author and PhD director names.
Args:
df (pandas.DataFrame):
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
+------------+---------------------------+
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Returns:
pandas.DataFrame:
+------------+---------------------------+
| column | |
+------------+---------------------------+
| a | author name (Last, First) |
| u | affiliation(s) |
| first_name | first name |
| last_name | family name |
| fmt_name | formated name |
+------------+---------------------------+
"""
# Compute initial for the first name
if
fmt
in
(
"F. Last"
,
"Last F."
):
dfm
=
(
df
.
first_name
.
str
.
extract
(
REG_INITIAL
,
expand
=
True
)
.
fillna
(
""
))
df
[
"initial"
]
=
dfm
.
apply
(
lambda
x
:
to_initial
(
x
[
0
],
x
[
1
],
x
[
2
]),
axis
=
"columns"
)
# Format
if
fmt
==
"Last, First"
:
df
[
"fmt_name"
]
=
df
.
a
elif
fmt
==
"First, Last"
:
df
[
"fmt_name"
]
=
df
.
first_name
+
" "
+
df
.
last_name
elif
fmt
==
"F. Last"
:
df
[
"fmt_name"
]
=
df
.
initial
+
" "
+
df
.
last_name
elif
fmt
==
"Last"
:
df
[
"fmt_name"
]
=
df
.
last_name
elif
fmt
==
"Last F."
:
df
[
"fmt_name"
]
=
df
.
last_name
+
" "
+
df
.
initial
# Clean initial column
if
fmt
in
(
"F. Last"
,
"Last F."
):
df
=
df
.
drop
(
"initial"
,
axis
=
"columns"
)
return
df
def
to_initial
(
x
,
y
,
z
):
"""Help function to extract initial from a first name split in x, y and z:
...
...
@@ -678,7 +741,8 @@ class RecordPubli(Record):
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
RecordException:
when fmt is not valid.
"""
if
fmt
not
in
AUTHOR_FORMATS
:
...
...
@@ -689,52 +753,8 @@ class RecordPubli(Record):
self
.
_last_fmt_author
=
fmt
# alias
d100
,
d700
=
self
[
"100"
],
self
[
"700"
]
# ....................................................................
#
# Compute initial for the first name
#
if
fmt
in
(
"F. Last"
,
"Last F."
):
for
df
in
(
d100
,
d700
):
dfm
=
(
df
.
first_name
.
str
.
extract
(
REG_INITIAL
,
expand
=
True
)
.
fillna
(
""
))
df
[
"initial"
]
=
dfm
.
apply
(
lambda
x
:
to_initial
(
x
[
0
],
x
[
1
],
x
[
2
]),
axis
=
"columns"
)
# ....................................................................
#
# Format
#
if
fmt
==
"Last, First"
:
d100
[
"fmt_name"
]
=
d100
.
a
d700
[
"fmt_name"
]
=
d700
.
a
elif
fmt
==
"First, Last"
:
d100
[
"fmt_name"
]
=
d100
.
first_name
+
", "
+
d100
.
last_name
d700
[
"fmt_name"
]
=
d700
.
first_name
+
" "
+
d700
.
last_name
elif
fmt
==
"F. Last"
:
d100
[
"fmt_name"
]
=
d100
.
initial
+
" "
+
d100
.
last_name
d700
[
"fmt_name"
]
=
d700
.
initial
+
" "
+
d700
.
last_name
elif
fmt
==
"Last"
:
d100
[
"fmt_name"
]
=
d100
.
last_name
d700
[
"fmt_name"
]
=
d700
.
last_name
elif
fmt
==
"Last F."
:
d100
[
"fmt_name"
]
=
d100
.
last_name
+
" "
+
d100
.
initial
d700
[
"fmt_name"
]
=
d700
.
last_name
+
" "
+
d700
.
initial
# ....................................................................
#
# Clean initial column
#
if
fmt
in
(
"F. Last"
,
"Last F."
):
d100
=
d100
.
drop
(
"initial"
,
axis
=
"columns"
)
d700
=
d700
.
drop
(
"initial"
,
axis
=
"columns"
)
self
[
"100"
]
=
format_names
(
self
[
"100"
],
fmt
)
self
[
"700"
]
=
format_names
(
self
[
"700"
],
fmt
)
def
report_number
(
self
):
"""The report number(s) associated to the publication.
...
...
modules/invenio_tools/recordthesis.py
View file @
38150847
...
...
@@ -3,7 +3,7 @@
"""
from
.base
import
THESIS_DIR
from
filters
import
CLEAN_THESIS_DEFENSE
from
.recordpubli
import
RecordPubli
from
.recordpubli
import
format_names
,
RecordPubli
from
pandas
import
DataFrame
...
...
@@ -30,6 +30,26 @@ class RecordThesis(RecordPubli):
+-----------------------+---------+----------+
"""
def
_process_authors
(
self
):
"""Process author and director names
"""
RecordPubli
.
_process_authors
(
self
)
# PhD directors
if
"701"
not
in
self
:
return
df
=
DataFrame
(
self
[
"701"
])
df1
=
df
.
a
.
str
.
split
(
","
,
expand
=
True
)
df
[[
"last_name"
,
"first_name"
]]
=
df1
[[
0
,
1
]]
df
.
first_name
=
df
.
first_name
.
str
.
strip
()
df
.
last_name
=
df
.
last_name
.
str
.
strip
()
df
[
"fmt_name"
]
=
df
.
a
self
[
"701"
]
=
df
def
authors_as_list
(
self
,
sort
=
False
):
"""The list of author(s) signing the publication.
...
...
@@ -63,6 +83,28 @@ class RecordThesis(RecordPubli):
return
li
def
reformat_authors
(
self
,
fmt
=
"Last, First"
):
"""Reformat author and director names.
The default formatting for cds/invenio record is ``Last, First``.
Args:
fmt (str):
define the new format for author names.
Possible values are "First, Last", "F. Last", "Last",
"Last, First" and "Last F."
Raises:
RecordException: if fmt is not valid.
"""
RecordPubli
.
reformat_authors
(
self
,
fmt
)
if
"701"
not
in
self
:
return
self
[
"701"
]
=
format_names
(
self
[
"701"
],
fmt
)
def
these_defense
(
self
):
"""The defence date for a master/phd thesis.
...
...
@@ -101,18 +143,13 @@ class RecordThesis(RecordPubli):
"""
# for a thesis, the author field 700 field contains
# names of the director as well as the name of authors
df
=
self
[
"700"
]
if
"e"
in
df
.
columns
:
query
=
df
.
e
==
THESIS_DIR
df
=
df
.
loc
[
query
]
# discover in 2019, that director's name move in the field 701
elif
"701"
in
self
:
df
=
DataFrame
(
self
[
"701"
])
if
"e"
in
df
.
columns
:
query
=
df
.
e
==
THESIS_DIR
df
=
(
df
.
loc
[
query
]
.
assign
(
fmt_name
=
lambda
x
:
x
.
a
))
for
df
in
(
self
.
get
(
"700"
),
self
.
get
(
"701"
)):
if
(
df
is
None
)
or
(
"e"
not
in
df
.
columns
):
continue
df
=
df
.
loc
[
df
.
e
==
THESIS_DIR
]
break
return
(
sep
.
join
(
df
.
fmt_name
)
if
len
(
df
)
>
0
else
""
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment