Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Docker-in-Docker (DinD) capabilities of public runners deactivated.
More info
Open sidebar
limbra
limbra
Commits
4612b6c2
Commit
4612b6c2
authored
Jan 07, 2021
by
LE GAC Renaud
Browse files
Add RecordHepPubli and test_05_RecordHepPubli
parent
14d0602f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
523 additions
and
12 deletions
+523
-12
modules/store_tools/__init__.py
modules/store_tools/__init__.py
+1
-1
modules/store_tools/factory.py
modules/store_tools/factory.py
+50
-11
modules/store_tools/recordheppubli.py
modules/store_tools/recordheppubli.py
+250
-0
tests/basis/test_05_RecordHepPubli.py
tests/basis/test_05_RecordHepPubli.py
+222
-0
No files found.
modules/store_tools/__init__.py
View file @
4612b6c2
...
...
@@ -74,4 +74,4 @@ def load_record(host, record_id, shelf=None):
"""
store
=
build_store
(
host
,
shelf
=
shelf
)
recjson
=
store
.
get_record
(
record_id
)
return
build_record
(
recjson
)
return
build_record
(
recjson
,
shelf
=
shelf
)
modules/store_tools/factory.py
View file @
4612b6c2
...
...
@@ -13,10 +13,11 @@ from .base import (CDS,
REG_CONF
)
from
datetime
import
datetime
from
.exception
import
CdsException
from
.exception
import
CdsException
,
RecordException
from
.inveniostore
import
InvenioStore
from
.inspirehepstore
import
InspirehepStore
,
SHELFS
from
.recordconf
import
RecordConf
from
.recordheppubli
import
RecordHepPubli
from
.recordinst
import
RecordInst
from
.recordpubli
import
RecordPubli
from
.recordthesis
import
RecordThesis
...
...
@@ -24,6 +25,8 @@ from .recordthesis import RecordThesis
REX_T
=
"\$\$t([\w, ]+)"
REX_U
=
"\$\$u([\w, ]+)"
MSG_FAIL_UPCAST
=
"Failed to upcast the JSON record"
def
add_conference_data
(
recjson
):
"""Add the conference data to the recjson.
...
...
@@ -157,32 +160,68 @@ def add_conference_data(recjson):
"url"
:
url
}
def
build_record
(
recjson
):
def
build_record
(
recjson
,
shelf
=
None
):
"""Transform a JSON object into a record
Note:
this tool is working for JSON object coming from cds.cern.ch,
old.inspirehep.net as well as inspirehep. In the latter case
the shelf has to be defined.
Args:
recjson (dict):
record data in a JSON format.
shelf (str):
section of the inspirehep store containing records.
Possible values are ``literature``, ``conferences``
and ``institutions``
Return
Record:
either RecordConf, RecordInst, RecodPubli or RecordThesis
either RecordConf, RecodHepPubli, RecordInst, RecodPubli
or RecordThesis
Raises:
"""
if
is_conference
(
recjson
):
add_conference_data
(
recjson
)
upcast_record
=
RecordConf
(
recjson
)
# ........................................................................
#
# cds.cern.ch or old.inspirehep.net
#
if
shelf
is
None
:
if
is_conference
(
recjson
):
add_conference_data
(
recjson
)
upcast_record
=
RecordConf
(
recjson
)
elif
is_institute
(
recjson
):
upcast_record
=
RecordInst
(
recjson
)
elif
is_
institute
(
recjson
):
upcast_record
=
Record
Inst
(
recjson
)
elif
is_
thesis
(
recjson
):
upcast_record
=
Record
Thesis
(
recjson
)
el
if
is_thesis
(
recjson
)
:
upcast_record
=
Record
Thesis
(
recjson
)
el
se
:
upcast_record
=
Record
Publi
(
recjson
)
else
:
upcast_record
=
RecordPubli
(
recjson
)
if
is_conference
(
recjson
)
and
shelf
==
"literature"
:
pass
# add_conference_data(recjson)
# upcast_record = RecordHepConf(recjson)
elif
shelf
==
"institutions"
:
pass
# upcast_record = RecordHepInst(recjson)
elif
is_thesis
(
recjson
)
and
shelf
==
"literature"
:
pass
# upcast_record = RecordHepThesis(recjson)
elif
shelf
==
"literature"
:
upcast_record
=
RecordHepPubli
(
recjson
)
else
:
raise
RecordException
(
MSG_FAIL_UPCAST
)
return
upcast_record
...
...
modules/store_tools/recordheppubli.py
0 → 100644
View file @
4612b6c2
""" store_tools.recordheppubli
"""
from
filters
import
CLEAN_COLLABORATION
from
pandas
import
DataFrame
from
.recordhep
import
RecordHep
from
store_tools.pluginauthors
import
PluginAuthors
from
store_tools.pluginpublicationinfo
import
PluginPublicationInfo
class
RecordHepPubli
(
RecordHep
,
PluginAuthors
,
PluginPublicationInfo
):
"""Article, preprint and proceeding from inspirehep.net version 2.
Schema documentation is defined here:
https://inspire-schemas.readthedocs.io/en/latest/schemas/
"""
def
__init__
(
self
,
recjson
):
super
().
__init__
(
recjson
)
self
.
_last_fmt_author
=
"Last, First"
self
.
_process_authors
()
self
.
_process_publication_info
()
def
_process_authors
(
self
):
"""Convert authors information into DataFrame:
Authors and their affiliations are stored in DataFrame with the
following structure:
+---------------+--------------------------------+
| column | |
+===============+================================+
| affiliation | value separated by "|" |
+---------------+--------------------------------+
| first_name | first name |
+---------------+--------------------------------+
| fmt_name | formated name |
+---------------+--------------------------------+
| full_name | Last, First |
+---------------+--------------------------------+
| last_name | family name |
+---------------+--------------------------------+
| relator_name | equal to dir. for phd director |
+---------------+--------------------------------+
Note:
After running this method, the field ``authors`` is always defined.
It contains one entry with empty strings when the field does not
exist.
"""
authors
=
self
.
get
(
"authors"
,
None
)
if
authors
is
None
:
cols
=
[
"affiliation"
,
"first_name"
,
"fmt_name"
,
"full_name"
,
"last_name"
]
self
[
"authors"
]
=
DataFrame
([[
""
]
*
len
(
cols
)],
columns
=
cols
)
return
data
=
[]
for
author
in
authors
:
affiliations
=
[]
if
"affiliations"
in
author
:
affiliations
=
[
elt
[
"value"
]
for
elt
in
author
[
"affiliations"
]]
full_name
=
author
[
"full_name"
]
last_name
,
first_name
=
full_name
.
split
(
","
)
dct
=
{
"affiliation"
:
"|"
.
join
(
affiliations
),
"first_name"
:
first_name
.
strip
(),
"fmt_name"
:
full_name
,
"full_name"
:
full_name
,
"last_name"
:
last_name
.
strip
()}
data
.
append
(
dct
)
df
=
DataFrame
(
data
)
# protection against duplicated entries, e.g. twice the first author
if
"full_name"
in
df
.
columns
:
df
=
df
.
drop_duplicates
(
"full_name"
)
# replace
self
[
"authors"
]
=
df
def
_process_publication_info
(
self
):
"""Convert publication_info into DataFrame:
Note:
* the field is a list when there are erratum
* in some case the subfield year is a list (cds 1951625)
publication information are stored in DataFrame with the
following structure:
+------------+--------------------------------+
| column | |
+============+================================+
| title | abbreviation of the publisher |
+------------+--------------------------------+
| volume | volume |
+------------+--------------------------------+
| year | year of publication |
+------------+--------------------------------+
| pagination | page number or ranges |
+------------+--------------------------------+
Note:
* After running this method, the field ``publication_info``
is always defined. It contains one entry with empty strings
when the field does not exist.
* In order to deal with erratum entry are sorter by year
and volume.
"""
data
=
self
.
get
(
"publication_info"
,
None
)
if
data
is
None
:
cols
=
[
"title"
,
"volume"
,
"year"
,
"pagination"
]
self
[
"publication_info"
]
=
\
DataFrame
([[
""
]
*
len
(
cols
)],
columns
=
cols
)
return
df
=
(
DataFrame
(
data
)
.
astype
({
"year"
:
str
})
.
rename
(
columns
=
{
"artid"
:
"pagination"
,
"journal_title"
:
"title"
,
"journal_volume"
:
"volume"
}))
columns
=
df
.
columns
# erratum -- sort by year and volume
if
set
([
"year"
,
"volume"
]).
issubset
(
columns
):
df
=
df
.
sort_values
([
"year"
,
"volume"
])
elif
"year"
in
columns
:
df
=
df
.
sort_values
(
"year"
)
# replace
self
[
"publication_info"
]
=
df
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
Returns:
str:
* collaborations are separated by a comma.
* The filter CLEAN_COLLABORATION is applied.
* empty string when not defined
"""
collaborations
=
self
.
get
(
"collaborations"
,
None
)
if
collaborations
is
None
:
return
""
lst
=
[]
for
elt
in
collaborations
:
val
=
elt
[
"value"
]
val
=
(
val
if
val
.
endswith
(
"ollaboration"
)
else
f
"
{
val
}
Collaboration"
)
lst
.
append
(
val
)
return
CLEAN_COLLABORATION
(
", "
.
join
(
lst
))
def
paper_url
(
self
):
"""The URL of the document.
Returns:
str:
* the string is empty when no URLs are found.
* first URL is selected when there is more than one
"""
documents
=
self
.
get
(
"documents"
,
None
)
return
(
""
if
documents
is
None
else
documents
[
0
][
"url"
])
def
preprint_number
(
self
):
"""The ArXiv preprint number.
Returns:
str:
* numbers are separated by a comma.
* empty string when it is not defined.
"""
lst
=
self
.
get
(
"arxiv_eprints"
,
None
)
if
lst
is
None
:
return
""
lst
=
[
f
"arXiv:
{
elt
[
'value'
]
}
"
for
elt
in
lst
]
return
", "
.
join
(
lst
)
def
report_number
(
self
):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
lst
=
self
.
get
(
"report_numbers"
,
None
)
if
lst
is
None
:
return
""
lst
=
[
elt
[
"value"
]
for
elt
in
lst
]
return
", "
.
join
(
lst
)
def
submitted
(
self
):
"""The date of submission.
Returns:
str:
* format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
* Empty string when not defined.
"""
val
=
self
.
get
(
"preprint_date"
,
None
)
return
(
""
if
val
is
None
else
val
)
def
title
(
self
):
"""The title of the publication.
Returns:
str:
* Empty string when not defined.
* The filter CLEAN_SPACES is applied.
* First one is selectec when ther is more than one
"""
titles
=
self
.
get
(
"titles"
,
None
)
return
(
""
if
titles
is
None
else
titles
[
0
][
"title"
])
tests/basis/test_05_RecordHepPubli.py
0 → 100644
View file @
4612b6c2
"""test_05_RecordHepPubli
Test all methods of the RecordHepPubli class for a given article:
https://inspirehep.net/api/literature/1319638
Precision luminosity measurements at LHCb,
J. Instrum. 9 (2014) P12005
arXiv:1410.0149
704 authors
No correction are applied to the record.
Allow to test the brut force decoding with its mistakes.
Note:
* The first author is not in the author list
* LHCb collaboration
* The publication year is a list (duplicate 773y)
* The submitted date is not formatted: 01 Oct 2014
"""
import
pandas
as
pd
import
pytest
from
store_tools
import
load_record
from
store_tools.recordheppubli
import
RecordHepPubli
@
pytest
.
fixture
(
scope
=
"module"
)
def
record
():
return
load_record
(
"inspirehep.net"
,
1319638
,
shelf
=
"literature"
)
def
test_upcast_ins_05001
(
record
):
assert
isinstance
(
record
,
RecordHepPubli
)
def
test_constructor_ins_05002
(
record
):
"""test the method _process_authors and _process_publication_info.
"""
authors
=
record
[
"authors"
]
assert
isinstance
(
authors
,
pd
.
DataFrame
)
refcols
=
[
"affiliation"
,
"first_name"
,
"fmt_name"
,
"full_name"
,
"last_name"
]
assert
len
(
authors
.
columns
.
difference
(
refcols
))
==
0
assert
len
(
authors
)
==
704
assert
authors
.
affiliation
.
iloc
[
12
]
==
"INFN, Rome|CERN"
papers
=
record
[
"publication_info"
]
assert
isinstance
(
papers
,
pd
.
DataFrame
)
assert
len
(
papers
)
==
1
paper
=
papers
.
iloc
[
0
]
assert
paper
.
title
==
"JINST"
assert
paper
.
volume
==
"9"
assert
paper
.
year
==
"2014"
assert
paper
.
pagination
==
"P12005"
# ............................................................................
#
# Section devoted to authors
#
def
test_is_authors_ins_05010
(
record
):
assert
record
.
is_authors
()
def
test_authors_as_list_ins_05011
(
record
):
authors
=
record
.
authors_as_list
()
assert
len
(
authors
)
==
704
assert
authors
[
0
]
==
"Aaij, Roel"
assert
authors
[
1
]
==
"Adeva, Bernardo"
assert
authors
[
344
]
==
"Le Gac, Renaud"
assert
authors
[
-
1
]
==
"Zvyagin, Alexander"
def
test_first_author_ins_05012
(
record
):
assert
record
.
first_author
()
==
"Aaij, Roel"
def
test_find_authors_ins_05013
(
record
):
assert
record
.
find_authors
(
"Leo"
)
==
\
"Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato"
def
test_reformat_author_ins_05014
(
record
):
record
.
reformat_authors
(
"F. Last"
)
authors
=
record
.
authors_as_list
()
assert
len
(
authors
)
==
704
assert
authors
[
0
]
==
"R. Aaij"
assert
authors
[
1
]
==
"B. Adeva"
assert
authors
[
12
]
==
"A. A. Alves Jr"
assert
authors
[
344
]
==
"R. Le Gac"
assert
authors
[
-
1
]
==
"A. Zvyagin"
record
.
reformat_authors
(
"Last, First"
)
# ............................................................................
#
# Section devoted to affiliation
#
def
test_is_affiliations_ins_05020
(
record
):
assert
record
.
is_affiliations
()
assert
record
.
is_affiliation_for_all
()
def
test_institutes_ins_05021
(
record
):
institutes
=
record
.
institutes
()
assert
institutes
[
0
]
==
"AGH-UST, Cracow"
assert
institutes
[
44
]
==
"MIT"
assert
institutes
[
-
1
]
==
"Zurich U."
def
test_find_affiliation_ins_05022
(
record
):
affiliation
=
record
.
find_affiliation
(
r
"Marseille, CPPM|CPPM, Marseille"
)
assert
affiliation
==
"Marseille, CPPM"
# ............................................................................
#
# Section devoted to authors and institutes
#
def
test_first_author_institutes_ins_05030
(
record
):
assert
record
.
first_author_institutes
()
==
"NIKHEF, Amsterdam"
def
test_find_authors_by_affiliation_ins_05031
(
record
):
pattern
=
"CPPM, Marseille|Marseille, CPPM"
authors
=
record
.
find_authors_by_affiliation
(
pattern
,
sep
=
"|"
)
assert
authors
==
"Akar, Simon|Aslanides, Elie|Cogan, Julien|"
\
"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|"
\
"Mancinelli, Giampiero|Mordà, Alessandro|"
\
"Perrin-Terrin, Mathieu|Serrano, Justine|"
\
"Tsaregorodtsev, Andrei"
# ............................................................................
#
# Other methods
#
def
test_collaboration_ins_05040
(
record
):
assert
record
.
collaboration
()
==
"LHCb Collaboration"
def
test_is_published_ins_05041
(
record
):
assert
record
.
is_published
()
def
test_is_with_erratum_ins_05042
(
record
):
assert
not
record
.
is_with_erratum
()
def
test_paper_info_ins_05043
(
record
):
assert
record
.
paper_editor
()
==
"JINST"
assert
record
.
paper_pages
()
==
"P12005"
assert
record
.
paper_volume
()
==
"9"
assert
record
.
paper_year
()
==
"2014"
def
test_paper_reference_ins_05044
(
record
):
assert
record
.
paper_reference
()
==
"JINST 9 2014 P12005"
def
test_preprint_number_ins_05045
(
record
):
assert
record
.
preprint_number
()
==
"arXiv:1410.0149"
def
test_paper_url_ins_05046
(
record
):
assert
record
.
paper_url
()
==
\
"https://inspirehep.net/files/d7355c9818375e62fdd3be49a2b52ae1"
def
test_report_number_ins_05047
(
record
):
assert
set
(
record
.
report_number
().
split
(
", "
))
==
\
{
"CERN-PH-EP-2014-221"
,
"LHCB-PAPER-2014-047"
}
def
test_submitted_ins_05048
(
record
):
assert
record
.
submitted
()
==
"2014-10-01"
def
test_title_ins_05049
(
record
):
assert
record
.
title
()
==
"Precision luminosity measurements at LHCb"
# ............................................................................
#
# Another publication
#
def
test_all_ins_05050
():
"""same article oai:inspirehet.net:1762838 and oai:cds.cern.ch:2698323"""
rec
=
load_record
(
"inspirehep.net"
,
1762838
,
shelf
=
"literature"
)
assert
rec
.
title
()
==
\
r
"Updated measurement of decay-time-dependent CP asymmetries "
\
r
"in $D^0 \to K^+K^-$ and $D^0 \to \pi^+\pi^-$ decays"
assert
rec
.
paper_reference
()
==
"Phys. Rev. D 101 2020 012005"
assert
rec
.
first_author
()
==
"Aaij, Roel"
assert
rec
.
primary_oai
()
==
"oai:inspirehep.net:1762838"
assert
rec
.
secondary_oai
()
==
"oai:cds.cern.ch:2698323"
assert
rec
.
paper_url
()
==
\
"https://inspirehep.net/files/c25e21267be950a4abb9d3e147328982"
assert
rec
.
preprint_number
()
==
"arXiv:1911.01114"
assert
rec
.
report_number
()
==
"CERN-EP-2019-225, LHCb-PAPER-2019-032"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment