Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
4612b6c2
Commit
4612b6c2
authored
Jan 07, 2021
by
LE GAC Renaud
Browse files
Add RecordHepPubli and test_05_RecordHepPubli
parent
14d0602f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
523 additions
and
12 deletions
+523
-12
modules/store_tools/__init__.py
modules/store_tools/__init__.py
+1
-1
modules/store_tools/factory.py
modules/store_tools/factory.py
+50
-11
modules/store_tools/recordheppubli.py
modules/store_tools/recordheppubli.py
+250
-0
tests/basis/test_05_RecordHepPubli.py
tests/basis/test_05_RecordHepPubli.py
+222
-0
No files found.
modules/store_tools/__init__.py
View file @
4612b6c2
...
...
@@ -74,4 +74,4 @@ def load_record(host, record_id, shelf=None):
"""
store
=
build_store
(
host
,
shelf
=
shelf
)
recjson
=
store
.
get_record
(
record_id
)
return
build_record
(
recjson
)
return
build_record
(
recjson
,
shelf
=
shelf
)
modules/store_tools/factory.py
View file @
4612b6c2
...
...
@@ -13,10 +13,11 @@ from .base import (CDS,
REG_CONF
)
from
datetime
import
datetime
from
.exception
import
CdsException
from
.exception
import
CdsException
,
RecordException
from
.inveniostore
import
InvenioStore
from
.inspirehepstore
import
InspirehepStore
,
SHELFS
from
.recordconf
import
RecordConf
from
.recordheppubli
import
RecordHepPubli
from
.recordinst
import
RecordInst
from
.recordpubli
import
RecordPubli
from
.recordthesis
import
RecordThesis
...
...
@@ -24,6 +25,8 @@ from .recordthesis import RecordThesis
REX_T
=
"\$\$t([\w, ]+)"
REX_U
=
"\$\$u([\w, ]+)"
MSG_FAIL_UPCAST
=
"Failed to upcast the JSON record"
def
add_conference_data
(
recjson
):
"""Add the conference data to the recjson.
...
...
@@ -157,32 +160,68 @@ def add_conference_data(recjson):
"url"
:
url
}
def
build_record
(
recjson
):
def
build_record
(
recjson
,
shelf
=
None
):
"""Transform a JSON object into a record
Note:
this tool is working for JSON object coming from cds.cern.ch,
old.inspirehep.net as well as inspirehep. In the latter case
the shelf has to be defined.
Args:
recjson (dict):
record data in a JSON format.
shelf (str):
section of the inspirehep store containing records.
Possible values are ``literature``, ``conferences``
and ``institutions``
Return
Record:
either RecordConf, RecordInst, RecodPubli or RecordThesis
either RecordConf, RecodHepPubli, RecordInst, RecodPubli
or RecordThesis
Raises:
"""
if
is_conference
(
recjson
):
add_conference_data
(
recjson
)
upcast_record
=
RecordConf
(
recjson
)
# ........................................................................
#
# cds.cern.ch or old.inspirehep.net
#
if
shelf
is
None
:
if
is_conference
(
recjson
):
add_conference_data
(
recjson
)
upcast_record
=
RecordConf
(
recjson
)
elif
is_institute
(
recjson
):
upcast_record
=
RecordInst
(
recjson
)
elif
is_
institute
(
recjson
):
upcast_record
=
Record
Inst
(
recjson
)
elif
is_
thesis
(
recjson
):
upcast_record
=
Record
Thesis
(
recjson
)
el
if
is_thesis
(
recjson
)
:
upcast_record
=
Record
Thesis
(
recjson
)
el
se
:
upcast_record
=
Record
Publi
(
recjson
)
else
:
upcast_record
=
RecordPubli
(
recjson
)
if
is_conference
(
recjson
)
and
shelf
==
"literature"
:
pass
# add_conference_data(recjson)
# upcast_record = RecordHepConf(recjson)
elif
shelf
==
"institutions"
:
pass
# upcast_record = RecordHepInst(recjson)
elif
is_thesis
(
recjson
)
and
shelf
==
"literature"
:
pass
# upcast_record = RecordHepThesis(recjson)
elif
shelf
==
"literature"
:
upcast_record
=
RecordHepPubli
(
recjson
)
else
:
raise
RecordException
(
MSG_FAIL_UPCAST
)
return
upcast_record
...
...
modules/store_tools/recordheppubli.py
0 → 100644
View file @
4612b6c2
""" store_tools.recordheppubli
"""
from
filters
import
CLEAN_COLLABORATION
from
pandas
import
DataFrame
from
.recordhep
import
RecordHep
from
store_tools.pluginauthors
import
PluginAuthors
from
store_tools.pluginpublicationinfo
import
PluginPublicationInfo
class
RecordHepPubli
(
RecordHep
,
PluginAuthors
,
PluginPublicationInfo
):
"""Article, preprint and proceeding from inspirehep.net version 2.
Schema documentation is defined here:
https://inspire-schemas.readthedocs.io/en/latest/schemas/
"""
def
__init__
(
self
,
recjson
):
super
().
__init__
(
recjson
)
self
.
_last_fmt_author
=
"Last, First"
self
.
_process_authors
()
self
.
_process_publication_info
()
def
_process_authors
(
self
):
"""Convert authors information into DataFrame:
Authors and their affiliations are stored in DataFrame with the
following structure:
+---------------+--------------------------------+
| column | |
+===============+================================+
| affiliation | value separated by "|" |
+---------------+--------------------------------+
| first_name | first name |
+---------------+--------------------------------+
| fmt_name | formated name |
+---------------+--------------------------------+
| full_name | Last, First |
+---------------+--------------------------------+
| last_name | family name |
+---------------+--------------------------------+
| relator_name | equal to dir. for phd director |
+---------------+--------------------------------+
Note:
After running this method, the field ``authors`` is always defined.
It contains one entry with empty strings when the field does not
exist.
"""
authors
=
self
.
get
(
"authors"
,
None
)
if
authors
is
None
:
cols
=
[
"affiliation"
,
"first_name"
,
"fmt_name"
,
"full_name"
,
"last_name"
]
self
[
"authors"
]
=
DataFrame
([[
""
]
*
len
(
cols
)],
columns
=
cols
)
return
data
=
[]
for
author
in
authors
:
affiliations
=
[]
if
"affiliations"
in
author
:
affiliations
=
[
elt
[
"value"
]
for
elt
in
author
[
"affiliations"
]]
full_name
=
author
[
"full_name"
]
last_name
,
first_name
=
full_name
.
split
(
","
)
dct
=
{
"affiliation"
:
"|"
.
join
(
affiliations
),
"first_name"
:
first_name
.
strip
(),
"fmt_name"
:
full_name
,
"full_name"
:
full_name
,
"last_name"
:
last_name
.
strip
()}
data
.
append
(
dct
)
df
=
DataFrame
(
data
)
# protection against duplicated entries, e.g. twice the first author
if
"full_name"
in
df
.
columns
:
df
=
df
.
drop_duplicates
(
"full_name"
)
# replace
self
[
"authors"
]
=
df
def
_process_publication_info
(
self
):
"""Convert publication_info into DataFrame:
Note:
* the field is a list when there are erratum
* in some case the subfield year is a list (cds 1951625)
publication information are stored in DataFrame with the
following structure:
+------------+--------------------------------+
| column | |
+============+================================+
| title | abbreviation of the publisher |
+------------+--------------------------------+
| volume | volume |
+------------+--------------------------------+
| year | year of publication |
+------------+--------------------------------+
| pagination | page number or ranges |
+------------+--------------------------------+
Note:
* After running this method, the field ``publication_info``
is always defined. It contains one entry with empty strings
when the field does not exist.
* In order to deal with erratum entry are sorter by year
and volume.
"""
data
=
self
.
get
(
"publication_info"
,
None
)
if
data
is
None
:
cols
=
[
"title"
,
"volume"
,
"year"
,
"pagination"
]
self
[
"publication_info"
]
=
\
DataFrame
([[
""
]
*
len
(
cols
)],
columns
=
cols
)
return
df
=
(
DataFrame
(
data
)
.
astype
({
"year"
:
str
})
.
rename
(
columns
=
{
"artid"
:
"pagination"
,
"journal_title"
:
"title"
,
"journal_volume"
:
"volume"
}))
columns
=
df
.
columns
# erratum -- sort by year and volume
if
set
([
"year"
,
"volume"
]).
issubset
(
columns
):
df
=
df
.
sort_values
([
"year"
,
"volume"
])
elif
"year"
in
columns
:
df
=
df
.
sort_values
(
"year"
)
# replace
self
[
"publication_info"
]
=
df
def
collaboration
(
self
):
"""The collaboration(s) signing the publication.
Returns:
str:
* collaborations are separated by a comma.
* The filter CLEAN_COLLABORATION is applied.
* empty string when not defined
"""
collaborations
=
self
.
get
(
"collaborations"
,
None
)
if
collaborations
is
None
:
return
""
lst
=
[]
for
elt
in
collaborations
:
val
=
elt
[
"value"
]
val
=
(
val
if
val
.
endswith
(
"ollaboration"
)
else
f
"
{
val
}
Collaboration"
)
lst
.
append
(
val
)
return
CLEAN_COLLABORATION
(
", "
.
join
(
lst
))
def
paper_url
(
self
):
"""The URL of the document.
Returns:
str:
* the string is empty when no URLs are found.
* first URL is selected when there is more than one
"""
documents
=
self
.
get
(
"documents"
,
None
)
return
(
""
if
documents
is
None
else
documents
[
0
][
"url"
])
def
preprint_number
(
self
):
"""The ArXiv preprint number.
Returns:
str:
* numbers are separated by a comma.
* empty string when it is not defined.
"""
lst
=
self
.
get
(
"arxiv_eprints"
,
None
)
if
lst
is
None
:
return
""
lst
=
[
f
"arXiv:
{
elt
[
'value'
]
}
"
for
elt
in
lst
]
return
", "
.
join
(
lst
)
def
report_number
(
self
):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
lst
=
self
.
get
(
"report_numbers"
,
None
)
if
lst
is
None
:
return
""
lst
=
[
elt
[
"value"
]
for
elt
in
lst
]
return
", "
.
join
(
lst
)
def
submitted
(
self
):
"""The date of submission.
Returns:
str:
* format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
* Empty string when not defined.
"""
val
=
self
.
get
(
"preprint_date"
,
None
)
return
(
""
if
val
is
None
else
val
)
def
title
(
self
):
"""The title of the publication.
Returns:
str:
* Empty string when not defined.
* The filter CLEAN_SPACES is applied.
* First one is selectec when ther is more than one
"""
titles
=
self
.
get
(
"titles"
,
None
)
return
(
""
if
titles
is
None
else
titles
[
0
][
"title"
])
tests/basis/test_05_RecordHepPubli.py
0 → 100644
View file @
4612b6c2
"""test_05_RecordHepPubli
Test all methods of the RecordHepPubli class for a given article:
https://inspirehep.net/api/literature/1319638
Precision luminosity measurements at LHCb,
J. Instrum. 9 (2014) P12005
arXiv:1410.0149
704 authors
No correction are applied to the record.
Allow to test the brut force decoding with its mistakes.
Note:
* The first author is not in the author list
* LHCb collaboration
* The publication year is a list (duplicate 773y)
* The submitted date is not formatted: 01 Oct 2014
"""
import
pandas
as
pd
import
pytest
from
store_tools
import
load_record
from
store_tools.recordheppubli
import
RecordHepPubli
@
pytest
.
fixture
(
scope
=
"module"
)
def
record
():
return
load_record
(
"inspirehep.net"
,
1319638
,
shelf
=
"literature"
)
def
test_upcast_ins_05001
(
record
):
assert
isinstance
(
record
,
RecordHepPubli
)
def
test_constructor_ins_05002
(
record
):
"""test the method _process_authors and _process_publication_info.
"""
authors
=
record
[
"authors"
]
assert
isinstance
(
authors
,
pd
.
DataFrame
)
refcols
=
[
"affiliation"
,
"first_name"
,
"fmt_name"
,
"full_name"
,
"last_name"
]
assert
len
(
authors
.
columns
.
difference
(
refcols
))
==
0
assert
len
(
authors
)
==
704
assert
authors
.
affiliation
.
iloc
[
12
]
==
"INFN, Rome|CERN"
papers
=
record
[
"publication_info"
]
assert
isinstance
(
papers
,
pd
.
DataFrame
)
assert
len
(
papers
)
==
1
paper
=
papers
.
iloc
[
0
]
assert
paper
.
title
==
"JINST"
assert
paper
.
volume
==
"9"
assert
paper
.
year
==
"2014"
assert
paper
.
pagination
==
"P12005"
# ............................................................................
#
# Section devoted to authors
#
def
test_is_authors_ins_05010
(
record
):
assert
record
.
is_authors
()
def
test_authors_as_list_ins_05011
(
record
):
authors
=
record
.
authors_as_list
()
assert
len
(
authors
)
==
704
assert
authors
[
0
]
==
"Aaij, Roel"
assert
authors
[
1
]
==
"Adeva, Bernardo"
assert
authors
[
344
]
==
"Le Gac, Renaud"
assert
authors
[
-
1
]
==
"Zvyagin, Alexander"
def
test_first_author_ins_05012
(
record
):
assert
record
.
first_author
()
==
"Aaij, Roel"
def
test_find_authors_ins_05013
(
record
):
assert
record
.
find_authors
(
"Leo"
)
==
\
"Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato"
def
test_reformat_author_ins_05014
(
record
):
record
.
reformat_authors
(
"F. Last"
)
authors
=
record
.
authors_as_list
()
assert
len
(
authors
)
==
704
assert
authors
[
0
]
==
"R. Aaij"
assert
authors
[
1
]
==
"B. Adeva"
assert
authors
[
12
]
==
"A. A. Alves Jr"
assert
authors
[
344
]
==
"R. Le Gac"
assert
authors
[
-
1
]
==
"A. Zvyagin"
record
.
reformat_authors
(
"Last, First"
)
# ............................................................................
#
# Section devoted to affiliation
#
def
test_is_affiliations_ins_05020
(
record
):
assert
record
.
is_affiliations
()
assert
record
.
is_affiliation_for_all
()
def
test_institutes_ins_05021
(
record
):
institutes
=
record
.
institutes
()
assert
institutes
[
0
]
==
"AGH-UST, Cracow"
assert
institutes
[
44
]
==
"MIT"
assert
institutes
[
-
1
]
==
"Zurich U."
def
test_find_affiliation_ins_05022
(
record
):
affiliation
=
record
.
find_affiliation
(
r
"Marseille, CPPM|CPPM, Marseille"
)
assert
affiliation
==
"Marseille, CPPM"
# ............................................................................
#
# Section devoted to authors and institutes
#
def
test_first_author_institutes_ins_05030
(
record
):
assert
record
.
first_author_institutes
()
==
"NIKHEF, Amsterdam"
def
test_find_authors_by_affiliation_ins_05031
(
record
):
pattern
=
"CPPM, Marseille|Marseille, CPPM"
authors
=
record
.
find_authors_by_affiliation
(
pattern
,
sep
=
"|"
)
assert
authors
==
"Akar, Simon|Aslanides, Elie|Cogan, Julien|"
\
"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|"
\
"Mancinelli, Giampiero|Mordà, Alessandro|"
\
"Perrin-Terrin, Mathieu|Serrano, Justine|"
\
"Tsaregorodtsev, Andrei"
# ............................................................................
#
# Other methods
#
def
test_collaboration_ins_05040
(
record
):
assert
record
.
collaboration
()
==
"LHCb Collaboration"
def
test_is_published_ins_05041
(
record
):
assert
record
.
is_published
()
def
test_is_with_erratum_ins_05042
(
record
):
assert
not
record
.
is_with_erratum
()
def
test_paper_info_ins_05043
(
record
):
assert
record
.
paper_editor
()
==
"JINST"
assert
record
.
paper_pages
()
==
"P12005"
assert
record
.
paper_volume
()
==
"9"
assert
record
.
paper_year
()
==
"2014"
def
test_paper_reference_ins_05044
(
record
):
assert
record
.
paper_reference
()
==
"JINST 9 2014 P12005"
def
test_preprint_number_ins_05045
(
record
):
assert
record
.
preprint_number
()
==
"arXiv:1410.0149"
def
test_paper_url_ins_05046
(
record
):
assert
record
.
paper_url
()
==
\
"https://inspirehep.net/files/d7355c9818375e62fdd3be49a2b52ae1"
def
test_report_number_ins_05047
(
record
):
assert
set
(
record
.
report_number
().
split
(
", "
))
==
\
{
"CERN-PH-EP-2014-221"
,
"LHCB-PAPER-2014-047"
}
def
test_submitted_ins_05048
(
record
):
assert
record
.
submitted
()
==
"2014-10-01"
def
test_title_ins_05049
(
record
):
assert
record
.
title
()
==
"Precision luminosity measurements at LHCb"
# ............................................................................
#
# Another publication
#
def
test_all_ins_05050
():
"""same article oai:inspirehet.net:1762838 and oai:cds.cern.ch:2698323"""
rec
=
load_record
(
"inspirehep.net"
,
1762838
,
shelf
=
"literature"
)
assert
rec
.
title
()
==
\
r
"Updated measurement of decay-time-dependent CP asymmetries "
\
r
"in $D^0 \to K^+K^-$ and $D^0 \to \pi^+\pi^-$ decays"
assert
rec
.
paper_reference
()
==
"Phys. Rev. D 101 2020 012005"
assert
rec
.
first_author
()
==
"Aaij, Roel"
assert
rec
.
primary_oai
()
==
"oai:inspirehep.net:1762838"
assert
rec
.
secondary_oai
()
==
"oai:cds.cern.ch:2698323"
assert
rec
.
paper_url
()
==
\
"https://inspirehep.net/files/c25e21267be950a4abb9d3e147328982"
assert
rec
.
preprint_number
()
==
"arXiv:1911.01114"
assert
rec
.
report_number
()
==
"CERN-EP-2019-225, LHCb-PAPER-2019-032"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment