Docker-in-Docker (DinD) capabilities of public runners deactivated. More info

Commit 4612b6c2 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Add RecordHepPubli and test_05_RecordHepPubli

parent 14d0602f
......@@ -74,4 +74,4 @@ def load_record(host, record_id, shelf=None):
"""
store = build_store(host, shelf=shelf)
recjson = store.get_record(record_id)
return build_record(recjson)
return build_record(recjson, shelf=shelf)
......@@ -13,10 +13,11 @@ from .base import (CDS,
REG_CONF)
from datetime import datetime
from .exception import CdsException
from .exception import CdsException, RecordException
from .inveniostore import InvenioStore
from .inspirehepstore import InspirehepStore, SHELFS
from .recordconf import RecordConf
from .recordheppubli import RecordHepPubli
from .recordinst import RecordInst
from .recordpubli import RecordPubli
from .recordthesis import RecordThesis
......@@ -24,6 +25,8 @@ from .recordthesis import RecordThesis
REX_T = "\$\$t([\w, ]+)"
REX_U = "\$\$u([\w, ]+)"
MSG_FAIL_UPCAST = "Failed to upcast the JSON record"
def add_conference_data(recjson):
"""Add the conference data to the recjson.
......@@ -157,32 +160,68 @@ def add_conference_data(recjson):
"url": url}
def build_record(recjson):
def build_record(recjson, shelf=None):
"""Transform a JSON object into a record
Note:
this tool is working for JSON object coming from cds.cern.ch,
old.inspirehep.net as well as inspirehep. In the latter case
the shelf has to be defined.
Args:
recjson (dict):
record data in a JSON format.
shelf (str):
section of the inspirehep store containing records.
Possible values are ``literature``, ``conferences``
and ``institutions``
Return
Record:
either RecordConf, RecordInst, RecodPubli or RecordThesis
either RecordConf, RecodHepPubli, RecordInst, RecodPubli
or RecordThesis
Raises:
"""
if is_conference(recjson):
add_conference_data(recjson)
upcast_record = RecordConf(recjson)
# ........................................................................
#
# cds.cern.ch or old.inspirehep.net
#
if shelf is None:
if is_conference(recjson):
add_conference_data(recjson)
upcast_record = RecordConf(recjson)
elif is_institute(recjson):
upcast_record = RecordInst(recjson)
elif is_institute(recjson):
upcast_record = RecordInst(recjson)
elif is_thesis(recjson):
upcast_record = RecordThesis(recjson)
elif is_thesis(recjson):
upcast_record = RecordThesis(recjson)
else:
upcast_record = RecordPubli(recjson)
else:
upcast_record = RecordPubli(recjson)
if is_conference(recjson) and shelf == "literature":
pass
# add_conference_data(recjson)
# upcast_record = RecordHepConf(recjson)
elif shelf == "institutions":
pass
# upcast_record = RecordHepInst(recjson)
elif is_thesis(recjson) and shelf == "literature":
pass
# upcast_record = RecordHepThesis(recjson)
elif shelf == "literature":
upcast_record = RecordHepPubli(recjson)
else:
raise RecordException(MSG_FAIL_UPCAST)
return upcast_record
......
""" store_tools.recordheppubli
"""
from filters import CLEAN_COLLABORATION
from pandas import DataFrame
from .recordhep import RecordHep
from store_tools.pluginauthors import PluginAuthors
from store_tools.pluginpublicationinfo import PluginPublicationInfo
class RecordHepPubli(RecordHep, PluginAuthors, PluginPublicationInfo):
"""Article, preprint and proceeding from inspirehep.net version 2.
Schema documentation is defined here:
https://inspire-schemas.readthedocs.io/en/latest/schemas/
"""
def __init__(self, recjson):
super().__init__(recjson)
self._last_fmt_author = "Last, First"
self._process_authors()
self._process_publication_info()
def _process_authors(self):
"""Convert authors information into DataFrame:
Authors and their affiliations are stored in DataFrame with the
following structure:
+---------------+--------------------------------+
| column | |
+===============+================================+
| affiliation | value separated by "|" |
+---------------+--------------------------------+
| first_name | first name |
+---------------+--------------------------------+
| fmt_name | formated name |
+---------------+--------------------------------+
| full_name | Last, First |
+---------------+--------------------------------+
| last_name | family name |
+---------------+--------------------------------+
| relator_name | equal to dir. for phd director |
+---------------+--------------------------------+
Note:
After running this method, the field ``authors`` is always defined.
It contains one entry with empty strings when the field does not
exist.
"""
authors = self.get("authors", None)
if authors is None:
cols = ["affiliation",
"first_name",
"fmt_name",
"full_name",
"last_name"]
self["authors"] = DataFrame([[""] * len(cols)], columns=cols)
return
data = []
for author in authors:
affiliations = []
if "affiliations" in author:
affiliations = [elt["value"] for elt in author["affiliations"]]
full_name = author["full_name"]
last_name, first_name = full_name.split(",")
dct = {"affiliation": "|".join(affiliations),
"first_name": first_name.strip(),
"fmt_name": full_name,
"full_name": full_name,
"last_name": last_name.strip()}
data.append(dct)
df = DataFrame(data)
# protection against duplicated entries, e.g. twice the first author
if "full_name" in df.columns:
df = df.drop_duplicates("full_name")
# replace
self["authors"] = df
def _process_publication_info(self):
"""Convert publication_info into DataFrame:
Note:
* the field is a list when there are erratum
* in some case the subfield year is a list (cds 1951625)
publication information are stored in DataFrame with the
following structure:
+------------+--------------------------------+
| column | |
+============+================================+
| title | abbreviation of the publisher |
+------------+--------------------------------+
| volume | volume |
+------------+--------------------------------+
| year | year of publication |
+------------+--------------------------------+
| pagination | page number or ranges |
+------------+--------------------------------+
Note:
* After running this method, the field ``publication_info``
is always defined. It contains one entry with empty strings
when the field does not exist.
* In order to deal with erratum entry are sorter by year
and volume.
"""
data = self.get("publication_info", None)
if data is None:
cols = ["title",
"volume",
"year",
"pagination"]
self["publication_info"] = \
DataFrame([[""] * len(cols)], columns=cols)
return
df = (DataFrame(data)
.astype({"year": str})
.rename(columns={"artid": "pagination",
"journal_title": "title",
"journal_volume": "volume"}))
columns = df.columns
# erratum -- sort by year and volume
if set(["year", "volume"]).issubset(columns):
df = df.sort_values(["year", "volume"])
elif "year" in columns:
df = df.sort_values("year")
# replace
self["publication_info"] = df
def collaboration(self):
"""The collaboration(s) signing the publication.
Returns:
str:
* collaborations are separated by a comma.
* The filter CLEAN_COLLABORATION is applied.
* empty string when not defined
"""
collaborations = self.get("collaborations", None)
if collaborations is None:
return ""
lst = []
for elt in collaborations:
val = elt["value"]
val = (val if val.endswith("ollaboration")
else f"{val} Collaboration")
lst.append(val)
return CLEAN_COLLABORATION(", ".join(lst))
def paper_url(self):
"""The URL of the document.
Returns:
str:
* the string is empty when no URLs are found.
* first URL is selected when there is more than one
"""
documents = self.get("documents", None)
return ("" if documents is None else documents[0]["url"])
def preprint_number(self):
"""The ArXiv preprint number.
Returns:
str:
* numbers are separated by a comma.
* empty string when it is not defined.
"""
lst = self.get("arxiv_eprints", None)
if lst is None:
return ""
lst = [f"arXiv:{elt['value']}" for elt in lst]
return ", ".join(lst)
def report_number(self):
"""The report number(s) associated to the publication.
Returns:
str:
- Numbers are separated by a comma
- Number are sorted in alphabetic order.
- Empty string when not defined.
"""
lst = self.get("report_numbers", None)
if lst is None:
return ""
lst = [elt["value"] for elt in lst]
return ", ".join(lst)
def submitted(self):
"""The date of submission.
Returns:
str:
* format are"YYYY-MM", "YYYY-MM-DD", "DD MMM YYYY", *etc.*
* Empty string when not defined.
"""
val = self.get("preprint_date", None)
return ("" if val is None else val)
def title(self):
"""The title of the publication.
Returns:
str:
* Empty string when not defined.
* The filter CLEAN_SPACES is applied.
* First one is selectec when ther is more than one
"""
titles = self.get("titles", None)
return ("" if titles is None else titles[0]["title"])
"""test_05_RecordHepPubli
Test all methods of the RecordHepPubli class for a given article:
https://inspirehep.net/api/literature/1319638
Precision luminosity measurements at LHCb,
J. Instrum. 9 (2014) P12005
arXiv:1410.0149
704 authors
No correction are applied to the record.
Allow to test the brut force decoding with its mistakes.
Note:
* The first author is not in the author list
* LHCb collaboration
* The publication year is a list (duplicate 773y)
* The submitted date is not formatted: 01 Oct 2014
"""
import pandas as pd
import pytest
from store_tools import load_record
from store_tools.recordheppubli import RecordHepPubli
@pytest.fixture(scope="module")
def record():
return load_record("inspirehep.net", 1319638, shelf="literature")
def test_upcast_ins_05001(record):
assert isinstance(record, RecordHepPubli)
def test_constructor_ins_05002(record):
"""test the method _process_authors and _process_publication_info.
"""
authors = record["authors"]
assert isinstance(authors, pd.DataFrame)
refcols = ["affiliation",
"first_name",
"fmt_name",
"full_name",
"last_name"]
assert len(authors.columns.difference(refcols)) == 0
assert len(authors) == 704
assert authors.affiliation.iloc[12] == "INFN, Rome|CERN"
papers = record["publication_info"]
assert isinstance(papers, pd.DataFrame)
assert len(papers) == 1
paper = papers.iloc[0]
assert paper.title == "JINST"
assert paper.volume == "9"
assert paper.year == "2014"
assert paper.pagination == "P12005"
# ............................................................................
#
# Section devoted to authors
#
def test_is_authors_ins_05010(record):
assert record.is_authors()
def test_authors_as_list_ins_05011(record):
authors = record.authors_as_list()
assert len(authors) == 704
assert authors[0] == "Aaij, Roel"
assert authors[1] == "Adeva, Bernardo"
assert authors[344] == "Le Gac, Renaud"
assert authors[-1] == "Zvyagin, Alexander"
def test_first_author_ins_05012(record):
assert record.first_author() == "Aaij, Roel"
def test_find_authors_ins_05013(record):
assert record.find_authors("Leo") == \
"Beaucourt, Leo, Kravchuk, Leonid, Leo, Sabato"
def test_reformat_author_ins_05014(record):
record.reformat_authors("F. Last")
authors = record.authors_as_list()
assert len(authors) == 704
assert authors[0] == "R. Aaij"
assert authors[1] == "B. Adeva"
assert authors[12] == "A. A. Alves Jr"
assert authors[344] == "R. Le Gac"
assert authors[-1] == "A. Zvyagin"
record.reformat_authors("Last, First")
# ............................................................................
#
# Section devoted to affiliation
#
def test_is_affiliations_ins_05020(record):
assert record.is_affiliations()
assert record.is_affiliation_for_all()
def test_institutes_ins_05021(record):
institutes = record.institutes()
assert institutes[0] == "AGH-UST, Cracow"
assert institutes[44] == "MIT"
assert institutes[-1] == "Zurich U."
def test_find_affiliation_ins_05022(record):
affiliation = record.find_affiliation(r"Marseille, CPPM|CPPM, Marseille")
assert affiliation == "Marseille, CPPM"
# ............................................................................
#
# Section devoted to authors and institutes
#
def test_first_author_institutes_ins_05030(record):
assert record.first_author_institutes() == "NIKHEF, Amsterdam"
def test_find_authors_by_affiliation_ins_05031(record):
pattern = "CPPM, Marseille|Marseille, CPPM"
authors = record.find_authors_by_affiliation(pattern, sep="|")
assert authors == "Akar, Simon|Aslanides, Elie|Cogan, Julien|" \
"Kanso, Walaa|Le Gac, Renaud|Leroy, Olivier|" \
"Mancinelli, Giampiero|Mordà, Alessandro|" \
"Perrin-Terrin, Mathieu|Serrano, Justine|" \
"Tsaregorodtsev, Andrei"
# ............................................................................
#
# Other methods
#
def test_collaboration_ins_05040(record):
assert record.collaboration() == "LHCb Collaboration"
def test_is_published_ins_05041(record):
assert record.is_published()
def test_is_with_erratum_ins_05042(record):
assert not record.is_with_erratum()
def test_paper_info_ins_05043(record):
assert record.paper_editor() == "JINST"
assert record.paper_pages() == "P12005"
assert record.paper_volume() == "9"
assert record.paper_year() == "2014"
def test_paper_reference_ins_05044(record):
assert record.paper_reference() == "JINST 9 2014 P12005"
def test_preprint_number_ins_05045(record):
assert record.preprint_number() == "arXiv:1410.0149"
def test_paper_url_ins_05046(record):
assert record.paper_url() == \
"https://inspirehep.net/files/d7355c9818375e62fdd3be49a2b52ae1"
def test_report_number_ins_05047(record):
assert set(record.report_number().split(", ")) == \
{"CERN-PH-EP-2014-221", "LHCB-PAPER-2014-047"}
def test_submitted_ins_05048(record):
assert record.submitted() == "2014-10-01"
def test_title_ins_05049(record):
assert record.title() == "Precision luminosity measurements at LHCb"
# ............................................................................
#
# Another publication
#
def test_all_ins_05050():
"""same article oai:inspirehet.net:1762838 and oai:cds.cern.ch:2698323"""
rec = load_record("inspirehep.net", 1762838, shelf="literature")
assert rec.title() == \
r"Updated measurement of decay-time-dependent CP asymmetries " \
r"in $D^0 \to K^+K^-$ and $D^0 \to \pi^+\pi^-$ decays"
assert rec.paper_reference() == "Phys. Rev. D 101 2020 012005"
assert rec.first_author() == "Aaij, Roel"
assert rec.primary_oai() == "oai:inspirehep.net:1762838"
assert rec.secondary_oai() == "oai:cds.cern.ch:2698323"
assert rec.paper_url() == \
"https://inspirehep.net/files/c25e21267be950a4abb9d3e147328982"
assert rec.preprint_number() == "arXiv:1911.01114"
assert rec.report_number() == "CERN-EP-2019-225, LHCb-PAPER-2019-032"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment