Commit ec43e308 authored by LE GAC Renaud's avatar LE GAC Renaud
Browse files

Merge RecordCds into RecordCdsPubli

parent 5eff26ac
......@@ -12,7 +12,6 @@ from gluon import current
from store_tools import (MSG_NO_CONF,
MSG_NO_THESIS,
OAI_URL,
RecordCds,
RecordCdsConf,
RecordCdsPubli,
RecordCdsThesis,
......@@ -986,7 +985,7 @@ class CheckAndFix(object):
date = '%s-%02i-%02i' % data
# update
if isinstance(record, RecordCds):
if isinstance(record, RecordCdsPubli):
# in some case we have to deal with a list (see cds 2234042)
# in some case it is not defined (e.g. phd thesis)
if "prepublication" in record:
......
......@@ -29,7 +29,6 @@ from .factory import build_record, build_store
from .inspirehepstore import InspirehepStore
from .publicationinfomixin import PublicationInfoMixin
from .cdsstore import CdsStore
from .recordcds import RecordCds
from .recordcdsconf import RecordCdsConf
from .recordcdspubli import RecordCdsPubli
from .recordcdsthesis import RecordCdsThesis
......
""" store_tools.recordcds
"""
import pprint
from .base import OAI, OAI_URL, REG_OAI
class RecordCds(dict):
"""Base class for JSON record coming from cds.cern.ch or old.inspirehep.net.
It is a dictionary with the following structure::
record[field][subfield] = value
record[field][subfield] = [val1, val2, ....]
record[field] = [dict1(subfield1=..., subfield2=...),
dict2(subfield1=..., subfield2=...), ...]
For an article, typical field ares (cds 1951625, ins 1319638, *etc.*):
+-----------------------------+-----------------------------+
| field (cds) | field (inspirehep) |
+=============================+=============================+
| abstract | abstract |
+-----------------------------+-----------------------------+
| accelerator_experiment | accelerator_experiment |
+-----------------------------+-----------------------------+
| agency_code | |
+-----------------------------+-----------------------------+
| authors | authors |
+-----------------------------+-----------------------------+
| base | |
+-----------------------------+-----------------------------+
| collection | collection |
+-----------------------------+-----------------------------+
| comment | comment |
+-----------------------------+-----------------------------+
| copyright_status | |
+-----------------------------+-----------------------------+
| corporate_name | corporate_name |
+-----------------------------+-----------------------------+
| creation_date | creation_date |
+-----------------------------+-----------------------------+
| doi | doi |
+-----------------------------+-----------------------------+
| email_message | |
+-----------------------------+-----------------------------+
| filenames | filenames |
+-----------------------------+-----------------------------+
| files | files |
+-----------------------------+-----------------------------+
| filetypes | filetypes |
+-----------------------------+-----------------------------+
| imprint | imprint |
+-----------------------------+-----------------------------+
| keywords | keywords |
+-----------------------------+-----------------------------+
| language | |
+-----------------------------+-----------------------------+
| license | license |
+-----------------------------+-----------------------------+
| number_of_authors | number_of_authors |
+-----------------------------+-----------------------------+
| number_of_citations | number_of_citations |
+-----------------------------+-----------------------------+
| number_of_comments | number_of_comments |
+-----------------------------+-----------------------------+
| number_of_reviews | number_of_reviews |
+-----------------------------+-----------------------------+
| oai | FIXME_OAI |
+-----------------------------+-----------------------------+
| other_report_number | |
+-----------------------------+-----------------------------+
| persistent_identifiers_keys | persistent_identifiers_keys |
+-----------------------------+-----------------------------+
| physical_description | physical_description |
+-----------------------------+-----------------------------+
| prepublication | prepublication |
+-----------------------------+-----------------------------+
| primary_report_number | primary_report_number |
+-----------------------------+-----------------------------+
| publication_info | publication_info |
+-----------------------------+-----------------------------+
| recid | recid |
+-----------------------------+-----------------------------+
| | reference |
+-----------------------------+-----------------------------+
| report_number | |
+-----------------------------+-----------------------------+
| | source_of_acquisition |
+-----------------------------+-----------------------------+
| status_week | |
+-----------------------------+-----------------------------+
| subject | subject |
+-----------------------------+-----------------------------+
| system_control_number | system_control_number |
+-----------------------------+-----------------------------+
| thesaurus_terms | thesaurus_terms |
+-----------------------------+-----------------------------+
| title | title |
+-----------------------------+-----------------------------+
| | title_additional |
+-----------------------------+-----------------------------+
| url | |
+-----------------------------+-----------------------------+
| version_id | version_id |
+-----------------------------+-----------------------------+
The class comes with a collection of methods to extract the record
information masking the ``field`` and the ``subfield`` codification.
"""
def __init__(self, *args):
dict.__init__(self, *args)
# private cache
self.__host = None
# meta data
# the authors of my institutes signing the record
# string containing a list of name separated by a comma
self.my_authors = ""
def _get(self, field, subfield, force_list=False):
"""Get the value associated to the ``field`` and ``subfield``.
Args:
field (str):
name of the field, *e.g.* ``authors``
subfield (str):
name of the subfield, *e.g.* ``full_name``
force_list (bool):
always return a *list* when ``True``.
Returns:
str:
value or an empty string when not defined.
list:
list of values or an empty list when not defined
"""
val = ""
if field in self and subfield in self[field]:
val = self[field][subfield]
elif field in self and isinstance(self[field], list):
val = []
for el in self[field]:
if subfield in el:
if isinstance(el[subfield], list):
val.extend(el[subfield])
else:
val.append(el[subfield])
if force_list and not isinstance(val, list):
val = ([val] if len(val) > 0 else [])
return val
@staticmethod
def _oai_url(value):
"""Build the Open Archive Initiative URL.
Args:
value (str):
OAI identifier, *e.g.* ``oai:host:id``
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the value
is not well formed.
"""
match = REG_OAI.match(value)
if match:
return OAI_URL % (match.group(1), match.group(2))
return ""
def debug(self):
"""Print the record structure on the standard output.
"""
pprint.pprint(self)
def host(self):
"""The store housing the record.
Returns:
str:
``inspirehep.net`` or ``cds.cern.ch`` or an empty string
when not defined.
"""
# The value is compute once and cache in self.__host
if self.__host is not None:
return self.__host
val = self.primary_oai()
if not val:
self.__host = None
return ""
match = REG_OAI.match(val)
if match:
self.__host = match.group(1)
return self.__host
return ""
def id(self):
"""The id of the record in the store.
Returns:
int:
the unique id of the record in the store
"""
return self["recid"]
def oai(self):
"""The Open Archive Initiative identifier(s).
Returns:
str:
the primary and secondary OAI identifier are separated
by a comma. The pattern of the identifier is ``oai:host:id`` or
an empty string when it is not defined.
"""
lst = [self.primary_oai(), self.secondary_oai()]
return ", ".join(lst).strip(", ")
def oai_url(self):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
the primary and secondary URLs are separated by a comma.
The pattern of the URL is ``http://host/record/id`` or
an empty string when it is not defined or when the OAI is
not well formed.
"""
lst = [self.primary_oai_url(), self.secondary_oai_url()]
return ", ".join(lst).strip(", ")
def primary_oai(self):
"""The primary Open Archive Initiative identifier.
The primary OAI identifier matches the record identifier.
Returns:
str:
the pattern of the string is ``oai:host:id``.
It is an empty string when not defined
"""
# the location of the OAI information depends on the store
if "oai" in self:
field, subfield = "oai", "value"
elif "FIXME_OAI" in self:
field, subfield = "FIXME_OAI", "id"
else:
return ""
# standard case
value = self._get(field, subfield)
# in some case OAI is a list (e.g. cds1513204)
# select the OAI corresponding to the record identifier.
if isinstance(value, list):
myid = self.id()
for el in value:
if el.endswith(str(myid)):
return el
return ""
return value
def primary_oai_url(self):
"""The Open Archive Initiative URL for the primary OAI.
Note:
A record can be deleted and replaced by a new one.
In that case the OAI is not changed but the record has
a new *id* and new *URL* which is return by this method.
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the OAI
is not well formed.
"""
oai = self.primary_oai()
rec_id = str(self.id())
if oai.endswith(rec_id):
return self._oai_url(self.primary_oai())
else:
return OAI_URL % (self.host(), rec_id)
def secondary_oai(self):
"""The secondary OAI identifier.
If the current store is *cds.cern.ch*, the secondary OAI identifier
corresponds to the record in the other store, *inspirehep.net*.
Returns:
str:
the pattern of the string is ``oai:host:id``.
It is an empty string when not defined
"""
if "system_control_number" not in self:
return ""
data = self["system_control_number"]
data = (data if isinstance(data, list) else [data])
# data is a list of dictionary
# keys are `institute`, `value` or `cancelled`
for di in data:
institute = di.get("institute")
if institute == "CDS":
if "value" in di:
return OAI % ("cds.cern.ch", di["value"])
elif institute == "Inspire":
if "value" in di:
return OAI % ("inspirehep.net", di["value"])
return ""
def secondary_oai_url(self):
"""The Open Archive Initiative URL for the secondary OAI.
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the OAI
is not well formed.
"""
return self._oai_url(self.secondary_oai())
......@@ -2,21 +2,21 @@
"""
import logging
import pprint
from .authorsmixin import AuthorsMixin
from .base import ARXIV
from .base import ARXIV, OAI, OAI_URL, REG_OAI
from filters import CLEAN_COLLABORATION
from pandas import concat, DataFrame
from plugin_dbui import CLEAN_SPACES
from .publicationinfomixin import PublicationInfoMixin
from store_tools.recordcds import RecordCds
def to_str(x):
return ("|".join(x) if isinstance(x, list) else x)
class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
class RecordCdsPubli(dict, AuthorsMixin, PublicationInfoMixin):
"""Article, preprint, proceeding, report and talk from cds.cern.ch or
old.inspirehep.net.
......@@ -123,13 +123,79 @@ class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
def __init__(self, *args):
self.__host = None
self._last_fmt_author = "Last, First"
self.logger = logging.getLogger("web2py.app.limbra")
RecordCds.__init__(self, *args)
super().__init__(*args)
self._process_authors()
self._process_publication_info()
# the authors of my institutes signing the record
# string containing a list of name separated by a comma
self.my_authors = ""
def _get(self, field, subfield, force_list=False):
"""Get the value associated to the ``field`` and ``subfield``.
Args:
field (str):
name of the field, *e.g.* ``authors``
subfield (str):
name of the subfield, *e.g.* ``full_name``
force_list (bool):
always return a *list* when ``True``.
Returns:
str:
value or an empty string when not defined.
list:
list of values or an empty list when not defined
"""
val = ""
if field in self and subfield in self[field]:
val = self[field][subfield]
elif field in self and isinstance(self[field], list):
val = []
for el in self[field]:
if subfield in el:
if isinstance(el[subfield], list):
val.extend(el[subfield])
else:
val.append(el[subfield])
if force_list and not isinstance(val, list):
val = ([val] if len(val) > 0 else [])
return val
@staticmethod
def _oai_url(value):
"""Build the Open Archive Initiative URL.
Args:
value (str):
OAI identifier, *e.g.* ``oai:host:id``
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the value
is not well formed.
"""
match = REG_OAI.match(value)
if match:
return OAI_URL % (match.group(1), match.group(2))
return ""
def _process_authors(self):
"""Convert authors information into DataFrame:
......@@ -278,6 +344,74 @@ class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
li = self._get("corporate_name", "collaboration", force_list=True)
return CLEAN_COLLABORATION(", ".join(li))
def debug(self):
"""Print the record structure on the standard output.
"""
pprint.pprint(self)
def host(self):
"""The store housing the record.
Returns:
str:
``inspirehep.net`` or ``cds.cern.ch`` or an empty string
when not defined.
"""
# The value is compute once and cache in self.__host
if self.__host is not None:
return self.__host
val = self.primary_oai()
if not val:
self.__host = None
return ""
match = REG_OAI.match(val)
if match:
self.__host = match.group(1)
return self.__host
return ""
def id(self):
"""The id of the record in the store.
Returns:
int:
the unique id of the record in the store
"""
return self["recid"]
def oai(self):
"""The Open Archive Initiative identifier(s).
Returns:
str:
the primary and secondary OAI identifier are separated
by a comma. The pattern of the identifier is ``oai:host:id`` or
an empty string when it is not defined.
"""
lst = [self.primary_oai(), self.secondary_oai()]
return ", ".join(lst).strip(", ")
def oai_url(self):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
the primary and secondary URLs are separated by a comma.
The pattern of the URL is ``http://host/record/id`` or
an empty string when it is not defined or when the OAI is
not well formed.
"""
lst = [self.primary_oai_url(), self.secondary_oai_url()]
return ", ".join(lst).strip(", ")
def paper_url(self):
"""The URL of the preprint.
......@@ -322,6 +456,110 @@ class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
return ""
def primary_oai(self):
"""The primary Open Archive Initiative identifier.
The primary OAI identifier matches the record identifier.
Returns:
str:
the pattern of the string is ``oai:host:id``.
It is an empty string when not defined
"""
# the location of the OAI information depends on the store
if "oai" in self:
field, subfield = "oai", "value"
elif "FIXME_OAI" in self:
field, subfield = "FIXME_OAI", "id"
else:
return ""
# standard case
value = self._get(field, subfield)
# in some case OAI is a list (e.g. cds1513204)
# select the OAI corresponding to the record identifier.
if isinstance(value, list):
myid = self.id()
for el in value:
if el.endswith(str(myid)):
return el
return ""
return value
def primary_oai_url(self):
"""The Open Archive Initiative URL for the primary OAI.
Note:
A record can be deleted and replaced by a new one.
In that case the OAI is not changed but the record has
a new *id* and new *URL* which is return by this method.
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the OAI
is not well formed.
"""
oai = self.primary_oai()
rec_id = str(self.id())
if oai.endswith(rec_id):
return self._oai_url(self.primary_oai())
else:
return OAI_URL % (self.host(), rec_id)