Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
limbra
limbra
Commits
ec43e308
Commit
ec43e308
authored
Jan 13, 2021
by
LE GAC Renaud
Browse files
Merge RecordCds into RecordCdsPubli
parent
5eff26ac
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
245 additions
and
365 deletions
+245
-365
modules/harvest_tools/checkandfix.py
modules/harvest_tools/checkandfix.py
+1
-2
modules/store_tools/__init__.py
modules/store_tools/__init__.py
+0
-1
modules/store_tools/recordcds.py
modules/store_tools/recordcds.py
+0
-356
modules/store_tools/recordcdspubli.py
modules/store_tools/recordcdspubli.py
+242
-4
tests/basis/test_03_Record.py
tests/basis/test_03_Record.py
+2
-2
No files found.
modules/harvest_tools/checkandfix.py
View file @
ec43e308
...
...
@@ -12,7 +12,6 @@ from gluon import current
from
store_tools
import
(
MSG_NO_CONF
,
MSG_NO_THESIS
,
OAI_URL
,
RecordCds
,
RecordCdsConf
,
RecordCdsPubli
,
RecordCdsThesis
,
...
...
@@ -986,7 +985,7 @@ class CheckAndFix(object):
date
=
'%s-%02i-%02i'
%
data
# update
if
isinstance
(
record
,
RecordCds
):
if
isinstance
(
record
,
RecordCds
Publi
):
# in some case we have to deal with a list (see cds 2234042)
# in some case it is not defined (e.g. phd thesis)
if
"prepublication"
in
record
:
...
...
modules/store_tools/__init__.py
View file @
ec43e308
...
...
@@ -29,7 +29,6 @@ from .factory import build_record, build_store
from
.inspirehepstore
import
InspirehepStore
from
.publicationinfomixin
import
PublicationInfoMixin
from
.cdsstore
import
CdsStore
from
.recordcds
import
RecordCds
from
.recordcdsconf
import
RecordCdsConf
from
.recordcdspubli
import
RecordCdsPubli
from
.recordcdsthesis
import
RecordCdsThesis
...
...
modules/store_tools/recordcds.py
deleted
100644 → 0
View file @
5eff26ac
""" store_tools.recordcds
"""
import
pprint
from
.base
import
OAI
,
OAI_URL
,
REG_OAI
class
RecordCds
(
dict
):
"""Base class for JSON record coming from cds.cern.ch or old.inspirehep.net.
It is a dictionary with the following structure::
record[field][subfield] = value
record[field][subfield] = [val1, val2, ....]
record[field] = [dict1(subfield1=..., subfield2=...),
dict2(subfield1=..., subfield2=...), ...]
For an article, typical field ares (cds 1951625, ins 1319638, *etc.*):
+-----------------------------+-----------------------------+
| field (cds) | field (inspirehep) |
+=============================+=============================+
| abstract | abstract |
+-----------------------------+-----------------------------+
| accelerator_experiment | accelerator_experiment |
+-----------------------------+-----------------------------+
| agency_code | |
+-----------------------------+-----------------------------+
| authors | authors |
+-----------------------------+-----------------------------+
| base | |
+-----------------------------+-----------------------------+
| collection | collection |
+-----------------------------+-----------------------------+
| comment | comment |
+-----------------------------+-----------------------------+
| copyright_status | |
+-----------------------------+-----------------------------+
| corporate_name | corporate_name |
+-----------------------------+-----------------------------+
| creation_date | creation_date |
+-----------------------------+-----------------------------+
| doi | doi |
+-----------------------------+-----------------------------+
| email_message | |
+-----------------------------+-----------------------------+
| filenames | filenames |
+-----------------------------+-----------------------------+
| files | files |
+-----------------------------+-----------------------------+
| filetypes | filetypes |
+-----------------------------+-----------------------------+
| imprint | imprint |
+-----------------------------+-----------------------------+
| keywords | keywords |
+-----------------------------+-----------------------------+
| language | |
+-----------------------------+-----------------------------+
| license | license |
+-----------------------------+-----------------------------+
| number_of_authors | number_of_authors |
+-----------------------------+-----------------------------+
| number_of_citations | number_of_citations |
+-----------------------------+-----------------------------+
| number_of_comments | number_of_comments |
+-----------------------------+-----------------------------+
| number_of_reviews | number_of_reviews |
+-----------------------------+-----------------------------+
| oai | FIXME_OAI |
+-----------------------------+-----------------------------+
| other_report_number | |
+-----------------------------+-----------------------------+
| persistent_identifiers_keys | persistent_identifiers_keys |
+-----------------------------+-----------------------------+
| physical_description | physical_description |
+-----------------------------+-----------------------------+
| prepublication | prepublication |
+-----------------------------+-----------------------------+
| primary_report_number | primary_report_number |
+-----------------------------+-----------------------------+
| publication_info | publication_info |
+-----------------------------+-----------------------------+
| recid | recid |
+-----------------------------+-----------------------------+
| | reference |
+-----------------------------+-----------------------------+
| report_number | |
+-----------------------------+-----------------------------+
| | source_of_acquisition |
+-----------------------------+-----------------------------+
| status_week | |
+-----------------------------+-----------------------------+
| subject | subject |
+-----------------------------+-----------------------------+
| system_control_number | system_control_number |
+-----------------------------+-----------------------------+
| thesaurus_terms | thesaurus_terms |
+-----------------------------+-----------------------------+
| title | title |
+-----------------------------+-----------------------------+
| | title_additional |
+-----------------------------+-----------------------------+
| url | |
+-----------------------------+-----------------------------+
| version_id | version_id |
+-----------------------------+-----------------------------+
The class comes with a collection of methods to extract the record
information masking the ``field`` and the ``subfield`` codification.
"""
def
__init__
(
self
,
*
args
):
dict
.
__init__
(
self
,
*
args
)
# private cache
self
.
__host
=
None
# meta data
# the authors of my institutes signing the record
# string containing a list of name separated by a comma
self
.
my_authors
=
""
def
_get
(
self
,
field
,
subfield
,
force_list
=
False
):
"""Get the value associated to the ``field`` and ``subfield``.
Args:
field (str):
name of the field, *e.g.* ``authors``
subfield (str):
name of the subfield, *e.g.* ``full_name``
force_list (bool):
always return a *list* when ``True``.
Returns:
str:
value or an empty string when not defined.
list:
list of values or an empty list when not defined
"""
val
=
""
if
field
in
self
and
subfield
in
self
[
field
]:
val
=
self
[
field
][
subfield
]
elif
field
in
self
and
isinstance
(
self
[
field
],
list
):
val
=
[]
for
el
in
self
[
field
]:
if
subfield
in
el
:
if
isinstance
(
el
[
subfield
],
list
):
val
.
extend
(
el
[
subfield
])
else
:
val
.
append
(
el
[
subfield
])
if
force_list
and
not
isinstance
(
val
,
list
):
val
=
([
val
]
if
len
(
val
)
>
0
else
[])
return
val
@
staticmethod
def
_oai_url
(
value
):
"""Build the Open Archive Initiative URL.
Args:
value (str):
OAI identifier, *e.g.* ``oai:host:id``
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the value
is not well formed.
"""
match
=
REG_OAI
.
match
(
value
)
if
match
:
return
OAI_URL
%
(
match
.
group
(
1
),
match
.
group
(
2
))
return
""
def
debug
(
self
):
"""Print the record structure on the standard output.
"""
pprint
.
pprint
(
self
)
def
host
(
self
):
"""The store housing the record.
Returns:
str:
``inspirehep.net`` or ``cds.cern.ch`` or an empty string
when not defined.
"""
# The value is compute once and cache in self.__host
if
self
.
__host
is
not
None
:
return
self
.
__host
val
=
self
.
primary_oai
()
if
not
val
:
self
.
__host
=
None
return
""
match
=
REG_OAI
.
match
(
val
)
if
match
:
self
.
__host
=
match
.
group
(
1
)
return
self
.
__host
return
""
def
id
(
self
):
"""The id of the record in the store.
Returns:
int:
the unique id of the record in the store
"""
return
self
[
"recid"
]
def
oai
(
self
):
"""The Open Archive Initiative identifier(s).
Returns:
str:
the primary and secondary OAI identifier are separated
by a comma. The pattern of the identifier is ``oai:host:id`` or
an empty string when it is not defined.
"""
lst
=
[
self
.
primary_oai
(),
self
.
secondary_oai
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
oai_url
(
self
):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
the primary and secondary URLs are separated by a comma.
The pattern of the URL is ``http://host/record/id`` or
an empty string when it is not defined or when the OAI is
not well formed.
"""
lst
=
[
self
.
primary_oai_url
(),
self
.
secondary_oai_url
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
primary_oai
(
self
):
"""The primary Open Archive Initiative identifier.
The primary OAI identifier matches the record identifier.
Returns:
str:
the pattern of the string is ``oai:host:id``.
It is an empty string when not defined
"""
# the location of the OAI information depends on the store
if
"oai"
in
self
:
field
,
subfield
=
"oai"
,
"value"
elif
"FIXME_OAI"
in
self
:
field
,
subfield
=
"FIXME_OAI"
,
"id"
else
:
return
""
# standard case
value
=
self
.
_get
(
field
,
subfield
)
# in some case OAI is a list (e.g. cds1513204)
# select the OAI corresponding to the record identifier.
if
isinstance
(
value
,
list
):
myid
=
self
.
id
()
for
el
in
value
:
if
el
.
endswith
(
str
(
myid
)):
return
el
return
""
return
value
def
primary_oai_url
(
self
):
"""The Open Archive Initiative URL for the primary OAI.
Note:
A record can be deleted and replaced by a new one.
In that case the OAI is not changed but the record has
a new *id* and new *URL* which is return by this method.
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the OAI
is not well formed.
"""
oai
=
self
.
primary_oai
()
rec_id
=
str
(
self
.
id
())
if
oai
.
endswith
(
rec_id
):
return
self
.
_oai_url
(
self
.
primary_oai
())
else
:
return
OAI_URL
%
(
self
.
host
(),
rec_id
)
def
secondary_oai
(
self
):
"""The secondary OAI identifier.
If the current store is *cds.cern.ch*, the secondary OAI identifier
corresponds to the record in the other store, *inspirehep.net*.
Returns:
str:
the pattern of the string is ``oai:host:id``.
It is an empty string when not defined
"""
if
"system_control_number"
not
in
self
:
return
""
data
=
self
[
"system_control_number"
]
data
=
(
data
if
isinstance
(
data
,
list
)
else
[
data
])
# data is a list of dictionary
# keys are `institute`, `value` or `cancelled`
for
di
in
data
:
institute
=
di
.
get
(
"institute"
)
if
institute
==
"CDS"
:
if
"value"
in
di
:
return
OAI
%
(
"cds.cern.ch"
,
di
[
"value"
])
elif
institute
==
"Inspire"
:
if
"value"
in
di
:
return
OAI
%
(
"inspirehep.net"
,
di
[
"value"
])
return
""
def
secondary_oai_url
(
self
):
"""The Open Archive Initiative URL for the secondary OAI.
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the OAI
is not well formed.
"""
return
self
.
_oai_url
(
self
.
secondary_oai
())
modules/store_tools/recordcdspubli.py
View file @
ec43e308
...
...
@@ -2,21 +2,21 @@
"""
import
logging
import
pprint
from
.authorsmixin
import
AuthorsMixin
from
.base
import
ARXIV
from
.base
import
ARXIV
,
OAI
,
OAI_URL
,
REG_OAI
from
filters
import
CLEAN_COLLABORATION
from
pandas
import
concat
,
DataFrame
from
plugin_dbui
import
CLEAN_SPACES
from
.publicationinfomixin
import
PublicationInfoMixin
from
store_tools.recordcds
import
RecordCds
def
to_str
(
x
):
return
(
"|"
.
join
(
x
)
if
isinstance
(
x
,
list
)
else
x
)
class
RecordCdsPubli
(
RecordCds
,
AuthorsMixin
,
PublicationInfoMixin
):
class
RecordCdsPubli
(
dict
,
AuthorsMixin
,
PublicationInfoMixin
):
"""Article, preprint, proceeding, report and talk from cds.cern.ch or
old.inspirehep.net.
...
...
@@ -123,13 +123,79 @@ class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
def
__init__
(
self
,
*
args
):
self
.
__host
=
None
self
.
_last_fmt_author
=
"Last, First"
self
.
logger
=
logging
.
getLogger
(
"web2py.app.limbra"
)
RecordCds
.
__init__
(
self
,
*
args
)
super
().
__init__
(
*
args
)
self
.
_process_authors
()
self
.
_process_publication_info
()
# the authors of my institutes signing the record
# string containing a list of name separated by a comma
self
.
my_authors
=
""
def
_get
(
self
,
field
,
subfield
,
force_list
=
False
):
"""Get the value associated to the ``field`` and ``subfield``.
Args:
field (str):
name of the field, *e.g.* ``authors``
subfield (str):
name of the subfield, *e.g.* ``full_name``
force_list (bool):
always return a *list* when ``True``.
Returns:
str:
value or an empty string when not defined.
list:
list of values or an empty list when not defined
"""
val
=
""
if
field
in
self
and
subfield
in
self
[
field
]:
val
=
self
[
field
][
subfield
]
elif
field
in
self
and
isinstance
(
self
[
field
],
list
):
val
=
[]
for
el
in
self
[
field
]:
if
subfield
in
el
:
if
isinstance
(
el
[
subfield
],
list
):
val
.
extend
(
el
[
subfield
])
else
:
val
.
append
(
el
[
subfield
])
if
force_list
and
not
isinstance
(
val
,
list
):
val
=
([
val
]
if
len
(
val
)
>
0
else
[])
return
val
@
staticmethod
def
_oai_url
(
value
):
"""Build the Open Archive Initiative URL.
Args:
value (str):
OAI identifier, *e.g.* ``oai:host:id``
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the value
is not well formed.
"""
match
=
REG_OAI
.
match
(
value
)
if
match
:
return
OAI_URL
%
(
match
.
group
(
1
),
match
.
group
(
2
))
return
""
def
_process_authors
(
self
):
"""Convert authors information into DataFrame:
...
...
@@ -278,6 +344,74 @@ class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
li
=
self
.
_get
(
"corporate_name"
,
"collaboration"
,
force_list
=
True
)
return
CLEAN_COLLABORATION
(
", "
.
join
(
li
))
def
debug
(
self
):
"""Print the record structure on the standard output.
"""
pprint
.
pprint
(
self
)
def
host
(
self
):
"""The store housing the record.
Returns:
str:
``inspirehep.net`` or ``cds.cern.ch`` or an empty string
when not defined.
"""
# The value is compute once and cache in self.__host
if
self
.
__host
is
not
None
:
return
self
.
__host
val
=
self
.
primary_oai
()
if
not
val
:
self
.
__host
=
None
return
""
match
=
REG_OAI
.
match
(
val
)
if
match
:
self
.
__host
=
match
.
group
(
1
)
return
self
.
__host
return
""
def
id
(
self
):
"""The id of the record in the store.
Returns:
int:
the unique id of the record in the store
"""
return
self
[
"recid"
]
def
oai
(
self
):
"""The Open Archive Initiative identifier(s).
Returns:
str:
the primary and secondary OAI identifier are separated
by a comma. The pattern of the identifier is ``oai:host:id`` or
an empty string when it is not defined.
"""
lst
=
[
self
.
primary_oai
(),
self
.
secondary_oai
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
oai_url
(
self
):
"""The Open Archive Initiative identifier URL(s).
Returns:
str:
the primary and secondary URLs are separated by a comma.
The pattern of the URL is ``http://host/record/id`` or
an empty string when it is not defined or when the OAI is
not well formed.
"""
lst
=
[
self
.
primary_oai_url
(),
self
.
secondary_oai_url
()]
return
", "
.
join
(
lst
).
strip
(
", "
)
def
paper_url
(
self
):
"""The URL of the preprint.
...
...
@@ -322,6 +456,110 @@ class RecordCdsPubli(RecordCds, AuthorsMixin, PublicationInfoMixin):
return
""
def
primary_oai
(
self
):
"""The primary Open Archive Initiative identifier.
The primary OAI identifier matches the record identifier.
Returns:
str:
the pattern of the string is ``oai:host:id``.
It is an empty string when not defined
"""
# the location of the OAI information depends on the store
if
"oai"
in
self
:
field
,
subfield
=
"oai"
,
"value"
elif
"FIXME_OAI"
in
self
:
field
,
subfield
=
"FIXME_OAI"
,
"id"
else
:
return
""
# standard case
value
=
self
.
_get
(
field
,
subfield
)
# in some case OAI is a list (e.g. cds1513204)
# select the OAI corresponding to the record identifier.
if
isinstance
(
value
,
list
):
myid
=
self
.
id
()
for
el
in
value
:
if
el
.
endswith
(
str
(
myid
)):
return
el
return
""
return
value
def
primary_oai_url
(
self
):
"""The Open Archive Initiative URL for the primary OAI.
Note:
A record can be deleted and replaced by a new one.
In that case the OAI is not changed but the record has
a new *id* and new *URL* which is return by this method.
Returns:
str:
the pattern of the string is ``http://host/record/id``.
The string is empty when it is not defined or when the OAI
is not well formed.
"""
oai
=
self
.
primary_oai
()
rec_id
=
str
(
self
.
id
())
if
oai
.
endswith
(
rec_id
):
return
self
.
_oai_url
(
self
.
primary_oai
())
else
:
return
OAI_URL
%
(
self
.
host
(),
rec_id
)