Skip to content
Snippets Groups Projects
Commit 5451fc58 authored by Enrique Garcia's avatar Enrique Garcia
Browse files

Merge branch 'codemeta_zip' into 'master'

Get codemeta from zip archive in records

See merge request !55
parents 242a8638 809773e1
No related branches found
No related tags found
1 merge request!55Get codemeta from zip archive in records
Pipeline #139590 passed
......@@ -4,15 +4,17 @@ import sys
import json
import pprint
import requests
from os.path import abspath
from pathlib import Path
from urllib.parse import urlencode
from urllib.request import urlopen
import warnings
from bs4 import BeautifulSoup
from zipfile import ZipFile
from ...metadata.codemeta2zenodo import parse_codemeta_and_write_zenodo_metadata_file, converter
from . import http_status
from bs4 import BeautifulSoup
from ...utils import get_codemeta_from_zipurl
__all__ = [
'ZenodoAPI',
......@@ -21,6 +23,7 @@ __all__ = [
'http_status',
]
zenodo_api_url = "https://zenodo.org/api"
zenodo_sandbox_api_url = "https://sandbox.zenodo.org/api"
......@@ -524,6 +527,10 @@ class Record:
def title(self):
return self.data['metadata']['title']
@property
def filelist(self):
return [f['links']['self'] for f in self.data['files']]
@property
def last_version_id(self):
return self.data['metadata']['relations']['version'][0]['last_child']['pid_value']
......@@ -536,6 +543,7 @@ class Record:
url = Path(self.data['links']['self']).parent.joinpath(str(record_id))
return Record(requests.get(url).json())
def print_info(self):
metadata = self.data['metadata']
descrp = BeautifulSoup(metadata['description'], features='html.parser').get_text()
......@@ -554,11 +562,21 @@ class Record:
def get_codemeta(self):
if 'files' not in self.data:
raise FileNotFoundError(f'The record {self.id} does not contain any file')
for file in self.data['files']:
if file['key'] == 'codemeta.json':
url = file['links']['self']
return json.loads(urlopen(url).read())
raise FileNotFoundError(f"No `codemeta.json` file found in record {self.id}")
codemeta_paths = [s for s in self.filelist if Path(s).name == 'codemeta.json']
ziparchives = [s for s in self.filelist if s.endswith('.zip')]
if len(codemeta_paths) == 1:
# note: there can't be more than one file named `codemeta.json` in a record
return json.loads(urlopen(codemeta_paths[0]).read())
elif len(ziparchives) > 0:
for zipurl in ziparchives:
try:
codemeta = get_codemeta_from_zipurl(zipurl)
return codemeta
except FileNotFoundError:
pass
else:
raise FileNotFoundError(f"No `codemeta.json` file found in record {self.id}")
@property
def doi(self):
......
......@@ -147,9 +147,17 @@ def test_get_record_sandbox():
record = get_record(520735, sandbox=True)
assert record.data['doi'] == '10.5072/zenodo.520735'
def test_codemeta_in_zip():
record = get_record(927064, sandbox=True)
codemeta = record.get_codemeta()
assert codemeta['name'] == 'eossr-testlib'
def test_write_record_zenodo(test_get_record_4923992, tmpdir):
record = test_get_record_4923992
record.write_zenodo(filename=tmpdir/'.zenodo.json')
with open(tmpdir/'.zenodo.json') as file:
json_dict = json.load(file)
assert json_dict['conceptdoi'] == '10.5281/zenodo.3572654'
from eossr.utils import get_codemeta_from_zipurl, ZipUrl
from pathlib import Path
_testurl = 'https://zenodo.org/record/5524913/files/eossr-v0.2.zip'
def test_ZipUrl():
zipurl = ZipUrl(_testurl)
codemeta_paths = zipurl.find_files('codemeta.json')
assert 'eossr-v0.2/codemeta.json' in codemeta_paths
assert 'eossr-v0.2/eossr/metadata/schema/codemeta.json' in codemeta_paths
zipurl.extract_file('eossr-v0.2/codemeta.json')
assert Path('eossr-v0.2/codemeta.json').exists()
def test_get_codemeta_from_zipurl():
codemeta = get_codemeta_from_zipurl(_testurl)
assert codemeta['name'] == 'eossr'
import json
import requests
from io import BytesIO
from zipfile import ZipFile
from pathlib import Path
__all__ = [
'ZipUrl',
'get_codemeta_from_zipurl',
]
class ZipUrl:
def __init__(self, url):
self.url = url
self.content = requests.get(self.url)
@property
def filelist(self):
with ZipFile(BytesIO(self.content.content)) as zipobj:
return zipobj.namelist()
def find_files(self, filename):
"""
return the path of files in the archive matching `filename`
:param filename: string
:return: list[str]
"""
matching_files = [f for f in self.filelist if Path(f).name == filename]
if len(matching_files) == 0:
raise FileNotFoundError(f"No file named {filename} in {self.url}")
else:
return matching_files
def extract_file(self, filepath):
with ZipFile(BytesIO(self.content.content)) as zipobj:
return zipobj.extract(filepath)
def get_codemeta_from_zipurl(url):
"""
Extract and reads codemeta metadata from a zip url.
A codemeta.json file must be present in the zip archive.
:param url: string
url to a zip file
:return: dictionnary
metadata in the codemeta.json file in the zip archive
"""
zipurl = ZipUrl(url)
codemeta_paths = zipurl.find_files('codemeta.json')
# if there are more than one codemeta file in the archive, we consider the one in the root directory, hence the
# one with the shortest path
codemeta_path = min(codemeta_paths, key=len)
with open(zipurl.extract_file(codemeta_path)) as file:
codemeta = json.load(file)
return codemeta
%% Cell type:markdown id:d2bcb61a tags:
# How to use ZipUrl
%% Cell type:markdown id:00eeeef5 tags:
ZipUrl can be used to access a zip archive stored online.
One can peek into the archive and extract a single file without downloading or extracting the whole archive.
%% Cell type:code id:658cb424 tags:
``` python
from eossr.utils import ZipUrl
```
%% Cell type:code id:4ad996cd tags:
``` python
zipurl = ZipUrl('https://zenodo.org/record/5524913/files/eossr-v0.2.zip')
```
%% Cell type:code id:0dd38cf3 tags:
``` python
zipurl.filelist
```
%% Cell type:code id:503e292a tags:
``` python
zipurl.extract_file('eossr-v0.2/README.md')
```
%% Cell type:code id:680dea59 tags:
``` python
zipurl.find_files('codemeta.json')
```
%% Cell type:code id:13c33da1 tags:
``` python
```
%% Cell type:markdown id:ba648264 tags:
## How it is used in eossr
%% Cell type:markdown id:b8906723 tags:
A repository is zipped before to be uploaded to Zenodo (by GitHub hook as well as the eossr GitLab CI).
ZipUrl is used in the eossr to get the metadata stored in the `codemeta.json` file stored in that zip archive.
%% Cell type:code id:51fd2eb4 tags:
``` python
from eossr.api import Record
```
%% Cell type:code id:4201cd65 tags:
``` python
record = Record.from_id(933320, sandbox=True)
```
%% Cell type:code id:c6eb8d25 tags:
``` python
record.print_info()
```
%% Cell type:markdown id:dad7fd90 tags:
This test record has been generated using the official [Zenodo-GitHub hook](https://guides.github.com/activities/citable-code/). It contains a unique zip file with the entire content of the repository:
%% Cell type:code id:2147c3cf tags:
``` python
record.filelist
```
%% Cell type:markdown id:de033fbd tags:
The codemeta metadata can still be retrieve from a single command:
%% Cell type:code id:8762a46c tags:
``` python
record.get_codemeta()
```
%% Cell type:code id:c9f2e80e tags:
``` python
```
%% Cell type:markdown id:4a25feec tags:
# Find ESCAPE OSSR records
**Please note**; to fetch information from the OSSR using the EOSSR API, **NO Zenodo token is needed**.
The EOSSR API will get the public information that is available in the repository.
%% Cell type:markdown id:5e008b43 tags:
## Getting all the records
%% Cell type:code id:dbde9b19 tags:
``` python
from eossr.api import get_ossr_records
```
%% Cell type:code id:2fe017bc tags:
``` python
ossr_records = get_ossr_records()
```
%% Cell type:code id:5eb34293 tags:
``` python
len(ossr_records)
```
%% Cell type:markdown id:93b033a0 tags:
Records are objects containing data and metadata sent by Zenodo API
%% Cell type:code id:f88ebfd3 tags:
``` python
record = ossr_records[0]
```
%% Cell type:code id:988b95c7 tags:
``` python
print(record)
```
%% Cell type:code id:6bbe7db1 tags:
``` python
record
```
%% Cell type:code id:3aad2e35 tags:
``` python
```
%% Cell type:markdown id:cb5f905e tags:
You can use `print_info` to display minimal information about a `Record`:
%% Cell type:code id:a9ec3d38 tags:
``` python
record.print_info()
```
%% Cell type:code id:28ad4039 tags:
``` python
```
%% Cell type:markdown id:21384a7d tags:
## Specific search
%% Cell type:markdown id:7f1a9209 tags:
### Using strings
%% Cell type:code id:3a3dc135 tags:
``` python
escape_records = get_ossr_records('escape')
```
%% Cell type:code id:0aa0deec tags:
``` python
for r in escape_records:
print(r)
```
%% Cell type:markdown id:c4b08fde tags:
### Using keywords
%% Cell type:code id:f5cb8f77 tags:
``` python
cta_records = get_ossr_records(keywords='CTA')
len(cta_records)
```
%% Cell type:code id:019f21eb tags:
``` python
for record in cta_records:
print(record)
```
%% Cell type:code id:5f304419 tags:
``` python
```
%% Cell type:markdown id:da77c97b tags:
### Directly from its id
if you happen to know exactly the record you are looking for
%% Cell type:code id:eb048c8f tags:
``` python
from eossr.api.zenodo import get_record
```
%% Cell type:code id:8a162ccb tags:
``` python
record = get_record(4923992)
```
%% Cell type:code id:81b76847 tags:
``` python
print(record)
```
%% Cell type:code id:25ec9b5a tags:
``` python
record.data.keys()
```
%% Cell type:code id:6f4de66b tags:
``` python
```
%% Cell type:markdown id:a85d8056 tags:
## `Record` methods
There are other useful methods to a Record class.
%% Cell type:markdown id:9083a824 tags:
## Getting CodeMeta metadata
If a `codemeta.json` file has been added to the record, one can retrieve it directly:
If a `codemeta.json` file has been added to the record, one can retrieve it directly.
The method is also looking into the zip archives that may be part of the record.
%% Cell type:code id:1431eb41 tags:
``` python
record.get_codemeta()
```
%% Cell type:code id:7a1d0f77 tags:
``` python
```
%% Cell type:markdown id:64c43dd6 tags:
## MyBinder integration
You can get a mybinder URL directly from a record:
%% Cell type:code id:7cf9bcdb tags:
``` python
record.get_mybinder_url()
```
%% Cell type:code id:622e44d9 tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment