Merge branch 'codemeta_zip' into 'master'

Get codemeta from zip archive in records See merge request !55

Merge branch 'codemeta_zip' into 'master'
5451fc58 · Enrique Garcia · 242a8638 · 809773e1 · 5451fc58 · 5451fc58
Commit 5451fc58 authored 3 years ago by Enrique Garcia
--- a/eossr/api/zenodo/__init__.py
+++ b/eossr/api/zenodo/__init__.py
@@ -4,15 +4,17 @@ import sys
 import json
 import pprint
 import requests
-from os.path import abspath
 from pathlib import Path
 from urllib.parse import urlencode
 from urllib.request import urlopen
 import warnings
+from bs4 import BeautifulSoup
 from zipfile import ZipFile
 from ...metadata.codemeta2zenodo import parse_codemeta_and_write_zenodo_metadata_file, converter
 from . import http_status
-from bs4 import BeautifulSoup
+from ...utils import get_codemeta_from_zipurl
+
+

 __all__ = [
    'ZenodoAPI',
@@ -21,6 +23,7 @@ __all__ = [
    'http_status',
 ]

+
 zenodo_api_url = "https://zenodo.org/api"
 zenodo_sandbox_api_url = "https://sandbox.zenodo.org/api"

@@ -524,6 +527,10 @@ class Record:
    def title(self):
        return self.data['metadata']['title']

+    @property
+    def filelist(self):
+        return [f['links']['self'] for f in self.data['files']]
+
    @property
    def last_version_id(self):
        return self.data['metadata']['relations']['version'][0]['last_child']['pid_value']
@@ -536,6 +543,7 @@ class Record:
            url = Path(self.data['links']['self']).parent.joinpath(str(record_id))
            return Record(requests.get(url).json())

+
    def print_info(self):
        metadata = self.data['metadata']
        descrp = BeautifulSoup(metadata['description'], features='html.parser').get_text()
@@ -554,11 +562,21 @@ class Record:
    def get_codemeta(self):
        if 'files' not in self.data:
            raise FileNotFoundError(f'The record {self.id} does not contain any file')
-        for file in self.data['files']:
-            if file['key'] == 'codemeta.json':
-                url = file['links']['self']
-                return json.loads(urlopen(url).read())
-        raise FileNotFoundError(f"No `codemeta.json` file found in record {self.id}")
+
+        codemeta_paths = [s for s in self.filelist if Path(s).name == 'codemeta.json']
+        ziparchives = [s for s in self.filelist if s.endswith('.zip')]
+        if len(codemeta_paths) == 1:
+            # note: there can't be more than one file named `codemeta.json` in a record
+            return json.loads(urlopen(codemeta_paths[0]).read())
+        elif len(ziparchives) > 0:
+            for zipurl in ziparchives:
+                try:
+                    codemeta = get_codemeta_from_zipurl(zipurl)
+                    return codemeta
+                except FileNotFoundError:
+                    pass
+        else:
+            raise FileNotFoundError(f"No `codemeta.json` file found in record {self.id}")

    @property
    def doi(self):

--- a/eossr/api/zenodo/tests/test_zenodo.py
+++ b/eossr/api/zenodo/tests/test_zenodo.py
@@ -147,9 +147,17 @@ def test_get_record_sandbox():
    record = get_record(520735, sandbox=True)
    assert record.data['doi'] == '10.5072/zenodo.520735'

+
+def test_codemeta_in_zip():
+    record = get_record(927064, sandbox=True)
+    codemeta = record.get_codemeta()
+    assert codemeta['name'] == 'eossr-testlib'
+
+
 def test_write_record_zenodo(test_get_record_4923992, tmpdir):
    record = test_get_record_4923992
    record.write_zenodo(filename=tmpdir/'.zenodo.json')
    with open(tmpdir/'.zenodo.json') as file:
        json_dict = json.load(file)
    assert json_dict['conceptdoi'] == '10.5281/zenodo.3572654'
+
--- a/eossr/tests/test_utils.py
+++ b/eossr/tests/test_utils.py
+from eossr.utils import get_codemeta_from_zipurl, ZipUrl
+from pathlib import Path
+
+_testurl = 'https://zenodo.org/record/5524913/files/eossr-v0.2.zip'
+
+def test_ZipUrl():
+    zipurl = ZipUrl(_testurl)
+    codemeta_paths = zipurl.find_files('codemeta.json')
+
+    assert 'eossr-v0.2/codemeta.json' in codemeta_paths
+    assert 'eossr-v0.2/eossr/metadata/schema/codemeta.json' in codemeta_paths
+    zipurl.extract_file('eossr-v0.2/codemeta.json')
+    assert Path('eossr-v0.2/codemeta.json').exists()
+
+def test_get_codemeta_from_zipurl():
+    codemeta = get_codemeta_from_zipurl(_testurl)
+    assert codemeta['name'] == 'eossr'
--- a/eossr/utils.py
+++ b/eossr/utils.py
+import json
+import requests
+from io import BytesIO
+from zipfile import ZipFile
+from pathlib import Path
+
+__all__ = [
+    'ZipUrl',
+    'get_codemeta_from_zipurl',
+]
+
+class ZipUrl:
+
+    def __init__(self, url):
+        self.url = url
+        self.content = requests.get(self.url)
+
+    @property
+    def filelist(self):
+        with ZipFile(BytesIO(self.content.content)) as zipobj:
+            return zipobj.namelist()
+
+    def find_files(self, filename):
+        """
+        return the path of files in the archive matching `filename`
+
+        :param filename: string
+        :return: list[str]
+        """
+        matching_files = [f for f in self.filelist if Path(f).name == filename]
+        if len(matching_files) == 0:
+            raise FileNotFoundError(f"No file named {filename} in {self.url}")
+        else:
+            return matching_files
+
+    def extract_file(self, filepath):
+        with ZipFile(BytesIO(self.content.content)) as zipobj:
+            return zipobj.extract(filepath)
+
+
+def get_codemeta_from_zipurl(url):
+    """
+    Extract and reads codemeta metadata from a zip url.
+    A codemeta.json file must be present in the zip archive.
+
+    :param url: string
+        url to a zip file
+    :return: dictionnary
+        metadata in the codemeta.json file in the zip archive
+    """
+    zipurl = ZipUrl(url)
+    codemeta_paths = zipurl.find_files('codemeta.json')
+    # if there are more than one codemeta file in the archive, we consider the one in the root directory, hence the
+    # one with the shortest path
+    codemeta_path = min(codemeta_paths, key=len)
+    with open(zipurl.extract_file(codemeta_path)) as file:
+        codemeta = json.load(file)
+
+    return codemeta
+
--- a/examples/notebooks/ZipUrl.ipynb
+++ b/examples/notebooks/ZipUrl.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d2bcb61a",
+   "metadata": {},
+   "source": [
+    "# How to use ZipUrl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00eeeef5",
+   "metadata": {},
+   "source": [
+    "ZipUrl can be used to access a zip archive stored online.    \n",
+    "One can peek into the archive and extract a single file without downloading or extracting the whole archive."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "658cb424",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from eossr.utils import ZipUrl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ad996cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zipurl = ZipUrl('https://zenodo.org/record/5524913/files/eossr-v0.2.zip')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dd38cf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zipurl.filelist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "503e292a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zipurl.extract_file('eossr-v0.2/README.md')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "680dea59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zipurl.find_files('codemeta.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13c33da1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba648264",
+   "metadata": {},
+   "source": [
+    "## How it is used in eossr"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8906723",
+   "metadata": {},
+   "source": [
+    "A repository is zipped before to be uploaded to Zenodo (by GitHub hook as well as the eossr GitLab CI).    \n",
+    "ZipUrl is used in the eossr to get the metadata stored in the `codemeta.json` file stored in that zip archive."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51fd2eb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from eossr.api import Record"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4201cd65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record = Record.from_id(933320, sandbox=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6eb8d25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record.print_info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dad7fd90",
+   "metadata": {},
+   "source": [
+    "This test record has been generated using the official [Zenodo-GitHub hook](https://guides.github.com/activities/citable-code/). It contains a unique zip file with the entire content of the repository:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2147c3cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record.filelist"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de033fbd",
+   "metadata": {},
+   "source": [
+    "The codemeta metadata can still be retrieve from a single command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8762a46c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record.get_codemeta()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9f2e80e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:d2bcb61a tags:
+
+# How to use ZipUrl
+
+%% Cell type:markdown id:00eeeef5 tags:
+
+ZipUrl can be used to access a zip archive stored online.
+One can peek into the archive and extract a single file without downloading or extracting the whole archive.
+
+%% Cell type:code id:658cb424 tags:
+
+``` python
+from eossr.utils import ZipUrl
+```
+
+%% Cell type:code id:4ad996cd tags:
+
+``` python
+zipurl = ZipUrl('https://zenodo.org/record/5524913/files/eossr-v0.2.zip')
+```
+
+%% Cell type:code id:0dd38cf3 tags:
+
+``` python
+zipurl.filelist
+```
+
+%% Cell type:code id:503e292a tags:
+
+``` python
+zipurl.extract_file('eossr-v0.2/README.md')
+```
+
+%% Cell type:code id:680dea59 tags:
+
+``` python
+zipurl.find_files('codemeta.json')
+```
+
+%% Cell type:code id:13c33da1 tags:
+
+``` python
+```
+
+%% Cell type:markdown id:ba648264 tags:
+
+## How it is used in eossr
+
+%% Cell type:markdown id:b8906723 tags:
+
+A repository is zipped before to be uploaded to Zenodo (by GitHub hook as well as the eossr GitLab CI).
+ZipUrl is used in the eossr to get the metadata stored in the `codemeta.json` file stored in that zip archive.
+
+%% Cell type:code id:51fd2eb4 tags:
+
+``` python
+from eossr.api import Record
+```
+
+%% Cell type:code id:4201cd65 tags:
+
+``` python
+record = Record.from_id(933320, sandbox=True)
+```
+
+%% Cell type:code id:c6eb8d25 tags:
+
+``` python
+record.print_info()
+```
+
+%% Cell type:markdown id:dad7fd90 tags:
+
+This test record has been generated using the official [Zenodo-GitHub hook](https://guides.github.com/activities/citable-code/). It contains a unique zip file with the entire content of the repository:
+
+%% Cell type:code id:2147c3cf tags:
+
+``` python
+record.filelist
+```
+
+%% Cell type:markdown id:de033fbd tags:
+
+The codemeta metadata can still be retrieve from a single command:
+
+%% Cell type:code id:8762a46c tags:
+
+``` python
+record.get_codemeta()
+```
+
+%% Cell type:code id:c9f2e80e tags:
+
+``` python
+```
--- a/examples/notebooks/ossr_api-Explore_the_OSSR.ipynb
+++ b/examples/notebooks/ossr_api-Explore_the_OSSR.ipynb
@@ -243,6 +243,16 @@
   "id": "25ec9b5a",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "record.data.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f4de66b",
+   "metadata": {},
+   "outputs": [],
   "source": []
  },
  {
@@ -260,7 +270,8 @@
   "metadata": {},
   "source": [
    "## Getting CodeMeta metadata\n",
-    "If a `codemeta.json` file has been added to the record, one can retrieve it directly:"
+    "If a `codemeta.json` file has been added to the record, one can retrieve it directly.    \n",
+    "The method is also looking into the zip archives that may be part of the record."
   ]
  },
  {
@@ -325,7 +336,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.9.2"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id:4a25feec tags:

 # Find ESCAPE OSSR records


 **Please note**; to fetch information from the OSSR using the EOSSR API, **NO Zenodo token is needed**.

 The EOSSR API will get the public information that is available in the repository.

 %% Cell type:markdown id:5e008b43 tags:

 ## Getting all the records

 %% Cell type:code id:dbde9b19 tags:

 ``` python
 from eossr.api import get_ossr_records
 ```

 %% Cell type:code id:2fe017bc tags:

 ``` python
 ossr_records = get_ossr_records()
 ```

 %% Cell type:code id:5eb34293 tags:

 ``` python
 len(ossr_records)
 ```

 %% Cell type:markdown id:93b033a0 tags:

 Records are objects containing data and metadata sent by Zenodo API

 %% Cell type:code id:f88ebfd3 tags:

 ``` python
 record = ossr_records[0]
 ```

 %% Cell type:code id:988b95c7 tags:

 ``` python
 print(record)
 ```

 %% Cell type:code id:6bbe7db1 tags:

 ``` python
 record
 ```

 %% Cell type:code id:3aad2e35 tags:

 ``` python
 ```

 %% Cell type:markdown id:cb5f905e tags:

 You can use `print_info` to display minimal information about a `Record`:

 %% Cell type:code id:a9ec3d38 tags:

 ``` python
 record.print_info()
 ```

 %% Cell type:code id:28ad4039 tags:

 ``` python
 ```

 %% Cell type:markdown id:21384a7d tags:

 ## Specific search

 %% Cell type:markdown id:7f1a9209 tags:

 ### Using strings

 %% Cell type:code id:3a3dc135 tags:

 ``` python
 escape_records = get_ossr_records('escape')
 ```

 %% Cell type:code id:0aa0deec tags:

 ``` python
 for r in escape_records:
    print(r)
 ```

 %% Cell type:markdown id:c4b08fde tags:

 ### Using keywords

 %% Cell type:code id:f5cb8f77 tags:

 ``` python
 cta_records = get_ossr_records(keywords='CTA')
 len(cta_records)
 ```

 %% Cell type:code id:019f21eb tags:

 ``` python
 for record in cta_records:
    print(record)
 ```

 %% Cell type:code id:5f304419 tags:

 ``` python
 ```

 %% Cell type:markdown id:da77c97b tags:

 ### Directly from its id
 if you happen to know exactly the record you are looking for

 %% Cell type:code id:eb048c8f tags:

 ``` python
 from eossr.api.zenodo import get_record
 ```

 %% Cell type:code id:8a162ccb tags:

 ``` python
 record = get_record(4923992)
 ```

 %% Cell type:code id:81b76847 tags:

 ``` python
 print(record)
 ```

 %% Cell type:code id:25ec9b5a tags:

 ``` python
+record.data.keys()
+```
+
+%% Cell type:code id:6f4de66b tags:
+
+``` python
 ```

 %% Cell type:markdown id:a85d8056 tags:

 ## `Record` methods
 There are other useful methods to a Record class.

 %% Cell type:markdown id:9083a824 tags:

 ## Getting CodeMeta metadata
-If a `codemeta.json` file has been added to the record, one can retrieve it directly:
+If a `codemeta.json` file has been added to the record, one can retrieve it directly.
+The method is also looking into the zip archives that may be part of the record.

 %% Cell type:code id:1431eb41 tags:

 ``` python
 record.get_codemeta()
 ```

 %% Cell type:code id:7a1d0f77 tags:

 ``` python
 ```

 %% Cell type:markdown id:64c43dd6 tags:

 ## MyBinder integration
 You can get a mybinder URL directly from a record:

 %% Cell type:code id:7cf9bcdb tags:

 ``` python
 record.get_mybinder_url()
 ```

 %% Cell type:code id:622e44d9 tags:

 ``` python
 ```