Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug: add support for old and new Zenodo APIs #375

Merged
merged 8 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions env/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
pytest
pytest-cov
pytest-localftpserver
pytest-httpserver
coverage
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- pytest
- pytest-cov
- pytest-localftpserver
- pytest-httpserver
- coverage
# Documentation
- sphinx==4.4.*
Expand Down
81 changes: 75 additions & 6 deletions pooch/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,10 +748,13 @@ def populate_registry(self, pooch):


class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
base_api_url = "https://zenodo.org/api/records"

def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
self._api_version = None

@classmethod
def initialize(cls, doi, archive_url):
Expand Down Expand Up @@ -788,11 +791,43 @@ def api_response(self):

article_id = self.archive_url.split("/")[-1]
self._api_response = requests.get(
f"https://zenodo.org/api/records/{article_id}"
f"{self.base_api_url}/{article_id}"
).json()

return self._api_response

@property
def api_version(self):
"""
Version of the Zenodo API we are interacting with

The versions can either be :

- ``"legacy"``: corresponds to the Zenodo API that was supported until
2023-10-12 (before the migration to InvenioRDM).
- ``"new"``: corresponds to the new API that went online on 2023-10-13
after the migration to InvenioRDM.

The ``"new"`` API breaks backward compatibility with the ``"legacy"``
one and could probably be replaced by an updated version that restores
the behaviour of the ``"legacy"`` one.

Returns
-------
str
"""
if self._api_version is None:
if all(["key" in file for file in self.api_response["files"]]):
self._api_version = "legacy"
elif all(["filename" in file for file in self.api_response["files"]]):
self._api_version = "new"
else:
raise ValueError(
"Couldn't determine the version of the Zenodo API for "
f"{self.archive_url} (doi:{self.doi})."
)
return self._api_version

def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
Expand All @@ -807,13 +842,35 @@ def download_url(self, file_name):
-------
download_url : str
The HTTP URL that can be used to download the file.

Notes
-----
After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
link to the desired files that appears in the API response leads to 404
errors (by 2023-10-17). The files are available in the following url:
``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.

This method supports both the legacy and the new API.
"""
files = {item["key"]: item for item in self.api_response["files"]}
# Create list of files in the repository
if self.api_version == "legacy":
files = {item["key"]: item for item in self.api_response["files"]}
else:
files = [item["filename"] for item in self.api_response["files"]]
# Check if file exists in the repository
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
f"File '{file_name}' not found in data archive "
f"{self.archive_url} (doi:{self.doi})."
)
# Build download url
if self.api_version == "legacy":
download_url = files[file_name]["links"]["self"]
else:
article_id = self.api_response["id"]
download_url = (
f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
)
download_url = files[file_name]["links"]["self"]
return download_url

def populate_registry(self, pooch):
Expand All @@ -824,10 +881,22 @@ def populate_registry(self, pooch):
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""

Notes
-----
After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
checksums for each file listed in the API reference is now an md5 sum.

This method supports both the legacy and the new API.
"""
for filedata in self.api_response["files"]:
pooch.registry[filedata["key"]] = filedata["checksum"]
checksum = filedata["checksum"]
if self.api_version == "legacy":
key = "key"
else:
key = "filename"
checksum = f"md5:{checksum}"
pooch.registry[filedata[key]] = checksum


class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
Expand Down
155 changes: 155 additions & 0 deletions pooch/tests/test_downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
except ImportError:
paramiko = None

from .. import Pooch
from ..downloaders import (
HTTPDownloader,
FTPDownloader,
Expand Down Expand Up @@ -384,3 +385,157 @@ def close():

# Check that the downloaded file has the right content
check_large_data(outfile)


class TestZenodoAPISupport:
"""
Test support for different Zenodo APIs
"""

article_id = 123456
doi = f"10.0001/zenodo.{article_id}"
doi_url = f"https://doi.org/{doi}"
file_name = "my-file.zip"
file_url = (
"https://zenodo.org/api/files/513d7033-93a2-4eeb-821c-2fb0bbab0012/my-file.zip"
)
file_checksum = "2942bfabb3d05332b66eb128e0842cff"

legacy_api_response = dict(
created="2021-20-19T08:00:00.000000+00:00",
modified="2021-20-19T08:00:00.000000+00:00",
id=article_id,
doi=doi,
doi_url=doi_url,
files=[
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"key": file_name,
"checksum": f"md5:{file_checksum}",
"links": {
"self": file_url,
},
}
],
)

new_api_response = dict(
created="2021-20-19T08:00:00.000000+00:00",
modified="2021-20-19T08:00:00.000000+00:00",
id=article_id,
doi=doi,
doi_url=doi_url,
files=[
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"filename": file_name,
"checksum": file_checksum,
"links": {
"self": file_url,
},
}
],
)

invalid_api_response = dict(
created="2021-20-19T08:00:00.000000+00:00",
modified="2021-20-19T08:00:00.000000+00:00",
id=article_id,
doi=doi,
doi_url=doi_url,
files=[
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"filename": file_name,
"checksum": file_checksum,
"links": {
"self": file_url,
},
},
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"key": file_name,
"checksum": f"md5:{file_checksum}",
"links": {
"self": file_url,
},
},
],
)

@pytest.mark.parametrize(
"api_version, api_response",
[
("legacy", legacy_api_response),
("new", new_api_response),
("invalid", invalid_api_response),
],
)
def test_api_version(self, httpserver, api_version, api_response):
"""
Test if the API version is correctly detected.
"""
# Create a local http server
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
api_response
)
# Create Zenodo downloader
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
# Override base url for the API of the downloader
downloader.base_api_url = httpserver.url_for("")
# Check if the API version is correctly identified
if api_version != "invalid":
assert downloader.api_version == api_version
else:
msg = "Couldn't determine the version of the Zenodo API"
with pytest.raises(ValueError, match=msg):
api_version = downloader.api_version

@pytest.mark.parametrize(
"api_version, api_response",
[("legacy", legacy_api_response), ("new", new_api_response)],
)
def test_download_url(self, httpserver, api_version, api_response):
"""
Test if the download url is correct for each API version.
"""
# Create a local http server
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
api_response
)
# Create Zenodo downloader
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
# Override base url for the API of the downloader
downloader.base_api_url = httpserver.url_for("")
# Check if the download url is correct
download_url = downloader.download_url(file_name=self.file_name)
if api_version == "legacy":
assert download_url == self.file_url
else:
expected_url = (
"https://zenodo.org/records/"
f"{self.article_id}/files/{self.file_name}?download=1"
)
assert download_url == expected_url

@pytest.mark.parametrize(
"api_response",
[legacy_api_response, new_api_response],
)
def test_populate_registry(self, httpserver, tmp_path, api_response):
"""
Test if population of registry is correctly done for each API version.
"""
# Create a local http server
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
api_response
)
# Create sample pooch object
puppy = Pooch(base_url="", path=tmp_path)
# Create Zenodo downloader
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
# Override base url for the API of the downloader
downloader.base_api_url = httpserver.url_for("")
# Populate registry
downloader.populate_registry(puppy)
assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"}
Loading