diff --git a/env/requirements-test.txt b/env/requirements-test.txt index 3476db60..87eedb97 100644 --- a/env/requirements-test.txt +++ b/env/requirements-test.txt @@ -2,4 +2,5 @@ pytest pytest-cov pytest-localftpserver +pytest-httpserver coverage diff --git a/environment.yml b/environment.yml index 29dfafb9..0dd03145 100644 --- a/environment.yml +++ b/environment.yml @@ -15,6 +15,7 @@ dependencies: - pytest - pytest-cov - pytest-localftpserver + - pytest-httpserver - coverage # Documentation - sphinx==4.4.* diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 1cae6b1e..e34a5890 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -748,10 +748,13 @@ def populate_registry(self, pooch): class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring + base_api_url = "https://zenodo.org/api/records" + def __init__(self, doi, archive_url): self.archive_url = archive_url self.doi = doi self._api_response = None + self._api_version = None @classmethod def initialize(cls, doi, archive_url): @@ -788,11 +791,43 @@ def api_response(self): article_id = self.archive_url.split("/")[-1] self._api_response = requests.get( - f"https://zenodo.org/api/records/{article_id}" + f"{self.base_api_url}/{article_id}" ).json() return self._api_response + @property + def api_version(self): + """ + Version of the Zenodo API we are interacting with + + The versions can either be : + + - ``"legacy"``: corresponds to the Zenodo API that was supported until + 2023-10-12 (before the migration to InvenioRDM). + - ``"new"``: corresponds to the new API that went online on 2023-10-13 + after the migration to InvenioRDM. + + The ``"new"`` API breaks backward compatibility with the ``"legacy"`` + one and could probably be replaced by an updated version that restores + the behaviour of the ``"legacy"`` one. + + Returns + ------- + str + """ + if self._api_version is None: + if all(["key" in file for file in self.api_response["files"]]): + self._api_version = "legacy" + elif all(["filename" in file for file in self.api_response["files"]]): + self._api_version = "new" + else: + raise ValueError( + "Couldn't determine the version of the Zenodo API for " + f"{self.archive_url} (doi:{self.doi})." + ) + return self._api_version + def download_url(self, file_name): """ Use the repository API to get the download URL for a file given @@ -807,13 +842,35 @@ def download_url(self, file_name): ------- download_url : str The HTTP URL that can be used to download the file. + + Notes + ----- + After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The + link to the desired files that appears in the API response leads to 404 + errors (by 2023-10-17). The files are available in the following url: + ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``. + + This method supports both the legacy and the new API. """ - files = {item["key"]: item for item in self.api_response["files"]} + # Create list of files in the repository + if self.api_version == "legacy": + files = {item["key"]: item for item in self.api_response["files"]} + else: + files = [item["filename"] for item in self.api_response["files"]] + # Check if file exists in the repository if file_name not in files: raise ValueError( - f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." + f"File '{file_name}' not found in data archive " + f"{self.archive_url} (doi:{self.doi})." + ) + # Build download url + if self.api_version == "legacy": + download_url = files[file_name]["links"]["self"] + else: + article_id = self.api_response["id"] + download_url = ( + f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" ) - download_url = files[file_name]["links"]["self"] return download_url def populate_registry(self, pooch): @@ -824,10 +881,22 @@ def populate_registry(self, pooch): ---------- pooch : Pooch The pooch instance that the registry will be added to. - """ + Notes + ----- + After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The + checksums for each file listed in the API reference is now an md5 sum. + + This method supports both the legacy and the new API. + """ for filedata in self.api_response["files"]: - pooch.registry[filedata["key"]] = filedata["checksum"] + checksum = filedata["checksum"] + if self.api_version == "legacy": + key = "key" + else: + key = "filename" + checksum = f"md5:{checksum}" + pooch.registry[filedata[key]] = checksum class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py index ec85f91a..d1777293 100644 --- a/pooch/tests/test_downloaders.py +++ b/pooch/tests/test_downloaders.py @@ -23,6 +23,7 @@ except ImportError: paramiko = None +from .. import Pooch from ..downloaders import ( HTTPDownloader, FTPDownloader, @@ -384,3 +385,157 @@ def close(): # Check that the downloaded file has the right content check_large_data(outfile) + + +class TestZenodoAPISupport: + """ + Test support for different Zenodo APIs + """ + + article_id = 123456 + doi = f"10.0001/zenodo.{article_id}" + doi_url = f"https://doi.org/{doi}" + file_name = "my-file.zip" + file_url = ( + "https://zenodo.org/api/files/513d7033-93a2-4eeb-821c-2fb0bbab0012/my-file.zip" + ) + file_checksum = "2942bfabb3d05332b66eb128e0842cff" + + legacy_api_response = dict( + created="2021-20-19T08:00:00.000000+00:00", + modified="2021-20-19T08:00:00.000000+00:00", + id=article_id, + doi=doi, + doi_url=doi_url, + files=[ + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "key": file_name, + "checksum": f"md5:{file_checksum}", + "links": { + "self": file_url, + }, + } + ], + ) + + new_api_response = dict( + created="2021-20-19T08:00:00.000000+00:00", + modified="2021-20-19T08:00:00.000000+00:00", + id=article_id, + doi=doi, + doi_url=doi_url, + files=[ + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "filename": file_name, + "checksum": file_checksum, + "links": { + "self": file_url, + }, + } + ], + ) + + invalid_api_response = dict( + created="2021-20-19T08:00:00.000000+00:00", + modified="2021-20-19T08:00:00.000000+00:00", + id=article_id, + doi=doi, + doi_url=doi_url, + files=[ + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "filename": file_name, + "checksum": file_checksum, + "links": { + "self": file_url, + }, + }, + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "key": file_name, + "checksum": f"md5:{file_checksum}", + "links": { + "self": file_url, + }, + }, + ], + ) + + @pytest.mark.parametrize( + "api_version, api_response", + [ + ("legacy", legacy_api_response), + ("new", new_api_response), + ("invalid", invalid_api_response), + ], + ) + def test_api_version(self, httpserver, api_version, api_response): + """ + Test if the API version is correctly detected. + """ + # Create a local http server + httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( + api_response + ) + # Create Zenodo downloader + downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) + # Override base url for the API of the downloader + downloader.base_api_url = httpserver.url_for("") + # Check if the API version is correctly identified + if api_version != "invalid": + assert downloader.api_version == api_version + else: + msg = "Couldn't determine the version of the Zenodo API" + with pytest.raises(ValueError, match=msg): + api_version = downloader.api_version + + @pytest.mark.parametrize( + "api_version, api_response", + [("legacy", legacy_api_response), ("new", new_api_response)], + ) + def test_download_url(self, httpserver, api_version, api_response): + """ + Test if the download url is correct for each API version. + """ + # Create a local http server + httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( + api_response + ) + # Create Zenodo downloader + downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) + # Override base url for the API of the downloader + downloader.base_api_url = httpserver.url_for("") + # Check if the download url is correct + download_url = downloader.download_url(file_name=self.file_name) + if api_version == "legacy": + assert download_url == self.file_url + else: + expected_url = ( + "https://zenodo.org/records/" + f"{self.article_id}/files/{self.file_name}?download=1" + ) + assert download_url == expected_url + + @pytest.mark.parametrize( + "api_response", + [legacy_api_response, new_api_response], + ) + def test_populate_registry(self, httpserver, tmp_path, api_response): + """ + Test if population of registry is correctly done for each API version. + """ + # Create a local http server + httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( + api_response + ) + # Create sample pooch object + puppy = Pooch(base_url="", path=tmp_path) + # Create Zenodo downloader + downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) + # Override base url for the API of the downloader + downloader.base_api_url = httpserver.url_for("") + # Populate registry + downloader.populate_registry(puppy) + assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"}