From ddfc2b244a4ac06ffdf9a1f3635f2f1f45c7e894 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 17 Oct 2023 11:03:55 -0700 Subject: [PATCH 1/8] Fix bug: update Zenodo downloader for new API Update the Zenodo downloader to work with the new Zenodo API. Use the `filename` key for getting the filenames of all files in a repository. Update the generated download url based on heuristics: the link present in the API reference doesn't work at the moment). Update the method that populates the registry: use the new `filename` key and specify that the checksums are now md5. --- pooch/downloaders.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 1cae6b1e..aee4b98b 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -807,13 +807,25 @@ def download_url(self, file_name): ------- download_url : str The HTTP URL that can be used to download the file. + + Notes + ----- + After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The + link to the desired files that appears in the API response leads to 404 + errors (by 2023-10-17). The files are available in the following url: + ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``. """ - files = {item["key"]: item for item in self.api_response["files"]} - if file_name not in files: + # Check if file exists in the repository + filenames = [item["filename"] for item in self.api_response["files"]] + if file_name not in filenames: raise ValueError( f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." ) - download_url = files[file_name]["links"]["self"] + # Build download url + article_id = self.api_response["id"] + download_url = ( + f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" + ) return download_url def populate_registry(self, pooch): @@ -824,10 +836,15 @@ def populate_registry(self, pooch): ---------- pooch : Pooch The pooch instance that the registry will be added to. + + Notes + ----- + After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The + checksums for each file listed in the API reference is now an md5 sum. """ for filedata in self.api_response["files"]: - pooch.registry[filedata["key"]] = filedata["checksum"] + pooch.registry[filedata["filename"]] = "md5:" + filedata["checksum"] class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring @@ -938,7 +955,8 @@ def download_url(self, file_name): files = {item["name"]: item for item in self.api_response} if file_name not in files: raise ValueError( - f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." + f"File '{file_name}' not found in data archive " + f"{self.archive_url} (doi:{self.doi})." ) download_url = files[file_name]["download_url"] return download_url From c6be96a51409efde95e20bb9460875e763c2a6bd Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 17 Oct 2023 11:08:53 -0700 Subject: [PATCH 2/8] Shorten line with error message --- pooch/downloaders.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index aee4b98b..288c6999 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -819,7 +819,8 @@ def download_url(self, file_name): filenames = [item["filename"] for item in self.api_response["files"]] if file_name not in filenames: raise ValueError( - f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." + f"File '{file_name}' not found in data archive " + f"{self.archive_url} (doi:{self.doi})." ) # Build download url article_id = self.api_response["id"] From de956d5001c8ddb843e7b58f79755f52bef40808 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 17 Oct 2023 11:09:49 -0700 Subject: [PATCH 3/8] Restore unrelated line --- pooch/downloaders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 288c6999..50ed6a8b 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -956,8 +956,7 @@ def download_url(self, file_name): files = {item["name"]: item for item in self.api_response} if file_name not in files: raise ValueError( - f"File '{file_name}' not found in data archive " - f"{self.archive_url} (doi:{self.doi})." + f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." ) download_url = files[file_name]["download_url"] return download_url From 23e205784a24804bc0247c391e64c2acd12a71f8 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Tue, 17 Oct 2023 11:11:05 -0700 Subject: [PATCH 4/8] Use f-string instead of str concatenation --- pooch/downloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 50ed6a8b..47cdb5fc 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -845,7 +845,7 @@ def populate_registry(self, pooch): """ for filedata in self.api_response["files"]: - pooch.registry[filedata["filename"]] = "md5:" + filedata["checksum"] + pooch.registry[filedata["filename"]] = f"md5:{filedata['checksum']}" class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring From 6db2a997db9b52a43ac6ddc9a6551c4fc71ee4d1 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Fri, 20 Oct 2023 11:43:15 -0700 Subject: [PATCH 5/8] Support both the legacy and the new Zenodo API Detect which API are we interacting with and generate the download url and/or populate the registry accordingly. --- pooch/downloaders.py | 65 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 47cdb5fc..c3b266fc 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -752,6 +752,7 @@ def __init__(self, doi, archive_url): self.archive_url = archive_url self.doi = doi self._api_response = None + self._api_version = None @classmethod def initialize(cls, doi, archive_url): @@ -793,6 +794,38 @@ def api_response(self): return self._api_response + @property + def api_version(self): + """ + Version of the Zenodo API we are interacting with + + The versions can either be : + + - ``"legacy"``: corresponds to the Zenodo API that was supported until + 2023-10-12 (before the migration to InvenioRDM). + - ``"new"``: corresponds to the new API that went online on 2023-10-13 + after the migration to InvenioRDM. + + The ``"new"`` API breaks backward compatibility with the ``"legacy"`` + one and could probably be replaced by an updated version that restores + the behaviour of the ``"legacy"`` one. + + Returns + ------- + str + """ + if self._api_version is None: + if all(["key" in file for file in self.api_response["files"]]): + self._api_version = "legacy" + elif all(["filename" in file for file in self.api_response["files"]]): + self._api_version = "new" + else: + raise ValueError( + "Couldn't determine the version of the Zenodo API for " + f"{self.archive_url} (doi:{self.doi})." + ) + return self._api_version + def download_url(self, file_name): """ Use the repository API to get the download URL for a file given @@ -814,19 +847,28 @@ def download_url(self, file_name): link to the desired files that appears in the API response leads to 404 errors (by 2023-10-17). The files are available in the following url: ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``. + + This method supports both the legacy and the new API. """ + # Create list of files in the repository + if self.api_version == "legacy": + files = {item["key"]: item for item in self.api_response["files"]} + else: + files = [item["filename"] for item in self.api_response["files"]] # Check if file exists in the repository - filenames = [item["filename"] for item in self.api_response["files"]] - if file_name not in filenames: + if file_name not in files: raise ValueError( f"File '{file_name}' not found in data archive " f"{self.archive_url} (doi:{self.doi})." ) # Build download url - article_id = self.api_response["id"] - download_url = ( - f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" - ) + if self.api_version == "legacy": + download_url = files[file_name]["links"]["self"] + else: + article_id = self.api_response["id"] + download_url = ( + f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" + ) return download_url def populate_registry(self, pooch): @@ -842,10 +884,17 @@ def populate_registry(self, pooch): ----- After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The checksums for each file listed in the API reference is now an md5 sum. - """ + This method supports both the legacy and the new API. + """ for filedata in self.api_response["files"]: - pooch.registry[filedata["filename"]] = f"md5:{filedata['checksum']}" + checksum = filedata["checksum"] + if self.api_version == "legacy": + key = "key" + else: + key = "filename" + checksum = f"md5:{checksum}" + pooch.registry[filedata[key]] = checksum class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring From c5159adb7d30f07f3ba9b5a0ecf6e9e033128c9f Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Fri, 20 Oct 2023 13:12:01 -0700 Subject: [PATCH 6/8] Use pytest-httpserver to test support for old and new APIs Test if `ZenodoRepository` correctly supports both the old and new Zenodo APIs. Use `pytest-httpserver` to run a local server that returns API response that mimics the Zenodo ones. Define a `base_api_url` class attribute for `ZenodoRepository` so we can override it in tests. --- env/requirements-test.txt | 1 + environment.yml | 1 + pooch/downloaders.py | 5 +- pooch/tests/test_downloaders.py | 155 ++++++++++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 1 deletion(-) diff --git a/env/requirements-test.txt b/env/requirements-test.txt index 3476db60..87eedb97 100644 --- a/env/requirements-test.txt +++ b/env/requirements-test.txt @@ -2,4 +2,5 @@ pytest pytest-cov pytest-localftpserver +pytest-httpserver coverage diff --git a/environment.yml b/environment.yml index 29dfafb9..0dd03145 100644 --- a/environment.yml +++ b/environment.yml @@ -15,6 +15,7 @@ dependencies: - pytest - pytest-cov - pytest-localftpserver + - pytest-httpserver - coverage # Documentation - sphinx==4.4.* diff --git a/pooch/downloaders.py b/pooch/downloaders.py index c3b266fc..8c8e665f 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -748,6 +748,9 @@ def populate_registry(self, pooch): class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring + + base_api_url = "https://zenodo.org/api/records" + def __init__(self, doi, archive_url): self.archive_url = archive_url self.doi = doi @@ -789,7 +792,7 @@ def api_response(self): article_id = self.archive_url.split("/")[-1] self._api_response = requests.get( - f"https://zenodo.org/api/records/{article_id}" + f"{self.base_api_url}/{article_id}" ).json() return self._api_response diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py index ec85f91a..1b9a812c 100644 --- a/pooch/tests/test_downloaders.py +++ b/pooch/tests/test_downloaders.py @@ -23,6 +23,7 @@ except ImportError: paramiko = None +from .. import Pooch from ..downloaders import ( HTTPDownloader, FTPDownloader, @@ -384,3 +385,157 @@ def close(): # Check that the downloaded file has the right content check_large_data(outfile) + + +class TestZenodoAPISupport: + """ + Test support for different Zenodo APIs + """ + + article_id = 123456 + doi = f"10.0001/zenodo.{article_id}" + doi_url = f"https://doi.org/{doi}" + file_name = "my-file.zip" + file_url = ( + "https://zenodo.org/api/files/513d7033-93a2-4eeb-821c-2fb0bbab0012/my-file.zip" + ) + file_checksum = "2942bfabb3d05332b66eb128e0842cff" + + legacy_api_response = dict( + created="2021-20-19T08:00:00.000000+00:00", + modified="2021-20-19T08:00:00.000000+00:00", + id=article_id, + doi=doi, + doi_url=doi_url, + files=[ + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "key": file_name, + "checksum": f"md5:{file_checksum}", + "links": { + "self": file_url, + }, + } + ], + ) + + new_api_response = dict( + created="2021-20-19T08:00:00.000000+00:00", + modified="2021-20-19T08:00:00.000000+00:00", + id=article_id, + doi=doi, + doi_url=doi_url, + files=[ + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "filename": file_name, + "checksum": file_checksum, + "links": { + "self": file_url, + }, + } + ], + ) + + invalid_api_response = dict( + created="2021-20-19T08:00:00.000000+00:00", + modified="2021-20-19T08:00:00.000000+00:00", + id=article_id, + doi=doi, + doi_url=doi_url, + files=[ + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "filename": file_name, + "checksum": file_checksum, + "links": { + "self": file_url, + }, + }, + { + "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", + "key": file_name, + "checksum": f"md5:{file_checksum}", + "links": { + "self": file_url, + }, + }, + ], + ) + + @pytest.mark.parametrize( + "api_version, api_response", + [ + ("legacy", legacy_api_response), + ("new", new_api_response), + ("invalid", invalid_api_response), + ], + ) + def test_api_version(self, httpserver, api_version, api_response): + """ + Test if the API version is correctly detected. + """ + # Create a local http server + httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( + api_response + ) + # Create Zenodo downloader + downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) + # Override base url for the API of the downloader + downloader.base_api_url = httpserver.url_for("") + # Check if the API version is correctly identified + if api_version != "invalid": + assert downloader.api_version == api_version + else: + msg = "Couldn't determine the version of the Zenodo API" + with pytest.raises(ValueError, match=msg): + downloader.api_version + + @pytest.mark.parametrize( + "api_version, api_response", + [("legacy", legacy_api_response), ("new", new_api_response)], + ) + def test_download_url(self, httpserver, api_version, api_response): + """ + Test if the download url is correct for each API version. + """ + # Create a local http server + httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( + api_response + ) + # Create Zenodo downloader + downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) + # Override base url for the API of the downloader + downloader.base_api_url = httpserver.url_for("") + # Check if the download url is correct + download_url = downloader.download_url(file_name=self.file_name) + if api_version == "legacy": + assert download_url == self.file_url + else: + expected_url = ( + "https://zenodo.org/records/" + f"{self.article_id}/files/{self.file_name}?download=1" + ) + assert download_url == expected_url + + @pytest.mark.parametrize( + "api_version, api_response", + [("legacy", legacy_api_response), ("new", new_api_response)], + ) + def test_populate_registry(self, httpserver, tmp_path, api_version, api_response): + """ + Test if population of registry is correctly done for each API version. + """ + # Create a local http server + httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( + api_response + ) + # Create sample pooch object + puppy = Pooch(base_url="", path=tmp_path) + # Create Zenodo downloader + downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) + # Override base url for the API of the downloader + downloader.base_api_url = httpserver.url_for("") + # Populate registry + downloader.populate_registry(puppy) + assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"} From 3634bb3e9e36c7408435289820e3d4ee78c4b9d8 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Fri, 20 Oct 2023 13:22:00 -0700 Subject: [PATCH 7/8] Run black --- pooch/downloaders.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pooch/downloaders.py b/pooch/downloaders.py index 8c8e665f..e34a5890 100644 --- a/pooch/downloaders.py +++ b/pooch/downloaders.py @@ -748,7 +748,6 @@ def populate_registry(self, pooch): class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring - base_api_url = "https://zenodo.org/api/records" def __init__(self, doi, archive_url): From d5bfc84e107b3038a9b23d0913ac15ead63f45f3 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Fri, 20 Oct 2023 14:18:33 -0700 Subject: [PATCH 8/8] Fix style to solve flake8 errors --- pooch/tests/test_downloaders.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py index 1b9a812c..d1777293 100644 --- a/pooch/tests/test_downloaders.py +++ b/pooch/tests/test_downloaders.py @@ -489,7 +489,7 @@ def test_api_version(self, httpserver, api_version, api_response): else: msg = "Couldn't determine the version of the Zenodo API" with pytest.raises(ValueError, match=msg): - downloader.api_version + api_version = downloader.api_version @pytest.mark.parametrize( "api_version, api_response", @@ -519,10 +519,10 @@ def test_download_url(self, httpserver, api_version, api_response): assert download_url == expected_url @pytest.mark.parametrize( - "api_version, api_response", - [("legacy", legacy_api_response), ("new", new_api_response)], + "api_response", + [legacy_api_response, new_api_response], ) - def test_populate_registry(self, httpserver, tmp_path, api_version, api_response): + def test_populate_registry(self, httpserver, tmp_path, api_response): """ Test if population of registry is correctly done for each API version. """