From 099e87fcf47e59b5e911f72f50ca3c94aed7444f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Mon, 19 Aug 2024 13:06:05 +0200 Subject: [PATCH] fix: allow both gs:// and gcs:// as query scheme; internally normalize to gs:// for compatibility with google cloud tools (#52) ## Summary by CodeRabbit - **New Features** - Enhanced compatibility with Google Cloud Storage URLs by accepting both "gcs://" and "gs://". - Introduced a method to normalize "gcs://" URLs to "gs://". - **Improvements** - Updated error messages for clearer guidance on valid URL schemes. - Revised example queries to highlight the "gs://" format. --- pyproject.toml | 2 +- snakemake_storage_plugin_gcs/__init__.py | 28 ++++++++++++++++++------ tests/tests.py | 12 +++++----- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 896cc58..5bd15a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ keywords = ["snakemake", "storage", "plugin", "google cloud storage"] [tool.poetry.dependencies] python = "^3.11" snakemake-interface-common = "^1.14.2" -snakemake-interface-storage-plugins = "^3.0.0" +snakemake-interface-storage-plugins = "^3.3.0" google-cloud-storage = "^2.12.0" google-crc32c = "^1.1.2" diff --git a/snakemake_storage_plugin_gcs/__init__.py b/snakemake_storage_plugin_gcs/__init__.py index 2b1c988..ecaff1c 100644 --- a/snakemake_storage_plugin_gcs/__init__.py +++ b/snakemake_storage_plugin_gcs/__init__.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +import re from typing import Any, Iterable, List, Optional from snakemake_interface_common.utils import lazy_property from snakemake_interface_storage_plugins.settings import StorageProviderSettingsBase @@ -32,6 +33,8 @@ from google.api_core import retry from google_crc32c import Checksum +_RE_GCS_SCHEME = re.compile(r"^gcs://") + # Optional: # Settings for the Google Storage plugin (e.g. host url, credentials). @@ -195,11 +198,12 @@ def is_valid_query(cls, query: str) -> StorageQueryValidationResult: valid=False, reason=f"cannot be parsed as URL ({e})", ) - if parsed.scheme != "gcs": + + if parsed.scheme != "gcs" and parsed.scheme != "gs": return StorageQueryValidationResult( query=query, valid=False, - reason="must start with gcs scheme (gcs://...)", + reason="must start with gcs or gs scheme (gs://... or gcs://...)", ) return StorageQueryValidationResult( query=query, @@ -213,10 +217,16 @@ def example_queries(cls) -> List[ExampleQuery]: """ return [ ExampleQuery( - query="gcs://mybucket/myfile.txt", + query="gs://mybucket/myfile.txt", type=QueryType.ANY, description="A file in an google storage (GCS) bucket", - ) + ), + ExampleQuery( + query="gcs://mybucket/myfile.txt", + type=QueryType.ANY, + description="A file in an google storage (GCS) bucket (alternative " + "query scheme)", + ), ] def use_rate_limiter(self) -> bool: @@ -248,6 +258,10 @@ def list_objects(self, query: Any) -> Iterable[str]: b = self.client.bucket(bucket_name, user_project=self.settings.project) return [k.name for k in b.list_blobs()] + def postprocess_query(self, query: str) -> str: + # normalize gcs:// to gs:// (the official scheme for google storage tools) + return _RE_GCS_SCHEME.sub("gs://", query) + # Required: # Implementation of storage object. If certain methods cannot be supported by your @@ -467,16 +481,16 @@ def list_candidate_matches(self) -> Iterable[str]: """Return a list of candidate matches in the storage for the query.""" # This is used by glob_wildcards() to find matches for wildcards in the query. prefix = get_constant_prefix(self.query) - if prefix.startswith(f"gcs://{self.bucket.name}"): + if prefix.startswith(f"gs://{self.bucket.name}"): prefix = prefix[6 + len(self.bucket.name) :].lstrip("/") return ( - f"gcs://{self.bucket.name}/{item.name}" + f"gs://{self.bucket.name}/{item.name}" for item in self.bucket.list_blobs(prefix=prefix) ) else: raise WorkflowError( - f"GCS storage object {self.query} must start with gcs://" + f"GCS storage object {self.query} must start with gs:// or gcs://" ) # Helper functions and properties not part of standard interface diff --git a/tests/tests.py b/tests/tests.py index f915d54..6bba4db 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -22,12 +22,12 @@ class TestStorage(TestStorageBase): files_only = True # def get_query(self, tmp_path) -> str: - return "gcs://snakemake-test-bucket/test-file.txt" + return "gs://snakemake-test-bucket/test-file.txt" def get_query_not_existing(self, tmp_path) -> str: bucket = uuid.uuid4().hex key = uuid.uuid4().hex - return f"gcs://{bucket}/{key}" + return f"gs://{bucket}/{key}" def get_storage_provider_cls(self) -> Type[StorageProviderBase]: # Return the StorageProvider class of this plugin @@ -46,7 +46,7 @@ def test_storage_nonempty_directory(self, tmp_path): tmpdir = "test_nonemptydir" # store the directory - obj = self._get_obj(tmp_path, f"gcs://snakemake-test-bucket/{tmpdir}") + obj = self._get_obj(tmp_path, f"gs://snakemake-test-bucket/{tmpdir}") stored = False try: @@ -83,11 +83,11 @@ def test_storage_nonempty_directory(self, tmp_path): shutil.rmtree(obj.local_path()) def test_list_candidate_matches(self, tmp_path): - obj = self._get_obj(tmp_path, "gcs://snakemake-test-bucket/") + obj = self._get_obj(tmp_path, "gs://snakemake-test-bucket/") candidates = list(obj.list_candidate_matches()) # I think the previous test deletes the first test_object expected_matches = [ - "gcs://snakemake-test-bucket/test-file_2.txt", - "gcs://snakemake-test-bucket/test-file_3.txt", + "gs://snakemake-test-bucket/test-file_2.txt", + "gs://snakemake-test-bucket/test-file_3.txt", ] assert candidates == expected_matches