Skip to content

Commit

Permalink
fix: allow both gs:// and gcs:// as query scheme; internally normaliz…
Browse files Browse the repository at this point in the history
…e to gs:// for compatibility with google cloud tools (#52)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Enhanced compatibility with Google Cloud Storage URLs by accepting
both "gcs://" and "gs://".
  - Introduced a method to normalize "gcs://" URLs to "gs://".

- **Improvements**
  - Updated error messages for clearer guidance on valid URL schemes.
  - Revised example queries to highlight the "gs://" format.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
  • Loading branch information
johanneskoester authored Aug 19, 2024
1 parent 13bde6b commit 099e87f
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ keywords = ["snakemake", "storage", "plugin", "google cloud storage"]
[tool.poetry.dependencies]
python = "^3.11"
snakemake-interface-common = "^1.14.2"
snakemake-interface-storage-plugins = "^3.0.0"
snakemake-interface-storage-plugins = "^3.3.0"
google-cloud-storage = "^2.12.0"
google-crc32c = "^1.1.2"

Expand Down
28 changes: 21 additions & 7 deletions snakemake_storage_plugin_gcs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dataclasses import dataclass, field
import re
from typing import Any, Iterable, List, Optional
from snakemake_interface_common.utils import lazy_property
from snakemake_interface_storage_plugins.settings import StorageProviderSettingsBase
Expand Down Expand Up @@ -32,6 +33,8 @@
from google.api_core import retry
from google_crc32c import Checksum

_RE_GCS_SCHEME = re.compile(r"^gcs://")


# Optional:
# Settings for the Google Storage plugin (e.g. host url, credentials).
Expand Down Expand Up @@ -195,11 +198,12 @@ def is_valid_query(cls, query: str) -> StorageQueryValidationResult:
valid=False,
reason=f"cannot be parsed as URL ({e})",
)
if parsed.scheme != "gcs":

if parsed.scheme != "gcs" and parsed.scheme != "gs":
return StorageQueryValidationResult(
query=query,
valid=False,
reason="must start with gcs scheme (gcs://...)",
reason="must start with gcs or gs scheme (gs://... or gcs://...)",
)
return StorageQueryValidationResult(
query=query,
Expand All @@ -213,10 +217,16 @@ def example_queries(cls) -> List[ExampleQuery]:
"""
return [
ExampleQuery(
query="gcs://mybucket/myfile.txt",
query="gs://mybucket/myfile.txt",
type=QueryType.ANY,
description="A file in an google storage (GCS) bucket",
)
),
ExampleQuery(
query="gcs://mybucket/myfile.txt",
type=QueryType.ANY,
description="A file in an google storage (GCS) bucket (alternative "
"query scheme)",
),
]

def use_rate_limiter(self) -> bool:
Expand Down Expand Up @@ -248,6 +258,10 @@ def list_objects(self, query: Any) -> Iterable[str]:
b = self.client.bucket(bucket_name, user_project=self.settings.project)
return [k.name for k in b.list_blobs()]

def postprocess_query(self, query: str) -> str:
# normalize gcs:// to gs:// (the official scheme for google storage tools)
return _RE_GCS_SCHEME.sub("gs://", query)


# Required:
# Implementation of storage object. If certain methods cannot be supported by your
Expand Down Expand Up @@ -467,16 +481,16 @@ def list_candidate_matches(self) -> Iterable[str]:
"""Return a list of candidate matches in the storage for the query."""
# This is used by glob_wildcards() to find matches for wildcards in the query.
prefix = get_constant_prefix(self.query)
if prefix.startswith(f"gcs://{self.bucket.name}"):
if prefix.startswith(f"gs://{self.bucket.name}"):
prefix = prefix[6 + len(self.bucket.name) :].lstrip("/")

return (
f"gcs://{self.bucket.name}/{item.name}"
f"gs://{self.bucket.name}/{item.name}"
for item in self.bucket.list_blobs(prefix=prefix)
)
else:
raise WorkflowError(
f"GCS storage object {self.query} must start with gcs://"
f"GCS storage object {self.query} must start with gs:// or gcs://"
)

# Helper functions and properties not part of standard interface
Expand Down
12 changes: 6 additions & 6 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ class TestStorage(TestStorageBase):
files_only = True #

def get_query(self, tmp_path) -> str:
return "gcs://snakemake-test-bucket/test-file.txt"
return "gs://snakemake-test-bucket/test-file.txt"

def get_query_not_existing(self, tmp_path) -> str:
bucket = uuid.uuid4().hex
key = uuid.uuid4().hex
return f"gcs://{bucket}/{key}"
return f"gs://{bucket}/{key}"

def get_storage_provider_cls(self) -> Type[StorageProviderBase]:
# Return the StorageProvider class of this plugin
Expand All @@ -46,7 +46,7 @@ def test_storage_nonempty_directory(self, tmp_path):
tmpdir = "test_nonemptydir"

# store the directory
obj = self._get_obj(tmp_path, f"gcs://snakemake-test-bucket/{tmpdir}")
obj = self._get_obj(tmp_path, f"gs://snakemake-test-bucket/{tmpdir}")

stored = False
try:
Expand Down Expand Up @@ -83,11 +83,11 @@ def test_storage_nonempty_directory(self, tmp_path):
shutil.rmtree(obj.local_path())

def test_list_candidate_matches(self, tmp_path):
obj = self._get_obj(tmp_path, "gcs://snakemake-test-bucket/")
obj = self._get_obj(tmp_path, "gs://snakemake-test-bucket/")
candidates = list(obj.list_candidate_matches())
# I think the previous test deletes the first test_object
expected_matches = [
"gcs://snakemake-test-bucket/test-file_2.txt",
"gcs://snakemake-test-bucket/test-file_3.txt",
"gs://snakemake-test-bucket/test-file_2.txt",
"gs://snakemake-test-bucket/test-file_3.txt",
]
assert candidates == expected_matches

0 comments on commit 099e87f

Please sign in to comment.