Skip to content

Commit

Permalink
Feat: Easy persistent cache with new ab.get_colab_cache helper func…
Browse files Browse the repository at this point in the history
…tion (#361)
  • Loading branch information
aaronsteers authored Sep 15, 2024
1 parent c5fea25 commit 2f27c87
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 3 deletions.
7 changes: 5 additions & 2 deletions airbyte/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from airbyte import (
caches,
cloud,
constants,
datasets,
destinations,
documents,
Expand All @@ -139,7 +140,7 @@
)
from airbyte.caches.bigquery import BigQueryCache
from airbyte.caches.duckdb import DuckDBCache
from airbyte.caches.util import get_default_cache, new_local_cache
from airbyte.caches.util import get_colab_cache, get_default_cache, new_local_cache
from airbyte.datasets import CachedDataset
from airbyte.destinations.base import Destination
from airbyte.destinations.util import get_destination
Expand All @@ -154,8 +155,9 @@

__all__ = [
# Modules
"cloud",
"caches",
"cloud",
"constants",
"datasets",
"destinations",
"documents",
Expand All @@ -169,6 +171,7 @@
"sources",
# Factories
"get_available_connectors",
"get_colab_cache",
"get_default_cache",
"get_destination",
"get_secret",
Expand Down
3 changes: 2 additions & 1 deletion airbyte/caches/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from airbyte_protocol.models import ConfiguredAirbyteCatalog

from airbyte import constants
from airbyte._writers.base import AirbyteWriterInterface
from airbyte.caches._catalog_backend import CatalogBackendBase, SqlCatalogBackend
from airbyte.caches._state_backend import SqlStateBackend
Expand Down Expand Up @@ -50,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface):
to the SQL backend specified in the `SqlConfig` class.
"""

cache_dir: Path = Field(default=Path(".cache"))
cache_dir: Path = Field(default=Path(constants.DEFAULT_CACHE_ROOT))
"""The directory to store the cache in."""

cleanup: bool = TEMP_FILE_CLEANUP
Expand Down
97 changes: 97 additions & 0 deletions airbyte/caches/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
from airbyte.caches.duckdb import DuckDBCache


# Google drive constants:

_MY_DRIVE = "MyDrive"
"""The default name of the user's personal Google Drive."""

_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive"
"""The recommended path to mount Google Drive to."""


# Utility functions:


def get_default_cache() -> DuckDBCache:
"""Get a local cache for storing data, using the default database path.
Expand Down Expand Up @@ -63,3 +75,88 @@ def new_local_cache(
cache_dir=cache_dir,
cleanup=cleanup,
)


def get_colab_cache(
cache_name: str = "default_cache",
sub_dir: str = "Airbyte/cache",
schema_name: str = "main",
table_prefix: str | None = "",
drive_name: str = _MY_DRIVE,
mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
) -> DuckDBCache:
"""Get a local cache for storing data, using the default database path.
Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
Colab sessions.
Please note that Google Colab may prompt you to authenticate with your Google account to access
your Google Drive. When prompted, click the link and follow the instructions.
Colab will require access to read and write files in your Google Drive, so please be sure to
grant the necessary permissions when prompted.
All arguments are optional and have default values that are suitable for most use cases.
Args:
cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
want to use a different database for different projects.
sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
if you want to store the cache in a different subdirectory than the default.
schema_name: The name of the schema to write to. Defaults to "main". Override this if you
want to write to a different schema.
table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
if you want to use a different prefix for all tables.
drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
want to store data in a shared drive instead of your personal drive.
mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
if you want to mount Google Drive to a different path (not recommended).
## Usage Examples
The default `get_colab_cache` arguments are suitable for most use cases:
```python
from airbyte.caches.colab import get_colab_cache
colab_cache = get_colab_cache()
```
Or you can call `get_colab_cache` with custom arguments:
```python
custom_cache = get_colab_cache(
cache_name="my_custom_cache",
sub_dir="Airbyte/custom_cache",
drive_name="My Company Drive",
)
```
"""
try:
from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports]
except ImportError:
drive = None
msg = (
"The `google.colab` interface is only available in Google Colab. "
"Please run this code in a Google Colab notebook."
)
raise ImportError(msg) from None

drive.mount(mount_path)
drive_root = (
Path(mount_path) / drive_name
if drive_name == _MY_DRIVE
else Path(mount_path) / "Shareddrives" / drive_name
)

cache_dir = drive_root / sub_dir
cache_dir.mkdir(parents=True, exist_ok=True)
db_file_path = cache_dir / f"{cache_name}.duckdb"

print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
return DuckDBCache(
db_path=db_file_path,
cache_dir=cache_dir,
schema_name=schema_name,
table_prefix=table_prefix,
)
15 changes: 15 additions & 0 deletions airbyte/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

import os
from pathlib import Path


DEBUG_MODE = False # Set to True to enable additional debug logging.
Expand Down Expand Up @@ -41,6 +42,20 @@
Specific caches may override this value with a different schema name.
"""

DEFAULT_CACHE_ROOT: Path = (
Path() / ".cache"
if "AIRBYTE_CACHE_ROOT" not in os.environ
else Path(os.environ["AIRBYTE_CACHE_ROOT"])
)
"""Default cache root is `.cache` in the current working directory.
The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.
Overriding this can be useful if you always want to store cache files in a specific location.
For example, in ephemeral environments like Google Colab, you might want to store cache files in
your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
"""

DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
"""The default number of records to include in each batch of an Arrow dataset."""

Expand Down

0 comments on commit 2f27c87

Please sign in to comment.