diff --git a/airbyte/__init__.py b/airbyte/__init__.py index ab927734..93037fea 100644 --- a/airbyte/__init__.py +++ b/airbyte/__init__.py @@ -126,6 +126,7 @@ from airbyte import ( caches, cloud, + constants, datasets, destinations, documents, @@ -139,7 +140,7 @@ ) from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache -from airbyte.caches.util import get_default_cache, new_local_cache +from airbyte.caches.util import get_colab_cache, get_default_cache, new_local_cache from airbyte.datasets import CachedDataset from airbyte.destinations.base import Destination from airbyte.destinations.util import get_destination @@ -154,8 +155,9 @@ __all__ = [ # Modules - "cloud", "caches", + "cloud", + "constants", "datasets", "destinations", "documents", @@ -169,6 +171,7 @@ "sources", # Factories "get_available_connectors", + "get_colab_cache", "get_default_cache", "get_destination", "get_secret", diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index cc168c71..ce917358 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -13,6 +13,7 @@ from airbyte_protocol.models import ConfiguredAirbyteCatalog +from airbyte import constants from airbyte._writers.base import AirbyteWriterInterface from airbyte.caches._catalog_backend import CatalogBackendBase, SqlCatalogBackend from airbyte.caches._state_backend import SqlStateBackend @@ -50,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface): to the SQL backend specified in the `SqlConfig` class. """ - cache_dir: Path = Field(default=Path(".cache")) + cache_dir: Path = Field(default=Path(constants.DEFAULT_CACHE_ROOT)) """The directory to store the cache in.""" cleanup: bool = TEMP_FILE_CLEANUP diff --git a/airbyte/caches/util.py b/airbyte/caches/util.py index d1cf2128..1d192406 100644 --- a/airbyte/caches/util.py +++ b/airbyte/caches/util.py @@ -11,6 +11,18 @@ from airbyte.caches.duckdb import DuckDBCache +# Google drive constants: + +_MY_DRIVE = "MyDrive" +"""The default name of the user's personal Google Drive.""" + +_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive" +"""The recommended path to mount Google Drive to.""" + + +# Utility functions: + + def get_default_cache() -> DuckDBCache: """Get a local cache for storing data, using the default database path. @@ -63,3 +75,88 @@ def new_local_cache( cache_dir=cache_dir, cleanup=cleanup, ) + + +def get_colab_cache( + cache_name: str = "default_cache", + sub_dir: str = "Airbyte/cache", + schema_name: str = "main", + table_prefix: str | None = "", + drive_name: str = _MY_DRIVE, + mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, +) -> DuckDBCache: + """Get a local cache for storing data, using the default database path. + + Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple + Colab sessions. + + Please note that Google Colab may prompt you to authenticate with your Google account to access + your Google Drive. When prompted, click the link and follow the instructions. + + Colab will require access to read and write files in your Google Drive, so please be sure to + grant the necessary permissions when prompted. + + All arguments are optional and have default values that are suitable for most use cases. + + Args: + cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you + want to use a different database for different projects. + sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this + if you want to store the cache in a different subdirectory than the default. + schema_name: The name of the schema to write to. Defaults to "main". Override this if you + want to write to a different schema. + table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this + if you want to use a different prefix for all tables. + drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you + want to store data in a shared drive instead of your personal drive. + mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this + if you want to mount Google Drive to a different path (not recommended). + + ## Usage Examples + + The default `get_colab_cache` arguments are suitable for most use cases: + + ```python + from airbyte.caches.colab import get_colab_cache + + colab_cache = get_colab_cache() + ``` + + Or you can call `get_colab_cache` with custom arguments: + + ```python + custom_cache = get_colab_cache( + cache_name="my_custom_cache", + sub_dir="Airbyte/custom_cache", + drive_name="My Company Drive", + ) + ``` + """ + try: + from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] + except ImportError: + drive = None + msg = ( + "The `google.colab` interface is only available in Google Colab. " + "Please run this code in a Google Colab notebook." + ) + raise ImportError(msg) from None + + drive.mount(mount_path) + drive_root = ( + Path(mount_path) / drive_name + if drive_name == _MY_DRIVE + else Path(mount_path) / "Shareddrives" / drive_name + ) + + cache_dir = drive_root / sub_dir + cache_dir.mkdir(parents=True, exist_ok=True) + db_file_path = cache_dir / f"{cache_name}.duckdb" + + print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") + return DuckDBCache( + db_path=db_file_path, + cache_dir=cache_dir, + schema_name=schema_name, + table_prefix=table_prefix, + ) diff --git a/airbyte/constants.py b/airbyte/constants.py index f4f4b969..94548d03 100644 --- a/airbyte/constants.py +++ b/airbyte/constants.py @@ -4,6 +4,7 @@ from __future__ import annotations import os +from pathlib import Path DEBUG_MODE = False # Set to True to enable additional debug logging. @@ -41,6 +42,20 @@ Specific caches may override this value with a different schema name. """ +DEFAULT_CACHE_ROOT: Path = ( + Path() / ".cache" + if "AIRBYTE_CACHE_ROOT" not in os.environ + else Path(os.environ["AIRBYTE_CACHE_ROOT"]) +) +"""Default cache root is `.cache` in the current working directory. + +The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable. + +Overriding this can be useful if you always want to store cache files in a specific location. +For example, in ephemeral environments like Google Colab, you might want to store cache files in +your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`. +""" + DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000 """The default number of records to include in each batch of an Arrow dataset."""