airbyte.caches.util

Utility functions for working with caches.

  1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  2"""Utility functions for working with caches."""
  3
  4from __future__ import annotations
  5
  6from pathlib import Path
  7
  8import ulid
  9
 10from airbyte import exceptions as exc
 11from airbyte.caches.duckdb import DuckDBCache
 12from airbyte.constants import (
 13    DEFAULT_CACHE_ROOT,
 14    DEFAULT_GOOGLE_DRIVE_MOUNT_PATH,
 15)
 16
 17
 18# Google drive constants:
 19
 20_MY_DRIVE = "MyDrive"
 21"""The default name of the user's personal Google Drive."""
 22
 23_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = DEFAULT_GOOGLE_DRIVE_MOUNT_PATH
 24"""The recommended path to mount Google Drive to."""
 25
 26
 27# Utility functions:
 28
 29
 30def get_default_cache() -> DuckDBCache:
 31    """Get a local cache for storing data, using the default database path.
 32
 33    Cache files are stored in the `.cache` directory, relative to the current
 34    working directory.
 35    """
 36    cache_dir = DEFAULT_CACHE_ROOT / "default_cache"
 37    return DuckDBCache(
 38        db_path=cache_dir / "default_cache.duckdb",
 39        cache_dir=cache_dir,
 40    )
 41
 42
 43def new_local_cache(
 44    cache_name: str | None = None,
 45    cache_dir: str | Path | None = None,
 46    *,
 47    cleanup: bool = True,
 48) -> DuckDBCache:
 49    """Get a local cache for storing data, using a name string to seed the path.
 50
 51    Args:
 52        cache_name: Name to use for the cache. Defaults to None.
 53        cache_dir: Root directory to store the cache in. Defaults to None.
 54        cleanup: Whether to clean up temporary files. Defaults to True.
 55
 56    Cache files are stored in the `.cache` directory, relative to the current
 57    working directory.
 58    """
 59    if cache_name:
 60        if " " in cache_name:
 61            raise exc.PyAirbyteInputError(
 62                message="Cache name cannot contain spaces.",
 63                input_value=cache_name,
 64            )
 65
 66        if not cache_name.replace("_", "").isalnum():
 67            raise exc.PyAirbyteInputError(
 68                message="Cache name can only contain alphanumeric characters and underscores.",
 69                input_value=cache_name,
 70            )
 71
 72    cache_name = cache_name or str(ulid.ULID())
 73    cache_dir = cache_dir or (DEFAULT_CACHE_ROOT / cache_name)
 74    if not isinstance(cache_dir, Path):
 75        cache_dir = Path(cache_dir)
 76
 77    return DuckDBCache(
 78        db_path=cache_dir / f"db_{cache_name}.duckdb",
 79        cache_dir=cache_dir,
 80        cleanup=cleanup,
 81    )
 82
 83
 84def get_colab_cache(
 85    cache_name: str = "default_cache",
 86    sub_dir: str = "Airbyte/cache",
 87    schema_name: str = "main",
 88    table_prefix: str | None = "",
 89    drive_name: str = _MY_DRIVE,
 90    mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
 91) -> DuckDBCache:
 92    """Get a local cache for storing data, using the default database path.
 93
 94    Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
 95    Colab sessions.
 96
 97    Please note that Google Colab may prompt you to authenticate with your Google account to access
 98    your Google Drive. When prompted, click the link and follow the instructions.
 99
100    Colab will require access to read and write files in your Google Drive, so please be sure to
101    grant the necessary permissions when prompted.
102
103    All arguments are optional and have default values that are suitable for most use cases.
104
105    Args:
106        cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
107            want to use a different database for different projects.
108        sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
109            if you want to store the cache in a different subdirectory than the default.
110        schema_name: The name of the schema to write to. Defaults to "main". Override this if you
111            want to write to a different schema.
112        table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
113            if you want to use a different prefix for all tables.
114        drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
115            want to store data in a shared drive instead of your personal drive.
116        mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
117            if you want to mount Google Drive to a different path (not recommended).
118
119    ## Usage Examples
120
121    The default `get_colab_cache` arguments are suitable for most use cases:
122
123    ```python
124    from airbyte.caches.colab import get_colab_cache
125
126    colab_cache = get_colab_cache()
127    ```
128
129    Or you can call `get_colab_cache` with custom arguments:
130
131    ```python
132    custom_cache = get_colab_cache(
133        cache_name="my_custom_cache",
134        sub_dir="Airbyte/custom_cache",
135        drive_name="My Company Drive",
136    )
137    ```
138    """
139    try:
140        from google.colab import drive  # noqa: PLC0415 # type: ignore[reportMissingImports]
141    except ImportError:
142        drive = None
143        msg = (
144            "The `google.colab` interface is only available in Google Colab. "
145            "Please run this code in a Google Colab notebook."
146        )
147        raise ImportError(msg) from None
148
149    drive.mount(mount_path)
150    drive_root = (
151        Path(mount_path) / drive_name
152        if drive_name == _MY_DRIVE
153        else Path(mount_path) / "Shareddrives" / drive_name
154    )
155
156    cache_dir = drive_root / sub_dir
157    cache_dir.mkdir(parents=True, exist_ok=True)
158    db_file_path = cache_dir / f"{cache_name}.duckdb"
159
160    print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
161    return DuckDBCache(
162        db_path=db_file_path,
163        cache_dir=cache_dir,
164        schema_name=schema_name,
165        table_prefix=table_prefix,
166    )
def get_default_cache() -> airbyte.DuckDBCache:
31def get_default_cache() -> DuckDBCache:
32    """Get a local cache for storing data, using the default database path.
33
34    Cache files are stored in the `.cache` directory, relative to the current
35    working directory.
36    """
37    cache_dir = DEFAULT_CACHE_ROOT / "default_cache"
38    return DuckDBCache(
39        db_path=cache_dir / "default_cache.duckdb",
40        cache_dir=cache_dir,
41    )

Get a local cache for storing data, using the default database path.

Cache files are stored in the .cache directory, relative to the current working directory.

def new_local_cache( cache_name: str | None = None, cache_dir: str | pathlib.Path | None = None, *, cleanup: bool = True) -> airbyte.DuckDBCache:
44def new_local_cache(
45    cache_name: str | None = None,
46    cache_dir: str | Path | None = None,
47    *,
48    cleanup: bool = True,
49) -> DuckDBCache:
50    """Get a local cache for storing data, using a name string to seed the path.
51
52    Args:
53        cache_name: Name to use for the cache. Defaults to None.
54        cache_dir: Root directory to store the cache in. Defaults to None.
55        cleanup: Whether to clean up temporary files. Defaults to True.
56
57    Cache files are stored in the `.cache` directory, relative to the current
58    working directory.
59    """
60    if cache_name:
61        if " " in cache_name:
62            raise exc.PyAirbyteInputError(
63                message="Cache name cannot contain spaces.",
64                input_value=cache_name,
65            )
66
67        if not cache_name.replace("_", "").isalnum():
68            raise exc.PyAirbyteInputError(
69                message="Cache name can only contain alphanumeric characters and underscores.",
70                input_value=cache_name,
71            )
72
73    cache_name = cache_name or str(ulid.ULID())
74    cache_dir = cache_dir or (DEFAULT_CACHE_ROOT / cache_name)
75    if not isinstance(cache_dir, Path):
76        cache_dir = Path(cache_dir)
77
78    return DuckDBCache(
79        db_path=cache_dir / f"db_{cache_name}.duckdb",
80        cache_dir=cache_dir,
81        cleanup=cleanup,
82    )

Get a local cache for storing data, using a name string to seed the path.

Arguments:
  • cache_name: Name to use for the cache. Defaults to None.
  • cache_dir: Root directory to store the cache in. Defaults to None.
  • cleanup: Whether to clean up temporary files. Defaults to True.

Cache files are stored in the .cache directory, relative to the current working directory.

def get_colab_cache( cache_name: str = 'default_cache', sub_dir: str = 'Airbyte/cache', schema_name: str = 'main', table_prefix: str | None = '', drive_name: str = 'MyDrive', mount_path: str = '/content/drive') -> airbyte.DuckDBCache:
 85def get_colab_cache(
 86    cache_name: str = "default_cache",
 87    sub_dir: str = "Airbyte/cache",
 88    schema_name: str = "main",
 89    table_prefix: str | None = "",
 90    drive_name: str = _MY_DRIVE,
 91    mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
 92) -> DuckDBCache:
 93    """Get a local cache for storing data, using the default database path.
 94
 95    Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
 96    Colab sessions.
 97
 98    Please note that Google Colab may prompt you to authenticate with your Google account to access
 99    your Google Drive. When prompted, click the link and follow the instructions.
100
101    Colab will require access to read and write files in your Google Drive, so please be sure to
102    grant the necessary permissions when prompted.
103
104    All arguments are optional and have default values that are suitable for most use cases.
105
106    Args:
107        cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
108            want to use a different database for different projects.
109        sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
110            if you want to store the cache in a different subdirectory than the default.
111        schema_name: The name of the schema to write to. Defaults to "main". Override this if you
112            want to write to a different schema.
113        table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
114            if you want to use a different prefix for all tables.
115        drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
116            want to store data in a shared drive instead of your personal drive.
117        mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
118            if you want to mount Google Drive to a different path (not recommended).
119
120    ## Usage Examples
121
122    The default `get_colab_cache` arguments are suitable for most use cases:
123
124    ```python
125    from airbyte.caches.colab import get_colab_cache
126
127    colab_cache = get_colab_cache()
128    ```
129
130    Or you can call `get_colab_cache` with custom arguments:
131
132    ```python
133    custom_cache = get_colab_cache(
134        cache_name="my_custom_cache",
135        sub_dir="Airbyte/custom_cache",
136        drive_name="My Company Drive",
137    )
138    ```
139    """
140    try:
141        from google.colab import drive  # noqa: PLC0415 # type: ignore[reportMissingImports]
142    except ImportError:
143        drive = None
144        msg = (
145            "The `google.colab` interface is only available in Google Colab. "
146            "Please run this code in a Google Colab notebook."
147        )
148        raise ImportError(msg) from None
149
150    drive.mount(mount_path)
151    drive_root = (
152        Path(mount_path) / drive_name
153        if drive_name == _MY_DRIVE
154        else Path(mount_path) / "Shareddrives" / drive_name
155    )
156
157    cache_dir = drive_root / sub_dir
158    cache_dir.mkdir(parents=True, exist_ok=True)
159    db_file_path = cache_dir / f"{cache_name}.duckdb"
160
161    print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
162    return DuckDBCache(
163        db_path=db_file_path,
164        cache_dir=cache_dir,
165        schema_name=schema_name,
166        table_prefix=table_prefix,
167    )

Get a local cache for storing data, using the default database path.

Unlike the default DuckDBCache, this implementation will easily persist data across multiple Colab sessions.

Please note that Google Colab may prompt you to authenticate with your Google account to access your Google Drive. When prompted, click the link and follow the instructions.

Colab will require access to read and write files in your Google Drive, so please be sure to grant the necessary permissions when prompted.

All arguments are optional and have default values that are suitable for most use cases.

Arguments:
  • cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you want to use a different database for different projects.
  • sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this if you want to store the cache in a different subdirectory than the default.
  • schema_name: The name of the schema to write to. Defaults to "main". Override this if you want to write to a different schema.
  • table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this if you want to use a different prefix for all tables.
  • drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you want to store data in a shared drive instead of your personal drive.
  • mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this if you want to mount Google Drive to a different path (not recommended).

Usage Examples

The default get_colab_cache arguments are suitable for most use cases:

from airbyte.caches.colab import get_colab_cache

colab_cache = get_colab_cache()

Or you can call get_colab_cache with custom arguments:

custom_cache = get_colab_cache(
    cache_name="my_custom_cache",
    sub_dir="Airbyte/custom_cache",
    drive_name="My Company Drive",
)