airbyte.caches.util

Utility functions for working with caches.

  1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  2"""Utility functions for working with caches."""
  3
  4from __future__ import annotations
  5
  6from pathlib import Path
  7
  8import ulid
  9
 10from airbyte import exceptions as exc
 11from airbyte.caches.duckdb import DuckDBCache
 12
 13
 14# Google drive constants:
 15
 16_MY_DRIVE = "MyDrive"
 17"""The default name of the user's personal Google Drive."""
 18
 19_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive"
 20"""The recommended path to mount Google Drive to."""
 21
 22
 23# Utility functions:
 24
 25
 26def get_default_cache() -> DuckDBCache:
 27    """Get a local cache for storing data, using the default database path.
 28
 29    Cache files are stored in the `.cache` directory, relative to the current
 30    working directory.
 31    """
 32    cache_dir = Path("./.cache/default_cache")
 33    return DuckDBCache(
 34        db_path=cache_dir / "default_cache.duckdb",
 35        cache_dir=cache_dir,
 36    )
 37
 38
 39def new_local_cache(
 40    cache_name: str | None = None,
 41    cache_dir: str | Path | None = None,
 42    *,
 43    cleanup: bool = True,
 44) -> DuckDBCache:
 45    """Get a local cache for storing data, using a name string to seed the path.
 46
 47    Args:
 48        cache_name: Name to use for the cache. Defaults to None.
 49        cache_dir: Root directory to store the cache in. Defaults to None.
 50        cleanup: Whether to clean up temporary files. Defaults to True.
 51
 52    Cache files are stored in the `.cache` directory, relative to the current
 53    working directory.
 54    """
 55    if cache_name:
 56        if " " in cache_name:
 57            raise exc.PyAirbyteInputError(
 58                message="Cache name cannot contain spaces.",
 59                input_value=cache_name,
 60            )
 61
 62        if not cache_name.replace("_", "").isalnum():
 63            raise exc.PyAirbyteInputError(
 64                message="Cache name can only contain alphanumeric characters and underscores.",
 65                input_value=cache_name,
 66            )
 67
 68    cache_name = cache_name or str(ulid.ULID())
 69    cache_dir = cache_dir or Path(f"./.cache/{cache_name}")
 70    if not isinstance(cache_dir, Path):
 71        cache_dir = Path(cache_dir)
 72
 73    return DuckDBCache(
 74        db_path=cache_dir / f"db_{cache_name}.duckdb",
 75        cache_dir=cache_dir,
 76        cleanup=cleanup,
 77    )
 78
 79
 80def get_colab_cache(
 81    cache_name: str = "default_cache",
 82    sub_dir: str = "Airbyte/cache",
 83    schema_name: str = "main",
 84    table_prefix: str | None = "",
 85    drive_name: str = _MY_DRIVE,
 86    mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
 87) -> DuckDBCache:
 88    """Get a local cache for storing data, using the default database path.
 89
 90    Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
 91    Colab sessions.
 92
 93    Please note that Google Colab may prompt you to authenticate with your Google account to access
 94    your Google Drive. When prompted, click the link and follow the instructions.
 95
 96    Colab will require access to read and write files in your Google Drive, so please be sure to
 97    grant the necessary permissions when prompted.
 98
 99    All arguments are optional and have default values that are suitable for most use cases.
100
101    Args:
102        cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
103            want to use a different database for different projects.
104        sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
105            if you want to store the cache in a different subdirectory than the default.
106        schema_name: The name of the schema to write to. Defaults to "main". Override this if you
107            want to write to a different schema.
108        table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
109            if you want to use a different prefix for all tables.
110        drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
111            want to store data in a shared drive instead of your personal drive.
112        mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
113            if you want to mount Google Drive to a different path (not recommended).
114
115    ## Usage Examples
116
117    The default `get_colab_cache` arguments are suitable for most use cases:
118
119    ```python
120    from airbyte.caches.colab import get_colab_cache
121
122    colab_cache = get_colab_cache()
123    ```
124
125    Or you can call `get_colab_cache` with custom arguments:
126
127    ```python
128    custom_cache = get_colab_cache(
129        cache_name="my_custom_cache",
130        sub_dir="Airbyte/custom_cache",
131        drive_name="My Company Drive",
132    )
133    ```
134    """
135    try:
136        from google.colab import drive  # noqa: PLC0415 # type: ignore[reportMissingImports]
137    except ImportError:
138        drive = None
139        msg = (
140            "The `google.colab` interface is only available in Google Colab. "
141            "Please run this code in a Google Colab notebook."
142        )
143        raise ImportError(msg) from None
144
145    drive.mount(mount_path)
146    drive_root = (
147        Path(mount_path) / drive_name
148        if drive_name == _MY_DRIVE
149        else Path(mount_path) / "Shareddrives" / drive_name
150    )
151
152    cache_dir = drive_root / sub_dir
153    cache_dir.mkdir(parents=True, exist_ok=True)
154    db_file_path = cache_dir / f"{cache_name}.duckdb"
155
156    print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
157    return DuckDBCache(
158        db_path=db_file_path,
159        cache_dir=cache_dir,
160        schema_name=schema_name,
161        table_prefix=table_prefix,
162    )
def get_default_cache() -> airbyte.DuckDBCache:
27def get_default_cache() -> DuckDBCache:
28    """Get a local cache for storing data, using the default database path.
29
30    Cache files are stored in the `.cache` directory, relative to the current
31    working directory.
32    """
33    cache_dir = Path("./.cache/default_cache")
34    return DuckDBCache(
35        db_path=cache_dir / "default_cache.duckdb",
36        cache_dir=cache_dir,
37    )

Get a local cache for storing data, using the default database path.

Cache files are stored in the .cache directory, relative to the current working directory.

def new_local_cache( cache_name: str | None = None, cache_dir: str | pathlib.Path | None = None, *, cleanup: bool = True) -> airbyte.DuckDBCache:
40def new_local_cache(
41    cache_name: str | None = None,
42    cache_dir: str | Path | None = None,
43    *,
44    cleanup: bool = True,
45) -> DuckDBCache:
46    """Get a local cache for storing data, using a name string to seed the path.
47
48    Args:
49        cache_name: Name to use for the cache. Defaults to None.
50        cache_dir: Root directory to store the cache in. Defaults to None.
51        cleanup: Whether to clean up temporary files. Defaults to True.
52
53    Cache files are stored in the `.cache` directory, relative to the current
54    working directory.
55    """
56    if cache_name:
57        if " " in cache_name:
58            raise exc.PyAirbyteInputError(
59                message="Cache name cannot contain spaces.",
60                input_value=cache_name,
61            )
62
63        if not cache_name.replace("_", "").isalnum():
64            raise exc.PyAirbyteInputError(
65                message="Cache name can only contain alphanumeric characters and underscores.",
66                input_value=cache_name,
67            )
68
69    cache_name = cache_name or str(ulid.ULID())
70    cache_dir = cache_dir or Path(f"./.cache/{cache_name}")
71    if not isinstance(cache_dir, Path):
72        cache_dir = Path(cache_dir)
73
74    return DuckDBCache(
75        db_path=cache_dir / f"db_{cache_name}.duckdb",
76        cache_dir=cache_dir,
77        cleanup=cleanup,
78    )

Get a local cache for storing data, using a name string to seed the path.

Arguments:
  • cache_name: Name to use for the cache. Defaults to None.
  • cache_dir: Root directory to store the cache in. Defaults to None.
  • cleanup: Whether to clean up temporary files. Defaults to True.

Cache files are stored in the .cache directory, relative to the current working directory.

def get_colab_cache( cache_name: str = 'default_cache', sub_dir: str = 'Airbyte/cache', schema_name: str = 'main', table_prefix: str | None = '', drive_name: str = 'MyDrive', mount_path: str = '/content/drive') -> airbyte.DuckDBCache:
 81def get_colab_cache(
 82    cache_name: str = "default_cache",
 83    sub_dir: str = "Airbyte/cache",
 84    schema_name: str = "main",
 85    table_prefix: str | None = "",
 86    drive_name: str = _MY_DRIVE,
 87    mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
 88) -> DuckDBCache:
 89    """Get a local cache for storing data, using the default database path.
 90
 91    Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
 92    Colab sessions.
 93
 94    Please note that Google Colab may prompt you to authenticate with your Google account to access
 95    your Google Drive. When prompted, click the link and follow the instructions.
 96
 97    Colab will require access to read and write files in your Google Drive, so please be sure to
 98    grant the necessary permissions when prompted.
 99
100    All arguments are optional and have default values that are suitable for most use cases.
101
102    Args:
103        cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
104            want to use a different database for different projects.
105        sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
106            if you want to store the cache in a different subdirectory than the default.
107        schema_name: The name of the schema to write to. Defaults to "main". Override this if you
108            want to write to a different schema.
109        table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
110            if you want to use a different prefix for all tables.
111        drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
112            want to store data in a shared drive instead of your personal drive.
113        mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
114            if you want to mount Google Drive to a different path (not recommended).
115
116    ## Usage Examples
117
118    The default `get_colab_cache` arguments are suitable for most use cases:
119
120    ```python
121    from airbyte.caches.colab import get_colab_cache
122
123    colab_cache = get_colab_cache()
124    ```
125
126    Or you can call `get_colab_cache` with custom arguments:
127
128    ```python
129    custom_cache = get_colab_cache(
130        cache_name="my_custom_cache",
131        sub_dir="Airbyte/custom_cache",
132        drive_name="My Company Drive",
133    )
134    ```
135    """
136    try:
137        from google.colab import drive  # noqa: PLC0415 # type: ignore[reportMissingImports]
138    except ImportError:
139        drive = None
140        msg = (
141            "The `google.colab` interface is only available in Google Colab. "
142            "Please run this code in a Google Colab notebook."
143        )
144        raise ImportError(msg) from None
145
146    drive.mount(mount_path)
147    drive_root = (
148        Path(mount_path) / drive_name
149        if drive_name == _MY_DRIVE
150        else Path(mount_path) / "Shareddrives" / drive_name
151    )
152
153    cache_dir = drive_root / sub_dir
154    cache_dir.mkdir(parents=True, exist_ok=True)
155    db_file_path = cache_dir / f"{cache_name}.duckdb"
156
157    print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
158    return DuckDBCache(
159        db_path=db_file_path,
160        cache_dir=cache_dir,
161        schema_name=schema_name,
162        table_prefix=table_prefix,
163    )

Get a local cache for storing data, using the default database path.

Unlike the default DuckDBCache, this implementation will easily persist data across multiple Colab sessions.

Please note that Google Colab may prompt you to authenticate with your Google account to access your Google Drive. When prompted, click the link and follow the instructions.

Colab will require access to read and write files in your Google Drive, so please be sure to grant the necessary permissions when prompted.

All arguments are optional and have default values that are suitable for most use cases.

Arguments:
  • cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you want to use a different database for different projects.
  • sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this if you want to store the cache in a different subdirectory than the default.
  • schema_name: The name of the schema to write to. Defaults to "main". Override this if you want to write to a different schema.
  • table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this if you want to use a different prefix for all tables.
  • drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you want to store data in a shared drive instead of your personal drive.
  • mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this if you want to mount Google Drive to a different path (not recommended).

Usage Examples

The default get_colab_cache arguments are suitable for most use cases:

from airbyte.caches.colab import get_colab_cache

colab_cache = get_colab_cache()

Or you can call get_colab_cache with custom arguments:

custom_cache = get_colab_cache(
    cache_name="my_custom_cache",
    sub_dir="Airbyte/custom_cache",
    drive_name="My Company Drive",
)