airbyte.caches.util
Utility functions for working with caches.
1# Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2"""Utility functions for working with caches.""" 3 4from __future__ import annotations 5 6from pathlib import Path 7 8import ulid 9 10from airbyte import exceptions as exc 11from airbyte.caches.duckdb import DuckDBCache 12from airbyte.constants import ( 13 DEFAULT_CACHE_ROOT, 14 DEFAULT_GOOGLE_DRIVE_MOUNT_PATH, 15) 16 17 18# Google drive constants: 19 20_MY_DRIVE = "MyDrive" 21"""The default name of the user's personal Google Drive.""" 22 23_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = DEFAULT_GOOGLE_DRIVE_MOUNT_PATH 24"""The recommended path to mount Google Drive to.""" 25 26 27# Utility functions: 28 29 30def get_default_cache() -> DuckDBCache: 31 """Get a local cache for storing data, using the default database path. 32 33 Cache files are stored in the `.cache` directory, relative to the current 34 working directory. 35 """ 36 cache_dir = DEFAULT_CACHE_ROOT / "default_cache" 37 return DuckDBCache( 38 db_path=cache_dir / "default_cache.duckdb", 39 cache_dir=cache_dir, 40 ) 41 42 43def new_local_cache( 44 cache_name: str | None = None, 45 cache_dir: str | Path | None = None, 46 *, 47 cleanup: bool = True, 48) -> DuckDBCache: 49 """Get a local cache for storing data, using a name string to seed the path. 50 51 Args: 52 cache_name: Name to use for the cache. Defaults to None. 53 cache_dir: Root directory to store the cache in. Defaults to None. 54 cleanup: Whether to clean up temporary files. Defaults to True. 55 56 Cache files are stored in the `.cache` directory, relative to the current 57 working directory. 58 """ 59 if cache_name: 60 if " " in cache_name: 61 raise exc.PyAirbyteInputError( 62 message="Cache name cannot contain spaces.", 63 input_value=cache_name, 64 ) 65 66 if not cache_name.replace("_", "").isalnum(): 67 raise exc.PyAirbyteInputError( 68 message="Cache name can only contain alphanumeric characters and underscores.", 69 input_value=cache_name, 70 ) 71 72 cache_name = cache_name or str(ulid.ULID()) 73 cache_dir = cache_dir or (DEFAULT_CACHE_ROOT / cache_name) 74 if not isinstance(cache_dir, Path): 75 cache_dir = Path(cache_dir) 76 77 return DuckDBCache( 78 db_path=cache_dir / f"db_{cache_name}.duckdb", 79 cache_dir=cache_dir, 80 cleanup=cleanup, 81 ) 82 83 84def get_colab_cache( 85 cache_name: str = "default_cache", 86 sub_dir: str = "Airbyte/cache", 87 schema_name: str = "main", 88 table_prefix: str | None = "", 89 drive_name: str = _MY_DRIVE, 90 mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, 91) -> DuckDBCache: 92 """Get a local cache for storing data, using the default database path. 93 94 Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple 95 Colab sessions. 96 97 Please note that Google Colab may prompt you to authenticate with your Google account to access 98 your Google Drive. When prompted, click the link and follow the instructions. 99 100 Colab will require access to read and write files in your Google Drive, so please be sure to 101 grant the necessary permissions when prompted. 102 103 All arguments are optional and have default values that are suitable for most use cases. 104 105 Args: 106 cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you 107 want to use a different database for different projects. 108 sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this 109 if you want to store the cache in a different subdirectory than the default. 110 schema_name: The name of the schema to write to. Defaults to "main". Override this if you 111 want to write to a different schema. 112 table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this 113 if you want to use a different prefix for all tables. 114 drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you 115 want to store data in a shared drive instead of your personal drive. 116 mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this 117 if you want to mount Google Drive to a different path (not recommended). 118 119 ## Usage Examples 120 121 The default `get_colab_cache` arguments are suitable for most use cases: 122 123 ```python 124 from airbyte.caches.colab import get_colab_cache 125 126 colab_cache = get_colab_cache() 127 ``` 128 129 Or you can call `get_colab_cache` with custom arguments: 130 131 ```python 132 custom_cache = get_colab_cache( 133 cache_name="my_custom_cache", 134 sub_dir="Airbyte/custom_cache", 135 drive_name="My Company Drive", 136 ) 137 ``` 138 """ 139 try: 140 from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] 141 except ImportError: 142 drive = None 143 msg = ( 144 "The `google.colab` interface is only available in Google Colab. " 145 "Please run this code in a Google Colab notebook." 146 ) 147 raise ImportError(msg) from None 148 149 drive.mount(mount_path) 150 drive_root = ( 151 Path(mount_path) / drive_name 152 if drive_name == _MY_DRIVE 153 else Path(mount_path) / "Shareddrives" / drive_name 154 ) 155 156 cache_dir = drive_root / sub_dir 157 cache_dir.mkdir(parents=True, exist_ok=True) 158 db_file_path = cache_dir / f"{cache_name}.duckdb" 159 160 print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") 161 return DuckDBCache( 162 db_path=db_file_path, 163 cache_dir=cache_dir, 164 schema_name=schema_name, 165 table_prefix=table_prefix, 166 )
31def get_default_cache() -> DuckDBCache: 32 """Get a local cache for storing data, using the default database path. 33 34 Cache files are stored in the `.cache` directory, relative to the current 35 working directory. 36 """ 37 cache_dir = DEFAULT_CACHE_ROOT / "default_cache" 38 return DuckDBCache( 39 db_path=cache_dir / "default_cache.duckdb", 40 cache_dir=cache_dir, 41 )
Get a local cache for storing data, using the default database path.
Cache files are stored in the .cache
directory, relative to the current
working directory.
44def new_local_cache( 45 cache_name: str | None = None, 46 cache_dir: str | Path | None = None, 47 *, 48 cleanup: bool = True, 49) -> DuckDBCache: 50 """Get a local cache for storing data, using a name string to seed the path. 51 52 Args: 53 cache_name: Name to use for the cache. Defaults to None. 54 cache_dir: Root directory to store the cache in. Defaults to None. 55 cleanup: Whether to clean up temporary files. Defaults to True. 56 57 Cache files are stored in the `.cache` directory, relative to the current 58 working directory. 59 """ 60 if cache_name: 61 if " " in cache_name: 62 raise exc.PyAirbyteInputError( 63 message="Cache name cannot contain spaces.", 64 input_value=cache_name, 65 ) 66 67 if not cache_name.replace("_", "").isalnum(): 68 raise exc.PyAirbyteInputError( 69 message="Cache name can only contain alphanumeric characters and underscores.", 70 input_value=cache_name, 71 ) 72 73 cache_name = cache_name or str(ulid.ULID()) 74 cache_dir = cache_dir or (DEFAULT_CACHE_ROOT / cache_name) 75 if not isinstance(cache_dir, Path): 76 cache_dir = Path(cache_dir) 77 78 return DuckDBCache( 79 db_path=cache_dir / f"db_{cache_name}.duckdb", 80 cache_dir=cache_dir, 81 cleanup=cleanup, 82 )
Get a local cache for storing data, using a name string to seed the path.
Arguments:
- cache_name: Name to use for the cache. Defaults to None.
- cache_dir: Root directory to store the cache in. Defaults to None.
- cleanup: Whether to clean up temporary files. Defaults to True.
Cache files are stored in the .cache
directory, relative to the current
working directory.
85def get_colab_cache( 86 cache_name: str = "default_cache", 87 sub_dir: str = "Airbyte/cache", 88 schema_name: str = "main", 89 table_prefix: str | None = "", 90 drive_name: str = _MY_DRIVE, 91 mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, 92) -> DuckDBCache: 93 """Get a local cache for storing data, using the default database path. 94 95 Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple 96 Colab sessions. 97 98 Please note that Google Colab may prompt you to authenticate with your Google account to access 99 your Google Drive. When prompted, click the link and follow the instructions. 100 101 Colab will require access to read and write files in your Google Drive, so please be sure to 102 grant the necessary permissions when prompted. 103 104 All arguments are optional and have default values that are suitable for most use cases. 105 106 Args: 107 cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you 108 want to use a different database for different projects. 109 sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this 110 if you want to store the cache in a different subdirectory than the default. 111 schema_name: The name of the schema to write to. Defaults to "main". Override this if you 112 want to write to a different schema. 113 table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this 114 if you want to use a different prefix for all tables. 115 drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you 116 want to store data in a shared drive instead of your personal drive. 117 mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this 118 if you want to mount Google Drive to a different path (not recommended). 119 120 ## Usage Examples 121 122 The default `get_colab_cache` arguments are suitable for most use cases: 123 124 ```python 125 from airbyte.caches.colab import get_colab_cache 126 127 colab_cache = get_colab_cache() 128 ``` 129 130 Or you can call `get_colab_cache` with custom arguments: 131 132 ```python 133 custom_cache = get_colab_cache( 134 cache_name="my_custom_cache", 135 sub_dir="Airbyte/custom_cache", 136 drive_name="My Company Drive", 137 ) 138 ``` 139 """ 140 try: 141 from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] 142 except ImportError: 143 drive = None 144 msg = ( 145 "The `google.colab` interface is only available in Google Colab. " 146 "Please run this code in a Google Colab notebook." 147 ) 148 raise ImportError(msg) from None 149 150 drive.mount(mount_path) 151 drive_root = ( 152 Path(mount_path) / drive_name 153 if drive_name == _MY_DRIVE 154 else Path(mount_path) / "Shareddrives" / drive_name 155 ) 156 157 cache_dir = drive_root / sub_dir 158 cache_dir.mkdir(parents=True, exist_ok=True) 159 db_file_path = cache_dir / f"{cache_name}.duckdb" 160 161 print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") 162 return DuckDBCache( 163 db_path=db_file_path, 164 cache_dir=cache_dir, 165 schema_name=schema_name, 166 table_prefix=table_prefix, 167 )
Get a local cache for storing data, using the default database path.
Unlike the default DuckDBCache
, this implementation will easily persist data across multiple
Colab sessions.
Please note that Google Colab may prompt you to authenticate with your Google account to access your Google Drive. When prompted, click the link and follow the instructions.
Colab will require access to read and write files in your Google Drive, so please be sure to grant the necessary permissions when prompted.
All arguments are optional and have default values that are suitable for most use cases.
Arguments:
- cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you want to use a different database for different projects.
- sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this if you want to store the cache in a different subdirectory than the default.
- schema_name: The name of the schema to write to. Defaults to "main". Override this if you want to write to a different schema.
- table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this if you want to use a different prefix for all tables.
- drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you want to store data in a shared drive instead of your personal drive.
- mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this if you want to mount Google Drive to a different path (not recommended).
Usage Examples
The default get_colab_cache
arguments are suitable for most use cases:
from airbyte.caches.colab import get_colab_cache
colab_cache = get_colab_cache()
Or you can call get_colab_cache
with custom arguments:
custom_cache = get_colab_cache(
cache_name="my_custom_cache",
sub_dir="Airbyte/custom_cache",
drive_name="My Company Drive",
)