airbyte.caches.util
Utility functions for working with caches.
1# Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2"""Utility functions for working with caches.""" 3 4from __future__ import annotations 5 6from pathlib import Path 7 8import ulid 9 10from airbyte import exceptions as exc 11from airbyte.caches.duckdb import DuckDBCache 12 13 14# Google drive constants: 15 16_MY_DRIVE = "MyDrive" 17"""The default name of the user's personal Google Drive.""" 18 19_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive" 20"""The recommended path to mount Google Drive to.""" 21 22 23# Utility functions: 24 25 26def get_default_cache() -> DuckDBCache: 27 """Get a local cache for storing data, using the default database path. 28 29 Cache files are stored in the `.cache` directory, relative to the current 30 working directory. 31 """ 32 cache_dir = Path("./.cache/default_cache") 33 return DuckDBCache( 34 db_path=cache_dir / "default_cache.duckdb", 35 cache_dir=cache_dir, 36 ) 37 38 39def new_local_cache( 40 cache_name: str | None = None, 41 cache_dir: str | Path | None = None, 42 *, 43 cleanup: bool = True, 44) -> DuckDBCache: 45 """Get a local cache for storing data, using a name string to seed the path. 46 47 Args: 48 cache_name: Name to use for the cache. Defaults to None. 49 cache_dir: Root directory to store the cache in. Defaults to None. 50 cleanup: Whether to clean up temporary files. Defaults to True. 51 52 Cache files are stored in the `.cache` directory, relative to the current 53 working directory. 54 """ 55 if cache_name: 56 if " " in cache_name: 57 raise exc.PyAirbyteInputError( 58 message="Cache name cannot contain spaces.", 59 input_value=cache_name, 60 ) 61 62 if not cache_name.replace("_", "").isalnum(): 63 raise exc.PyAirbyteInputError( 64 message="Cache name can only contain alphanumeric characters and underscores.", 65 input_value=cache_name, 66 ) 67 68 cache_name = cache_name or str(ulid.ULID()) 69 cache_dir = cache_dir or Path(f"./.cache/{cache_name}") 70 if not isinstance(cache_dir, Path): 71 cache_dir = Path(cache_dir) 72 73 return DuckDBCache( 74 db_path=cache_dir / f"db_{cache_name}.duckdb", 75 cache_dir=cache_dir, 76 cleanup=cleanup, 77 ) 78 79 80def get_colab_cache( 81 cache_name: str = "default_cache", 82 sub_dir: str = "Airbyte/cache", 83 schema_name: str = "main", 84 table_prefix: str | None = "", 85 drive_name: str = _MY_DRIVE, 86 mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, 87) -> DuckDBCache: 88 """Get a local cache for storing data, using the default database path. 89 90 Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple 91 Colab sessions. 92 93 Please note that Google Colab may prompt you to authenticate with your Google account to access 94 your Google Drive. When prompted, click the link and follow the instructions. 95 96 Colab will require access to read and write files in your Google Drive, so please be sure to 97 grant the necessary permissions when prompted. 98 99 All arguments are optional and have default values that are suitable for most use cases. 100 101 Args: 102 cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you 103 want to use a different database for different projects. 104 sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this 105 if you want to store the cache in a different subdirectory than the default. 106 schema_name: The name of the schema to write to. Defaults to "main". Override this if you 107 want to write to a different schema. 108 table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this 109 if you want to use a different prefix for all tables. 110 drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you 111 want to store data in a shared drive instead of your personal drive. 112 mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this 113 if you want to mount Google Drive to a different path (not recommended). 114 115 ## Usage Examples 116 117 The default `get_colab_cache` arguments are suitable for most use cases: 118 119 ```python 120 from airbyte.caches.colab import get_colab_cache 121 122 colab_cache = get_colab_cache() 123 ``` 124 125 Or you can call `get_colab_cache` with custom arguments: 126 127 ```python 128 custom_cache = get_colab_cache( 129 cache_name="my_custom_cache", 130 sub_dir="Airbyte/custom_cache", 131 drive_name="My Company Drive", 132 ) 133 ``` 134 """ 135 try: 136 from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] 137 except ImportError: 138 drive = None 139 msg = ( 140 "The `google.colab` interface is only available in Google Colab. " 141 "Please run this code in a Google Colab notebook." 142 ) 143 raise ImportError(msg) from None 144 145 drive.mount(mount_path) 146 drive_root = ( 147 Path(mount_path) / drive_name 148 if drive_name == _MY_DRIVE 149 else Path(mount_path) / "Shareddrives" / drive_name 150 ) 151 152 cache_dir = drive_root / sub_dir 153 cache_dir.mkdir(parents=True, exist_ok=True) 154 db_file_path = cache_dir / f"{cache_name}.duckdb" 155 156 print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") 157 return DuckDBCache( 158 db_path=db_file_path, 159 cache_dir=cache_dir, 160 schema_name=schema_name, 161 table_prefix=table_prefix, 162 )
27def get_default_cache() -> DuckDBCache: 28 """Get a local cache for storing data, using the default database path. 29 30 Cache files are stored in the `.cache` directory, relative to the current 31 working directory. 32 """ 33 cache_dir = Path("./.cache/default_cache") 34 return DuckDBCache( 35 db_path=cache_dir / "default_cache.duckdb", 36 cache_dir=cache_dir, 37 )
Get a local cache for storing data, using the default database path.
Cache files are stored in the .cache
directory, relative to the current
working directory.
40def new_local_cache( 41 cache_name: str | None = None, 42 cache_dir: str | Path | None = None, 43 *, 44 cleanup: bool = True, 45) -> DuckDBCache: 46 """Get a local cache for storing data, using a name string to seed the path. 47 48 Args: 49 cache_name: Name to use for the cache. Defaults to None. 50 cache_dir: Root directory to store the cache in. Defaults to None. 51 cleanup: Whether to clean up temporary files. Defaults to True. 52 53 Cache files are stored in the `.cache` directory, relative to the current 54 working directory. 55 """ 56 if cache_name: 57 if " " in cache_name: 58 raise exc.PyAirbyteInputError( 59 message="Cache name cannot contain spaces.", 60 input_value=cache_name, 61 ) 62 63 if not cache_name.replace("_", "").isalnum(): 64 raise exc.PyAirbyteInputError( 65 message="Cache name can only contain alphanumeric characters and underscores.", 66 input_value=cache_name, 67 ) 68 69 cache_name = cache_name or str(ulid.ULID()) 70 cache_dir = cache_dir or Path(f"./.cache/{cache_name}") 71 if not isinstance(cache_dir, Path): 72 cache_dir = Path(cache_dir) 73 74 return DuckDBCache( 75 db_path=cache_dir / f"db_{cache_name}.duckdb", 76 cache_dir=cache_dir, 77 cleanup=cleanup, 78 )
Get a local cache for storing data, using a name string to seed the path.
Arguments:
- cache_name: Name to use for the cache. Defaults to None.
- cache_dir: Root directory to store the cache in. Defaults to None.
- cleanup: Whether to clean up temporary files. Defaults to True.
Cache files are stored in the .cache
directory, relative to the current
working directory.
81def get_colab_cache( 82 cache_name: str = "default_cache", 83 sub_dir: str = "Airbyte/cache", 84 schema_name: str = "main", 85 table_prefix: str | None = "", 86 drive_name: str = _MY_DRIVE, 87 mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, 88) -> DuckDBCache: 89 """Get a local cache for storing data, using the default database path. 90 91 Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple 92 Colab sessions. 93 94 Please note that Google Colab may prompt you to authenticate with your Google account to access 95 your Google Drive. When prompted, click the link and follow the instructions. 96 97 Colab will require access to read and write files in your Google Drive, so please be sure to 98 grant the necessary permissions when prompted. 99 100 All arguments are optional and have default values that are suitable for most use cases. 101 102 Args: 103 cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you 104 want to use a different database for different projects. 105 sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this 106 if you want to store the cache in a different subdirectory than the default. 107 schema_name: The name of the schema to write to. Defaults to "main". Override this if you 108 want to write to a different schema. 109 table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this 110 if you want to use a different prefix for all tables. 111 drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you 112 want to store data in a shared drive instead of your personal drive. 113 mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this 114 if you want to mount Google Drive to a different path (not recommended). 115 116 ## Usage Examples 117 118 The default `get_colab_cache` arguments are suitable for most use cases: 119 120 ```python 121 from airbyte.caches.colab import get_colab_cache 122 123 colab_cache = get_colab_cache() 124 ``` 125 126 Or you can call `get_colab_cache` with custom arguments: 127 128 ```python 129 custom_cache = get_colab_cache( 130 cache_name="my_custom_cache", 131 sub_dir="Airbyte/custom_cache", 132 drive_name="My Company Drive", 133 ) 134 ``` 135 """ 136 try: 137 from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] 138 except ImportError: 139 drive = None 140 msg = ( 141 "The `google.colab` interface is only available in Google Colab. " 142 "Please run this code in a Google Colab notebook." 143 ) 144 raise ImportError(msg) from None 145 146 drive.mount(mount_path) 147 drive_root = ( 148 Path(mount_path) / drive_name 149 if drive_name == _MY_DRIVE 150 else Path(mount_path) / "Shareddrives" / drive_name 151 ) 152 153 cache_dir = drive_root / sub_dir 154 cache_dir.mkdir(parents=True, exist_ok=True) 155 db_file_path = cache_dir / f"{cache_name}.duckdb" 156 157 print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") 158 return DuckDBCache( 159 db_path=db_file_path, 160 cache_dir=cache_dir, 161 schema_name=schema_name, 162 table_prefix=table_prefix, 163 )
Get a local cache for storing data, using the default database path.
Unlike the default DuckDBCache
, this implementation will easily persist data across multiple
Colab sessions.
Please note that Google Colab may prompt you to authenticate with your Google account to access your Google Drive. When prompted, click the link and follow the instructions.
Colab will require access to read and write files in your Google Drive, so please be sure to grant the necessary permissions when prompted.
All arguments are optional and have default values that are suitable for most use cases.
Arguments:
- cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you want to use a different database for different projects.
- sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this if you want to store the cache in a different subdirectory than the default.
- schema_name: The name of the schema to write to. Defaults to "main". Override this if you want to write to a different schema.
- table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this if you want to use a different prefix for all tables.
- drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you want to store data in a shared drive instead of your personal drive.
- mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this if you want to mount Google Drive to a different path (not recommended).
Usage Examples
The default get_colab_cache
arguments are suitable for most use cases:
from airbyte.caches.colab import get_colab_cache
colab_cache = get_colab_cache()
Or you can call get_colab_cache
with custom arguments:
custom_cache = get_colab_cache(
cache_name="my_custom_cache",
sub_dir="Airbyte/custom_cache",
drive_name="My Company Drive",
)