airbyte.constants
Constants shared across the PyAirbyte codebase.
1# Copyright (c) 2024 Airbyte, Inc., all rights reserved. 2"""Constants shared across the PyAirbyte codebase.""" 3 4from __future__ import annotations 5 6import logging 7import os 8from pathlib import Path 9 10 11logger = logging.getLogger("airbyte") 12 13 14DEBUG_MODE = False # Set to True to enable additional debug logging. 15 16AB_EXTRACTED_AT_COLUMN = "_airbyte_extracted_at" 17"""A column that stores the timestamp when the record was extracted.""" 18 19AB_META_COLUMN = "_airbyte_meta" 20"""A column that stores metadata about the record.""" 21 22AB_RAW_ID_COLUMN = "_airbyte_raw_id" 23"""A column that stores a unique identifier for each row in the source data. 24 25Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations. 26In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte, 27this column is simply used as a unique identifier for each record as it is received. 28 29PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time 30received. This allows us to determine the debug the order of records as they are received, even if 31the source provides records that are tied or received out of order from the perspective of their 32`emitted_at` (`_airbyte_extracted_at`) timestamps. 33""" 34 35AB_INTERNAL_COLUMNS = { 36 AB_RAW_ID_COLUMN, 37 AB_EXTRACTED_AT_COLUMN, 38 AB_META_COLUMN, 39} 40"""A set of internal columns that are reserved for PyAirbyte's internal use.""" 41 42 43def _try_create_dir_if_missing(path: Path, desc: str = "specified") -> Path: 44 """Try to create a directory if it does not exist.""" 45 resolved_path = path.expanduser().resolve() 46 try: 47 if resolved_path.exists(): 48 if not resolved_path.is_dir(): 49 logger.warning( 50 "The %s path exists but is not a directory: '%s'", desc, resolved_path 51 ) 52 return resolved_path 53 resolved_path.mkdir(parents=True, exist_ok=True) 54 except Exception as ex: 55 logger.warning( 56 "Could not auto-create missing %s directory at '%s': %s", desc, resolved_path, ex 57 ) 58 return resolved_path 59 60 61DEFAULT_PROJECT_DIR: Path = _try_create_dir_if_missing( 62 Path(os.getenv("AIRBYTE_PROJECT_DIR", "") or Path.cwd()).expanduser().absolute(), 63 desc="project", 64) 65"""Default project directory. 66 67Can be overridden by setting the `AIRBYTE_PROJECT_DIR` environment variable. 68 69If not set, defaults to the current working directory. 70 71This serves as the parent directory for both cache and install directories when not explicitly 72configured. 73 74If a path is specified that does not yet exist, PyAirbyte will attempt to create it. 75""" 76 77 78DEFAULT_INSTALL_DIR: Path = _try_create_dir_if_missing( 79 Path(os.getenv("AIRBYTE_INSTALL_DIR", "") or DEFAULT_PROJECT_DIR).expanduser().absolute(), 80 desc="install", 81) 82"""Default install directory for connectors. 83 84If not set, defaults to `DEFAULT_PROJECT_DIR` (`AIRBYTE_PROJECT_DIR` env var) or the current 85working directory if neither is set. 86 87If a path is specified that does not yet exist, PyAirbyte will attempt to create it. 88""" 89 90 91DEFAULT_CACHE_ROOT: Path = ( 92 (Path(os.getenv("AIRBYTE_CACHE_ROOT", "") or (DEFAULT_PROJECT_DIR / ".cache"))) 93 .expanduser() 94 .absolute() 95) 96"""Default cache root is `.cache` in the current working directory. 97 98The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable. 99 100Overriding this can be useful if you always want to store cache files in a specific location. 101For example, in ephemeral environments like Google Colab, you might want to store cache files in 102your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`. 103""" 104 105DEFAULT_CACHE_SCHEMA_NAME = "airbyte_raw" 106"""The default schema name to use for caches. 107 108Specific caches may override this value with a different schema name. 109""" 110 111DEFAULT_GOOGLE_DRIVE_MOUNT_PATH = "/content/drive" 112"""Default path to mount Google Drive in Google Colab environments.""" 113 114DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000 115"""The default number of records to include in each batch of an Arrow dataset.""" 116 117 118def _str_to_bool(value: str) -> bool: 119 """Convert a string value of an environment values to a boolean value.""" 120 return bool(value) and value.lower() not in {"", "0", "false", "f", "no", "n", "off"} 121 122 123TEMP_DIR_OVERRIDE: Path | None = ( 124 Path(os.environ["AIRBYTE_TEMP_DIR"]) if os.getenv("AIRBYTE_TEMP_DIR") else None 125) 126"""The directory to use for temporary files. 127 128This value is read from the `AIRBYTE_TEMP_DIR` environment variable. If the variable is not set, 129Tempfile will use the system's default temporary directory. 130 131This can be useful if you want to store temporary files in a specific location (or) when you 132need your temporary files to exist in user level directories, and not in system level 133directories for permissions reasons. 134""" 135 136TEMP_FILE_CLEANUP = _str_to_bool( 137 os.getenv( 138 key="AIRBYTE_TEMP_FILE_CLEANUP", 139 default="true", 140 ) 141) 142"""Whether to clean up temporary files after use. 143 144This value is read from the `AIRBYTE_TEMP_FILE_CLEANUP` environment variable. If the variable is 145not set, the default value is `True`. 146""" 147 148AIRBYTE_OFFLINE_MODE = _str_to_bool( 149 os.getenv( 150 key="AIRBYTE_OFFLINE_MODE", 151 default="false", 152 ) 153) 154"""Enable or disable offline mode. 155 156When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the 157Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in 158environments without internet access or with air-gapped networks. 159 160Offline mode also disables telemetry, similar to a `DO_NOT_TRACK` setting, ensuring no usage data 161is sent from your environment. You may also specify a custom registry URL via the`_REGISTRY_ENV_VAR` 162environment variable if you prefer to use a different registry source for metadata. 163 164This setting helps you make informed choices about data privacy and operation in restricted and 165air-gapped environments. 166""" 167 168AIRBYTE_PRINT_FULL_ERROR_LOGS: bool = _str_to_bool( 169 os.getenv( 170 key="AIRBYTE_PRINT_FULL_ERROR_LOGS", 171 default=os.getenv("CI", "false"), 172 ) 173) 174"""Whether to print full error logs when an error occurs. 175This setting helps in debugging by providing detailed logs when errors occur. This is especially 176helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after 177the pipeline run. 178 179If not set, the default value is `False` for non-CI environments. 180If running in a CI environment ("CI" env var is set), then the default value is `True`. 181""" 182 183NO_UV: bool = os.getenv("AIRBYTE_NO_UV", "").lower() not in {"1", "true", "yes"} 184"""Whether to use uv for Python package management. 185 186This value is determined by the `AIRBYTE_NO_UV` environment variable. When `AIRBYTE_NO_UV` 187is set to "1", "true", or "yes", uv will be disabled and pip will be used instead. 188 189If the variable is not set or set to any other value, uv will be used by default. 190This provides a safe fallback mechanism for environments where uv is not available 191or causes issues. 192""" 193 194SECRETS_HYDRATION_PREFIX = "secret_reference::" 195"""Use this prefix to indicate a secret reference in configuration. 196 197For example, this snippet will populate the `personal_access_token` field with the value of the 198secret named `GITHUB_PERSONAL_ACCESS_TOKEN`, for instance from an environment variable. 199 200```json 201{ 202 "credentials": { 203 "personal_access_token": "secret_reference::GITHUB_PERSONAL_ACCESS_TOKEN" 204 } 205} 206``` 207 208For more information, see the `airbyte.secrets` module documentation. 209""" 210 211# Cloud Constants 212 213CLOUD_CLIENT_ID_ENV_VAR: str = "AIRBYTE_CLOUD_CLIENT_ID" 214"""The environment variable name for the Airbyte Cloud client ID.""" 215 216CLOUD_CLIENT_SECRET_ENV_VAR: str = "AIRBYTE_CLOUD_CLIENT_SECRET" 217"""The environment variable name for the Airbyte Cloud client secret.""" 218 219CLOUD_API_ROOT_ENV_VAR: str = "AIRBYTE_CLOUD_API_URL" 220"""The environment variable name for the Airbyte Cloud API URL.""" 221 222CLOUD_WORKSPACE_ID_ENV_VAR: str = "AIRBYTE_CLOUD_WORKSPACE_ID" 223"""The environment variable name for the Airbyte Cloud workspace ID.""" 224 225CLOUD_API_ROOT: str = "https://api.airbyte.com/v1" 226"""The Airbyte Cloud API root URL. 227 228This is the root URL for the Airbyte Cloud API. It is used to interact with the Airbyte Cloud API 229and is the default API root for the `CloudWorkspace` class. 230- https://reference.airbyte.com/reference/getting-started 231""" 232 233CLOUD_CONFIG_API_ROOT: str = "https://cloud.airbyte.com/api/v1" 234"""Internal-Use API Root, aka Airbyte "Config API". 235 236Documentation: 237- https://docs.airbyte.com/api-documentation#configuration-api-deprecated 238- https://github.com/airbytehq/airbyte-platform-internal/blob/master/oss/airbyte-api/server-api/src/main/openapi/config.yaml 239"""
A column that stores the timestamp when the record was extracted.
A column that stores metadata about the record.
A column that stores a unique identifier for each row in the source data.
Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations. In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte, this column is simply used as a unique identifier for each record as it is received.
PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time
received. This allows us to determine the debug the order of records as they are received, even if
the source provides records that are tied or received out of order from the perspective of their
emitted_at
(_airbyte_extracted_at
) timestamps.
A set of internal columns that are reserved for PyAirbyte's internal use.
Default project directory.
Can be overridden by setting the AIRBYTE_PROJECT_DIR
environment variable.
If not set, defaults to the current working directory.
This serves as the parent directory for both cache and install directories when not explicitly configured.
If a path is specified that does not yet exist, PyAirbyte will attempt to create it.
Default install directory for connectors.
If not set, defaults to DEFAULT_PROJECT_DIR
(AIRBYTE_PROJECT_DIR
env var) or the current
working directory if neither is set.
If a path is specified that does not yet exist, PyAirbyte will attempt to create it.
Default cache root is .cache
in the current working directory.
The default location can be overridden by setting the AIRBYTE_CACHE_ROOT
environment variable.
Overriding this can be useful if you always want to store cache files in a specific location.
For example, in ephemeral environments like Google Colab, you might want to store cache files in
your mounted Google Drive by setting this to a path like /content/drive/MyDrive/Airbyte/cache
.
The default schema name to use for caches.
Specific caches may override this value with a different schema name.
Default path to mount Google Drive in Google Colab environments.
The default number of records to include in each batch of an Arrow dataset.
The directory to use for temporary files.
This value is read from the AIRBYTE_TEMP_DIR
environment variable. If the variable is not set,
Tempfile will use the system's default temporary directory.
This can be useful if you want to store temporary files in a specific location (or) when you need your temporary files to exist in user level directories, and not in system level directories for permissions reasons.
Whether to clean up temporary files after use.
This value is read from the AIRBYTE_TEMP_FILE_CLEANUP
environment variable. If the variable is
not set, the default value is True
.
Enable or disable offline mode.
When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in environments without internet access or with air-gapped networks.
Offline mode also disables telemetry, similar to a DO_NOT_TRACK
setting, ensuring no usage data
is sent from your environment. You may also specify a custom registry URL via the_REGISTRY_ENV_VAR
environment variable if you prefer to use a different registry source for metadata.
This setting helps you make informed choices about data privacy and operation in restricted and air-gapped environments.
Whether to print full error logs when an error occurs. This setting helps in debugging by providing detailed logs when errors occur. This is especially helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after the pipeline run.
If not set, the default value is False
for non-CI environments.
If running in a CI environment ("CI" env var is set), then the default value is True
.
Whether to use uv for Python package management.
This value is determined by the AIRBYTE_NO_UV
environment variable. When AIRBYTE_NO_UV
is set to "1", "true", or "yes", uv will be disabled and pip will be used instead.
If the variable is not set or set to any other value, uv will be used by default. This provides a safe fallback mechanism for environments where uv is not available or causes issues.
Use this prefix to indicate a secret reference in configuration.
For example, this snippet will populate the personal_access_token
field with the value of the
secret named GITHUB_PERSONAL_ACCESS_TOKEN
, for instance from an environment variable.
{
"credentials": {
"personal_access_token": "secret_reference::GITHUB_PERSONAL_ACCESS_TOKEN"
}
}
For more information, see the airbyte.secrets
module documentation.
The environment variable name for the Airbyte Cloud client ID.
The environment variable name for the Airbyte Cloud client secret.
The environment variable name for the Airbyte Cloud API URL.
The environment variable name for the Airbyte Cloud workspace ID.
The Airbyte Cloud API root URL.
This is the root URL for the Airbyte Cloud API. It is used to interact with the Airbyte Cloud API
and is the default API root for the CloudWorkspace
class.
Internal-Use API Root, aka Airbyte "Config API".
Documentation: