airbyte.constants

Constants shared across the PyAirbyte codebase.

  1# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
  2"""Constants shared across the PyAirbyte codebase."""
  3
  4from __future__ import annotations
  5
  6import logging
  7import os
  8from pathlib import Path
  9
 10
 11logger = logging.getLogger("airbyte")
 12
 13
 14DEBUG_MODE = False  # Set to True to enable additional debug logging.
 15
 16AB_EXTRACTED_AT_COLUMN = "_airbyte_extracted_at"
 17"""A column that stores the timestamp when the record was extracted."""
 18
 19AB_META_COLUMN = "_airbyte_meta"
 20"""A column that stores metadata about the record."""
 21
 22AB_RAW_ID_COLUMN = "_airbyte_raw_id"
 23"""A column that stores a unique identifier for each row in the source data.
 24
 25Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations.
 26In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte,
 27this column is simply used as a unique identifier for each record as it is received.
 28
 29PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time
 30received. This allows us to determine the debug the order of records as they are received, even if
 31the source provides records that are tied or received out of order from the perspective of their
 32`emitted_at` (`_airbyte_extracted_at`) timestamps.
 33"""
 34
 35AB_INTERNAL_COLUMNS = {
 36    AB_RAW_ID_COLUMN,
 37    AB_EXTRACTED_AT_COLUMN,
 38    AB_META_COLUMN,
 39}
 40"""A set of internal columns that are reserved for PyAirbyte's internal use."""
 41
 42
 43def _try_create_dir_if_missing(path: Path, desc: str = "specified") -> Path:
 44    """Try to create a directory if it does not exist."""
 45    resolved_path = path.expanduser().resolve()
 46    try:
 47        if resolved_path.exists():
 48            if not resolved_path.is_dir():
 49                logger.warning(
 50                    "The %s path exists but is not a directory: '%s'", desc, resolved_path
 51                )
 52            return resolved_path
 53        resolved_path.mkdir(parents=True, exist_ok=True)
 54    except Exception as ex:
 55        logger.warning(
 56            "Could not auto-create missing %s directory at '%s': %s", desc, resolved_path, ex
 57        )
 58    return resolved_path
 59
 60
 61DEFAULT_PROJECT_DIR: Path = _try_create_dir_if_missing(
 62    Path(os.getenv("AIRBYTE_PROJECT_DIR", "") or Path.cwd()).expanduser().absolute(),
 63    desc="project",
 64)
 65"""Default project directory.
 66
 67Can be overridden by setting the `AIRBYTE_PROJECT_DIR` environment variable.
 68
 69If not set, defaults to the current working directory.
 70
 71This serves as the parent directory for both cache and install directories when not explicitly
 72configured.
 73
 74If a path is specified that does not yet exist, PyAirbyte will attempt to create it.
 75"""
 76
 77
 78DEFAULT_INSTALL_DIR: Path = _try_create_dir_if_missing(
 79    Path(os.getenv("AIRBYTE_INSTALL_DIR", "") or DEFAULT_PROJECT_DIR).expanduser().absolute(),
 80    desc="install",
 81)
 82"""Default install directory for connectors.
 83
 84If not set, defaults to `DEFAULT_PROJECT_DIR` (`AIRBYTE_PROJECT_DIR` env var) or the current
 85working directory if neither is set.
 86
 87If a path is specified that does not yet exist, PyAirbyte will attempt to create it.
 88"""
 89
 90
 91DEFAULT_CACHE_ROOT: Path = (
 92    (Path(os.getenv("AIRBYTE_CACHE_ROOT", "") or (DEFAULT_PROJECT_DIR / ".cache")))
 93    .expanduser()
 94    .absolute()
 95)
 96"""Default cache root is `.cache` in the current working directory.
 97
 98The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.
 99
100Overriding this can be useful if you always want to store cache files in a specific location.
101For example, in ephemeral environments like Google Colab, you might want to store cache files in
102your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
103"""
104
105DEFAULT_CACHE_SCHEMA_NAME = "airbyte_raw"
106"""The default schema name to use for caches.
107
108Specific caches may override this value with a different schema name.
109"""
110
111DEFAULT_GOOGLE_DRIVE_MOUNT_PATH = "/content/drive"
112"""Default path to mount Google Drive in Google Colab environments."""
113
114DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
115"""The default number of records to include in each batch of an Arrow dataset."""
116
117
118def _str_to_bool(value: str) -> bool:
119    """Convert a string value of an environment values to a boolean value."""
120    return bool(value) and value.lower() not in {"", "0", "false", "f", "no", "n", "off"}
121
122
123TEMP_DIR_OVERRIDE: Path | None = (
124    Path(os.environ["AIRBYTE_TEMP_DIR"]) if os.getenv("AIRBYTE_TEMP_DIR") else None
125)
126"""The directory to use for temporary files.
127
128This value is read from the `AIRBYTE_TEMP_DIR` environment variable. If the variable is not set,
129Tempfile will use the system's default temporary directory.
130
131This can be useful if you want to store temporary files in a specific location (or) when you
132need your temporary files to exist in user level directories, and not in system level
133directories for permissions reasons.
134"""
135
136TEMP_FILE_CLEANUP = _str_to_bool(
137    os.getenv(
138        key="AIRBYTE_TEMP_FILE_CLEANUP",
139        default="true",
140    )
141)
142"""Whether to clean up temporary files after use.
143
144This value is read from the `AIRBYTE_TEMP_FILE_CLEANUP` environment variable. If the variable is
145not set, the default value is `True`.
146"""
147
148AIRBYTE_OFFLINE_MODE = _str_to_bool(
149    os.getenv(
150        key="AIRBYTE_OFFLINE_MODE",
151        default="false",
152    )
153)
154"""Enable or disable offline mode.
155
156When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the
157Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in
158environments without internet access or with air-gapped networks.
159
160Offline mode also disables telemetry, similar to a `DO_NOT_TRACK` setting, ensuring no usage data
161is sent from your environment. You may also specify a custom registry URL via the`_REGISTRY_ENV_VAR`
162environment variable if you prefer to use a different registry source for metadata.
163
164This setting helps you make informed choices about data privacy and operation in restricted and
165air-gapped environments.
166"""
167
168AIRBYTE_PRINT_FULL_ERROR_LOGS: bool = _str_to_bool(
169    os.getenv(
170        key="AIRBYTE_PRINT_FULL_ERROR_LOGS",
171        default=os.getenv("CI", "false"),
172    )
173)
174"""Whether to print full error logs when an error occurs.
175This setting helps in debugging by providing detailed logs when errors occur. This is especially
176helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after
177the pipeline run.
178
179If not set, the default value is `False` for non-CI environments.
180If running in a CI environment ("CI" env var is set), then the default value is `True`.
181"""
182
183NO_UV: bool = os.getenv("AIRBYTE_NO_UV", "").lower() not in {"1", "true", "yes"}
184"""Whether to use uv for Python package management.
185
186This value is determined by the `AIRBYTE_NO_UV` environment variable. When `AIRBYTE_NO_UV`
187is set to "1", "true", or "yes", uv will be disabled and pip will be used instead.
188
189If the variable is not set or set to any other value, uv will be used by default.
190This provides a safe fallback mechanism for environments where uv is not available
191or causes issues.
192"""
193
194SECRETS_HYDRATION_PREFIX = "secret_reference::"
195"""Use this prefix to indicate a secret reference in configuration.
196
197For example, this snippet will populate the `personal_access_token` field with the value of the
198secret named `GITHUB_PERSONAL_ACCESS_TOKEN`, for instance from an environment variable.
199
200```json
201{
202  "credentials": {
203    "personal_access_token": "secret_reference::GITHUB_PERSONAL_ACCESS_TOKEN"
204  }
205}
206```
207
208For more information, see the `airbyte.secrets` module documentation.
209"""
210
211# Cloud Constants
212
213CLOUD_CLIENT_ID_ENV_VAR: str = "AIRBYTE_CLOUD_CLIENT_ID"
214"""The environment variable name for the Airbyte Cloud client ID."""
215
216CLOUD_CLIENT_SECRET_ENV_VAR: str = "AIRBYTE_CLOUD_CLIENT_SECRET"
217"""The environment variable name for the Airbyte Cloud client secret."""
218
219CLOUD_API_ROOT_ENV_VAR: str = "AIRBYTE_CLOUD_API_URL"
220"""The environment variable name for the Airbyte Cloud API URL."""
221
222CLOUD_WORKSPACE_ID_ENV_VAR: str = "AIRBYTE_CLOUD_WORKSPACE_ID"
223"""The environment variable name for the Airbyte Cloud workspace ID."""
224
225CLOUD_API_ROOT: str = "https://api.airbyte.com/v1"
226"""The Airbyte Cloud API root URL.
227
228This is the root URL for the Airbyte Cloud API. It is used to interact with the Airbyte Cloud API
229and is the default API root for the `CloudWorkspace` class.
230- https://reference.airbyte.com/reference/getting-started
231"""
232
233CLOUD_CONFIG_API_ROOT: str = "https://cloud.airbyte.com/api/v1"
234"""Internal-Use API Root, aka Airbyte "Config API".
235
236Documentation:
237- https://docs.airbyte.com/api-documentation#configuration-api-deprecated
238- https://github.com/airbytehq/airbyte-platform-internal/blob/master/oss/airbyte-api/server-api/src/main/openapi/config.yaml
239"""
logger = <Logger airbyte (INFO)>
DEBUG_MODE = False
AB_EXTRACTED_AT_COLUMN = '_airbyte_extracted_at'

A column that stores the timestamp when the record was extracted.

AB_META_COLUMN = '_airbyte_meta'

A column that stores metadata about the record.

AB_RAW_ID_COLUMN = '_airbyte_raw_id'

A column that stores a unique identifier for each row in the source data.

Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations. In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte, this column is simply used as a unique identifier for each record as it is received.

PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time received. This allows us to determine the debug the order of records as they are received, even if the source provides records that are tied or received out of order from the perspective of their emitted_at (_airbyte_extracted_at) timestamps.

AB_INTERNAL_COLUMNS = {'_airbyte_meta', '_airbyte_extracted_at', '_airbyte_raw_id'}

A set of internal columns that are reserved for PyAirbyte's internal use.

DEFAULT_PROJECT_DIR: pathlib.Path = PosixPath('/home/runner/work/PyAirbyte/PyAirbyte')

Default project directory.

Can be overridden by setting the AIRBYTE_PROJECT_DIR environment variable.

If not set, defaults to the current working directory.

This serves as the parent directory for both cache and install directories when not explicitly configured.

If a path is specified that does not yet exist, PyAirbyte will attempt to create it.

DEFAULT_INSTALL_DIR: pathlib.Path = PosixPath('/home/runner/work/PyAirbyte/PyAirbyte')

Default install directory for connectors.

If not set, defaults to DEFAULT_PROJECT_DIR (AIRBYTE_PROJECT_DIR env var) or the current working directory if neither is set.

If a path is specified that does not yet exist, PyAirbyte will attempt to create it.

DEFAULT_CACHE_ROOT: pathlib.Path = PosixPath('/home/runner/work/PyAirbyte/PyAirbyte/.cache')

Default cache root is .cache in the current working directory.

The default location can be overridden by setting the AIRBYTE_CACHE_ROOT environment variable.

Overriding this can be useful if you always want to store cache files in a specific location. For example, in ephemeral environments like Google Colab, you might want to store cache files in your mounted Google Drive by setting this to a path like /content/drive/MyDrive/Airbyte/cache.

DEFAULT_CACHE_SCHEMA_NAME = 'airbyte_raw'

The default schema name to use for caches.

Specific caches may override this value with a different schema name.

DEFAULT_GOOGLE_DRIVE_MOUNT_PATH = '/content/drive'

Default path to mount Google Drive in Google Colab environments.

DEFAULT_ARROW_MAX_CHUNK_SIZE = 100000

The default number of records to include in each batch of an Arrow dataset.

TEMP_DIR_OVERRIDE: pathlib.Path | None = None

The directory to use for temporary files.

This value is read from the AIRBYTE_TEMP_DIR environment variable. If the variable is not set, Tempfile will use the system's default temporary directory.

This can be useful if you want to store temporary files in a specific location (or) when you need your temporary files to exist in user level directories, and not in system level directories for permissions reasons.

TEMP_FILE_CLEANUP = True

Whether to clean up temporary files after use.

This value is read from the AIRBYTE_TEMP_FILE_CLEANUP environment variable. If the variable is not set, the default value is True.

AIRBYTE_OFFLINE_MODE = False

Enable or disable offline mode.

When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in environments without internet access or with air-gapped networks.

Offline mode also disables telemetry, similar to a DO_NOT_TRACK setting, ensuring no usage data is sent from your environment. You may also specify a custom registry URL via the_REGISTRY_ENV_VAR environment variable if you prefer to use a different registry source for metadata.

This setting helps you make informed choices about data privacy and operation in restricted and air-gapped environments.

AIRBYTE_PRINT_FULL_ERROR_LOGS: bool = True

Whether to print full error logs when an error occurs. This setting helps in debugging by providing detailed logs when errors occur. This is especially helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after the pipeline run.

If not set, the default value is False for non-CI environments. If running in a CI environment ("CI" env var is set), then the default value is True.

NO_UV: bool = True

Whether to use uv for Python package management.

This value is determined by the AIRBYTE_NO_UV environment variable. When AIRBYTE_NO_UV is set to "1", "true", or "yes", uv will be disabled and pip will be used instead.

If the variable is not set or set to any other value, uv will be used by default. This provides a safe fallback mechanism for environments where uv is not available or causes issues.

SECRETS_HYDRATION_PREFIX = 'secret_reference::'

Use this prefix to indicate a secret reference in configuration.

For example, this snippet will populate the personal_access_token field with the value of the secret named GITHUB_PERSONAL_ACCESS_TOKEN, for instance from an environment variable.

{
  "credentials": {
    "personal_access_token": "secret_reference::GITHUB_PERSONAL_ACCESS_TOKEN"
  }
}

For more information, see the airbyte.secrets module documentation.

CLOUD_CLIENT_ID_ENV_VAR: str = 'AIRBYTE_CLOUD_CLIENT_ID'

The environment variable name for the Airbyte Cloud client ID.

CLOUD_CLIENT_SECRET_ENV_VAR: str = 'AIRBYTE_CLOUD_CLIENT_SECRET'

The environment variable name for the Airbyte Cloud client secret.

CLOUD_API_ROOT_ENV_VAR: str = 'AIRBYTE_CLOUD_API_URL'

The environment variable name for the Airbyte Cloud API URL.

CLOUD_WORKSPACE_ID_ENV_VAR: str = 'AIRBYTE_CLOUD_WORKSPACE_ID'

The environment variable name for the Airbyte Cloud workspace ID.

CLOUD_API_ROOT: str = 'https://api.airbyte.com/v1'

The Airbyte Cloud API root URL.

This is the root URL for the Airbyte Cloud API. It is used to interact with the Airbyte Cloud API and is the default API root for the CloudWorkspace class.

CLOUD_CONFIG_API_ROOT: str = 'https://cloud.airbyte.com/api/v1'