airbyte.constants

Constants shared across the PyAirbyte codebase.

  1# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
  2"""Constants shared across the PyAirbyte codebase."""
  3
  4from __future__ import annotations
  5
  6import os
  7from pathlib import Path
  8
  9
 10DEBUG_MODE = False  # Set to True to enable additional debug logging.
 11
 12
 13AB_EXTRACTED_AT_COLUMN = "_airbyte_extracted_at"
 14"""A column that stores the timestamp when the record was extracted."""
 15
 16AB_META_COLUMN = "_airbyte_meta"
 17"""A column that stores metadata about the record."""
 18
 19AB_RAW_ID_COLUMN = "_airbyte_raw_id"
 20"""A column that stores a unique identifier for each row in the source data.
 21
 22Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations.
 23In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte,
 24this column is simply used as a unique identifier for each record as it is received.
 25
 26PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time
 27received. This allows us to determine the debug the order of records as they are received, even if
 28the source provides records that are tied or received out of order from the perspective of their
 29`emitted_at` (`_airbyte_extracted_at`) timestamps.
 30"""
 31
 32AB_INTERNAL_COLUMNS = {
 33    AB_RAW_ID_COLUMN,
 34    AB_EXTRACTED_AT_COLUMN,
 35    AB_META_COLUMN,
 36}
 37"""A set of internal columns that are reserved for PyAirbyte's internal use."""
 38
 39DEFAULT_CACHE_SCHEMA_NAME = "airbyte_raw"
 40"""The default schema name to use for caches.
 41
 42Specific caches may override this value with a different schema name.
 43"""
 44
 45DEFAULT_CACHE_ROOT: Path = (
 46    Path() / ".cache"
 47    if "AIRBYTE_CACHE_ROOT" not in os.environ
 48    else Path(os.environ["AIRBYTE_CACHE_ROOT"])
 49)
 50"""Default cache root is `.cache` in the current working directory.
 51
 52The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.
 53
 54Overriding this can be useful if you always want to store cache files in a specific location.
 55For example, in ephemeral environments like Google Colab, you might want to store cache files in
 56your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
 57"""
 58
 59DEFAULT_PROJECT_DIR: Path = (
 60    Path(os.getenv("AIRBYTE_PROJECT_DIR", "") or Path.cwd()).expanduser().absolute()
 61)
 62"""Default project directory.
 63
 64Can be overridden by setting the `AIRBYTE_PROJECT_DIR` environment variable.
 65
 66If not set, defaults to the current working directory.
 67
 68This serves as the parent directory for both cache and install directories when not explicitly
 69configured.
 70"""
 71
 72DEFAULT_INSTALL_DIR: Path = (
 73    Path(os.getenv("AIRBYTE_INSTALL_DIR", "") or DEFAULT_PROJECT_DIR).expanduser().absolute()
 74)
 75"""Default install directory for connectors.
 76
 77If not set, defaults to `DEFAULT_PROJECT_DIR` (`AIRBYTE_PROJECT_DIR` env var) or the current
 78working directory if neither is set.
 79"""
 80
 81
 82DEFAULT_GOOGLE_DRIVE_MOUNT_PATH = "/content/drive"
 83"""Default path to mount Google Drive in Google Colab environments."""
 84
 85DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
 86"""The default number of records to include in each batch of an Arrow dataset."""
 87
 88
 89def _str_to_bool(value: str) -> bool:
 90    """Convert a string value of an environment values to a boolean value."""
 91    return bool(value) and value.lower() not in {"", "0", "false", "f", "no", "n", "off"}
 92
 93
 94TEMP_DIR_OVERRIDE: Path | None = (
 95    Path(os.environ["AIRBYTE_TEMP_DIR"]) if os.getenv("AIRBYTE_TEMP_DIR") else None
 96)
 97"""The directory to use for temporary files.
 98
 99This value is read from the `AIRBYTE_TEMP_DIR` environment variable. If the variable is not set,
100Tempfile will use the system's default temporary directory.
101
102This can be useful if you want to store temporary files in a specific location (or) when you
103need your temporary files to exist in user level directories, and not in system level
104directories for permissions reasons.
105"""
106
107TEMP_FILE_CLEANUP = _str_to_bool(
108    os.getenv(
109        key="AIRBYTE_TEMP_FILE_CLEANUP",
110        default="true",
111    )
112)
113"""Whether to clean up temporary files after use.
114
115This value is read from the `AIRBYTE_TEMP_FILE_CLEANUP` environment variable. If the variable is
116not set, the default value is `True`.
117"""
118
119AIRBYTE_OFFLINE_MODE = _str_to_bool(
120    os.getenv(
121        key="AIRBYTE_OFFLINE_MODE",
122        default="false",
123    )
124)
125"""Enable or disable offline mode.
126
127When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the
128Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in
129environments without internet access or with air-gapped networks.
130
131Offline mode also disables telemetry, similar to a `DO_NOT_TRACK` setting, ensuring no usage data
132is sent from your environment. You may also specify a custom registry URL via the`_REGISTRY_ENV_VAR`
133environment variable if you prefer to use a different registry source for metadata.
134
135This setting helps you make informed choices about data privacy and operation in restricted and
136air-gapped environments.
137"""
138
139AIRBYTE_PRINT_FULL_ERROR_LOGS: bool = _str_to_bool(
140    os.getenv(
141        key="AIRBYTE_PRINT_FULL_ERROR_LOGS",
142        default=os.getenv("CI", "false"),
143    )
144)
145"""Whether to print full error logs when an error occurs.
146This setting helps in debugging by providing detailed logs when errors occur. This is especially
147helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after
148the pipeline run.
149
150If not set, the default value is `False` for non-CI environments.
151If running in a CI environment ("CI" env var is set), then the default value is `True`.
152"""
153
154NO_UV: bool = os.getenv("AIRBYTE_NO_UV", "").lower() not in {"1", "true", "yes"}
155"""Whether to use uv for Python package management.
156
157This value is determined by the `AIRBYTE_NO_UV` environment variable. When `AIRBYTE_NO_UV`
158is set to "1", "true", or "yes", uv will be disabled and pip will be used instead.
159
160If the variable is not set or set to any other value, uv will be used by default.
161This provides a safe fallback mechanism for environments where uv is not available
162or causes issues.
163"""
164
165SECRETS_HYDRATION_PREFIX = "secret_reference::"
166"""Use this prefix to indicate a secret reference in configuration.
167
168For example, this snippet will populate the `personal_access_token` field with the value of the
169secret named `GITHUB_PERSONAL_ACCESS_TOKEN`, for instance from an environment variable.
170
171```json
172{
173  "credentials": {
174    "personal_access_token": "secret_reference::GITHUB_PERSONAL_ACCESS_TOKEN"
175  }
176}
177```
178
179For more information, see the `airbyte.secrets` module documentation.
180"""
DEBUG_MODE = False
AB_EXTRACTED_AT_COLUMN = '_airbyte_extracted_at'

A column that stores the timestamp when the record was extracted.

AB_META_COLUMN = '_airbyte_meta'

A column that stores metadata about the record.

AB_RAW_ID_COLUMN = '_airbyte_raw_id'

A column that stores a unique identifier for each row in the source data.

Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations. In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte, this column is simply used as a unique identifier for each record as it is received.

PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time received. This allows us to determine the debug the order of records as they are received, even if the source provides records that are tied or received out of order from the perspective of their emitted_at (_airbyte_extracted_at) timestamps.

AB_INTERNAL_COLUMNS = {'_airbyte_extracted_at', '_airbyte_meta', '_airbyte_raw_id'}

A set of internal columns that are reserved for PyAirbyte's internal use.

DEFAULT_CACHE_SCHEMA_NAME = 'airbyte_raw'

The default schema name to use for caches.

Specific caches may override this value with a different schema name.

DEFAULT_CACHE_ROOT: pathlib.Path = PosixPath('.cache')

Default cache root is .cache in the current working directory.

The default location can be overridden by setting the AIRBYTE_CACHE_ROOT environment variable.

Overriding this can be useful if you always want to store cache files in a specific location. For example, in ephemeral environments like Google Colab, you might want to store cache files in your mounted Google Drive by setting this to a path like /content/drive/MyDrive/Airbyte/cache.

DEFAULT_PROJECT_DIR: pathlib.Path = PosixPath('/home/runner/work/PyAirbyte/PyAirbyte')

Default project directory.

Can be overridden by setting the AIRBYTE_PROJECT_DIR environment variable.

If not set, defaults to the current working directory.

This serves as the parent directory for both cache and install directories when not explicitly configured.

DEFAULT_INSTALL_DIR: pathlib.Path = PosixPath('/home/runner/work/PyAirbyte/PyAirbyte')

Default install directory for connectors.

If not set, defaults to DEFAULT_PROJECT_DIR (AIRBYTE_PROJECT_DIR env var) or the current working directory if neither is set.

DEFAULT_GOOGLE_DRIVE_MOUNT_PATH = '/content/drive'

Default path to mount Google Drive in Google Colab environments.

DEFAULT_ARROW_MAX_CHUNK_SIZE = 100000

The default number of records to include in each batch of an Arrow dataset.

TEMP_DIR_OVERRIDE: pathlib.Path | None = None

The directory to use for temporary files.

This value is read from the AIRBYTE_TEMP_DIR environment variable. If the variable is not set, Tempfile will use the system's default temporary directory.

This can be useful if you want to store temporary files in a specific location (or) when you need your temporary files to exist in user level directories, and not in system level directories for permissions reasons.

TEMP_FILE_CLEANUP = True

Whether to clean up temporary files after use.

This value is read from the AIRBYTE_TEMP_FILE_CLEANUP environment variable. If the variable is not set, the default value is True.

AIRBYTE_OFFLINE_MODE = False

Enable or disable offline mode.

When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in environments without internet access or with air-gapped networks.

Offline mode also disables telemetry, similar to a DO_NOT_TRACK setting, ensuring no usage data is sent from your environment. You may also specify a custom registry URL via the_REGISTRY_ENV_VAR environment variable if you prefer to use a different registry source for metadata.

This setting helps you make informed choices about data privacy and operation in restricted and air-gapped environments.

AIRBYTE_PRINT_FULL_ERROR_LOGS: bool = True

Whether to print full error logs when an error occurs. This setting helps in debugging by providing detailed logs when errors occur. This is especially helpful in ephemeral environments like CI/CD pipelines where log files may not be persisted after the pipeline run.

If not set, the default value is False for non-CI environments. If running in a CI environment ("CI" env var is set), then the default value is True.

NO_UV: bool = True

Whether to use uv for Python package management.

This value is determined by the AIRBYTE_NO_UV environment variable. When AIRBYTE_NO_UV is set to "1", "true", or "yes", uv will be disabled and pip will be used instead.

If the variable is not set or set to any other value, uv will be used by default. This provides a safe fallback mechanism for environments where uv is not available or causes issues.

SECRETS_HYDRATION_PREFIX = 'secret_reference::'

Use this prefix to indicate a secret reference in configuration.

For example, this snippet will populate the personal_access_token field with the value of the secret named GITHUB_PERSONAL_ACCESS_TOKEN, for instance from an environment variable.

{
  "credentials": {
    "personal_access_token": "secret_reference::GITHUB_PERSONAL_ACCESS_TOKEN"
  }
}

For more information, see the airbyte.secrets module documentation.