airbyte.constants

Constants shared across the PyAirbyte codebase.

  1# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
  2"""Constants shared across the PyAirbyte codebase."""
  3
  4from __future__ import annotations
  5
  6import os
  7from pathlib import Path
  8
  9
 10DEBUG_MODE = False  # Set to True to enable additional debug logging.
 11
 12
 13AB_EXTRACTED_AT_COLUMN = "_airbyte_extracted_at"
 14"""A column that stores the timestamp when the record was extracted."""
 15
 16AB_META_COLUMN = "_airbyte_meta"
 17"""A column that stores metadata about the record."""
 18
 19AB_RAW_ID_COLUMN = "_airbyte_raw_id"
 20"""A column that stores a unique identifier for each row in the source data.
 21
 22Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations.
 23In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte,
 24this column is simply used as a unique identifier for each record as it is received.
 25
 26PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time
 27received. This allows us to determine the debug the order of records as they are received, even if
 28the source provides records that are tied or received out of order from the perspective of their
 29`emitted_at` (`_airbyte_extracted_at`) timestamps.
 30"""
 31
 32AB_INTERNAL_COLUMNS = {
 33    AB_RAW_ID_COLUMN,
 34    AB_EXTRACTED_AT_COLUMN,
 35    AB_META_COLUMN,
 36}
 37"""A set of internal columns that are reserved for PyAirbyte's internal use."""
 38
 39DEFAULT_CACHE_SCHEMA_NAME = "airbyte_raw"
 40"""The default schema name to use for caches.
 41
 42Specific caches may override this value with a different schema name.
 43"""
 44
 45DEFAULT_CACHE_ROOT: Path = (
 46    Path() / ".cache"
 47    if "AIRBYTE_CACHE_ROOT" not in os.environ
 48    else Path(os.environ["AIRBYTE_CACHE_ROOT"])
 49)
 50"""Default cache root is `.cache` in the current working directory.
 51
 52The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.
 53
 54Overriding this can be useful if you always want to store cache files in a specific location.
 55For example, in ephemeral environments like Google Colab, you might want to store cache files in
 56your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
 57"""
 58
 59DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
 60"""The default number of records to include in each batch of an Arrow dataset."""
 61
 62
 63def _str_to_bool(value: str) -> bool:
 64    """Convert a string value of an environment values to a boolean value."""
 65    return bool(value) and value.lower() not in {"", "0", "false", "f", "no", "n", "off"}
 66
 67
 68TEMP_DIR_OVERRIDE: Path | None = (
 69    Path(os.environ["AIRBYTE_TEMP_DIR"]) if os.getenv("AIRBYTE_TEMP_DIR") else None
 70)
 71"""The directory to use for temporary files.
 72
 73This value is read from the `AIRBYTE_TEMP_DIR` environment variable. If the variable is not set,
 74Tempfile will use the system's default temporary directory.
 75
 76This can be useful if you want to store temporary files in a specific location (or) when you
 77need your temporary files to exist in user level directories, and not in system level
 78directories for permissions reasons.
 79"""
 80
 81TEMP_FILE_CLEANUP = _str_to_bool(
 82    os.getenv(
 83        key="AIRBYTE_TEMP_FILE_CLEANUP",
 84        default="true",
 85    )
 86)
 87"""Whether to clean up temporary files after use.
 88
 89This value is read from the `AIRBYTE_TEMP_FILE_CLEANUP` environment variable. If the variable is
 90not set, the default value is `True`.
 91"""
 92
 93AIRBYTE_OFFLINE_MODE = _str_to_bool(
 94    os.getenv(
 95        key="AIRBYTE_OFFLINE_MODE",
 96        default="false",
 97    )
 98)
 99"""Enable or disable offline mode.
100
101When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the
102Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in
103environments without internet access or with air-gapped networks.
104
105Offline mode also disables telemetry, similar to a `DO_NOT_TRACK` setting, ensuring no usage data
106is sent from your environment. You may also specify a custom registry URL via the`_REGISTRY_ENV_VAR`
107environment variable if you prefer to use a different registry source for metadata.
108
109This setting helps you make informed choices about data privacy and operation in restricted and
110air-gapped environments.
111"""
DEBUG_MODE = False
AB_EXTRACTED_AT_COLUMN = '_airbyte_extracted_at'

A column that stores the timestamp when the record was extracted.

AB_META_COLUMN = '_airbyte_meta'

A column that stores metadata about the record.

AB_RAW_ID_COLUMN = '_airbyte_raw_id'

A column that stores a unique identifier for each row in the source data.

Note: The interpretation of this column is slightly different from in Airbyte Dv2 destinations. In Airbyte Dv2 destinations, this column points to a row in a separate 'raw' table. In PyAirbyte, this column is simply used as a unique identifier for each record as it is received.

PyAirbyte uses ULIDs for this column, which are identifiers that can be sorted by time received. This allows us to determine the debug the order of records as they are received, even if the source provides records that are tied or received out of order from the perspective of their emitted_at (_airbyte_extracted_at) timestamps.

AB_INTERNAL_COLUMNS = {'_airbyte_extracted_at', '_airbyte_raw_id', '_airbyte_meta'}

A set of internal columns that are reserved for PyAirbyte's internal use.

DEFAULT_CACHE_SCHEMA_NAME = 'airbyte_raw'

The default schema name to use for caches.

Specific caches may override this value with a different schema name.

DEFAULT_CACHE_ROOT: pathlib.Path = PosixPath('.cache')

Default cache root is .cache in the current working directory.

The default location can be overridden by setting the AIRBYTE_CACHE_ROOT environment variable.

Overriding this can be useful if you always want to store cache files in a specific location. For example, in ephemeral environments like Google Colab, you might want to store cache files in your mounted Google Drive by setting this to a path like /content/drive/MyDrive/Airbyte/cache.

DEFAULT_ARROW_MAX_CHUNK_SIZE = 100000

The default number of records to include in each batch of an Arrow dataset.

TEMP_DIR_OVERRIDE: pathlib.Path | None = None

The directory to use for temporary files.

This value is read from the AIRBYTE_TEMP_DIR environment variable. If the variable is not set, Tempfile will use the system's default temporary directory.

This can be useful if you want to store temporary files in a specific location (or) when you need your temporary files to exist in user level directories, and not in system level directories for permissions reasons.

TEMP_FILE_CLEANUP = True

Whether to clean up temporary files after use.

This value is read from the AIRBYTE_TEMP_FILE_CLEANUP environment variable. If the variable is not set, the default value is True.

AIRBYTE_OFFLINE_MODE = False

Enable or disable offline mode.

When offline mode is enabled, PyAirbyte will attempt to fetch metadata for connectors from the Airbyte registry but will not raise an error if the registry is unavailable. This can be useful in environments without internet access or with air-gapped networks.

Offline mode also disables telemetry, similar to a DO_NOT_TRACK setting, ensuring no usage data is sent from your environment. You may also specify a custom registry URL via the_REGISTRY_ENV_VAR environment variable if you prefer to use a different registry source for metadata.

This setting helps you make informed choices about data privacy and operation in restricted and air-gapped environments.