airbyte.caches

Base module for all caches.

View Source

 1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 2"""Base module for all caches."""
 3
 4from __future__ import annotations
 5
 6from typing import TYPE_CHECKING
 7
 8from airbyte.caches.base import CacheBase
 9from airbyte.caches.bigquery import BigQueryCache
10from airbyte.caches.duckdb import DuckDBCache
11from airbyte.caches.motherduck import MotherDuckCache
12from airbyte.caches.postgres import PostgresCache
13from airbyte.caches.snowflake import SnowflakeCache
14from airbyte.caches.util import get_default_cache, new_local_cache
15
16
17# Submodules imported here for documentation reasons: https://github.com/mitmproxy/pdoc/issues/757
18if TYPE_CHECKING:
19    # ruff: noqa: TC004
20    from airbyte.caches import base, bigquery, duckdb, motherduck, postgres, snowflake, util
21
22# We export these classes for easy access: `airbyte.caches...`
23__all__ = [
24    # Factories
25    "get_default_cache",
26    "new_local_cache",
27    # Classes
28    "BigQueryCache",
29    "CacheBase",
30    "DuckDBCache",
31    "MotherDuckCache",
32    "PostgresCache",
33    "SnowflakeCache",
34    # Submodules,
35    "util",
36    "bigquery",
37    "duckdb",
38    "motherduck",
39    "postgres",
40    "snowflake",
41    "base",
42]

def get_default_cache() -> DuckDBCache: View Source

31def get_default_cache() -> DuckDBCache:
32    """Get a local cache for storing data, using the default database path.
33
34    Cache files are stored in the `.cache` directory, relative to the current
35    working directory.
36    """
37    cache_dir = DEFAULT_CACHE_ROOT / "default_cache"
38    return DuckDBCache(
39        db_path=cache_dir / "default_cache.duckdb",
40        cache_dir=cache_dir,
41    )

Get a local cache for storing data, using the default database path.

Cache files are stored in the .cache directory, relative to the current working directory.

def new_local_cache( cache_name: str | None = None, cache_dir: str | pathlib.Path | None = None, *, cleanup: bool = True) -> DuckDBCache: View Source

44def new_local_cache(
45    cache_name: str | None = None,
46    cache_dir: str | Path | None = None,
47    *,
48    cleanup: bool = True,
49) -> DuckDBCache:
50    """Get a local cache for storing data, using a name string to seed the path.
51
52    Args:
53        cache_name: Name to use for the cache. Defaults to None.
54        cache_dir: Root directory to store the cache in. Defaults to None.
55        cleanup: Whether to clean up temporary files. Defaults to True.
56
57    Cache files are stored in the `.cache` directory, relative to the current
58    working directory.
59    """
60    if cache_name:
61        if " " in cache_name:
62            raise exc.PyAirbyteInputError(
63                message="Cache name cannot contain spaces.",
64                input_value=cache_name,
65            )
66
67        if not cache_name.replace("_", "").isalnum():
68            raise exc.PyAirbyteInputError(
69                message="Cache name can only contain alphanumeric characters and underscores.",
70                input_value=cache_name,
71            )
72
73    cache_name = cache_name or str(ulid.ULID())
74    cache_dir = cache_dir or (DEFAULT_CACHE_ROOT / cache_name)
75    if not isinstance(cache_dir, Path):
76        cache_dir = Path(cache_dir)
77
78    return DuckDBCache(
79        db_path=cache_dir / f"db_{cache_name}.duckdb",
80        cache_dir=cache_dir,
81        cleanup=cleanup,
82    )

Get a local cache for storing data, using a name string to seed the path.

Arguments:

cache_name: Name to use for the cache. Defaults to None.
cache_dir: Root directory to store the cache in. Defaults to None.
cleanup: Whether to clean up temporary files. Defaults to True.

Cache files are stored in the .cache directory, relative to the current working directory.

class BigQueryCache(airbyte._processors.sql.bigquery.BigQueryConfig, airbyte.caches.CacheBase): View Source

39class BigQueryCache(BigQueryConfig, CacheBase):
40    """The BigQuery cache implementation."""
41
42    _sql_processor_class: ClassVar[type[SqlProcessorBase]] = BigQuerySqlProcessor
43
44    paired_destination_name: ClassVar[str | None] = "destination-bigquery"
45    paired_destination_config_class: ClassVar[type | None] = DestinationBigquery
46
47    @property
48    def paired_destination_config(self) -> DestinationBigquery:
49        """Return a dictionary of destination configuration values."""
50        return bigquery_cache_to_destination_configuration(cache=self)
51
52    def get_arrow_dataset(
53        self,
54        stream_name: str,
55        *,
56        max_chunk_size: int = DEFAULT_ARROW_MAX_CHUNK_SIZE,
57    ) -> NoReturn:
58        """Raises NotImplementedError; BigQuery doesn't support `pd.read_sql_table`.
59
60        See: https://github.com/airbytehq/PyAirbyte/issues/165
61        """
62        raise NotImplementedError(
63            "BigQuery doesn't currently support to_arrow"
64            "Please consider using a different cache implementation for these functionalities."
65        )

The BigQuery cache implementation.

paired_destination_name: ClassVar[str | None] = 'destination-bigquery'

paired_destination_config_class: ClassVar[type | None] = <class 'airbyte_api.models.destination_bigquery.DestinationBigquery'>

paired_destination_config: airbyte_api.models.destination_bigquery.DestinationBigquery View Source

47    @property
48    def paired_destination_config(self) -> DestinationBigquery:
49        """Return a dictionary of destination configuration values."""
50        return bigquery_cache_to_destination_configuration(cache=self)

Return a dictionary of destination configuration values.

def get_arrow_dataset(self, stream_name: str, *, max_chunk_size: int = 100000) -> NoReturn: View Source

52    def get_arrow_dataset(
53        self,
54        stream_name: str,
55        *,
56        max_chunk_size: int = DEFAULT_ARROW_MAX_CHUNK_SIZE,
57    ) -> NoReturn:
58        """Raises NotImplementedError; BigQuery doesn't support `pd.read_sql_table`.
59
60        See: https://github.com/airbytehq/PyAirbyte/issues/165
61        """
62        raise NotImplementedError(
63            "BigQuery doesn't currently support to_arrow"
64            "Please consider using a different cache implementation for these functionalities."
65        )

Raises NotImplementedError; BigQuery doesn't support pd.read_sql_table.

See: https://github.com/airbytehq/PyAirbyte/issues/165

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

Inherited Members

CacheBase: CacheBase; cache_dir; cleanup; close; config_hash; execute_sql; processor; run_sql_query; get_record_processor; get_records; get_pandas_dataframe; streams; get_state_provider; get_state_writer; register_source; create_source_tables
airbyte._processors.sql.bigquery.BigQueryConfig: database_name; schema_name; credentials_path; dataset_location; project_name; dataset_name; get_sql_alchemy_url; get_database_name; get_vendor_client
airbyte.shared.sql_processor.SqlConfig: table_prefix; get_create_table_extra_clauses; get_sql_alchemy_connect_args; get_sql_engine; dispose_engine
pydantic.main.BaseModel: model_fields; model_computed_fields; model_extra; model_fields_set; model_construct; model_copy; model_dump; model_dump_json; model_json_schema; model_parametrized_name; model_rebuild; model_validate; model_validate_json; model_validate_strings; dict; json; parse_obj; parse_raw; parse_file; from_orm; construct; copy; schema; schema_json; validate; update_forward_refs
airbyte._writers.base.AirbyteWriterInterface: name

class DuckDBCache(airbyte._processors.sql.duckdb.DuckDBConfig, airbyte.caches.CacheBase): View Source

44class DuckDBCache(DuckDBConfig, CacheBase):
45    """A DuckDB cache."""
46
47    _sql_processor_class: ClassVar[type[SqlProcessorBase]] = DuckDBSqlProcessor
48
49    paired_destination_name: ClassVar[str | None] = "destination-duckdb"
50    paired_destination_config_class: ClassVar[type | None] = DestinationDuckdb
51
52    @property
53    def paired_destination_config(self) -> DestinationDuckdb:
54        """Return a dictionary of destination configuration values."""
55        return duckdb_cache_to_destination_configuration(cache=self)

A DuckDB cache.

paired_destination_name: ClassVar[str | None] = 'destination-duckdb'

paired_destination_config_class: ClassVar[type | None] = <class 'airbyte_api.models.destination_duckdb.DestinationDuckdb'>

paired_destination_config: airbyte_api.models.destination_duckdb.DestinationDuckdb View Source

52    @property
53    def paired_destination_config(self) -> DestinationDuckdb:
54        """Return a dictionary of destination configuration values."""
55        return duckdb_cache_to_destination_configuration(cache=self)

Return a dictionary of destination configuration values.

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

Inherited Members

CacheBase: CacheBase; cache_dir; cleanup; close; config_hash; execute_sql; processor; run_sql_query; get_record_processor; get_records; get_pandas_dataframe; get_arrow_dataset; streams; get_state_provider; get_state_writer; register_source; create_source_tables
airbyte._processors.sql.duckdb.DuckDBConfig: db_path; schema_name; get_sql_alchemy_url; get_database_name; get_sql_engine
airbyte.shared.sql_processor.SqlConfig: table_prefix; get_create_table_extra_clauses; get_sql_alchemy_connect_args; dispose_engine; get_vendor_client
pydantic.main.BaseModel: model_fields; model_computed_fields; model_extra; model_fields_set; model_construct; model_copy; model_dump; model_dump_json; model_json_schema; model_parametrized_name; model_rebuild; model_validate; model_validate_json; model_validate_strings; dict; json; parse_obj; parse_raw; parse_file; from_orm; construct; copy; schema; schema_json; validate; update_forward_refs
airbyte._writers.base.AirbyteWriterInterface: name

class MotherDuckCache(airbyte.caches.motherduck.MotherDuckConfig, airbyte.caches.DuckDBCache): View Source

72class MotherDuckCache(MotherDuckConfig, DuckDBCache):
73    """Cache that uses MotherDuck for external persistent storage."""
74
75    _sql_processor_class: ClassVar[type[SqlProcessorBase]] = MotherDuckSqlProcessor
76
77    paired_destination_name: ClassVar[str | None] = "destination-bigquery"
78    paired_destination_config_class: ClassVar[type | None] = DestinationDuckdb
79
80    @property
81    def paired_destination_config(self) -> DestinationDuckdb:
82        """Return a dictionary of destination configuration values."""
83        return motherduck_cache_to_destination_configuration(cache=self)

Cache that uses MotherDuck for external persistent storage.

paired_destination_name: ClassVar[str | None] = 'destination-bigquery'

paired_destination_config_class: ClassVar[type | None] = <class 'airbyte_api.models.destination_duckdb.DestinationDuckdb'>

paired_destination_config: airbyte_api.models.destination_duckdb.DestinationDuckdb View Source

80    @property
81    def paired_destination_config(self) -> DestinationDuckdb:
82        """Return a dictionary of destination configuration values."""
83        return motherduck_cache_to_destination_configuration(cache=self)

Return a dictionary of destination configuration values.

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

Inherited Members

CacheBase: CacheBase; cache_dir; cleanup; close; config_hash; execute_sql; processor; run_sql_query; get_record_processor; get_records; get_pandas_dataframe; get_arrow_dataset; streams; get_state_provider; get_state_writer; register_source; create_source_tables
airbyte.caches.motherduck.MotherDuckConfig: database; api_key; db_path; get_sql_alchemy_url; get_database_name
airbyte._processors.sql.duckdb.DuckDBConfig: schema_name; get_sql_engine
airbyte.shared.sql_processor.SqlConfig: table_prefix; get_create_table_extra_clauses; get_sql_alchemy_connect_args; dispose_engine; get_vendor_client
pydantic.main.BaseModel: model_fields; model_computed_fields; model_extra; model_fields_set; model_construct; model_copy; model_dump; model_dump_json; model_json_schema; model_parametrized_name; model_rebuild; model_validate; model_validate_json; model_validate_strings; dict; json; parse_obj; parse_raw; parse_file; from_orm; construct; copy; schema; schema_json; validate; update_forward_refs
airbyte._writers.base.AirbyteWriterInterface: name

class PostgresCache(airbyte._processors.sql.postgres.PostgresConfig, airbyte.caches.CacheBase): View Source

38class PostgresCache(PostgresConfig, CacheBase):
39    """Configuration for the Postgres cache.
40
41    Also inherits config from the JsonlWriter, which is responsible for writing files to disk.
42    """
43
44    _sql_processor_class: ClassVar[type[SqlProcessorBase]] = PostgresSqlProcessor
45
46    paired_destination_name: ClassVar[str | None] = "destination-bigquery"
47    paired_destination_config_class: ClassVar[type | None] = DestinationPostgres
48
49    @property
50    def paired_destination_config(self) -> DestinationPostgres:
51        """Return a dictionary of destination configuration values."""
52        return postgres_cache_to_destination_configuration(cache=self)
53
54    def clone_as_cloud_destination_config(self) -> DestinationPostgres:
55        """Return a DestinationPostgres instance with the same configuration."""
56        return DestinationPostgres(
57            host=self.host,
58            port=self.port,
59            username=self.username,
60            password=self.password,
61            database=self.database,
62        )

Configuration for the Postgres cache.

Also inherits config from the JsonlWriter, which is responsible for writing files to disk.

paired_destination_name: ClassVar[str | None] = 'destination-bigquery'

paired_destination_config_class: ClassVar[type | None] = <class 'airbyte_api.models.destination_postgres.DestinationPostgres'>

paired_destination_config: airbyte_api.models.destination_postgres.DestinationPostgres View Source

49    @property
50    def paired_destination_config(self) -> DestinationPostgres:
51        """Return a dictionary of destination configuration values."""
52        return postgres_cache_to_destination_configuration(cache=self)

Return a dictionary of destination configuration values.

def clone_as_cloud_destination_config(self) -> airbyte_api.models.destination_postgres.DestinationPostgres: View Source

54    def clone_as_cloud_destination_config(self) -> DestinationPostgres:
55        """Return a DestinationPostgres instance with the same configuration."""
56        return DestinationPostgres(
57            host=self.host,
58            port=self.port,
59            username=self.username,
60            password=self.password,
61            database=self.database,
62        )

Return a DestinationPostgres instance with the same configuration.

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

Inherited Members

CacheBase: CacheBase; cache_dir; cleanup; close; config_hash; execute_sql; processor; run_sql_query; get_record_processor; get_records; get_pandas_dataframe; get_arrow_dataset; streams; get_state_provider; get_state_writer; register_source; create_source_tables
airbyte._processors.sql.postgres.PostgresConfig: host; port; database; username; password; get_sql_alchemy_url; get_database_name
airbyte.shared.sql_processor.SqlConfig: schema_name; table_prefix; get_create_table_extra_clauses; get_sql_alchemy_connect_args; get_sql_engine; dispose_engine; get_vendor_client
pydantic.main.BaseModel: model_fields; model_computed_fields; model_extra; model_fields_set; model_construct; model_copy; model_dump; model_dump_json; model_json_schema; model_parametrized_name; model_rebuild; model_validate; model_validate_json; model_validate_strings; dict; json; parse_obj; parse_raw; parse_file; from_orm; construct; copy; schema; schema_json; validate; update_forward_refs
airbyte._writers.base.AirbyteWriterInterface: name

class SnowflakeCache(airbyte._processors.sql.snowflake.SnowflakeConfig, airbyte.caches.CacheBase): View Source

75class SnowflakeCache(SnowflakeConfig, CacheBase):
76    """Configuration for the Snowflake cache."""
77
78    dedupe_mode: RecordDedupeMode = RecordDedupeMode.APPEND
79
80    _sql_processor_class: ClassVar[type[SqlProcessorBase]] = SnowflakeSqlProcessor
81
82    paired_destination_name: ClassVar[str | None] = "destination-bigquery"
83    paired_destination_config_class: ClassVar[type | None] = DestinationSnowflake
84
85    @property
86    def paired_destination_config(self) -> DestinationSnowflake:
87        """Return a dictionary of destination configuration values."""
88        return snowflake_cache_to_destination_configuration(cache=self)

Configuration for the Snowflake cache.

dedupe_mode: airbyte.shared.sql_processor.RecordDedupeMode

paired_destination_name: ClassVar[str | None] = 'destination-bigquery'

paired_destination_config_class: ClassVar[type | None] = <class 'airbyte_api.models.destination_snowflake.DestinationSnowflake'>

paired_destination_config: airbyte_api.models.destination_snowflake.DestinationSnowflake View Source

85    @property
86    def paired_destination_config(self) -> DestinationSnowflake:
87        """Return a dictionary of destination configuration values."""
88        return snowflake_cache_to_destination_configuration(cache=self)

Return a dictionary of destination configuration values.

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

Inherited Members

CacheBase: CacheBase; cache_dir; cleanup; close; config_hash; execute_sql; processor; run_sql_query; get_record_processor; get_records; get_pandas_dataframe; get_arrow_dataset; streams; get_state_provider; get_state_writer; register_source; create_source_tables
airbyte._processors.sql.snowflake.SnowflakeConfig: account; username; password; private_key; private_key_path; private_key_passphrase; warehouse; database; role; schema_name; data_retention_time_in_days; get_sql_alchemy_connect_args; get_create_table_extra_clauses; get_database_name; get_sql_alchemy_url; get_vendor_client
airbyte.shared.sql_processor.SqlConfig: table_prefix; get_sql_engine; dispose_engine
pydantic.main.BaseModel: model_fields; model_computed_fields; model_extra; model_fields_set; model_construct; model_copy; model_dump; model_dump_json; model_json_schema; model_parametrized_name; model_rebuild; model_validate; model_validate_json; model_validate_strings; dict; json; parse_obj; parse_raw; parse_file; from_orm; construct; copy; schema; schema_json; validate; update_forward_refs
airbyte._writers.base.AirbyteWriterInterface: name

airbyte.caches

Arguments:

Arguments:

Inherited Members

Raises:

Arguments:

Returns:

Arguments:

Arguments:

Inherited Members

Arguments:

Inherited Members

Arguments:

Inherited Members

Arguments:

Inherited Members

Arguments:

Inherited Members