airbyte.caches.base API documentation

CacheBase(**data: Any) View Source

 78    def __init__(self, **data: Any) -> None:  # noqa: ANN401
 79        """Initialize the cache and backends."""
 80        super().__init__(**data)
 81
 82        # Create a temporary processor to do the work of ensuring the schema exists
 83        temp_processor = self._sql_processor_class(
 84            sql_config=self,
 85            catalog_provider=CatalogProvider(ConfiguredAirbyteCatalog(streams=[])),
 86            state_writer=StdOutStateWriter(),
 87            temp_dir=self.cache_dir,
 88            temp_file_cleanup=self.cleanup,
 89        )
 90        temp_processor._ensure_schema_exists()  # noqa: SLF001  # Accessing non-public member
 91
 92        # Initialize the catalog and state backends
 93        self._catalog_backend = SqlCatalogBackend(
 94            sql_config=self,
 95            table_prefix=self.table_prefix or "",
 96        )
 97        self._state_backend = SqlStateBackend(
 98            sql_config=self,
 99            table_prefix=self.table_prefix or "",
100        )
101
102        # Now we can create the SQL read processor
103        self._read_processor = self._sql_processor_class(
104            sql_config=self,
105            catalog_provider=self._catalog_backend.get_full_catalog_provider(),
106            state_writer=StdOutStateWriter(),  # Shouldn't be needed for the read-only processor
107            temp_dir=self.cache_dir,
108            temp_file_cleanup=self.cleanup,
109        )

Initialize the cache and backends.

cache_dir: pathlib.Path

The directory to store the cache in.

cleanup: bool

Whether to clean up the cache after use.

paired_destination_name: ClassVar[str | None] = None

paired_destination_config_class: ClassVar[type | None] = None

paired_destination_config: Union[Any, dict[str, Any]] View Source

70    @property
71    def paired_destination_config(self) -> Any | dict[str, Any]:  # noqa: ANN401  # Allow Any return type
72        """Return a dictionary of destination configuration values."""
73        raise NotImplementedError(
74            f"The type '{type(self).__name__}' does not define an equivalent destination "
75            "configuration."
76        )

Return a dictionary of destination configuration values.

config_hash: str | None View Source

111    @property
112    def config_hash(self) -> str | None:
113        """Return a hash of the cache configuration.
114
115        This is the same as the SQLConfig hash from the superclass.
116        """
117        return super(SqlConfig, self).config_hash

Return a hash of the cache configuration.

This is the same as the SQLConfig hash from the superclass.

def execute_sql(self, sql: str | list[str]) -> None: View Source

119    def execute_sql(self, sql: str | list[str]) -> None:
120        """Execute one or more SQL statements against the cache's SQL backend.
121
122        If multiple SQL statements are given, they are executed in order,
123        within the same transaction.
124
125        This method is useful for creating tables, indexes, and other
126        schema objects in the cache. It does not return any results and it
127        automatically closes the connection after executing all statements.
128
129        This method is not intended for querying data. For that, use the `get_records`
130        method - or for a low-level interface, use the `get_sql_engine` method.
131
132        If any of the statements fail, the transaction is canceled and an exception
133        is raised. Most databases will rollback the transaction in this case.
134        """
135        if isinstance(sql, str):
136            # Coerce to a list if a single string is given
137            sql = [sql]
138
139        with self.processor.get_sql_connection() as connection:
140            for sql_statement in sql:
141                connection.execute(text(sql_statement))

Execute one or more SQL statements against the cache's SQL backend.

If multiple SQL statements are given, they are executed in order, within the same transaction.

This method is useful for creating tables, indexes, and other schema objects in the cache. It does not return any results and it automatically closes the connection after executing all statements.

This method is not intended for querying data. For that, use the get_records method - or for a low-level interface, use the get_sql_engine method.

If any of the statements fail, the transaction is canceled and an exception is raised. Most databases will rollback the transaction in this case.

processor: airbyte.shared.sql_processor.SqlProcessorBase View Source

143    @final
144    @property
145    def processor(self) -> SqlProcessorBase:
146        """Return the SQL processor instance."""
147        return self._read_processor

Return the SQL processor instance.

def get_record_processor( self, source_name: str, catalog_provider: airbyte.shared.catalog_providers.CatalogProvider, state_writer: airbyte.shared.state_writers.StateWriterBase | None = None) -> airbyte.shared.sql_processor.SqlProcessorBase: View Source

149    def get_record_processor(
150        self,
151        source_name: str,
152        catalog_provider: CatalogProvider,
153        state_writer: StateWriterBase | None = None,
154    ) -> SqlProcessorBase:
155        """Return a record processor for the specified source name and catalog.
156
157        We first register the source and its catalog with the catalog manager. Then we create a new
158        SQL processor instance with (only) the given input catalog.
159
160        For the state writer, we use a state writer which stores state in an internal SQL table.
161        """
162        # First register the source and catalog into durable storage. This is necessary to ensure
163        # that we can later retrieve the catalog information.
164        self.register_source(
165            source_name=source_name,
166            incoming_source_catalog=catalog_provider.configured_catalog,
167            stream_names=set(catalog_provider.stream_names),
168        )
169
170        # Next create a new SQL processor instance with the given catalog - and a state writer
171        # that writes state to the internal SQL table and associates with the given source name.
172        return self._sql_processor_class(
173            sql_config=self,
174            catalog_provider=catalog_provider,
175            state_writer=state_writer or self.get_state_writer(source_name=source_name),
176            temp_dir=self.cache_dir,
177            temp_file_cleanup=self.cleanup,
178        )

Return a record processor for the specified source name and catalog.

We first register the source and its catalog with the catalog manager. Then we create a new SQL processor instance with (only) the given input catalog.

For the state writer, we use a state writer which stores state in an internal SQL table.

def get_records(self, stream_name: str) -> airbyte.CachedDataset: View Source

182    def get_records(
183        self,
184        stream_name: str,
185    ) -> CachedDataset:
186        """Uses SQLAlchemy to select all rows from the table."""
187        return CachedDataset(self, stream_name)

Uses SQLAlchemy to select all rows from the table.

def get_pandas_dataframe(self, stream_name: str) -> pandas.core.frame.DataFrame: View Source

189    def get_pandas_dataframe(
190        self,
191        stream_name: str,
192    ) -> pd.DataFrame:
193        """Return a Pandas data frame with the stream's data."""
194        table_name = self._read_processor.get_sql_table_name(stream_name)
195        engine = self.get_sql_engine()
196        return pd.read_sql_table(table_name, engine, schema=self.schema_name)

Return a Pandas data frame with the stream's data.

def get_arrow_dataset( self, stream_name: str, *, max_chunk_size: int = 100000) -> pyarrow._dataset.Dataset: View Source

198    def get_arrow_dataset(
199        self,
200        stream_name: str,
201        *,
202        max_chunk_size: int = DEFAULT_ARROW_MAX_CHUNK_SIZE,
203    ) -> ds.Dataset:
204        """Return an Arrow Dataset with the stream's data."""
205        table_name = self._read_processor.get_sql_table_name(stream_name)
206        engine = self.get_sql_engine()
207
208        # Read the table in chunks to handle large tables which does not fits in memory
209        pandas_chunks = pd.read_sql_table(
210            table_name=table_name,
211            con=engine,
212            schema=self.schema_name,
213            chunksize=max_chunk_size,
214        )
215
216        arrow_batches_list = []
217        arrow_schema = None
218
219        for pandas_chunk in pandas_chunks:
220            if arrow_schema is None:
221                # Initialize the schema with the first chunk
222                arrow_schema = pa.Schema.from_pandas(pandas_chunk)
223
224            # Convert each pandas chunk to an Arrow Table
225            arrow_table = pa.RecordBatch.from_pandas(pandas_chunk, schema=arrow_schema)
226            arrow_batches_list.append(arrow_table)
227
228        return ds.dataset(arrow_batches_list)

Return an Arrow Dataset with the stream's data.

streams: dict[str, airbyte.CachedDataset] View Source

230    @final
231    @property
232    def streams(self) -> dict[str, CachedDataset]:
233        """Return a temporary table name."""
234        result = {}
235        stream_names = set(self._catalog_backend.stream_names)
236
237        for stream_name in stream_names:
238            result[stream_name] = CachedDataset(self, stream_name)
239
240        return result

Return a temporary table name.

def get_state_provider( self, source_name: str, *, refresh: bool = True, destination_name: str | None = None) -> airbyte.shared.state_providers.StateProviderBase: View Source

255    def get_state_provider(
256        self,
257        source_name: str,
258        *,
259        refresh: bool = True,
260        destination_name: str | None = None,
261    ) -> StateProviderBase:
262        """Return a state provider for the specified source name."""
263        return self._state_backend.get_state_provider(
264            source_name=source_name,
265            table_prefix=self.table_prefix or "",
266            refresh=refresh,
267            destination_name=destination_name,
268        )

Return a state provider for the specified source name.

def get_state_writer( self, source_name: str, destination_name: str | None = None) -> airbyte.shared.state_writers.StateWriterBase: View Source

270    def get_state_writer(
271        self,
272        source_name: str,
273        destination_name: str | None = None,
274    ) -> StateWriterBase:
275        """Return a state writer for the specified source name.
276
277        If syncing to the cache, `destination_name` should be `None`.
278        If syncing to a destination, `destination_name` should be the destination name.
279        """
280        return self._state_backend.get_state_writer(
281            source_name=source_name,
282            destination_name=destination_name,
283        )

Return a state writer for the specified source name.

If syncing to the cache, destination_name should be None. If syncing to a destination, destination_name should be the destination name.

def register_source( self, source_name: str, incoming_source_catalog: airbyte_protocol.models.airbyte_protocol.ConfiguredAirbyteCatalog, stream_names: set[str]) -> None: View Source

285    def register_source(
286        self,
287        source_name: str,
288        incoming_source_catalog: ConfiguredAirbyteCatalog,
289        stream_names: set[str],
290    ) -> None:
291        """Register the source name and catalog."""
292        self._catalog_backend.register_source(
293            source_name=source_name,
294            incoming_source_catalog=incoming_source_catalog,
295            incoming_stream_names=stream_names,
296        )

Register the source name and catalog.

def create_source_tables( self, source: airbyte.Source, streams: Union[list[str], Literal['*'], NoneType] = None) -> None: View Source

298    def create_source_tables(
299        self,
300        source: Source,
301        streams: Literal["*"] | list[str] | None = None,
302    ) -> None:
303        """Create tables in the cache for the provided source if they do not exist already.
304
305        Tables are created based upon the Source's catalog.
306
307        Args:
308            source: The source to create tables for.
309            streams: Stream names to create tables for. If None, use the Source's selected_streams
310                or "*" if neither is set. If "*", all available streams will be used.
311        """
312        if streams is None:
313            streams = source.get_selected_streams() or "*"
314
315        catalog_provider = CatalogProvider(source.get_configured_catalog(streams=streams))
316
317        # Register the incoming source catalog
318        self.register_source(
319            source_name=source.name,
320            incoming_source_catalog=catalog_provider.configured_catalog,
321            stream_names=set(catalog_provider.stream_names),
322        )
323
324        # Ensure schema exists
325        self.processor._ensure_schema_exists()  # noqa: SLF001  # Accessing non-public member
326
327        # Create tables for each stream if they don't exist
328        for stream_name in catalog_provider.stream_names:
329            self.processor._ensure_final_table_exists(  # noqa: SLF001
330                stream_name=stream_name,
331                create_if_missing=True,
332            )

Create tables in the cache for the provided source if they do not exist already.

Tables are created based upon the Source's catalog.

Arguments:

source: The source to create tables for.
streams: Stream names to create tables for. If None, use the Source's selected_streams or "" if neither is set. If "", all available streams will be used.

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None: View Source

328def init_private_attributes(self: BaseModel, context: Any, /) -> None:
329    """This function is meant to behave like a BaseModel method to initialise private attributes.
330
331    It takes context as an argument since that's what pydantic-core passes when calling it.
332
333    Args:
334        self: The BaseModel instance.
335        context: The context.
336    """
337    if getattr(self, '__pydantic_private__', None) is None:
338        pydantic_private = {}
339        for name, private_attr in self.__private_attributes__.items():
340            default = private_attr.get_default()
341            if default is not PydanticUndefined:
342                pydantic_private[name] = default
343        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Arguments:

self: The BaseModel instance.
context: The context.

airbyte.caches.base

Arguments:

Arguments:

Inherited Members