airbyte_cdk.sources.declarative.declarative_stream

View Source
  1#
  2# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  3#
  4import logging
  5from dataclasses import InitVar, dataclass, field
  6from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
  7
  8from airbyte_cdk.models import SyncMode
  9from airbyte_cdk.sources.declarative.incremental import (
 10    GlobalSubstreamCursor,
 11    PerPartitionCursor,
 12    PerPartitionWithGlobalCursor,
 13)
 14from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
 15from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
 16from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever
 17from airbyte_cdk.sources.declarative.retrievers.async_retriever import AsyncRetriever
 18from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever
 19from airbyte_cdk.sources.declarative.schema import DefaultSchemaLoader
 20from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader
 21from airbyte_cdk.sources.streams.checkpoint import (
 22    CheckpointMode,
 23    CheckpointReader,
 24    Cursor,
 25    CursorBasedCheckpointReader,
 26)
 27from airbyte_cdk.sources.streams.core import Stream
 28from airbyte_cdk.sources.types import Config, StreamSlice
 29
 30
 31@dataclass
 32class DeclarativeStream(Stream):
 33    """
 34    DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever
 35
 36    Attributes:
 37        name (str): stream name
 38        primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream
 39        schema_loader (SchemaLoader): The schema loader
 40        retriever (Retriever): The retriever
 41        config (Config): The user-provided configuration as specified by the source's spec
 42        stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field
 43        stream. Transformations are applied in the order in which they are defined.
 44    """
 45
 46    retriever: Retriever
 47    config: Config
 48    parameters: InitVar[Mapping[str, Any]]
 49    name: str
 50    primary_key: Optional[Union[str, List[str], List[List[str]]]]
 51    state_migrations: List[StateMigration] = field(repr=True, default_factory=list)
 52    schema_loader: Optional[SchemaLoader] = None
 53    _name: str = field(init=False, repr=False, default="")
 54    _primary_key: str = field(init=False, repr=False, default="")
 55    stream_cursor_field: Optional[Union[InterpolatedString, str]] = None
 56
 57    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 58        self._stream_cursor_field = (
 59            InterpolatedString.create(self.stream_cursor_field, parameters=parameters)
 60            if isinstance(self.stream_cursor_field, str)
 61            else self.stream_cursor_field
 62        )
 63        self._schema_loader = (
 64            self.schema_loader
 65            if self.schema_loader
 66            else DefaultSchemaLoader(config=self.config, parameters=parameters)
 67        )
 68
 69    @property  # type: ignore
 70    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
 71        return self._primary_key
 72
 73    @primary_key.setter
 74    def primary_key(self, value: str) -> None:
 75        if not isinstance(value, property):
 76            self._primary_key = value
 77
 78    @property
 79    def exit_on_rate_limit(self) -> bool:
 80        if isinstance(self.retriever, AsyncRetriever):
 81            return self.retriever.exit_on_rate_limit
 82
 83        return self.retriever.requester.exit_on_rate_limit  # type: ignore # abstract Retriever class has not requester attribute
 84
 85    @exit_on_rate_limit.setter
 86    def exit_on_rate_limit(self, value: bool) -> None:
 87        if isinstance(self.retriever, AsyncRetriever):
 88            self.retriever.exit_on_rate_limit = value
 89        else:
 90            self.retriever.requester.exit_on_rate_limit = value  # type: ignore[attr-defined]
 91
 92    @property  # type: ignore
 93    def name(self) -> str:
 94        """
 95        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
 96        """
 97        return self._name
 98
 99    @name.setter
100    def name(self, value: str) -> None:
101        if not isinstance(value, property):
102            self._name = value
103
104    @property
105    def state(self) -> MutableMapping[str, Any]:
106        return self.retriever.state  # type: ignore
107
108    @state.setter
109    def state(self, value: MutableMapping[str, Any]) -> None:
110        """State setter, accept state serialized by state getter."""
111        state: Mapping[str, Any] = value
112        if self.state_migrations:
113            for migration in self.state_migrations:
114                if migration.should_migrate(state):
115                    state = migration.migrate(state)
116        self.retriever.state = state
117
118    def get_updated_state(
119        self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
120    ) -> MutableMapping[str, Any]:
121        return self.state
122
123    @property
124    def cursor_field(self) -> Union[str, List[str]]:
125        """
126        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
127        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
128        """
129        cursor = self._stream_cursor_field.eval(self.config)  # type: ignore # _stream_cursor_field is always cast to interpolated string
130        return cursor if cursor else []
131
132    @property
133    def is_resumable(self) -> bool:
134        # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on
135        # if the retriever has a cursor defined.
136        return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False
137
138    def read_records(
139        self,
140        sync_mode: SyncMode,
141        cursor_field: Optional[List[str]] = None,
142        stream_slice: Optional[Mapping[str, Any]] = None,
143        stream_state: Optional[Mapping[str, Any]] = None,
144    ) -> Iterable[Mapping[str, Any]]:
145        """
146        :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
147        """
148        if stream_slice is None or (
149            not isinstance(stream_slice, StreamSlice) and stream_slice == {}
150        ):
151            # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field
152            # As part of the declarative model without custom components, this should never happen as the CDK would wire up a
153            # SinglePartitionRouter that would create this StreamSlice properly
154            # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default
155            # empty slice which seems to make sense.
156            stream_slice = StreamSlice(partition={}, cursor_slice={})
157        if not isinstance(stream_slice, StreamSlice):
158            raise ValueError(
159                f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}"
160            )
161        yield from self.retriever.read_records(self.get_json_schema(), stream_slice)  # type: ignore # records are of the correct type
162
163    def get_json_schema(self) -> Mapping[str, Any]:  # type: ignore
164        """
165        :return: A dict of the JSON schema representing this stream.
166
167        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
168        Override as needed.
169        """
170        return self._schema_loader.get_json_schema()
171
172    def stream_slices(
173        self,
174        *,
175        sync_mode: SyncMode,
176        cursor_field: Optional[List[str]] = None,
177        stream_state: Optional[Mapping[str, Any]] = None,
178    ) -> Iterable[Optional[StreamSlice]]:
179        """
180        Override to define the slices for this stream. See the stream slicing section of the docs for more information.
181
182        :param sync_mode:
183        :param cursor_field:
184        :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state
185        :return:
186        """
187        return self.retriever.stream_slices()
188
189    @property
190    def state_checkpoint_interval(self) -> Optional[int]:
191        """
192        We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but:
193        * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the
194            cursor value once at the end of every slice.
195        * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the
196            important state is the one at the beginning of the slice
197        """
198        return None
199
200    def get_cursor(self) -> Optional[Cursor]:
201        if self.retriever and isinstance(self.retriever, SimpleRetriever):
202            return self.retriever.cursor
203        return None
204
205    def _get_checkpoint_reader(
206        self,
207        logger: logging.Logger,
208        cursor_field: Optional[List[str]],
209        sync_mode: SyncMode,
210        stream_state: MutableMapping[str, Any],
211    ) -> CheckpointReader:
212        """
213        This method is overridden to prevent issues with stream slice classification for incremental streams that have parent streams.
214
215        The classification logic, when used with `itertools.tee`, creates a copy of the stream slices. When `stream_slices` is called
216        the second time, the parent records generated during the classification phase are lost. This occurs because `itertools.tee`
217        only buffers the results, meaning the logic in `simple_retriever` that observes and updates the cursor isn't executed again.
218
219        By overriding this method, we ensure that the stream slices are processed correctly and parent records are not lost,
220        allowing the cursor to function as expected.
221        """
222        mappings_or_slices = self.stream_slices(
223            cursor_field=cursor_field,
224            sync_mode=sync_mode,  # todo: change this interface to no longer rely on sync_mode for behavior
225            stream_state=stream_state,
226        )
227
228        cursor = self.get_cursor()
229        checkpoint_mode = self._checkpoint_mode
230
231        if isinstance(
232            cursor, (GlobalSubstreamCursor, PerPartitionCursor, PerPartitionWithGlobalCursor)
233        ):
234            self.has_multiple_slices = True
235            return CursorBasedCheckpointReader(
236                stream_slices=mappings_or_slices,
237                cursor=cursor,
238                read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH,
239            )
240
241        return super()._get_checkpoint_reader(logger, cursor_field, sync_mode, stream_state)
airbyte_cdk.sources.declarative.declarative_stream

Attributes:

Returns

Returns

Parameters

Returns

Returns

Returns

Parameters

Returns

Parameters

Returns

Inherited Members