airbyte_cdk.legacy.sources.declarative.declarative_stream

View Source
  1#
  2# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  3#
  4import logging
  5from dataclasses import InitVar, dataclass, field
  6from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
  7
  8from typing_extensions import deprecated
  9
 10from airbyte_cdk.legacy.sources.declarative.incremental import (
 11    GlobalSubstreamCursor,
 12    PerPartitionCursor,
 13    PerPartitionWithGlobalCursor,
 14)
 15from airbyte_cdk.models import SyncMode
 16from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
 17from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
 18from airbyte_cdk.sources.declarative.retrievers.async_retriever import AsyncRetriever
 19from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever
 20from airbyte_cdk.sources.declarative.schema import DefaultSchemaLoader
 21from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader
 22from airbyte_cdk.sources.streams.checkpoint import (
 23    CheckpointMode,
 24    CheckpointReader,
 25    Cursor,
 26    CursorBasedCheckpointReader,
 27)
 28from airbyte_cdk.sources.streams.core import Stream
 29from airbyte_cdk.sources.types import Config, StreamSlice
 30
 31
 32@deprecated("DeclarativeStream has been deprecated in favor of the concurrent DefaultStream")
 33@dataclass
 34class DeclarativeStream(Stream):
 35    """
 36    DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever
 37
 38    Attributes:
 39        name (str): stream name
 40        primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream
 41        schema_loader (SchemaLoader): The schema loader
 42        retriever (Retriever): The retriever
 43        config (Config): The user-provided configuration as specified by the source's spec
 44        stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field
 45        stream. Transformations are applied in the order in which they are defined.
 46    """
 47
 48    retriever: Retriever
 49    config: Config
 50    parameters: InitVar[Mapping[str, Any]]
 51    name: str
 52    primary_key: Optional[Union[str, List[str], List[List[str]]]]
 53    state_migrations: List[StateMigration] = field(repr=True, default_factory=list)
 54    schema_loader: Optional[SchemaLoader] = None
 55    _name: str = field(init=False, repr=False, default="")
 56    _primary_key: str = field(init=False, repr=False, default="")
 57    stream_cursor_field: Optional[Union[InterpolatedString, str]] = None
 58
 59    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 60        self._stream_cursor_field = (
 61            InterpolatedString.create(self.stream_cursor_field, parameters=parameters)
 62            if isinstance(self.stream_cursor_field, str)
 63            else self.stream_cursor_field
 64        )
 65        self._schema_loader = (
 66            self.schema_loader
 67            if self.schema_loader
 68            else DefaultSchemaLoader(config=self.config, parameters=parameters)
 69        )
 70
 71    @property  # type: ignore
 72    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
 73        return self._primary_key
 74
 75    @primary_key.setter
 76    def primary_key(self, value: str) -> None:
 77        if not isinstance(value, property):
 78            self._primary_key = value
 79
 80    @property
 81    def exit_on_rate_limit(self) -> bool:
 82        if isinstance(self.retriever, AsyncRetriever):
 83            return self.retriever.exit_on_rate_limit
 84
 85        return self.retriever.requester.exit_on_rate_limit  # type: ignore # abstract Retriever class has not requester attribute
 86
 87    @exit_on_rate_limit.setter
 88    def exit_on_rate_limit(self, value: bool) -> None:
 89        if isinstance(self.retriever, AsyncRetriever):
 90            self.retriever.exit_on_rate_limit = value
 91        else:
 92            self.retriever.requester.exit_on_rate_limit = value  # type: ignore[attr-defined]
 93
 94    @property  # type: ignore
 95    def name(self) -> str:
 96        """
 97        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
 98        """
 99        return self._name
100
101    @name.setter
102    def name(self, value: str) -> None:
103        if not isinstance(value, property):
104            self._name = value
105
106    @property
107    def state(self) -> MutableMapping[str, Any]:
108        return self.retriever.state  # type: ignore
109
110    @state.setter
111    def state(self, value: MutableMapping[str, Any]) -> None:
112        """State setter, accept state serialized by state getter."""
113        state: Mapping[str, Any] = value
114        if self.state_migrations:
115            for migration in self.state_migrations:
116                if migration.should_migrate(state):
117                    state = migration.migrate(state)
118        self.retriever.state = state
119
120    def get_updated_state(
121        self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
122    ) -> MutableMapping[str, Any]:
123        return self.state
124
125    @property
126    def cursor_field(self) -> Union[str, List[str]]:
127        """
128        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
129        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
130        """
131        cursor = self._stream_cursor_field.eval(self.config)  # type: ignore # _stream_cursor_field is always cast to interpolated string
132        return cursor if cursor else []
133
134    @property
135    def is_resumable(self) -> bool:
136        # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on
137        # if the retriever has a cursor defined.
138        return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False
139
140    def read_records(
141        self,
142        sync_mode: SyncMode,
143        cursor_field: Optional[List[str]] = None,
144        stream_slice: Optional[Mapping[str, Any]] = None,
145        stream_state: Optional[Mapping[str, Any]] = None,
146    ) -> Iterable[Mapping[str, Any]]:
147        """
148        :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
149        """
150        if stream_slice is None or (
151            not isinstance(stream_slice, StreamSlice) and stream_slice == {}
152        ):
153            # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field
154            # As part of the declarative model without custom components, this should never happen as the CDK would wire up a
155            # SinglePartitionRouter that would create this StreamSlice properly
156            # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default
157            # empty slice which seems to make sense.
158            stream_slice = StreamSlice(partition={}, cursor_slice={})
159        if not isinstance(stream_slice, StreamSlice):
160            raise ValueError(
161                f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}"
162            )
163        yield from self.retriever.read_records(self.get_json_schema(), stream_slice)  # type: ignore # records are of the correct type
164
165    def get_json_schema(self) -> Mapping[str, Any]:  # type: ignore
166        """
167        :return: A dict of the JSON schema representing this stream.
168
169        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
170        Override as needed.
171        """
172        return self._schema_loader.get_json_schema()
173
174    def stream_slices(
175        self,
176        *,
177        sync_mode: SyncMode,
178        cursor_field: Optional[List[str]] = None,
179        stream_state: Optional[Mapping[str, Any]] = None,
180    ) -> Iterable[Optional[StreamSlice]]:
181        """
182        Override to define the slices for this stream. See the stream slicing section of the docs for more information.
183
184        :param sync_mode:
185        :param cursor_field:
186        :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state
187        :return:
188        """
189        return self.retriever.stream_slices()
190
191    @property
192    def state_checkpoint_interval(self) -> Optional[int]:
193        """
194        We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but:
195        * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the
196            cursor value once at the end of every slice.
197        * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the
198            important state is the one at the beginning of the slice
199        """
200        return None
201
202    def get_cursor(self) -> Optional[Cursor]:
203        return None
204
205    def _get_checkpoint_reader(
206        self,
207        logger: logging.Logger,
208        cursor_field: Optional[List[str]],
209        sync_mode: SyncMode,
210        stream_state: MutableMapping[str, Any],
211    ) -> CheckpointReader:
212        """
213        This method is overridden to prevent issues with stream slice classification for incremental streams that have parent streams.
214
215        The classification logic, when used with `itertools.tee`, creates a copy of the stream slices. When `stream_slices` is called
216        the second time, the parent records generated during the classification phase are lost. This occurs because `itertools.tee`
217        only buffers the results, meaning the logic in `simple_retriever` that observes and updates the cursor isn't executed again.
218
219        By overriding this method, we ensure that the stream slices are processed correctly and parent records are not lost,
220        allowing the cursor to function as expected.
221        """
222        mappings_or_slices = self.stream_slices(
223            cursor_field=cursor_field,
224            sync_mode=sync_mode,  # todo: change this interface to no longer rely on sync_mode for behavior
225            stream_state=stream_state,
226        )
227
228        cursor = self.get_cursor()
229        checkpoint_mode = self._checkpoint_mode
230
231        if isinstance(
232            cursor, (GlobalSubstreamCursor, PerPartitionCursor, PerPartitionWithGlobalCursor)
233        ):
234            self.has_multiple_slices = True
235            return CursorBasedCheckpointReader(
236                stream_slices=mappings_or_slices,
237                cursor=cursor,
238                read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH,
239            )
240
241        return super()._get_checkpoint_reader(logger, cursor_field, sync_mode, stream_state)
airbyte_cdk.legacy.sources.declarative.declarative_stream

Attributes:

Returns

Returns

Parameters

Returns

Returns

Returns

Parameters

Returns

Parameters

Returns

Inherited Members