airbyte_cdk.sources.declarative.incremental

View Source

 1#
 2# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
 3#
 4
 5from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
 6    ConcurrentCursorFactory,
 7    ConcurrentPerPartitionCursor,
 8)
 9from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
10from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
11from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
12    GlobalSubstreamCursor,
13)
14from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import (
15    CursorFactory,
16    PerPartitionCursor,
17)
18from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
19    PerPartitionWithGlobalCursor,
20)
21from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor import (
22    ChildPartitionResumableFullRefreshCursor,
23    ResumableFullRefreshCursor,
24)
25
26__all__ = [
27    "CursorFactory",
28    "ConcurrentCursorFactory",
29    "ConcurrentPerPartitionCursor",
30    "DatetimeBasedCursor",
31    "DeclarativeCursor",
32    "GlobalSubstreamCursor",
33    "PerPartitionCursor",
34    "PerPartitionWithGlobalCursor",
35    "ResumableFullRefreshCursor",
36    "ChildPartitionResumableFullRefreshCursor",
37]

class CursorFactory: View Source

20class CursorFactory:
21    def __init__(self, create_function: Callable[[], DeclarativeCursor]):
22        self._create_function = create_function
23
24    def create(self) -> DeclarativeCursor:
25        return self._create_function()

CursorFactory( create_function: Callable[[], DeclarativeCursor]) View Source

21    def __init__(self, create_function: Callable[[], DeclarativeCursor]):
22        self._create_function = create_function

def create( self) -> DeclarativeCursor: View Source

24    def create(self) -> DeclarativeCursor:
25        return self._create_function()

class ConcurrentCursorFactory: View Source

35class ConcurrentCursorFactory:
36    def __init__(self, create_function: Callable[..., ConcurrentCursor]):
37        self._create_function = create_function
38
39    def create(
40        self, stream_state: Mapping[str, Any], runtime_lookback_window: Optional[timedelta]
41    ) -> ConcurrentCursor:
42        return self._create_function(
43            stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
44        )

ConcurrentCursorFactory( create_function: Callable[..., airbyte_cdk.ConcurrentCursor]) View Source

36    def __init__(self, create_function: Callable[..., ConcurrentCursor]):
37        self._create_function = create_function

def create( self, stream_state: Mapping[str, Any], runtime_lookback_window: Optional[datetime.timedelta]) -> airbyte_cdk.ConcurrentCursor: View Source

39    def create(
40        self, stream_state: Mapping[str, Any], runtime_lookback_window: Optional[timedelta]
41    ) -> ConcurrentCursor:
42        return self._create_function(
43            stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
44        )

class DeclarativeCursor(airbyte_cdk.sources.streams.checkpoint.cursor.Cursor, airbyte_cdk.sources.declarative.stream_slicers.stream_slicer.StreamSlicer, abc.ABC): View Source

10class DeclarativeCursor(Cursor, StreamSlicer, ABC):
11    """
12    DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of
13    state, declarative cursors also manage stream slicing and injecting slice values into outbound requests.
14    """

DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of state, declarative cursors also manage stream slicing and injecting slice values into outbound requests.

Inherited Members

airbyte_cdk.sources.streams.checkpoint.cursor.Cursor: set_initial_state; observe; close_slice; get_stream_state; should_be_synced; select_state
airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer.StreamSlicer: stream_slices
airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider.RequestOptionsProvider: get_request_params; get_request_headers; get_request_body_data; get_request_body_json

class PerPartitionWithGlobalCursor(airbyte_cdk.sources.declarative.incremental.DeclarativeCursor): View Source

 21class PerPartitionWithGlobalCursor(DeclarativeCursor):
 22    """
 23    Manages state for streams with multiple partitions, with an optional fallback to a global cursor when specific conditions are met.
 24
 25    This cursor handles partitioned streams by maintaining individual state per partition using `PerPartitionCursor`. If the number of partitions exceeds a defined limit, it switches to a global cursor (`GlobalSubstreamCursor`) to manage state more efficiently.
 26
 27    **Overview**
 28
 29    - **Partition-Based State**: Initially manages state per partition to ensure accurate processing of each partition's data.
 30    - **Global Fallback**: Switches to a global cursor when the partition limit is exceeded to handle state management more effectively.
 31
 32    **Switching Logic**
 33
 34    - Monitors the number of partitions.
 35    - If `PerPartitionCursor.limit_reached()` returns `True`, sets `_use_global_cursor` to `True`, activating the global cursor.
 36
 37    **Active Cursor Selection**
 38
 39    - Uses the `_get_active_cursor()` helper method to select the active cursor based on the `_use_global_cursor` flag.
 40    - This simplifies the logic and ensures consistent cursor usage across methods.
 41
 42    **State Structure Example**
 43
 44    ```json
 45    {
 46        "states": [
 47            {
 48                "partition": {"partition_key": "partition_1"},
 49                "cursor": {"cursor_field": "2021-01-15"}
 50            },
 51            {
 52                "partition": {"partition_key": "partition_2"},
 53                "cursor": {"cursor_field": "2021-02-14"}
 54            }
 55        ],
 56        "state": {
 57            "cursor_field": "2021-02-15"
 58        },
 59        "use_global_cursor": false
 60    }
 61    ```
 62
 63    In this example, the cursor is using partition-based state management (`"use_global_cursor": false`), maintaining separate cursor states for each partition.
 64
 65    **Usage Scenario**
 66
 67    Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization.
 68    """
 69
 70    def __init__(
 71        self,
 72        cursor_factory: CursorFactory,
 73        partition_router: PartitionRouter,
 74        stream_cursor: DatetimeBasedCursor,
 75    ):
 76        self._partition_router = partition_router
 77        self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router)
 78        self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router)
 79        self._use_global_cursor = False
 80        self._current_partition: Optional[Mapping[str, Any]] = None
 81        self._last_slice: bool = False
 82        self._parent_state: Optional[Mapping[str, Any]] = None
 83
 84    def _get_active_cursor(self) -> Union[PerPartitionCursor, GlobalSubstreamCursor]:
 85        return self._global_cursor if self._use_global_cursor else self._per_partition_cursor
 86
 87    def stream_slices(self) -> Iterable[StreamSlice]:
 88        self._global_cursor.start_slices_generation()
 89
 90        # Iterate through partitions and process slices
 91        for partition, is_last_partition, parent_state in iterate_with_last_flag_and_state(
 92            self._partition_router.stream_slices(), self._partition_router.get_stream_state
 93        ):
 94            # Generate slices for the current cursor and handle the last slice using the flag
 95            self._parent_state = parent_state
 96            for slice, is_last_slice, _ in iterate_with_last_flag_and_state(
 97                self._get_active_cursor().generate_slices_from_partition(partition=partition),
 98                lambda: None,
 99            ):
100                self._global_cursor.register_slice(is_last_slice and is_last_partition)
101                yield slice
102        self._parent_state = self._partition_router.get_stream_state()
103
104    def set_initial_state(self, stream_state: StreamState) -> None:
105        """
106        Set the initial state for the cursors.
107        """
108        self._use_global_cursor = stream_state.get("use_global_cursor", False)
109
110        self._parent_state = stream_state.get("parent_state", {})
111
112        self._global_cursor.set_initial_state(stream_state)
113        if not self._use_global_cursor:
114            self._per_partition_cursor.set_initial_state(stream_state)
115
116    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
117        if not self._use_global_cursor and self._per_partition_cursor.limit_reached():
118            self._use_global_cursor = True
119
120        if not self._use_global_cursor:
121            self._per_partition_cursor.observe(stream_slice, record)
122        self._global_cursor.observe(stream_slice, record)
123
124    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
125        if not self._use_global_cursor:
126            self._per_partition_cursor.close_slice(stream_slice, *args)
127        self._global_cursor.close_slice(stream_slice, *args)
128
129    def get_stream_state(self) -> StreamState:
130        final_state: MutableMapping[str, Any] = {"use_global_cursor": self._use_global_cursor}
131
132        final_state.update(self._global_cursor.get_stream_state())
133        if not self._use_global_cursor:
134            final_state.update(self._per_partition_cursor.get_stream_state())
135
136        final_state["parent_state"] = self._parent_state
137        if not final_state.get("parent_state"):
138            del final_state["parent_state"]
139
140        return final_state
141
142    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
143        return self._get_active_cursor().select_state(stream_slice)
144
145    def get_request_params(
146        self,
147        *,
148        stream_state: Optional[StreamState] = None,
149        stream_slice: Optional[StreamSlice] = None,
150        next_page_token: Optional[Mapping[str, Any]] = None,
151    ) -> Mapping[str, Any]:
152        return self._get_active_cursor().get_request_params(
153            stream_state=stream_state,
154            stream_slice=stream_slice,
155            next_page_token=next_page_token,
156        )
157
158    def get_request_headers(
159        self,
160        *,
161        stream_state: Optional[StreamState] = None,
162        stream_slice: Optional[StreamSlice] = None,
163        next_page_token: Optional[Mapping[str, Any]] = None,
164    ) -> Mapping[str, Any]:
165        return self._get_active_cursor().get_request_headers(
166            stream_state=stream_state,
167            stream_slice=stream_slice,
168            next_page_token=next_page_token,
169        )
170
171    def get_request_body_data(
172        self,
173        *,
174        stream_state: Optional[StreamState] = None,
175        stream_slice: Optional[StreamSlice] = None,
176        next_page_token: Optional[Mapping[str, Any]] = None,
177    ) -> Union[Mapping[str, Any], str]:
178        return self._get_active_cursor().get_request_body_data(
179            stream_state=stream_state,
180            stream_slice=stream_slice,
181            next_page_token=next_page_token,
182        )
183
184    def get_request_body_json(
185        self,
186        *,
187        stream_state: Optional[StreamState] = None,
188        stream_slice: Optional[StreamSlice] = None,
189        next_page_token: Optional[Mapping[str, Any]] = None,
190    ) -> Mapping[str, Any]:
191        return self._get_active_cursor().get_request_body_json(
192            stream_state=stream_state,
193            stream_slice=stream_slice,
194            next_page_token=next_page_token,
195        )
196
197    def should_be_synced(self, record: Record) -> bool:
198        return self._get_active_cursor().should_be_synced(record)

Manages state for streams with multiple partitions, with an optional fallback to a global cursor when specific conditions are met.

This cursor handles partitioned streams by maintaining individual state per partition using PerPartitionCursor. If the number of partitions exceeds a defined limit, it switches to a global cursor (GlobalSubstreamCursor) to manage state more efficiently.

Overview

Partition-Based State: Initially manages state per partition to ensure accurate processing of each partition's data.
Global Fallback: Switches to a global cursor when the partition limit is exceeded to handle state management more effectively.

Switching Logic

Monitors the number of partitions.
If PerPartitionCursor.limit_reached() returns True, sets _use_global_cursor to True, activating the global cursor.

Active Cursor Selection

Uses the _get_active_cursor() helper method to select the active cursor based on the _use_global_cursor flag.
This simplifies the logic and ensures consistent cursor usage across methods.

State Structure Example

{
    "states": [
        {
            "partition": {"partition_key": "partition_1"},
            "cursor": {"cursor_field": "2021-01-15"}
        },
        {
            "partition": {"partition_key": "partition_2"},
            "cursor": {"cursor_field": "2021-02-14"}
        }
    ],
    "state": {
        "cursor_field": "2021-02-15"
    },
    "use_global_cursor": false
}

In this example, the cursor is using partition-based state management ("use_global_cursor": false), maintaining separate cursor states for each partition.

Usage Scenario

Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization.

PerPartitionWithGlobalCursor( cursor_factory: CursorFactory, partition_router: airbyte_cdk.sources.declarative.partition_routers.PartitionRouter, stream_cursor: DatetimeBasedCursor) View Source

70    def __init__(
71        self,
72        cursor_factory: CursorFactory,
73        partition_router: PartitionRouter,
74        stream_cursor: DatetimeBasedCursor,
75    ):
76        self._partition_router = partition_router
77        self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router)
78        self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router)
79        self._use_global_cursor = False
80        self._current_partition: Optional[Mapping[str, Any]] = None
81        self._last_slice: bool = False
82        self._parent_state: Optional[Mapping[str, Any]] = None

def stream_slices(self) -> Iterable[airbyte_cdk.StreamSlice]: View Source

 87    def stream_slices(self) -> Iterable[StreamSlice]:
 88        self._global_cursor.start_slices_generation()
 89
 90        # Iterate through partitions and process slices
 91        for partition, is_last_partition, parent_state in iterate_with_last_flag_and_state(
 92            self._partition_router.stream_slices(), self._partition_router.get_stream_state
 93        ):
 94            # Generate slices for the current cursor and handle the last slice using the flag
 95            self._parent_state = parent_state
 96            for slice, is_last_slice, _ in iterate_with_last_flag_and_state(
 97                self._get_active_cursor().generate_slices_from_partition(partition=partition),
 98                lambda: None,
 99            ):
100                self._global_cursor.register_slice(is_last_slice and is_last_partition)
101                yield slice
102        self._parent_state = self._partition_router.get_stream_state()

Defines stream slices

Returns

An iterable of stream slices

def set_initial_state(self, stream_state: Mapping[str, Any]) -> None: View Source

104    def set_initial_state(self, stream_state: StreamState) -> None:
105        """
106        Set the initial state for the cursors.
107        """
108        self._use_global_cursor = stream_state.get("use_global_cursor", False)
109
110        self._parent_state = stream_state.get("parent_state", {})
111
112        self._global_cursor.set_initial_state(stream_state)
113        if not self._use_global_cursor:
114            self._per_partition_cursor.set_initial_state(stream_state)

Set the initial state for the cursors.

def observe( self, stream_slice: airbyte_cdk.StreamSlice, record: airbyte_cdk.Record) -> None: View Source

116    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
117        if not self._use_global_cursor and self._per_partition_cursor.limit_reached():
118            self._use_global_cursor = True
119
120        if not self._use_global_cursor:
121            self._per_partition_cursor.observe(stream_slice, record)
122        self._global_cursor.observe(stream_slice, record)

Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.

Parameters

stream_slice: The current slice, which may or may not contain the most recently observed record
record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.

def close_slice( self, stream_slice: airbyte_cdk.StreamSlice, *args: Any) -> None: View Source

124    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
125        if not self._use_global_cursor:
126            self._per_partition_cursor.close_slice(stream_slice, *args)
127        self._global_cursor.close_slice(stream_slice, *args)

Update state based on the stream slice. Note that stream_slice.cursor_slice and most_recent_record.associated_slice are expected to be the same but we make it explicit here that stream_slice should be leveraged to update the state. We do not pass in the latest record, since cursor instances should maintain the relevant internal state on their own.

Parameters

stream_slice: slice to close

def get_stream_state(self) -> Mapping[str, Any]: View Source

129    def get_stream_state(self) -> StreamState:
130        final_state: MutableMapping[str, Any] = {"use_global_cursor": self._use_global_cursor}
131
132        final_state.update(self._global_cursor.get_stream_state())
133        if not self._use_global_cursor:
134            final_state.update(self._per_partition_cursor.get_stream_state())
135
136        final_state["parent_state"] = self._parent_state
137        if not final_state.get("parent_state"):
138            del final_state["parent_state"]
139
140        return final_state

Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it is used for two things:

Interpolation of the requests
Transformation of records
Saving the state

For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that allows for emitting the state to the platform.

def select_state( self, stream_slice: Optional[airbyte_cdk.StreamSlice] = None) -> Optional[Mapping[str, Any]]: View Source

142    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
143        return self._get_active_cursor().select_state(stream_slice)

Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of a specific parent delineated by the incoming slice's partition object.

def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]: View Source

145    def get_request_params(
146        self,
147        *,
148        stream_state: Optional[StreamState] = None,
149        stream_slice: Optional[StreamSlice] = None,
150        next_page_token: Optional[Mapping[str, Any]] = None,
151    ) -> Mapping[str, Any]:
152        return self._get_active_cursor().get_request_params(
153            stream_state=stream_state,
154            stream_slice=stream_slice,
155            next_page_token=next_page_token,
156        )

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]: View Source

158    def get_request_headers(
159        self,
160        *,
161        stream_state: Optional[StreamState] = None,
162        stream_slice: Optional[StreamSlice] = None,
163        next_page_token: Optional[Mapping[str, Any]] = None,
164    ) -> Mapping[str, Any]:
165        return self._get_active_cursor().get_request_headers(
166            stream_state=stream_state,
167            stream_slice=stream_slice,
168            next_page_token=next_page_token,
169        )

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Union[Mapping[str, Any], str]: View Source

171    def get_request_body_data(
172        self,
173        *,
174        stream_state: Optional[StreamState] = None,
175        stream_slice: Optional[StreamSlice] = None,
176        next_page_token: Optional[Mapping[str, Any]] = None,
177    ) -> Union[Mapping[str, Any], str]:
178        return self._get_active_cursor().get_request_body_data(
179            stream_state=stream_state,
180            stream_slice=stream_slice,
181            next_page_token=next_page_token,
182        )

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]: View Source

184    def get_request_body_json(
185        self,
186        *,
187        stream_state: Optional[StreamState] = None,
188        stream_slice: Optional[StreamSlice] = None,
189        next_page_token: Optional[Mapping[str, Any]] = None,
190    ) -> Mapping[str, Any]:
191        return self._get_active_cursor().get_request_body_json(
192            stream_state=stream_state,
193            stream_slice=stream_slice,
194            next_page_token=next_page_token,
195        )

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def should_be_synced(self, record: airbyte_cdk.Record) -> bool: View Source

197    def should_be_synced(self, record: Record) -> bool:
198        return self._get_active_cursor().should_be_synced(record)

Evaluating if a record should be synced allows for filtering and stop condition on pagination

@dataclass

class ChildPartitionResumableFullRefreshCursor(airbyte_cdk.sources.declarative.incremental.ResumableFullRefreshCursor): View Source

 97@dataclass
 98class ChildPartitionResumableFullRefreshCursor(ResumableFullRefreshCursor):
 99    """
100    The Sub-stream Resumable Cursor for Full-Refresh substreams.
101    Follows the parent type `ResumableFullRefreshCursor` with a small override,
102    to provide the ability to close the substream's slice once it has finished processing.
103
104    Check the `close_slice` method overide for more info about the actual behaviour of this cursor.
105    """
106
107    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
108        """
109        Once the current slice has finished syncing:
110         - paginator returns None
111         - no more slices to process
112
113        we assume that the records are processed and emitted already,
114        thus we have to set the cursor to ` __ab_full_refresh_sync_complete: true `,
115        otherwise there is a risk of Inf. Loop processing the same slice.
116        """
117        self._cursor = FULL_REFRESH_COMPLETE_STATE

The Sub-stream Resumable Cursor for Full-Refresh substreams. Follows the parent type ResumableFullRefreshCursor with a small override, to provide the ability to close the substream's slice once it has finished processing.

Check the close_slice method overide for more info about the actual behaviour of this cursor.

ChildPartitionResumableFullRefreshCursor(parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])

def close_slice( self, stream_slice: airbyte_cdk.StreamSlice, *args: Any) -> None: View Source

107    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
108        """
109        Once the current slice has finished syncing:
110         - paginator returns None
111         - no more slices to process
112
113        we assume that the records are processed and emitted already,
114        thus we have to set the cursor to ` __ab_full_refresh_sync_complete: true `,
115        otherwise there is a risk of Inf. Loop processing the same slice.
116        """
117        self._cursor = FULL_REFRESH_COMPLETE_STATE

Once the current slice has finished syncing:

paginator returns None

no more slices to process

we assume that the records are processed and emitted already, thus we have to set the cursor to __ab_full_refresh_sync_complete: true, otherwise there is a risk of Inf. Loop processing the same slice.

Inherited Members

ResumableFullRefreshCursor: parameters; get_stream_state; set_initial_state; observe; should_be_synced; select_state; stream_slices; get_request_params; get_request_headers; get_request_body_data; get_request_body_json

airbyte_cdk.sources.declarative.incremental

Attributes:

Attributes:

Parameters

Parameters

Parameters

Returns

Returns

Parameters

Inherited Members

Arguments:

Arguments:

Parameters

Arguments:

Returns

Arguments:

Parameters

Parameters

Returns

Parameters

Parameters

Parameters

Parameters

Once the current slice has finished syncing:

Inherited Members