airbyte_cdk.legacy.sources.declarative.incremental

View Source

 1# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
 2
 3from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import (
 4    DatetimeBasedCursor,
 5)
 6from airbyte_cdk.legacy.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
 7from airbyte_cdk.legacy.sources.declarative.incremental.global_substream_cursor import (
 8    GlobalSubstreamCursor,
 9)
10from airbyte_cdk.legacy.sources.declarative.incremental.per_partition_cursor import (
11    CursorFactory,
12    PerPartitionCursor,
13)
14from airbyte_cdk.legacy.sources.declarative.incremental.per_partition_with_global import (
15    PerPartitionWithGlobalCursor,
16)
17from airbyte_cdk.legacy.sources.declarative.incremental.resumable_full_refresh_cursor import (
18    ChildPartitionResumableFullRefreshCursor,
19    ResumableFullRefreshCursor,
20)
21
22__all__ = [
23    "CursorFactory",
24    "DatetimeBasedCursor",
25    "DeclarativeCursor",
26    "GlobalSubstreamCursor",
27    "PerPartitionCursor",
28    "PerPartitionWithGlobalCursor",
29    "ResumableFullRefreshCursor",
30    "ChildPartitionResumableFullRefreshCursor",
31]

class CursorFactory: View Source

20class CursorFactory:
21    def __init__(self, create_function: Callable[[], DeclarativeCursor]):
22        self._create_function = create_function
23
24    def create(self) -> DeclarativeCursor:
25        return self._create_function()

CursorFactory( create_function: Callable[[], DeclarativeCursor]) View Source

21    def __init__(self, create_function: Callable[[], DeclarativeCursor]):
22        self._create_function = create_function

def create( self) -> DeclarativeCursor: View Source

24    def create(self) -> DeclarativeCursor:
25        return self._create_function()

class DeclarativeCursor(airbyte_cdk.sources.streams.checkpoint.cursor.Cursor, airbyte_cdk.sources.declarative.stream_slicers.stream_slicer.StreamSlicer, abc.ABC): View Source

10class DeclarativeCursor(Cursor, StreamSlicer, ABC):
11    """
12    DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of
13    state, declarative cursors also manage stream slicing and injecting slice values into outbound requests.
14    """

DeclarativeCursors are components that allow for checkpointing syncs. In addition to managing the fetching and updating of state, declarative cursors also manage stream slicing and injecting slice values into outbound requests.

Inherited Members

airbyte_cdk.sources.streams.checkpoint.cursor.Cursor: set_initial_state; observe; close_slice; get_stream_state; should_be_synced; select_state
airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer.StreamSlicer: stream_slices
airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider.RequestOptionsProvider: get_request_params; get_request_headers; get_request_body_data; get_request_body_json

class PerPartitionWithGlobalCursor(airbyte_cdk.legacy.sources.declarative.incremental.DeclarativeCursor): View Source

 23class PerPartitionWithGlobalCursor(DeclarativeCursor):
 24    """
 25    Manages state for streams with multiple partitions, with an optional fallback to a global cursor when specific conditions are met.
 26
 27    This cursor handles partitioned streams by maintaining individual state per partition using `PerPartitionCursor`. If the number of partitions exceeds a defined limit, it switches to a global cursor (`GlobalSubstreamCursor`) to manage state more efficiently.
 28
 29    **Overview**
 30
 31    - **Partition-Based State**: Initially manages state per partition to ensure accurate processing of each partition's data.
 32    - **Global Fallback**: Switches to a global cursor when the partition limit is exceeded to handle state management more effectively.
 33
 34    **Switching Logic**
 35
 36    - Monitors the number of partitions.
 37    - If `PerPartitionCursor.limit_reached()` returns `True`, sets `_use_global_cursor` to `True`, activating the global cursor.
 38
 39    **Active Cursor Selection**
 40
 41    - Uses the `_get_active_cursor()` helper method to select the active cursor based on the `_use_global_cursor` flag.
 42    - This simplifies the logic and ensures consistent cursor usage across methods.
 43
 44    **State Structure Example**
 45
 46    ```json
 47    {
 48        "states": [
 49            {
 50                "partition": {"partition_key": "partition_1"},
 51                "cursor": {"cursor_field": "2021-01-15"}
 52            },
 53            {
 54                "partition": {"partition_key": "partition_2"},
 55                "cursor": {"cursor_field": "2021-02-14"}
 56            }
 57        ],
 58        "state": {
 59            "cursor_field": "2021-02-15"
 60        },
 61        "use_global_cursor": false
 62    }
 63    ```
 64
 65    In this example, the cursor is using partition-based state management (`"use_global_cursor": false`), maintaining separate cursor states for each partition.
 66
 67    **Usage Scenario**
 68
 69    Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization.
 70    """
 71
 72    def __init__(
 73        self,
 74        cursor_factory: CursorFactory,
 75        partition_router: PartitionRouter,
 76        stream_cursor: DatetimeBasedCursor,
 77    ):
 78        self._partition_router = partition_router
 79        self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router)
 80        self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router)
 81        self._use_global_cursor = False
 82        self._current_partition: Optional[Mapping[str, Any]] = None
 83        self._last_slice: bool = False
 84        self._parent_state: Optional[Mapping[str, Any]] = None
 85
 86    def _get_active_cursor(self) -> Union[PerPartitionCursor, GlobalSubstreamCursor]:
 87        return self._global_cursor if self._use_global_cursor else self._per_partition_cursor
 88
 89    def stream_slices(self) -> Iterable[StreamSlice]:
 90        self._global_cursor.start_slices_generation()
 91
 92        # Iterate through partitions and process slices
 93        for partition, is_last_partition, parent_state in iterate_with_last_flag_and_state(
 94            self._partition_router.stream_slices(), self._partition_router.get_stream_state
 95        ):
 96            # Generate slices for the current cursor and handle the last slice using the flag
 97            self._parent_state = parent_state
 98            for slice, is_last_slice, _ in iterate_with_last_flag_and_state(
 99                self._get_active_cursor().generate_slices_from_partition(partition=partition),
100                lambda: None,
101            ):
102                self._global_cursor.register_slice(is_last_slice and is_last_partition)
103                yield slice
104        self._parent_state = self._partition_router.get_stream_state()
105
106    def set_initial_state(self, stream_state: StreamState) -> None:
107        """
108        Set the initial state for the cursors.
109        """
110        self._use_global_cursor = stream_state.get("use_global_cursor", False)
111
112        self._parent_state = stream_state.get("parent_state", {})
113
114        self._global_cursor.set_initial_state(stream_state)
115        if not self._use_global_cursor:
116            self._per_partition_cursor.set_initial_state(stream_state)
117
118    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
119        if not self._use_global_cursor and self._per_partition_cursor.limit_reached():
120            self._use_global_cursor = True
121
122        if not self._use_global_cursor:
123            self._per_partition_cursor.observe(stream_slice, record)
124        self._global_cursor.observe(stream_slice, record)
125
126    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
127        if not self._use_global_cursor:
128            self._per_partition_cursor.close_slice(stream_slice, *args)
129        self._global_cursor.close_slice(stream_slice, *args)
130
131    def get_stream_state(self) -> StreamState:
132        final_state: MutableMapping[str, Any] = {"use_global_cursor": self._use_global_cursor}
133
134        final_state.update(self._global_cursor.get_stream_state())
135        if not self._use_global_cursor:
136            final_state.update(self._per_partition_cursor.get_stream_state())
137
138        final_state["parent_state"] = self._parent_state
139        if not final_state.get("parent_state"):
140            del final_state["parent_state"]
141
142        return final_state
143
144    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
145        return self._get_active_cursor().select_state(stream_slice)
146
147    def get_request_params(
148        self,
149        *,
150        stream_state: Optional[StreamState] = None,
151        stream_slice: Optional[StreamSlice] = None,
152        next_page_token: Optional[Mapping[str, Any]] = None,
153    ) -> Mapping[str, Any]:
154        return self._get_active_cursor().get_request_params(
155            stream_state=stream_state,
156            stream_slice=stream_slice,
157            next_page_token=next_page_token,
158        )
159
160    def get_request_headers(
161        self,
162        *,
163        stream_state: Optional[StreamState] = None,
164        stream_slice: Optional[StreamSlice] = None,
165        next_page_token: Optional[Mapping[str, Any]] = None,
166    ) -> Mapping[str, Any]:
167        return self._get_active_cursor().get_request_headers(
168            stream_state=stream_state,
169            stream_slice=stream_slice,
170            next_page_token=next_page_token,
171        )
172
173    def get_request_body_data(
174        self,
175        *,
176        stream_state: Optional[StreamState] = None,
177        stream_slice: Optional[StreamSlice] = None,
178        next_page_token: Optional[Mapping[str, Any]] = None,
179    ) -> Union[Mapping[str, Any], str]:
180        return self._get_active_cursor().get_request_body_data(
181            stream_state=stream_state,
182            stream_slice=stream_slice,
183            next_page_token=next_page_token,
184        )
185
186    def get_request_body_json(
187        self,
188        *,
189        stream_state: Optional[StreamState] = None,
190        stream_slice: Optional[StreamSlice] = None,
191        next_page_token: Optional[Mapping[str, Any]] = None,
192    ) -> Mapping[str, Any]:
193        return self._get_active_cursor().get_request_body_json(
194            stream_state=stream_state,
195            stream_slice=stream_slice,
196            next_page_token=next_page_token,
197        )
198
199    def should_be_synced(self, record: Record) -> bool:
200        return self._get_active_cursor().should_be_synced(record)

Manages state for streams with multiple partitions, with an optional fallback to a global cursor when specific conditions are met.

This cursor handles partitioned streams by maintaining individual state per partition using PerPartitionCursor. If the number of partitions exceeds a defined limit, it switches to a global cursor (GlobalSubstreamCursor) to manage state more efficiently.

Overview

Partition-Based State: Initially manages state per partition to ensure accurate processing of each partition's data.
Global Fallback: Switches to a global cursor when the partition limit is exceeded to handle state management more effectively.

Switching Logic

Monitors the number of partitions.
If PerPartitionCursor.limit_reached() returns True, sets _use_global_cursor to True, activating the global cursor.

Active Cursor Selection

Uses the _get_active_cursor() helper method to select the active cursor based on the _use_global_cursor flag.
This simplifies the logic and ensures consistent cursor usage across methods.

State Structure Example

{
    "states": [
        {
            "partition": {"partition_key": "partition_1"},
            "cursor": {"cursor_field": "2021-01-15"}
        },
        {
            "partition": {"partition_key": "partition_2"},
            "cursor": {"cursor_field": "2021-02-14"}
        }
    ],
    "state": {
        "cursor_field": "2021-02-15"
    },
    "use_global_cursor": false
}

In this example, the cursor is using partition-based state management ("use_global_cursor": false), maintaining separate cursor states for each partition.

Usage Scenario

Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization.

PerPartitionWithGlobalCursor( cursor_factory: CursorFactory, partition_router: airbyte_cdk.sources.declarative.partition_routers.PartitionRouter, stream_cursor: DatetimeBasedCursor) View Source

72    def __init__(
73        self,
74        cursor_factory: CursorFactory,
75        partition_router: PartitionRouter,
76        stream_cursor: DatetimeBasedCursor,
77    ):
78        self._partition_router = partition_router
79        self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router)
80        self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router)
81        self._use_global_cursor = False
82        self._current_partition: Optional[Mapping[str, Any]] = None
83        self._last_slice: bool = False
84        self._parent_state: Optional[Mapping[str, Any]] = None

def stream_slices(self) -> Iterable[airbyte_cdk.StreamSlice]: View Source

 89    def stream_slices(self) -> Iterable[StreamSlice]:
 90        self._global_cursor.start_slices_generation()
 91
 92        # Iterate through partitions and process slices
 93        for partition, is_last_partition, parent_state in iterate_with_last_flag_and_state(
 94            self._partition_router.stream_slices(), self._partition_router.get_stream_state
 95        ):
 96            # Generate slices for the current cursor and handle the last slice using the flag
 97            self._parent_state = parent_state
 98            for slice, is_last_slice, _ in iterate_with_last_flag_and_state(
 99                self._get_active_cursor().generate_slices_from_partition(partition=partition),
100                lambda: None,
101            ):
102                self._global_cursor.register_slice(is_last_slice and is_last_partition)
103                yield slice
104        self._parent_state = self._partition_router.get_stream_state()

Defines stream slices

Returns

An iterable of stream slices

def set_initial_state(self, stream_state: Mapping[str, Any]) -> None: View Source

106    def set_initial_state(self, stream_state: StreamState) -> None:
107        """
108        Set the initial state for the cursors.
109        """
110        self._use_global_cursor = stream_state.get("use_global_cursor", False)
111
112        self._parent_state = stream_state.get("parent_state", {})
113
114        self._global_cursor.set_initial_state(stream_state)
115        if not self._use_global_cursor:
116            self._per_partition_cursor.set_initial_state(stream_state)

Set the initial state for the cursors.

def observe( self, stream_slice: airbyte_cdk.StreamSlice, record: airbyte_cdk.Record) -> None: View Source

118    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
119        if not self._use_global_cursor and self._per_partition_cursor.limit_reached():
120            self._use_global_cursor = True
121
122        if not self._use_global_cursor:
123            self._per_partition_cursor.observe(stream_slice, record)
124        self._global_cursor.observe(stream_slice, record)

Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.

Parameters

stream_slice: The current slice, which may or may not contain the most recently observed record
record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.

def close_slice( self, stream_slice: airbyte_cdk.StreamSlice, *args: Any) -> None: View Source

126    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
127        if not self._use_global_cursor:
128            self._per_partition_cursor.close_slice(stream_slice, *args)
129        self._global_cursor.close_slice(stream_slice, *args)

Update state based on the stream slice. Note that stream_slice.cursor_slice and most_recent_record.associated_slice are expected to be the same but we make it explicit here that stream_slice should be leveraged to update the state. We do not pass in the latest record, since cursor instances should maintain the relevant internal state on their own.

Parameters

stream_slice: slice to close

def get_stream_state(self) -> Mapping[str, Any]: View Source

131    def get_stream_state(self) -> StreamState:
132        final_state: MutableMapping[str, Any] = {"use_global_cursor": self._use_global_cursor}
133
134        final_state.update(self._global_cursor.get_stream_state())
135        if not self._use_global_cursor:
136            final_state.update(self._per_partition_cursor.get_stream_state())
137
138        final_state["parent_state"] = self._parent_state
139        if not final_state.get("parent_state"):
140            del final_state["parent_state"]
141
142        return final_state

Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it is used for two things:

Interpolation of the requests
Transformation of records
Saving the state

For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that allows for emitting the state to the platform.

def select_state( self, stream_slice: Optional[airbyte_cdk.StreamSlice] = None) -> Optional[Mapping[str, Any]]: View Source

144    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
145        return self._get_active_cursor().select_state(stream_slice)

Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of a specific parent delineated by the incoming slice's partition object.

def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]: View Source

147    def get_request_params(
148        self,
149        *,
150        stream_state: Optional[StreamState] = None,
151        stream_slice: Optional[StreamSlice] = None,
152        next_page_token: Optional[Mapping[str, Any]] = None,
153    ) -> Mapping[str, Any]:
154        return self._get_active_cursor().get_request_params(
155            stream_state=stream_state,
156            stream_slice=stream_slice,
157            next_page_token=next_page_token,
158        )

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]: View Source

160    def get_request_headers(
161        self,
162        *,
163        stream_state: Optional[StreamState] = None,
164        stream_slice: Optional[StreamSlice] = None,
165        next_page_token: Optional[Mapping[str, Any]] = None,
166    ) -> Mapping[str, Any]:
167        return self._get_active_cursor().get_request_headers(
168            stream_state=stream_state,
169            stream_slice=stream_slice,
170            next_page_token=next_page_token,
171        )

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Union[Mapping[str, Any], str]: View Source

173    def get_request_body_data(
174        self,
175        *,
176        stream_state: Optional[StreamState] = None,
177        stream_slice: Optional[StreamSlice] = None,
178        next_page_token: Optional[Mapping[str, Any]] = None,
179    ) -> Union[Mapping[str, Any], str]:
180        return self._get_active_cursor().get_request_body_data(
181            stream_state=stream_state,
182            stream_slice=stream_slice,
183            next_page_token=next_page_token,
184        )

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[airbyte_cdk.StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]: View Source

186    def get_request_body_json(
187        self,
188        *,
189        stream_state: Optional[StreamState] = None,
190        stream_slice: Optional[StreamSlice] = None,
191        next_page_token: Optional[Mapping[str, Any]] = None,
192    ) -> Mapping[str, Any]:
193        return self._get_active_cursor().get_request_body_json(
194            stream_state=stream_state,
195            stream_slice=stream_slice,
196            next_page_token=next_page_token,
197        )

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def should_be_synced(self, record: airbyte_cdk.Record) -> bool: View Source

199    def should_be_synced(self, record: Record) -> bool:
200        return self._get_active_cursor().should_be_synced(record)

Evaluating if a record should be synced allows for filtering and stop condition on pagination

@dataclass

class ChildPartitionResumableFullRefreshCursor(airbyte_cdk.legacy.sources.declarative.incremental.ResumableFullRefreshCursor): View Source

 97@dataclass
 98class ChildPartitionResumableFullRefreshCursor(ResumableFullRefreshCursor):
 99    """
100    The Sub-stream Resumable Cursor for Full-Refresh substreams.
101    Follows the parent type `ResumableFullRefreshCursor` with a small override,
102    to provide the ability to close the substream's slice once it has finished processing.
103
104    Check the `close_slice` method overide for more info about the actual behaviour of this cursor.
105    """
106
107    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
108        """
109        Once the current slice has finished syncing:
110         - paginator returns None
111         - no more slices to process
112
113        we assume that the records are processed and emitted already,
114        thus we have to set the cursor to ` __ab_full_refresh_sync_complete: true `,
115        otherwise there is a risk of Inf. Loop processing the same slice.
116        """
117        self._cursor = FULL_REFRESH_COMPLETE_STATE

The Sub-stream Resumable Cursor for Full-Refresh substreams. Follows the parent type ResumableFullRefreshCursor with a small override, to provide the ability to close the substream's slice once it has finished processing.

Check the close_slice method overide for more info about the actual behaviour of this cursor.

ChildPartitionResumableFullRefreshCursor(parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])

def close_slice( self, stream_slice: airbyte_cdk.StreamSlice, *args: Any) -> None: View Source

107    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
108        """
109        Once the current slice has finished syncing:
110         - paginator returns None
111         - no more slices to process
112
113        we assume that the records are processed and emitted already,
114        thus we have to set the cursor to ` __ab_full_refresh_sync_complete: true `,
115        otherwise there is a risk of Inf. Loop processing the same slice.
116        """
117        self._cursor = FULL_REFRESH_COMPLETE_STATE

Once the current slice has finished syncing:

paginator returns None

no more slices to process

we assume that the records are processed and emitted already, thus we have to set the cursor to __ab_full_refresh_sync_complete: true, otherwise there is a risk of Inf. Loop processing the same slice.

Inherited Members

ResumableFullRefreshCursor: parameters; get_stream_state; set_initial_state; observe; should_be_synced; select_state; stream_slices; get_request_params; get_request_headers; get_request_body_data; get_request_body_json

airbyte_cdk.legacy.sources.declarative.incremental

Attributes:

Parameters

Parameters

Parameters

Returns

Returns

Parameters

Inherited Members

Arguments:

Arguments:

Parameters

Arguments:

Returns

Arguments:

Parameters

Parameters

Returns

Parameters

Parameters

Parameters

Parameters

Once the current slice has finished syncing:

Inherited Members