airbyte_cdk

Welcome to the Airbyte Python CDK!

The Airbyte Python CDK is a Python library that provides a set of tools to help you build connectors for the Airbyte platform.

Building Source Connectors

To build a source connector, you will want to refer to the following classes and modules:

Building Destination Connectors

To build a destination connector, you will want to refer to the following classes and modules:

Working with Airbyte Protocol Models

The Airbyte CDK provides a set of classes that help you work with the Airbyte protocol models:


API Reference


  1# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
  2"""
  3# Welcome to the Airbyte Python CDK!
  4
  5The Airbyte Python CDK is a Python library that provides a set of tools to help you build
  6connectors for the Airbyte platform.
  7
  8## Building Source Connectors
  9
 10To build a source connector, you will want to refer to
 11the following classes and modules:
 12
 13- `airbyte_cdk.sources`
 14- `airbyte_cdk.sources.concurrent_source`
 15- `airbyte_cdk.sources.config`
 16- `airbyte_cdk.sources.file_based`
 17- `airbyte_cdk.sources.streams`
 18
 19## Building Destination Connectors
 20
 21To build a destination connector, you will want to refer to
 22the following classes and modules:
 23
 24- `airbyte_cdk.destinations`
 25- `airbyte_cdk.destinations.Destination`
 26- `airbyte_cdk.destinations.vector_db_based`
 27
 28## Working with Airbyte Protocol Models
 29
 30The Airbyte CDK provides a set of classes that help you work with the Airbyte protocol models:
 31
 32- `airbyte_cdk.models.airbyte_protocol`
 33- `airbyte_cdk.models.airbyte_protocol_serializers`
 34
 35---
 36
 37API Reference
 38
 39---
 40
 41"""
 42
 43# Warning: The below imports are not stable and will cause circular
 44# dependencies if auto-sorted with isort. Please keep them in the same order.
 45# TODO: Submodules should import from lower-level modules, rather than importing from here.
 46# Imports should also be placed in `if TYPE_CHECKING` blocks if they are only used as type
 47# hints - again, to avoid circular dependencies.
 48# Once those issues are resolved, the below can be sorted with isort.
 49import dunamai as _dunamai
 50
 51from .config_observation import (
 52    create_connector_config_control_message,
 53    emit_configuration_as_airbyte_control_message,
 54)
 55from .connector import BaseConnector, Connector
 56from .destinations import Destination
 57from .entrypoint import AirbyteEntrypoint, launch
 58from .logger import AirbyteLogFormatter, init_logger
 59from .models import (
 60    AdvancedAuth,
 61    AirbyteConnectionStatus,
 62    AirbyteLogMessage,
 63    AirbyteMessage,
 64    AirbyteRecordMessage,
 65    AirbyteStream,
 66    ConfiguredAirbyteCatalog,
 67    ConfiguredAirbyteStream,
 68    ConnectorSpecification,
 69    DestinationSyncMode,
 70    FailureType,
 71    Level,
 72    OAuthConfigSpecification,
 73    OrchestratorType,
 74    Status,
 75    SyncMode,
 76    Type,
 77)
 78from .sources import AbstractSource, Source
 79from .sources.concurrent_source.concurrent_source import ConcurrentSource
 80from .sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
 81from .sources.config import BaseConfig
 82from .sources.connector_state_manager import ConnectorStateManager
 83from .sources.declarative.auth import DeclarativeOauth2Authenticator
 84from .sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator, NoAuth
 85from .sources.declarative.auth.oauth import DeclarativeSingleUseRefreshTokenOauth2Authenticator
 86from .sources.declarative.auth.token import (
 87    ApiKeyAuthenticator,
 88    BasicHttpAuthenticator,
 89    BearerAuthenticator,
 90)
 91from .sources.declarative.datetime.min_max_datetime import MinMaxDatetime
 92from .sources.declarative.declarative_stream import DeclarativeStream
 93from .sources.declarative.decoders import Decoder, JsonDecoder
 94from .sources.declarative.exceptions import ReadException
 95from .sources.declarative.extractors import DpathExtractor, RecordSelector
 96from .sources.declarative.extractors.record_extractor import RecordExtractor
 97from .sources.declarative.extractors.record_filter import RecordFilter
 98from .sources.declarative.incremental import DatetimeBasedCursor
 99from .sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString
100from .sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
101from .sources.declarative.migrations.legacy_to_per_partition_state_migration import (
102    LegacyToPerPartitionStateMigration,
103)
104from .sources.declarative.partition_routers import (
105    CartesianProductStreamSlicer,
106    SinglePartitionRouter,
107    SubstreamPartitionRouter,
108)
109from .sources.declarative.partition_routers.substream_partition_router import ParentStreamConfig
110from .sources.declarative.requesters import HttpRequester, Requester
111from .sources.declarative.requesters.error_handlers import BackoffStrategy
112from .sources.declarative.requesters.paginators import DefaultPaginator, PaginationStrategy
113from .sources.declarative.requesters.paginators.strategies import (
114    CursorPaginationStrategy,
115    OffsetIncrement,
116    PageIncrement,
117    StopConditionPaginationStrategyDecorator,
118)
119from .sources.declarative.requesters.request_option import RequestOption, RequestOptionType
120from .sources.declarative.requesters.request_options.default_request_options_provider import (
121    DefaultRequestOptionsProvider,
122)
123from .sources.declarative.requesters.request_options.interpolated_request_input_provider import (
124    InterpolatedRequestInputProvider,
125)
126from .sources.declarative.requesters.requester import HttpMethod
127from .sources.declarative.retrievers import SimpleRetriever
128from .sources.declarative.schema import JsonFileSchemaLoader
129from .sources.declarative.transformations.add_fields import AddedFieldDefinition, AddFields
130from .sources.declarative.transformations.transformation import RecordTransformation
131from .sources.declarative.types import FieldPointer
132from .sources.declarative.yaml_declarative_source import YamlDeclarativeSource
133from .sources.message import InMemoryMessageRepository, MessageRepository
134from .sources.source import TState
135from .sources.streams.availability_strategy import AvailabilityStrategy
136from .sources.streams.call_rate import (
137    AbstractAPIBudget,
138    CachedLimiterSession,
139    HttpAPIBudget,
140    HttpRequestMatcher,
141    LimiterSession,
142    MovingWindowCallRatePolicy,
143    Rate,
144)
145from .sources.streams.checkpoint import Cursor as LegacyCursor
146from .sources.streams.checkpoint import ResumableFullRefreshCursor
147from .sources.streams.concurrent.adapters import StreamFacade
148from .sources.streams.concurrent.cursor import (
149    ConcurrentCursor,
150    Cursor,
151    CursorField,
152    FinalStateCursor,
153)
154from .sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
155    EpochValueConcurrentStreamStateConverter,
156    IsoMillisConcurrentStreamStateConverter,
157)
158from .sources.streams.core import IncrementalMixin, Stream, package_name_from_class
159from .sources.streams.http import HttpStream, HttpSubStream
160from .sources.streams.http.availability_strategy import HttpAvailabilityStrategy
161from .sources.streams.http.exceptions import (
162    BaseBackoffException,
163    DefaultBackoffException,
164    UserDefinedBackoffException,
165)
166from .sources.streams.http.rate_limiting import default_backoff_handler
167from .sources.streams.http.requests_native_auth import (
168    Oauth2Authenticator,
169    SingleUseRefreshTokenOauth2Authenticator,
170    TokenAuthenticator,
171)
172from .sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator
173from .sources.types import Config, Record, StreamSlice
174from .sources.utils import casing
175from .sources.utils.schema_helpers import (
176    InternalConfig,
177    ResourceSchemaLoader,
178    check_config_against_spec_or_exit,
179    expand_refs,
180    split_config,
181)
182from .sources.utils.transform import TransformConfig, TypeTransformer
183from .utils import AirbyteTracedException, is_cloud_environment
184from .utils.constants import ENV_REQUEST_CACHE_PATH
185from .utils.event_timing import create_timer
186from .utils.oneof_option_config import OneOfOptionConfig
187from .utils.spec_schema_transformations import resolve_refs
188from .utils.stream_status_utils import as_airbyte_message
189
190__all__ = [
191    # Availability strategy
192    "AvailabilityStrategy",
193    "HttpAvailabilityStrategy",
194    # Checkpoint
195    "LegacyCursor",
196    "ResumableFullRefreshCursor",
197    # Concurrent
198    "ConcurrentCursor",
199    "ConcurrentSource",
200    "ConcurrentSourceAdapter",
201    "Cursor",
202    "CursorField",
203    "DEFAULT_CONCURRENCY",
204    "EpochValueConcurrentStreamStateConverter",
205    "FinalStateCursor",
206    "IsoMillisConcurrentStreamStateConverter",
207    "StreamFacade",
208    # Config observation
209    "create_connector_config_control_message",
210    "emit_configuration_as_airbyte_control_message",
211    # Connector
212    "AbstractSource",
213    "BaseConfig",
214    "BaseConnector",
215    "Connector",
216    "Destination",
217    "Source",
218    "TState",
219    # Declarative
220    "AddFields",
221    "AddedFieldDefinition",
222    "ApiKeyAuthenticator",
223    "BackoffStrategy",
224    "BasicHttpAuthenticator",
225    "BearerAuthenticator",
226    "CartesianProductStreamSlicer",
227    "CursorPaginationStrategy",
228    "DatetimeBasedCursor",
229    "DeclarativeAuthenticator",
230    "DeclarativeOauth2Authenticator",
231    "DeclarativeSingleUseRefreshTokenOauth2Authenticator",
232    "DeclarativeStream",
233    "Decoder",
234    "DefaultPaginator",
235    "DefaultRequestOptionsProvider",
236    "DpathExtractor",
237    "FieldPointer",
238    "HttpMethod",
239    "HttpRequester",
240    "InterpolatedBoolean",
241    "InterpolatedRequestInputProvider",
242    "InterpolatedString",
243    "JsonDecoder",
244    "JsonFileSchemaLoader",
245    "LegacyToPerPartitionStateMigration",
246    "ManifestDeclarativeSource",
247    "MinMaxDatetime",
248    "NoAuth",
249    "OffsetIncrement",
250    "PageIncrement",
251    "PaginationStrategy",
252    "ParentStreamConfig",
253    "ReadException",
254    "RecordExtractor",
255    "RecordFilter",
256    "RecordSelector",
257    "RecordTransformation",
258    "RequestOption",
259    "RequestOptionType",
260    "Requester",
261    "ResponseStatus",
262    "SimpleRetriever",
263    "SinglePartitionRouter",
264    "StopConditionPaginationStrategyDecorator",
265    "StreamSlice",
266    "SubstreamPartitionRouter",
267    "YamlDeclarativeSource",
268    # Entrypoint
269    "launch",
270    "AirbyteEntrypoint",
271    # HTTP
272    "AbstractAPIBudget",
273    "AbstractHeaderAuthenticator",
274    "BaseBackoffException",
275    "CachedLimiterSession",
276    "DefaultBackoffException",
277    "default_backoff_handler",
278    "HttpAPIBudget",
279    "HttpAuthenticator",
280    "HttpRequestMatcher",
281    "HttpStream",
282    "HttpSubStream",
283    "LimiterSession",
284    "MovingWindowCallRatePolicy",
285    "MultipleTokenAuthenticator",
286    "Oauth2Authenticator",
287    "Rate",
288    "SingleUseRefreshTokenOauth2Authenticator",
289    "TokenAuthenticator",
290    "UserDefinedBackoffException",
291    # Logger
292    "AirbyteLogFormatter",
293    "init_logger",
294    # Protocol classes
295    "AirbyteStream",
296    "AirbyteConnectionStatus",
297    "AirbyteMessage",
298    "ConfiguredAirbyteCatalog",
299    "Status",
300    "Type",
301    "OrchestratorType",
302    "ConfiguredAirbyteStream",
303    "DestinationSyncMode",
304    "SyncMode",
305    "FailureType",
306    "AdvancedAuth",
307    "AirbyteLogMessage",
308    "OAuthConfigSpecification",
309    "ConnectorSpecification",
310    "Level",
311    "AirbyteRecordMessage",
312    # Repository
313    "InMemoryMessageRepository",
314    "MessageRepository",
315    # State management
316    "ConnectorStateManager",
317    # Stream
318    "IncrementalMixin",
319    "Stream",
320    "StreamData",
321    "package_name_from_class",
322    # Utils
323    "AirbyteTracedException",
324    "is_cloud_environment",
325    "casing",
326    "InternalConfig",
327    "ResourceSchemaLoader",
328    "check_config_against_spec_or_exit",
329    "split_config",
330    "TransformConfig",
331    "TypeTransformer",
332    "ENV_REQUEST_CACHE_PATH",
333    "create_timer",
334    "OneOfOptionConfig",
335    "resolve_refs",
336    "as_airbyte_message",
337    # Types
338    "Config",
339    "Record",
340    "Source",
341    "StreamSlice",
342]
343
344__version__: str
345"""Version generated by poetry dynamic versioning during publish.
346
347When running in development, dunamai will calculate a new prerelease version
348from existing git release tag info.
349"""
350
351try:
352    __version__ = _dunamai.get_version(
353        "airbyte-cdk",
354        third_choice=_dunamai.Version.from_any_vcs,
355        fallback=_dunamai.Version("0.0.0+dev"),
356    ).serialize()
357except:
358    __version__ = "0.0.0+dev"
class AvailabilityStrategy(abc.ABC):
18class AvailabilityStrategy(ABC):
19    """
20    Abstract base class for checking stream availability.
21    """
22
23    @abstractmethod
24    def check_availability(
25        self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None
26    ) -> Tuple[bool, Optional[str]]:
27        """
28        Checks stream availability.
29
30        :param stream: stream
31        :param logger: source logger
32        :param source: (optional) source
33        :return: A tuple of (boolean, str). If boolean is true, then the stream
34          is available, and no str is required. Otherwise, the stream is unavailable
35          for some reason and the str should describe what went wrong and how to
36          resolve the unavailability, if possible.
37        """
38
39    @staticmethod
40    def get_first_stream_slice(stream: Stream) -> Optional[Mapping[str, Any]]:
41        """
42        Gets the first stream_slice from a given stream's stream_slices.
43        :param stream: stream
44        :raises StopIteration: if there is no first slice to return (the stream_slices generator is empty)
45        :return: first stream slice from 'stream_slices' generator (`None` is a valid stream slice)
46        """
47        # We wrap the return output of stream_slices() because some implementations return types that are iterable,
48        # but not iterators such as lists or tuples
49        slices = iter(
50            stream.stream_slices(
51                cursor_field=stream.cursor_field,  # type: ignore[arg-type]
52                sync_mode=SyncMode.full_refresh,
53            )
54        )
55        return next(slices)
56
57    @staticmethod
58    def get_first_record_for_slice(
59        stream: Stream, stream_slice: Optional[Mapping[str, Any]]
60    ) -> StreamData:
61        """
62        Gets the first record for a stream_slice of a stream.
63
64        :param stream: stream instance from which to read records
65        :param stream_slice: stream_slice parameters for slicing the stream
66        :raises StopIteration: if there is no first record to return (the read_records generator is empty)
67        :return: StreamData containing the first record in the slice
68        """
69        # Store the original value of exit_on_rate_limit
70        original_exit_on_rate_limit = stream.exit_on_rate_limit
71
72        try:
73            # Ensure exit_on_rate_limit is safely set to True if possible
74            stream.exit_on_rate_limit = True
75
76            # We wrap the return output of read_records() because some implementations return types that are iterable,
77            # but not iterators such as lists or tuples
78            records_for_slice = iter(
79                stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice)
80            )
81
82            return next(records_for_slice)
83        finally:
84            # Restore the original exit_on_rate_limit value
85            stream.exit_on_rate_limit = original_exit_on_rate_limit

Abstract base class for checking stream availability.

@abstractmethod
def check_availability( self, stream: Stream, logger: logging.Logger, source: Optional[Source] = None) -> Tuple[bool, Optional[str]]:
23    @abstractmethod
24    def check_availability(
25        self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None
26    ) -> Tuple[bool, Optional[str]]:
27        """
28        Checks stream availability.
29
30        :param stream: stream
31        :param logger: source logger
32        :param source: (optional) source
33        :return: A tuple of (boolean, str). If boolean is true, then the stream
34          is available, and no str is required. Otherwise, the stream is unavailable
35          for some reason and the str should describe what went wrong and how to
36          resolve the unavailability, if possible.
37        """

Checks stream availability.

Parameters
  • stream: stream
  • logger: source logger
  • source: (optional) source
Returns

A tuple of (boolean, str). If boolean is true, then the stream is available, and no str is required. Otherwise, the stream is unavailable for some reason and the str should describe what went wrong and how to resolve the unavailability, if possible.

@staticmethod
def get_first_stream_slice( stream: Stream) -> Optional[Mapping[str, Any]]:
39    @staticmethod
40    def get_first_stream_slice(stream: Stream) -> Optional[Mapping[str, Any]]:
41        """
42        Gets the first stream_slice from a given stream's stream_slices.
43        :param stream: stream
44        :raises StopIteration: if there is no first slice to return (the stream_slices generator is empty)
45        :return: first stream slice from 'stream_slices' generator (`None` is a valid stream slice)
46        """
47        # We wrap the return output of stream_slices() because some implementations return types that are iterable,
48        # but not iterators such as lists or tuples
49        slices = iter(
50            stream.stream_slices(
51                cursor_field=stream.cursor_field,  # type: ignore[arg-type]
52                sync_mode=SyncMode.full_refresh,
53            )
54        )
55        return next(slices)

Gets the first stream_slice from a given stream's stream_slices.

Parameters
  • stream: stream
Raises
  • StopIteration: if there is no first slice to return (the stream_slices generator is empty)
Returns

first stream slice from 'stream_slices' generator (None is a valid stream slice)

@staticmethod
def get_first_record_for_slice( stream: Stream, stream_slice: Optional[Mapping[str, Any]]) -> Union[Mapping[str, Any], AirbyteMessage]:
57    @staticmethod
58    def get_first_record_for_slice(
59        stream: Stream, stream_slice: Optional[Mapping[str, Any]]
60    ) -> StreamData:
61        """
62        Gets the first record for a stream_slice of a stream.
63
64        :param stream: stream instance from which to read records
65        :param stream_slice: stream_slice parameters for slicing the stream
66        :raises StopIteration: if there is no first record to return (the read_records generator is empty)
67        :return: StreamData containing the first record in the slice
68        """
69        # Store the original value of exit_on_rate_limit
70        original_exit_on_rate_limit = stream.exit_on_rate_limit
71
72        try:
73            # Ensure exit_on_rate_limit is safely set to True if possible
74            stream.exit_on_rate_limit = True
75
76            # We wrap the return output of read_records() because some implementations return types that are iterable,
77            # but not iterators such as lists or tuples
78            records_for_slice = iter(
79                stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice)
80            )
81
82            return next(records_for_slice)
83        finally:
84            # Restore the original exit_on_rate_limit value
85            stream.exit_on_rate_limit = original_exit_on_rate_limit

Gets the first record for a stream_slice of a stream.

Parameters
  • stream: stream instance from which to read records
  • stream_slice: stream_slice parameters for slicing the stream
Raises
  • StopIteration: if there is no first record to return (the read_records generator is empty)
Returns

StreamData containing the first record in the slice

class HttpAvailabilityStrategy(airbyte_cdk.AvailabilityStrategy):
18class HttpAvailabilityStrategy(AvailabilityStrategy):
19    def check_availability(
20        self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None
21    ) -> Tuple[bool, Optional[str]]:
22        """
23        Check stream availability by attempting to read the first record of the
24        stream.
25
26        :param stream: stream
27        :param logger: source logger
28        :param source: (optional) source
29        :return: A tuple of (boolean, str). If boolean is true, then the stream
30          is available, and no str is required. Otherwise, the stream is unavailable
31          for some reason and the str should describe what went wrong and how to
32          resolve the unavailability, if possible.
33        """
34        reason: Optional[str]
35        try:
36            # Some streams need a stream slice to read records (e.g. if they have a SubstreamPartitionRouter)
37            # Streams that don't need a stream slice will return `None` as their first stream slice.
38            stream_slice = self.get_first_stream_slice(stream)
39        except StopIteration:
40            # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!)
41            # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield <something>`
42            # without accounting for the case in which the parent stream is empty.
43            reason = f"Cannot attempt to connect to stream {stream.name} - no stream slices were found, likely because the parent stream is empty."
44            return False, reason
45        except AirbyteTracedException as error:
46            return False, error.message
47
48        try:
49            self.get_first_record_for_slice(stream, stream_slice)
50            return True, None
51        except StopIteration:
52            logger.info(f"Successfully connected to stream {stream.name}, but got 0 records.")
53            return True, None
54        except AirbyteTracedException as error:
55            return False, error.message

Abstract base class for checking stream availability.

def check_availability( self, stream: Stream, logger: logging.Logger, source: Optional[Source] = None) -> Tuple[bool, Optional[str]]:
19    def check_availability(
20        self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None
21    ) -> Tuple[bool, Optional[str]]:
22        """
23        Check stream availability by attempting to read the first record of the
24        stream.
25
26        :param stream: stream
27        :param logger: source logger
28        :param source: (optional) source
29        :return: A tuple of (boolean, str). If boolean is true, then the stream
30          is available, and no str is required. Otherwise, the stream is unavailable
31          for some reason and the str should describe what went wrong and how to
32          resolve the unavailability, if possible.
33        """
34        reason: Optional[str]
35        try:
36            # Some streams need a stream slice to read records (e.g. if they have a SubstreamPartitionRouter)
37            # Streams that don't need a stream slice will return `None` as their first stream slice.
38            stream_slice = self.get_first_stream_slice(stream)
39        except StopIteration:
40            # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!)
41            # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield <something>`
42            # without accounting for the case in which the parent stream is empty.
43            reason = f"Cannot attempt to connect to stream {stream.name} - no stream slices were found, likely because the parent stream is empty."
44            return False, reason
45        except AirbyteTracedException as error:
46            return False, error.message
47
48        try:
49            self.get_first_record_for_slice(stream, stream_slice)
50            return True, None
51        except StopIteration:
52            logger.info(f"Successfully connected to stream {stream.name}, but got 0 records.")
53            return True, None
54        except AirbyteTracedException as error:
55            return False, error.message

Check stream availability by attempting to read the first record of the stream.

Parameters
  • stream: stream
  • logger: source logger
  • source: (optional) source
Returns

A tuple of (boolean, str). If boolean is true, then the stream is available, and no str is required. Otherwise, the stream is unavailable for some reason and the str should describe what went wrong and how to resolve the unavailability, if possible.

@dataclass
class ResumableFullRefreshCursor(airbyte_cdk.sources.streams.checkpoint.cursor.Cursor):
11@dataclass
12class ResumableFullRefreshCursor(Cursor):
13    """
14    Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state
15    of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job
16    with the platform responsible for removing said state.
17    """
18
19    def __init__(self) -> None:
20        self._cursor: StreamState = {}
21
22    def get_stream_state(self) -> StreamState:
23        return self._cursor
24
25    def set_initial_state(self, stream_state: StreamState) -> None:
26        self._cursor = stream_state
27
28    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
29        """
30        Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records.
31        """
32        pass
33
34    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
35        self._cursor = stream_slice.cursor_slice
36
37    def should_be_synced(self, record: Record) -> bool:
38        """
39        Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages
40        that don't have filterable bounds. We should always return them.
41        """
42        return True
43
44    def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
45        """
46        RFR record don't have ordering to be compared between one another.
47        """
48        return False
49
50    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
51        # A top-level RFR cursor only manages the state of a single partition
52        return self._cursor

Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job with the platform responsible for removing said state.

def get_stream_state(self) -> Mapping[str, Any]:
22    def get_stream_state(self) -> StreamState:
23        return self._cursor

Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it is used for two things:

  • Interpolation of the requests
  • Transformation of records
  • Saving the state

For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that allows for emitting the state to the platform.

def set_initial_state(self, stream_state: Mapping[str, Any]) -> None:
25    def set_initial_state(self, stream_state: StreamState) -> None:
26        self._cursor = stream_state

Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called before calling anything else

Parameters
  • stream_state: The state of the stream as returned by get_stream_state
def observe( self, stream_slice: StreamSlice, record: Record) -> None:
28    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
29        """
30        Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records.
31        """
32        pass

Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records.

def close_slice( self, stream_slice: StreamSlice, *args: Any) -> None:
34    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
35        self._cursor = stream_slice.cursor_slice

Update state based on the stream slice. Note that stream_slice.cursor_slice and most_recent_record.associated_slice are expected to be the same but we make it explicit here that stream_slice should be leveraged to update the state. We do not pass in the latest record, since cursor instances should maintain the relevant internal state on their own.

Parameters
  • stream_slice: slice to close
def should_be_synced(self, record: Record) -> bool:
37    def should_be_synced(self, record: Record) -> bool:
38        """
39        Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages
40        that don't have filterable bounds. We should always return them.
41        """
42        return True

Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages that don't have filterable bounds. We should always return them.

def is_greater_than_or_equal( self, first: Record, second: Record) -> bool:
44    def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
45        """
46        RFR record don't have ordering to be compared between one another.
47        """
48        return False

RFR record don't have ordering to be compared between one another.

def select_state( self, stream_slice: Optional[StreamSlice] = None) -> Optional[Mapping[str, Any]]:
50    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
51        # A top-level RFR cursor only manages the state of a single partition
52        return self._cursor

Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of a specific parent delineated by the incoming slice's partition object.

class ConcurrentCursor(airbyte_cdk.Cursor):
128class ConcurrentCursor(Cursor):
129    _START_BOUNDARY = 0
130    _END_BOUNDARY = 1
131
132    def __init__(
133        self,
134        stream_name: str,
135        stream_namespace: Optional[str],
136        stream_state: Any,
137        message_repository: MessageRepository,
138        connector_state_manager: ConnectorStateManager,
139        connector_state_converter: AbstractStreamStateConverter,
140        cursor_field: CursorField,
141        slice_boundary_fields: Optional[Tuple[str, str]],
142        start: Optional[CursorValueType],
143        end_provider: Callable[[], CursorValueType],
144        lookback_window: Optional[GapType] = None,
145        slice_range: Optional[GapType] = None,
146        cursor_granularity: Optional[GapType] = None,
147        clamping_strategy: ClampingStrategy = NoClamping(),
148    ) -> None:
149        self._stream_name = stream_name
150        self._stream_namespace = stream_namespace
151        self._message_repository = message_repository
152        self._connector_state_converter = connector_state_converter
153        self._connector_state_manager = connector_state_manager
154        self._cursor_field = cursor_field
155        # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379
156        self._slice_boundary_fields = slice_boundary_fields
157        self._start = start
158        self._end_provider = end_provider
159        self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
160        self._lookback_window = lookback_window
161        self._slice_range = slice_range
162        self._most_recent_cursor_value_per_partition: MutableMapping[
163            Union[StreamSlice, Mapping[str, Any], None], Any
164        ] = {}
165        self._has_closed_at_least_one_slice = False
166        self._cursor_granularity = cursor_granularity
167        # Flag to track if the logger has been triggered (per stream)
168        self._should_be_synced_logger_triggered = False
169        self._clamping_strategy = clamping_strategy
170
171    @property
172    def state(self) -> MutableMapping[str, Any]:
173        return self._connector_state_converter.convert_to_state_message(
174            self.cursor_field, self._concurrent_state
175        )
176
177    @property
178    def cursor_field(self) -> CursorField:
179        return self._cursor_field
180
181    @property
182    def _slice_boundary_fields_wrapper(self) -> Tuple[str, str]:
183        return (
184            self._slice_boundary_fields
185            if self._slice_boundary_fields
186            else (
187                self._connector_state_converter.START_KEY,
188                self._connector_state_converter.END_KEY,
189            )
190        )
191
192    def _get_concurrent_state(
193        self, state: MutableMapping[str, Any]
194    ) -> Tuple[CursorValueType, MutableMapping[str, Any]]:
195        if self._connector_state_converter.is_state_message_compatible(state):
196            return (
197                self._start or self._connector_state_converter.zero_value,
198                self._connector_state_converter.deserialize(state),
199            )
200        return self._connector_state_converter.convert_from_sequential_state(
201            self._cursor_field, state, self._start
202        )
203
204    def observe(self, record: Record) -> None:
205        most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
206            record.associated_slice
207        )
208        try:
209            cursor_value = self._extract_cursor_value(record)
210
211            if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value:
212                self._most_recent_cursor_value_per_partition[record.associated_slice] = cursor_value
213        except ValueError:
214            self._log_for_record_without_cursor_value()
215
216    def _extract_cursor_value(self, record: Record) -> Any:
217        return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
218
219    def close_partition(self, partition: Partition) -> None:
220        slice_count_before = len(self._concurrent_state.get("slices", []))
221        self._add_slice_to_state(partition)
222        if slice_count_before < len(
223            self._concurrent_state["slices"]
224        ):  # only emit if at least one slice has been processed
225            self._merge_partitions()
226            self._emit_state_message()
227        self._has_closed_at_least_one_slice = True
228
229    def _add_slice_to_state(self, partition: Partition) -> None:
230        most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
231            partition.to_slice()
232        )
233
234        if self._slice_boundary_fields:
235            if "slices" not in self._concurrent_state:
236                raise RuntimeError(
237                    f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
238                )
239            self._concurrent_state["slices"].append(
240                {
241                    self._connector_state_converter.START_KEY: self._extract_from_slice(
242                        partition, self._slice_boundary_fields[self._START_BOUNDARY]
243                    ),
244                    self._connector_state_converter.END_KEY: self._extract_from_slice(
245                        partition, self._slice_boundary_fields[self._END_BOUNDARY]
246                    ),
247                    self._connector_state_converter.MOST_RECENT_RECORD_KEY: most_recent_cursor_value,
248                }
249            )
250        elif most_recent_cursor_value:
251            if self._has_closed_at_least_one_slice:
252                # If we track state value using records cursor field, we can only do that if there is one partition. This is because we save
253                # the state every time we close a partition. We assume that if there are multiple slices, they need to be providing
254                # boundaries. There are cases where partitions could not have boundaries:
255                # * The cursor should be per-partition
256                # * The stream state is actually the parent stream state
257                # There might be other cases not listed above. Those are not supported today hence the stream should not use this cursor for
258                # state management. For the specific user that was affected with this issue, we need to:
259                # * Fix state tracking (which is currently broken)
260                # * Make the new version available
261                # * (Probably) ask the user to reset the stream to avoid data loss
262                raise ValueError(
263                    "Given that slice_boundary_fields is not defined and that per-partition state is not supported, only one slice is "
264                    "expected. Please contact the Airbyte team."
265                )
266
267            self._concurrent_state["slices"].append(
268                {
269                    self._connector_state_converter.START_KEY: self.start,
270                    self._connector_state_converter.END_KEY: most_recent_cursor_value,
271                    self._connector_state_converter.MOST_RECENT_RECORD_KEY: most_recent_cursor_value,
272                }
273            )
274
275    def _emit_state_message(self) -> None:
276        self._connector_state_manager.update_state_for_stream(
277            self._stream_name,
278            self._stream_namespace,
279            self.state,
280        )
281        state_message = self._connector_state_manager.create_state_message(
282            self._stream_name, self._stream_namespace
283        )
284        self._message_repository.emit_message(state_message)
285
286    def _merge_partitions(self) -> None:
287        self._concurrent_state["slices"] = self._connector_state_converter.merge_intervals(
288            self._concurrent_state["slices"]
289        )
290
291    def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType:
292        try:
293            _slice = partition.to_slice()
294            if not _slice:
295                raise KeyError(f"Could not find key `{key}` in empty slice")
296            return self._connector_state_converter.parse_value(_slice[key])  # type: ignore  # we expect the devs to specify a key that would return a CursorValueType
297        except KeyError as exception:
298            raise KeyError(
299                f"Partition is expected to have key `{key}` but could not be found"
300            ) from exception
301
302    def ensure_at_least_one_state_emitted(self) -> None:
303        """
304        The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
305        called.
306        """
307        self._emit_state_message()
308
309    def stream_slices(self) -> Iterable[StreamSlice]:
310        """
311        Generating slices based on a few parameters:
312        * lookback_window: Buffer to remove from END_KEY of the highest slice
313        * slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created
314        * start: `_split_per_slice_range` will clip any value to `self._start which means that:
315          * if upper is less than self._start, no slices will be generated
316          * if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case)
317
318        Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be
319        inclusive in the API that is queried.
320        """
321        self._merge_partitions()
322
323        if self._start is not None and self._is_start_before_first_slice():
324            yield from self._split_per_slice_range(
325                self._start,
326                self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY],
327                False,
328            )
329
330        if len(self._concurrent_state["slices"]) == 1:
331            yield from self._split_per_slice_range(
332                self._calculate_lower_boundary_of_last_slice(
333                    self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY]
334                ),
335                self._end_provider(),
336                True,
337            )
338        elif len(self._concurrent_state["slices"]) > 1:
339            for i in range(len(self._concurrent_state["slices"]) - 1):
340                if self._cursor_granularity:
341                    yield from self._split_per_slice_range(
342                        self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY]
343                        + self._cursor_granularity,
344                        self._concurrent_state["slices"][i + 1][
345                            self._connector_state_converter.START_KEY
346                        ],
347                        False,
348                    )
349                else:
350                    yield from self._split_per_slice_range(
351                        self._concurrent_state["slices"][i][
352                            self._connector_state_converter.END_KEY
353                        ],
354                        self._concurrent_state["slices"][i + 1][
355                            self._connector_state_converter.START_KEY
356                        ],
357                        False,
358                    )
359            yield from self._split_per_slice_range(
360                self._calculate_lower_boundary_of_last_slice(
361                    self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY]
362                ),
363                self._end_provider(),
364                True,
365            )
366        else:
367            raise ValueError("Expected at least one slice")
368
369    def _is_start_before_first_slice(self) -> bool:
370        return (
371            self._start is not None
372            and self._start
373            < self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY]
374        )
375
376    def _calculate_lower_boundary_of_last_slice(
377        self, lower_boundary: CursorValueType
378    ) -> CursorValueType:
379        if self._lookback_window:
380            return lower_boundary - self._lookback_window
381        return lower_boundary
382
383    def _split_per_slice_range(
384        self, lower: CursorValueType, upper: CursorValueType, upper_is_end: bool
385    ) -> Iterable[StreamSlice]:
386        if lower >= upper:
387            return
388
389        if self._start and upper < self._start:
390            return
391
392        lower = max(lower, self._start) if self._start else lower
393        if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper:
394            clamped_lower = self._clamping_strategy.clamp(lower)
395            clamped_upper = self._clamping_strategy.clamp(upper)
396            start_value, end_value = (
397                (clamped_lower, clamped_upper - self._cursor_granularity)
398                if self._cursor_granularity and not upper_is_end
399                else (clamped_lower, clamped_upper)
400            )
401            yield StreamSlice(
402                partition={},
403                cursor_slice={
404                    self._slice_boundary_fields_wrapper[
405                        self._START_BOUNDARY
406                    ]: self._connector_state_converter.output_format(start_value),
407                    self._slice_boundary_fields_wrapper[
408                        self._END_BOUNDARY
409                    ]: self._connector_state_converter.output_format(end_value),
410                },
411            )
412        else:
413            stop_processing = False
414            current_lower_boundary = lower
415            while not stop_processing:
416                current_upper_boundary = min(
417                    self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper
418                )
419                has_reached_upper_boundary = current_upper_boundary >= upper
420
421                clamped_upper = (
422                    self._clamping_strategy.clamp(current_upper_boundary)
423                    if current_upper_boundary != upper
424                    else current_upper_boundary
425                )
426                clamped_lower = self._clamping_strategy.clamp(current_lower_boundary)
427                if clamped_lower >= clamped_upper:
428                    # clamping collapsed both values which means that it is time to stop processing
429                    # FIXME should this be replace by proper end_provider
430                    break
431                start_value, end_value = (
432                    (clamped_lower, clamped_upper - self._cursor_granularity)
433                    if self._cursor_granularity
434                    and (not upper_is_end or not has_reached_upper_boundary)
435                    else (clamped_lower, clamped_upper)
436                )
437                yield StreamSlice(
438                    partition={},
439                    cursor_slice={
440                        self._slice_boundary_fields_wrapper[
441                            self._START_BOUNDARY
442                        ]: self._connector_state_converter.output_format(start_value),
443                        self._slice_boundary_fields_wrapper[
444                            self._END_BOUNDARY
445                        ]: self._connector_state_converter.output_format(end_value),
446                    },
447                )
448                current_lower_boundary = clamped_upper
449                if current_upper_boundary >= upper:
450                    stop_processing = True
451
452    def _evaluate_upper_safely(self, lower: CursorValueType, step: GapType) -> CursorValueType:
453        """
454        Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date
455        This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code
456        would have broken anyway.
457        """
458        try:
459            return lower + step
460        except OverflowError:
461            return self._end_provider()
462
463    def should_be_synced(self, record: Record) -> bool:
464        """
465        Determines if a record should be synced based on its cursor value.
466        :param record: The record to evaluate
467
468        :return: True if the record's cursor value falls within the sync boundaries
469        """
470        try:
471            record_cursor_value: CursorValueType = self._extract_cursor_value(record)
472        except ValueError:
473            self._log_for_record_without_cursor_value()
474            return True
475        return self.start <= record_cursor_value <= self._end_provider()
476
477    def _log_for_record_without_cursor_value(self) -> None:
478        if not self._should_be_synced_logger_triggered:
479            LOGGER.warning(
480                f"Could not find cursor field `{self.cursor_field.cursor_field_key}` in record for stream {self._stream_name}. The incremental sync will assume it needs to be synced"
481            )
482            self._should_be_synced_logger_triggered = True

Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.

ConcurrentCursor( stream_name: str, stream_namespace: Optional[str], stream_state: Any, message_repository: MessageRepository, connector_state_manager: ConnectorStateManager, connector_state_converter: airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter.AbstractStreamStateConverter, cursor_field: CursorField, slice_boundary_fields: Optional[Tuple[str, str]], start: Optional[airbyte_cdk.sources.streams.concurrent.cursor_types.CursorValueType], end_provider: Callable[[], airbyte_cdk.sources.streams.concurrent.cursor_types.CursorValueType], lookback_window: Optional[airbyte_cdk.sources.streams.concurrent.cursor_types.GapType] = None, slice_range: Optional[airbyte_cdk.sources.streams.concurrent.cursor_types.GapType] = None, cursor_granularity: Optional[airbyte_cdk.sources.streams.concurrent.cursor_types.GapType] = None, clamping_strategy: airbyte_cdk.sources.streams.concurrent.clamping.ClampingStrategy = <airbyte_cdk.sources.streams.concurrent.clamping.NoClamping object>)
132    def __init__(
133        self,
134        stream_name: str,
135        stream_namespace: Optional[str],
136        stream_state: Any,
137        message_repository: MessageRepository,
138        connector_state_manager: ConnectorStateManager,
139        connector_state_converter: AbstractStreamStateConverter,
140        cursor_field: CursorField,
141        slice_boundary_fields: Optional[Tuple[str, str]],
142        start: Optional[CursorValueType],
143        end_provider: Callable[[], CursorValueType],
144        lookback_window: Optional[GapType] = None,
145        slice_range: Optional[GapType] = None,
146        cursor_granularity: Optional[GapType] = None,
147        clamping_strategy: ClampingStrategy = NoClamping(),
148    ) -> None:
149        self._stream_name = stream_name
150        self._stream_namespace = stream_namespace
151        self._message_repository = message_repository
152        self._connector_state_converter = connector_state_converter
153        self._connector_state_manager = connector_state_manager
154        self._cursor_field = cursor_field
155        # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379
156        self._slice_boundary_fields = slice_boundary_fields
157        self._start = start
158        self._end_provider = end_provider
159        self.start, self._concurrent_state = self._get_concurrent_state(stream_state)
160        self._lookback_window = lookback_window
161        self._slice_range = slice_range
162        self._most_recent_cursor_value_per_partition: MutableMapping[
163            Union[StreamSlice, Mapping[str, Any], None], Any
164        ] = {}
165        self._has_closed_at_least_one_slice = False
166        self._cursor_granularity = cursor_granularity
167        # Flag to track if the logger has been triggered (per stream)
168        self._should_be_synced_logger_triggered = False
169        self._clamping_strategy = clamping_strategy
state: MutableMapping[str, Any]
171    @property
172    def state(self) -> MutableMapping[str, Any]:
173        return self._connector_state_converter.convert_to_state_message(
174            self.cursor_field, self._concurrent_state
175        )
cursor_field: CursorField
177    @property
178    def cursor_field(self) -> CursorField:
179        return self._cursor_field
def observe(self, record: Record) -> None:
204    def observe(self, record: Record) -> None:
205        most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
206            record.associated_slice
207        )
208        try:
209            cursor_value = self._extract_cursor_value(record)
210
211            if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value:
212                self._most_recent_cursor_value_per_partition[record.associated_slice] = cursor_value
213        except ValueError:
214            self._log_for_record_without_cursor_value()

Indicate to the cursor that the record has been emitted

def close_partition( self, partition: airbyte_cdk.sources.streams.concurrent.partitions.partition.Partition) -> None:
219    def close_partition(self, partition: Partition) -> None:
220        slice_count_before = len(self._concurrent_state.get("slices", []))
221        self._add_slice_to_state(partition)
222        if slice_count_before < len(
223            self._concurrent_state["slices"]
224        ):  # only emit if at least one slice has been processed
225            self._merge_partitions()
226            self._emit_state_message()
227        self._has_closed_at_least_one_slice = True

Indicate to the cursor that the partition has been successfully processed

def ensure_at_least_one_state_emitted(self) -> None:
302    def ensure_at_least_one_state_emitted(self) -> None:
303        """
304        The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
305        called.
306        """
307        self._emit_state_message()

The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be called.

def stream_slices(self) -> Iterable[StreamSlice]:
309    def stream_slices(self) -> Iterable[StreamSlice]:
310        """
311        Generating slices based on a few parameters:
312        * lookback_window: Buffer to remove from END_KEY of the highest slice
313        * slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created
314        * start: `_split_per_slice_range` will clip any value to `self._start which means that:
315          * if upper is less than self._start, no slices will be generated
316          * if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case)
317
318        Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be
319        inclusive in the API that is queried.
320        """
321        self._merge_partitions()
322
323        if self._start is not None and self._is_start_before_first_slice():
324            yield from self._split_per_slice_range(
325                self._start,
326                self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY],
327                False,
328            )
329
330        if len(self._concurrent_state["slices"]) == 1:
331            yield from self._split_per_slice_range(
332                self._calculate_lower_boundary_of_last_slice(
333                    self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY]
334                ),
335                self._end_provider(),
336                True,
337            )
338        elif len(self._concurrent_state["slices"]) > 1:
339            for i in range(len(self._concurrent_state["slices"]) - 1):
340                if self._cursor_granularity:
341                    yield from self._split_per_slice_range(
342                        self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY]
343                        + self._cursor_granularity,
344                        self._concurrent_state["slices"][i + 1][
345                            self._connector_state_converter.START_KEY
346                        ],
347                        False,
348                    )
349                else:
350                    yield from self._split_per_slice_range(
351                        self._concurrent_state["slices"][i][
352                            self._connector_state_converter.END_KEY
353                        ],
354                        self._concurrent_state["slices"][i + 1][
355                            self._connector_state_converter.START_KEY
356                        ],
357                        False,
358                    )
359            yield from self._split_per_slice_range(
360                self._calculate_lower_boundary_of_last_slice(
361                    self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY]
362                ),
363                self._end_provider(),
364                True,
365            )
366        else:
367            raise ValueError("Expected at least one slice")

Generating slices based on a few parameters:

  • lookback_window: Buffer to remove from END_KEY of the highest slice
  • slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created
  • start: _split_per_slice_range will clip any value to `self._start which means that:
    • if upper is less than self._start, no slices will be generated
    • if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case)

Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be inclusive in the API that is queried.

def should_be_synced(self, record: Record) -> bool:
463    def should_be_synced(self, record: Record) -> bool:
464        """
465        Determines if a record should be synced based on its cursor value.
466        :param record: The record to evaluate
467
468        :return: True if the record's cursor value falls within the sync boundaries
469        """
470        try:
471            record_cursor_value: CursorValueType = self._extract_cursor_value(record)
472        except ValueError:
473            self._log_for_record_without_cursor_value()
474            return True
475        return self.start <= record_cursor_value <= self._end_provider()

Determines if a record should be synced based on its cursor value.

Parameters
  • record: The record to evaluate
Returns

True if the record's cursor value falls within the sync boundaries

class ConcurrentSource:
 30class ConcurrentSource:
 31    """
 32    A Source that reads data from multiple AbstractStreams concurrently.
 33    It does so by submitting partition generation, and partition read tasks to a thread pool.
 34    The tasks asynchronously add their output to a shared queue.
 35    The read is done when all partitions for all streams w ere generated and read.
 36    """
 37
 38    DEFAULT_TIMEOUT_SECONDS = 900
 39
 40    @staticmethod
 41    def create(
 42        num_workers: int,
 43        initial_number_of_partitions_to_generate: int,
 44        logger: logging.Logger,
 45        slice_logger: SliceLogger,
 46        message_repository: MessageRepository,
 47        timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
 48    ) -> "ConcurrentSource":
 49        is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
 50        too_many_generator = (
 51            not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers
 52        )
 53        assert (
 54            not too_many_generator
 55        ), "It is required to have more workers than threads generating partitions"
 56        threadpool = ThreadPoolManager(
 57            concurrent.futures.ThreadPoolExecutor(
 58                max_workers=num_workers, thread_name_prefix="workerpool"
 59            ),
 60            logger,
 61        )
 62        return ConcurrentSource(
 63            threadpool,
 64            logger,
 65            slice_logger,
 66            message_repository,
 67            initial_number_of_partitions_to_generate,
 68            timeout_seconds,
 69        )
 70
 71    def __init__(
 72        self,
 73        threadpool: ThreadPoolManager,
 74        logger: logging.Logger,
 75        slice_logger: SliceLogger = DebugSliceLogger(),
 76        message_repository: MessageRepository = InMemoryMessageRepository(),
 77        initial_number_partitions_to_generate: int = 1,
 78        timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
 79    ) -> None:
 80        """
 81        :param threadpool: The threadpool to submit tasks to
 82        :param logger: The logger to log to
 83        :param slice_logger: The slice logger used to create messages on new slices
 84        :param message_repository: The repository to emit messages to
 85        :param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible.
 86        :param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return.
 87        """
 88        self._threadpool = threadpool
 89        self._logger = logger
 90        self._slice_logger = slice_logger
 91        self._message_repository = message_repository
 92        self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
 93        self._timeout_seconds = timeout_seconds
 94
 95    def read(
 96        self,
 97        streams: List[AbstractStream],
 98    ) -> Iterator[AirbyteMessage]:
 99        self._logger.info("Starting syncing")
100
101        # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
102        # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
103        # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
104        # information and might even need to be configurable depending on the source
105        queue: Queue[QueueItem] = Queue(maxsize=10_000)
106        concurrent_stream_processor = ConcurrentReadProcessor(
107            streams,
108            PartitionEnqueuer(queue, self._threadpool),
109            self._threadpool,
110            self._logger,
111            self._slice_logger,
112            self._message_repository,
113            PartitionReader(queue),
114        )
115
116        # Enqueue initial partition generation tasks
117        yield from self._submit_initial_partition_generators(concurrent_stream_processor)
118
119        # Read from the queue until all partitions were generated and read
120        yield from self._consume_from_queue(
121            queue,
122            concurrent_stream_processor,
123        )
124        self._threadpool.check_for_errors_and_shutdown()
125        self._logger.info("Finished syncing")
126
127    def _submit_initial_partition_generators(
128        self, concurrent_stream_processor: ConcurrentReadProcessor
129    ) -> Iterable[AirbyteMessage]:
130        for _ in range(self._initial_number_partitions_to_generate):
131            status_message = concurrent_stream_processor.start_next_partition_generator()
132            if status_message:
133                yield status_message
134
135    def _consume_from_queue(
136        self,
137        queue: Queue[QueueItem],
138        concurrent_stream_processor: ConcurrentReadProcessor,
139    ) -> Iterable[AirbyteMessage]:
140        while airbyte_message_or_record_or_exception := queue.get():
141            yield from self._handle_item(
142                airbyte_message_or_record_or_exception,
143                concurrent_stream_processor,
144            )
145            if concurrent_stream_processor.is_done() and queue.empty():
146                # all partitions were generated and processed. we're done here
147                break
148
149    def _handle_item(
150        self,
151        queue_item: QueueItem,
152        concurrent_stream_processor: ConcurrentReadProcessor,
153    ) -> Iterable[AirbyteMessage]:
154        # handle queue item and call the appropriate handler depending on the type of the queue item
155        if isinstance(queue_item, StreamThreadException):
156            yield from concurrent_stream_processor.on_exception(queue_item)
157        elif isinstance(queue_item, PartitionGenerationCompletedSentinel):
158            yield from concurrent_stream_processor.on_partition_generation_completed(queue_item)
159        elif isinstance(queue_item, Partition):
160            concurrent_stream_processor.on_partition(queue_item)
161        elif isinstance(queue_item, PartitionCompleteSentinel):
162            yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
163        elif isinstance(queue_item, Record):
164            yield from concurrent_stream_processor.on_record(queue_item)
165        else:
166            raise ValueError(f"Unknown queue item type: {type(queue_item)}")

A Source that reads data from multiple AbstractStreams concurrently. It does so by submitting partition generation, and partition read tasks to a thread pool. The tasks asynchronously add their output to a shared queue. The read is done when all partitions for all streams w ere generated and read.

ConcurrentSource( threadpool: airbyte_cdk.sources.concurrent_source.thread_pool_manager.ThreadPoolManager, logger: logging.Logger, slice_logger: airbyte_cdk.sources.utils.slice_logger.SliceLogger = <airbyte_cdk.sources.utils.slice_logger.DebugSliceLogger object>, message_repository: MessageRepository = <InMemoryMessageRepository object>, initial_number_partitions_to_generate: int = 1, timeout_seconds: int = 900)
71    def __init__(
72        self,
73        threadpool: ThreadPoolManager,
74        logger: logging.Logger,
75        slice_logger: SliceLogger = DebugSliceLogger(),
76        message_repository: MessageRepository = InMemoryMessageRepository(),
77        initial_number_partitions_to_generate: int = 1,
78        timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
79    ) -> None:
80        """
81        :param threadpool: The threadpool to submit tasks to
82        :param logger: The logger to log to
83        :param slice_logger: The slice logger used to create messages on new slices
84        :param message_repository: The repository to emit messages to
85        :param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible.
86        :param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return.
87        """
88        self._threadpool = threadpool
89        self._logger = logger
90        self._slice_logger = slice_logger
91        self._message_repository = message_repository
92        self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
93        self._timeout_seconds = timeout_seconds
Parameters
  • threadpool: The threadpool to submit tasks to
  • logger: The logger to log to
  • slice_logger: The slice logger used to create messages on new slices
  • message_repository: The repository to emit messages to
  • initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible.
  • timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return.
DEFAULT_TIMEOUT_SECONDS = 900
@staticmethod
def create( num_workers: int, initial_number_of_partitions_to_generate: int, logger: logging.Logger, slice_logger: airbyte_cdk.sources.utils.slice_logger.SliceLogger, message_repository: MessageRepository, timeout_seconds: int = 900) -> ConcurrentSource:
40    @staticmethod
41    def create(
42        num_workers: int,
43        initial_number_of_partitions_to_generate: int,
44        logger: logging.Logger,
45        slice_logger: SliceLogger,
46        message_repository: MessageRepository,
47        timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
48    ) -> "ConcurrentSource":
49        is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
50        too_many_generator = (
51            not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers
52        )
53        assert (
54            not too_many_generator
55        ), "It is required to have more workers than threads generating partitions"
56        threadpool = ThreadPoolManager(
57            concurrent.futures.ThreadPoolExecutor(
58                max_workers=num_workers, thread_name_prefix="workerpool"
59            ),
60            logger,
61        )
62        return ConcurrentSource(
63            threadpool,
64            logger,
65            slice_logger,
66            message_repository,
67            initial_number_of_partitions_to_generate,
68            timeout_seconds,
69        )
def read( self, streams: List[airbyte_cdk.sources.streams.concurrent.abstract_stream.AbstractStream]) -> Iterator[AirbyteMessage]:
 95    def read(
 96        self,
 97        streams: List[AbstractStream],
 98    ) -> Iterator[AirbyteMessage]:
 99        self._logger.info("Starting syncing")
100
101        # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
102        # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
103        # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
104        # information and might even need to be configurable depending on the source
105        queue: Queue[QueueItem] = Queue(maxsize=10_000)
106        concurrent_stream_processor = ConcurrentReadProcessor(
107            streams,
108            PartitionEnqueuer(queue, self._threadpool),
109            self._threadpool,
110            self._logger,
111            self._slice_logger,
112            self._message_repository,
113            PartitionReader(queue),
114        )
115
116        # Enqueue initial partition generation tasks
117        yield from self._submit_initial_partition_generators(concurrent_stream_processor)
118
119        # Read from the queue until all partitions were generated and read
120        yield from self._consume_from_queue(
121            queue,
122            concurrent_stream_processor,
123        )
124        self._threadpool.check_for_errors_and_shutdown()
125        self._logger.info("Finished syncing")
 34class ConcurrentSourceAdapter(AbstractSource, ABC):
 35    def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None:
 36        """
 37        ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source.
 38
 39        The source's streams are still defined through the streams() method.
 40        Streams wrapped in a StreamFacade will be processed concurrently.
 41        Other streams will be processed sequentially as a later step.
 42        """
 43        self._concurrent_source = concurrent_source
 44        super().__init__(**kwargs)
 45
 46    def read(
 47        self,
 48        logger: logging.Logger,
 49        config: Mapping[str, Any],
 50        catalog: ConfiguredAirbyteCatalog,
 51        state: Optional[List[AirbyteStateMessage]] = None,
 52    ) -> Iterator[AirbyteMessage]:
 53        abstract_streams = self._select_abstract_streams(config, catalog)
 54        concurrent_stream_names = {stream.name for stream in abstract_streams}
 55        configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog(
 56            streams=[
 57                stream
 58                for stream in catalog.streams
 59                if stream.stream.name not in concurrent_stream_names
 60            ]
 61        )
 62        if abstract_streams:
 63            yield from self._concurrent_source.read(abstract_streams)
 64        if configured_catalog_for_regular_streams.streams:
 65            yield from super().read(logger, config, configured_catalog_for_regular_streams, state)
 66
 67    def _select_abstract_streams(
 68        self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog
 69    ) -> List[AbstractStream]:
 70        """
 71        Selects streams that can be processed concurrently and returns their abstract representations.
 72        """
 73        all_streams = self.streams(config)
 74        stream_name_to_instance: Mapping[str, Stream] = {s.name: s for s in all_streams}
 75        abstract_streams: List[AbstractStream] = []
 76        for configured_stream in configured_catalog.streams:
 77            stream_instance = stream_name_to_instance.get(configured_stream.stream.name)
 78            if not stream_instance:
 79                continue
 80
 81            if isinstance(stream_instance, AbstractStreamFacade):
 82                abstract_streams.append(stream_instance.get_underlying_stream())
 83        return abstract_streams
 84
 85    def convert_to_concurrent_stream(
 86        self,
 87        logger: logging.Logger,
 88        stream: Stream,
 89        state_manager: ConnectorStateManager,
 90        cursor: Optional[Cursor] = None,
 91    ) -> Stream:
 92        """
 93        Prepares a stream for concurrent processing by initializing or assigning a cursor,
 94        managing the stream's state, and returning an updated Stream instance.
 95        """
 96        state: MutableMapping[str, Any] = {}
 97
 98        if cursor:
 99            state = state_manager.get_stream_state(stream.name, stream.namespace)
100
101            stream.cursor = cursor  # type: ignore[assignment]  # cursor is of type ConcurrentCursor, which inherits from Cursor
102            if hasattr(stream, "parent"):
103                stream.parent.cursor = cursor
104        else:
105            cursor = FinalStateCursor(
106                stream_name=stream.name,
107                stream_namespace=stream.namespace,
108                message_repository=self.message_repository,  # type: ignore[arg-type]  # _default_message_repository will be returned in the worst case
109            )
110        return StreamFacade.create_from_stream(stream, self, logger, state, cursor)
111
112    def initialize_cursor(
113        self,
114        stream: Stream,
115        state_manager: ConnectorStateManager,
116        converter: AbstractStreamStateConverter,
117        slice_boundary_fields: Optional[Tuple[str, str]],
118        start: Optional[CursorValueType],
119        end_provider: Callable[[], CursorValueType],
120        lookback_window: Optional[GapType] = None,
121        slice_range: Optional[GapType] = None,
122    ) -> Optional[ConcurrentCursor]:
123        lookback_window = lookback_window or timedelta(seconds=DEFAULT_LOOKBACK_SECONDS)
124
125        cursor_field_name = stream.cursor_field
126
127        if cursor_field_name:
128            if not isinstance(cursor_field_name, str):
129                raise ValueError(
130                    f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}."
131                )
132
133            return ConcurrentCursor(
134                stream.name,
135                stream.namespace,
136                state_manager.get_stream_state(stream.name, stream.namespace),
137                self.message_repository,  # type: ignore[arg-type]  # _default_message_repository will be returned in the worst case
138                state_manager,
139                converter,
140                CursorField(cursor_field_name),
141                slice_boundary_fields,
142                start,
143                end_provider,
144                lookback_window,
145                slice_range,
146            )
147
148        return None

Abstract base class for an Airbyte Source. Consumers should implement any abstract methods in this class to create an Airbyte Specification compliant Source.

ConcurrentSourceAdapter( concurrent_source: ConcurrentSource, **kwargs: Any)
35    def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None:
36        """
37        ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source.
38
39        The source's streams are still defined through the streams() method.
40        Streams wrapped in a StreamFacade will be processed concurrently.
41        Other streams will be processed sequentially as a later step.
42        """
43        self._concurrent_source = concurrent_source
44        super().__init__(**kwargs)

ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source.

The source's streams are still defined through the streams() method. Streams wrapped in a StreamFacade will be processed concurrently. Other streams will be processed sequentially as a later step.

def read( self, logger: logging.Logger, config: Mapping[str, Any], catalog: airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteCatalog, state: Optional[List[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage]] = None) -> Iterator[AirbyteMessage]:
46    def read(
47        self,
48        logger: logging.Logger,
49        config: Mapping[str, Any],
50        catalog: ConfiguredAirbyteCatalog,
51        state: Optional[List[AirbyteStateMessage]] = None,
52    ) -> Iterator[AirbyteMessage]:
53        abstract_streams = self._select_abstract_streams(config, catalog)
54        concurrent_stream_names = {stream.name for stream in abstract_streams}
55        configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog(
56            streams=[
57                stream
58                for stream in catalog.streams
59                if stream.stream.name not in concurrent_stream_names
60            ]
61        )
62        if abstract_streams:
63            yield from self._concurrent_source.read(abstract_streams)
64        if configured_catalog_for_regular_streams.streams:
65            yield from super().read(logger, config, configured_catalog_for_regular_streams, state)

Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.

def convert_to_concurrent_stream( self, logger: logging.Logger, stream: Stream, state_manager: ConnectorStateManager, cursor: Optional[Cursor] = None) -> Stream:
 85    def convert_to_concurrent_stream(
 86        self,
 87        logger: logging.Logger,
 88        stream: Stream,
 89        state_manager: ConnectorStateManager,
 90        cursor: Optional[Cursor] = None,
 91    ) -> Stream:
 92        """
 93        Prepares a stream for concurrent processing by initializing or assigning a cursor,
 94        managing the stream's state, and returning an updated Stream instance.
 95        """
 96        state: MutableMapping[str, Any] = {}
 97
 98        if cursor:
 99            state = state_manager.get_stream_state(stream.name, stream.namespace)
100
101            stream.cursor = cursor  # type: ignore[assignment]  # cursor is of type ConcurrentCursor, which inherits from Cursor
102            if hasattr(stream, "parent"):
103                stream.parent.cursor = cursor
104        else:
105            cursor = FinalStateCursor(
106                stream_name=stream.name,
107                stream_namespace=stream.namespace,
108                message_repository=self.message_repository,  # type: ignore[arg-type]  # _default_message_repository will be returned in the worst case
109            )
110        return StreamFacade.create_from_stream(stream, self, logger, state, cursor)

Prepares a stream for concurrent processing by initializing or assigning a cursor, managing the stream's state, and returning an updated Stream instance.

112    def initialize_cursor(
113        self,
114        stream: Stream,
115        state_manager: ConnectorStateManager,
116        converter: AbstractStreamStateConverter,
117        slice_boundary_fields: Optional[Tuple[str, str]],
118        start: Optional[CursorValueType],
119        end_provider: Callable[[], CursorValueType],
120        lookback_window: Optional[GapType] = None,
121        slice_range: Optional[GapType] = None,
122    ) -> Optional[ConcurrentCursor]:
123        lookback_window = lookback_window or timedelta(seconds=DEFAULT_LOOKBACK_SECONDS)
124
125        cursor_field_name = stream.cursor_field
126
127        if cursor_field_name:
128            if not isinstance(cursor_field_name, str):
129                raise ValueError(
130                    f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}."
131                )
132
133            return ConcurrentCursor(
134                stream.name,
135                stream.namespace,
136                state_manager.get_stream_state(stream.name, stream.namespace),
137                self.message_repository,  # type: ignore[arg-type]  # _default_message_repository will be returned in the worst case
138                state_manager,
139                converter,
140                CursorField(cursor_field_name),
141                slice_boundary_fields,
142                start,
143                end_provider,
144                lookback_window,
145                slice_range,
146            )
147
148        return None
51class Cursor(StreamSlicer, ABC):
52    @property
53    @abstractmethod
54    def state(self) -> MutableMapping[str, Any]: ...
55
56    @abstractmethod
57    def observe(self, record: Record) -> None:
58        """
59        Indicate to the cursor that the record has been emitted
60        """
61        raise NotImplementedError()
62
63    @abstractmethod
64    def close_partition(self, partition: Partition) -> None:
65        """
66        Indicate to the cursor that the partition has been successfully processed
67        """
68        raise NotImplementedError()
69
70    @abstractmethod
71    def ensure_at_least_one_state_emitted(self) -> None:
72        """
73        State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per
74        stream. Hence, if no partitions are generated, this method needs to be called.
75        """
76        raise NotImplementedError()
77
78    def stream_slices(self) -> Iterable[StreamSlice]:
79        """
80        Default placeholder implementation of generate_slices.
81        Subclasses can override this method to provide actual behavior.
82        """
83        yield StreamSlice(partition={}, cursor_slice={})

Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.

state: MutableMapping[str, Any]
52    @property
53    @abstractmethod
54    def state(self) -> MutableMapping[str, Any]: ...
@abstractmethod
def observe(self, record: Record) -> None:
56    @abstractmethod
57    def observe(self, record: Record) -> None:
58        """
59        Indicate to the cursor that the record has been emitted
60        """
61        raise NotImplementedError()

Indicate to the cursor that the record has been emitted

@abstractmethod
def close_partition( self, partition: airbyte_cdk.sources.streams.concurrent.partitions.partition.Partition) -> None:
63    @abstractmethod
64    def close_partition(self, partition: Partition) -> None:
65        """
66        Indicate to the cursor that the partition has been successfully processed
67        """
68        raise NotImplementedError()

Indicate to the cursor that the partition has been successfully processed

@abstractmethod
def ensure_at_least_one_state_emitted(self) -> None:
70    @abstractmethod
71    def ensure_at_least_one_state_emitted(self) -> None:
72        """
73        State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per
74        stream. Hence, if no partitions are generated, this method needs to be called.
75        """
76        raise NotImplementedError()

State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per stream. Hence, if no partitions are generated, this method needs to be called.

def stream_slices(self) -> Iterable[StreamSlice]:
78    def stream_slices(self) -> Iterable[StreamSlice]:
79        """
80        Default placeholder implementation of generate_slices.
81        Subclasses can override this method to provide actual behavior.
82        """
83        yield StreamSlice(partition={}, cursor_slice={})

Default placeholder implementation of generate_slices. Subclasses can override this method to provide actual behavior.

class CursorField:
40class CursorField:
41    def __init__(self, cursor_field_key: str) -> None:
42        self.cursor_field_key = cursor_field_key
43
44    def extract_value(self, record: Record) -> CursorValueType:
45        cursor_value = record.data.get(self.cursor_field_key)
46        if cursor_value is None:
47            raise ValueError(f"Could not find cursor field {self.cursor_field_key} in record")
48        return cursor_value  # type: ignore  # we assume that the value the path points at is a comparable
CursorField(cursor_field_key: str)
41    def __init__(self, cursor_field_key: str) -> None:
42        self.cursor_field_key = cursor_field_key
cursor_field_key
def extract_value( self, record: Record) -> airbyte_cdk.sources.streams.concurrent.cursor_types.CursorValueType:
44    def extract_value(self, record: Record) -> CursorValueType:
45        cursor_value = record.data.get(self.cursor_field_key)
46        if cursor_value is None:
47            raise ValueError(f"Could not find cursor field {self.cursor_field_key} in record")
48        return cursor_value  # type: ignore  # we assume that the value the path points at is a comparable
DEFAULT_CONCURRENCY
115class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter):
116    """
117    e.g.
118    { "created": 1617030403 }
119    =>
120    {
121        "state_type": "date-range",
122        "metadata": { … },
123        "slices": [
124            {starts: 0, end: 1617030403, finished_processing: true}
125        ]
126    }
127    """
128
129    _zero_value = 0
130
131    def increment(self, timestamp: datetime) -> datetime:
132        return timestamp + timedelta(seconds=1)
133
134    def output_format(self, timestamp: datetime) -> int:
135        return int(timestamp.timestamp())
136
137    def parse_timestamp(self, timestamp: int) -> datetime:
138        dt_object = AirbyteDateTime.fromtimestamp(timestamp, timezone.utc)
139        if not isinstance(dt_object, AirbyteDateTime):
140            raise ValueError(
141                f"AirbyteDateTime object was expected but got {type(dt_object)} from AirbyteDateTime.fromtimestamp({timestamp})"
142            )
143        return dt_object

e.g. { "created": 1617030403 } => { "state_type": "date-range", "metadata": { … }, "slices": [ {starts: 0, end: 1617030403, finished_processing: true} ] }

def increment(self, timestamp: datetime.datetime) -> datetime.datetime:
131    def increment(self, timestamp: datetime) -> datetime:
132        return timestamp + timedelta(seconds=1)

Increment a timestamp by a single unit.

def output_format(self, timestamp: datetime.datetime) -> int:
134    def output_format(self, timestamp: datetime) -> int:
135        return int(timestamp.timestamp())

Convert the cursor value type to a JSON valid type.

def parse_timestamp(self, timestamp: int) -> datetime.datetime:
137    def parse_timestamp(self, timestamp: int) -> datetime:
138        dt_object = AirbyteDateTime.fromtimestamp(timestamp, timezone.utc)
139        if not isinstance(dt_object, AirbyteDateTime):
140            raise ValueError(
141                f"AirbyteDateTime object was expected but got {type(dt_object)} from AirbyteDateTime.fromtimestamp({timestamp})"
142            )
143        return dt_object
class FinalStateCursor(airbyte_cdk.Cursor):
 86class FinalStateCursor(Cursor):
 87    """Cursor that is used to guarantee at least one state message is emitted for a concurrent stream."""
 88
 89    def __init__(
 90        self,
 91        stream_name: str,
 92        stream_namespace: Optional[str],
 93        message_repository: MessageRepository,
 94    ) -> None:
 95        self._stream_name = stream_name
 96        self._stream_namespace = stream_namespace
 97        self._message_repository = message_repository
 98        # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel
 99        # state message rather than manage overall source state. This is also only temporary as we move to the resumable
100        # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state.
101        self._connector_state_manager = ConnectorStateManager()
102        self._has_closed_at_least_one_slice = False
103
104    @property
105    def state(self) -> MutableMapping[str, Any]:
106        return {NO_CURSOR_STATE_KEY: True}
107
108    def observe(self, record: Record) -> None:
109        pass
110
111    def close_partition(self, partition: Partition) -> None:
112        pass
113
114    def ensure_at_least_one_state_emitted(self) -> None:
115        """
116        Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync
117        """
118
119        self._connector_state_manager.update_state_for_stream(
120            self._stream_name, self._stream_namespace, self.state
121        )
122        state_message = self._connector_state_manager.create_state_message(
123            self._stream_name, self._stream_namespace
124        )
125        self._message_repository.emit_message(state_message)

Cursor that is used to guarantee at least one state message is emitted for a concurrent stream.

FinalStateCursor( stream_name: str, stream_namespace: Optional[str], message_repository: MessageRepository)
 89    def __init__(
 90        self,
 91        stream_name: str,
 92        stream_namespace: Optional[str],
 93        message_repository: MessageRepository,
 94    ) -> None:
 95        self._stream_name = stream_name
 96        self._stream_namespace = stream_namespace
 97        self._message_repository = message_repository
 98        # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel
 99        # state message rather than manage overall source state. This is also only temporary as we move to the resumable
100        # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state.
101        self._connector_state_manager = ConnectorStateManager()
102        self._has_closed_at_least_one_slice = False
state: MutableMapping[str, Any]
104    @property
105    def state(self) -> MutableMapping[str, Any]:
106        return {NO_CURSOR_STATE_KEY: True}
def observe(self, record: Record) -> None:
108    def observe(self, record: Record) -> None:
109        pass

Indicate to the cursor that the record has been emitted

def close_partition( self, partition: airbyte_cdk.sources.streams.concurrent.partitions.partition.Partition) -> None:
111    def close_partition(self, partition: Partition) -> None:
112        pass

Indicate to the cursor that the partition has been successfully processed

def ensure_at_least_one_state_emitted(self) -> None:
114    def ensure_at_least_one_state_emitted(self) -> None:
115        """
116        Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync
117        """
118
119        self._connector_state_manager.update_state_for_stream(
120            self._stream_name, self._stream_namespace, self.state
121        )
122        state_message = self._connector_state_manager.create_state_message(
123            self._stream_name, self._stream_namespace
124        )
125        self._message_repository.emit_message(state_message)

Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync

Inherited Members
Cursor
stream_slices
146class IsoMillisConcurrentStreamStateConverter(DateTimeStreamStateConverter):
147    """
148    e.g.
149    { "created": "2021-01-18T21:18:20.000Z" }
150    =>
151    {
152        "state_type": "date-range",
153        "metadata": { … },
154        "slices": [
155            {starts: "2020-01-18T21:18:20.000Z", end: "2021-01-18T21:18:20.000Z", finished_processing: true}
156        ]
157    }
158    """
159
160    _zero_value = "0001-01-01T00:00:00.000Z"
161
162    def __init__(
163        self, is_sequential_state: bool = True, cursor_granularity: Optional[timedelta] = None
164    ):
165        super().__init__(is_sequential_state=is_sequential_state)
166        self._cursor_granularity = cursor_granularity or timedelta(milliseconds=1)
167
168    def increment(self, timestamp: datetime) -> datetime:
169        return timestamp + self._cursor_granularity
170
171    def output_format(self, timestamp: datetime) -> str:
172        """Format datetime with milliseconds always included.
173
174        Args:
175            timestamp: The datetime to format.
176
177        Returns:
178            str: ISO8601/RFC3339 formatted string with milliseconds.
179        """
180        dt = AirbyteDateTime.from_datetime(timestamp)
181        # Always include milliseconds, even if zero
182        millis = dt.microsecond // 1000 if dt.microsecond else 0
183        return f"{dt.year:04d}-{dt.month:02d}-{dt.day:02d}T{dt.hour:02d}:{dt.minute:02d}:{dt.second:02d}.{millis:03d}Z"
184
185    def parse_timestamp(self, timestamp: str) -> datetime:
186        dt_object = ab_datetime_parse(timestamp)
187        if not isinstance(dt_object, AirbyteDateTime):
188            raise ValueError(
189                f"AirbyteDateTime object was expected but got {type(dt_object)} from parse({timestamp})"
190            )
191        return dt_object

e.g. { "created": "2021-01-18T21:18:20.000Z" } => { "state_type": "date-range", "metadata": { … }, "slices": [ {starts: "2020-01-18T21:18:20.000Z", end: "2021-01-18T21:18:20.000Z", finished_processing: true} ] }

IsoMillisConcurrentStreamStateConverter( is_sequential_state: bool = True, cursor_granularity: Optional[datetime.timedelta] = None)
162    def __init__(
163        self, is_sequential_state: bool = True, cursor_granularity: Optional[timedelta] = None
164    ):
165        super().__init__(is_sequential_state=is_sequential_state)
166        self._cursor_granularity = cursor_granularity or timedelta(milliseconds=1)
def increment(self, timestamp: datetime.datetime) -> datetime.datetime:
168    def increment(self, timestamp: datetime) -> datetime:
169        return timestamp + self._cursor_granularity

Increment a timestamp by a single unit.

def output_format(self, timestamp: datetime.datetime) -> str:
171    def output_format(self, timestamp: datetime) -> str:
172        """Format datetime with milliseconds always included.
173
174        Args:
175            timestamp: The datetime to format.
176
177        Returns:
178            str: ISO8601/RFC3339 formatted string with milliseconds.
179        """
180        dt = AirbyteDateTime.from_datetime(timestamp)
181        # Always include milliseconds, even if zero
182        millis = dt.microsecond // 1000 if dt.microsecond else 0
183        return f"{dt.year:04d}-{dt.month:02d}-{dt.day:02d}T{dt.hour:02d}:{dt.minute:02d}:{dt.second:02d}.{millis:03d}Z"

Format datetime with milliseconds always included.

Arguments:
  • timestamp: The datetime to format.
Returns:

str: ISO8601/RFC3339 formatted string with milliseconds.

def parse_timestamp(self, timestamp: str) -> datetime.datetime:
185    def parse_timestamp(self, timestamp: str) -> datetime:
186        dt_object = ab_datetime_parse(timestamp)
187        if not isinstance(dt_object, AirbyteDateTime):
188            raise ValueError(
189                f"AirbyteDateTime object was expected but got {type(dt_object)} from parse({timestamp})"
190            )
191        return dt_object
@deprecated('This class is experimental. Use at your own risk.', category=ExperimentalClassWarning)
class StreamFacade(airbyte_cdk.sources.streams.concurrent.abstract_stream_facade.AbstractStreamFacade[airbyte_cdk.sources.streams.concurrent.default_stream.DefaultStream], airbyte_cdk.Stream):
 54@deprecated(
 55    "This class is experimental. Use at your own risk.",
 56    category=ExperimentalClassWarning,
 57)
 58class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
 59    """
 60    The StreamFacade is a Stream that wraps an AbstractStream and exposes it as a Stream.
 61
 62    All methods either delegate to the wrapped AbstractStream or provide a default implementation.
 63    The default implementations define restrictions imposed on Streams migrated to the new interface. For instance, only source-defined cursors are supported.
 64    """
 65
 66    @classmethod
 67    def create_from_stream(
 68        cls,
 69        stream: Stream,
 70        source: AbstractSource,
 71        logger: logging.Logger,
 72        state: Optional[MutableMapping[str, Any]],
 73        cursor: Cursor,
 74    ) -> Stream:
 75        """
 76        Create a ConcurrentStream from a Stream object.
 77        :param source: The source
 78        :param stream: The stream
 79        :param max_workers: The maximum number of worker thread to use
 80        :return:
 81        """
 82        pk = get_primary_key_from_stream(stream.primary_key)
 83        cursor_field = get_cursor_field_from_stream(stream)
 84
 85        if not source.message_repository:
 86            raise ValueError(
 87                "A message repository is required to emit non-record messages. Please set the message repository on the source."
 88            )
 89
 90        message_repository = source.message_repository
 91        return StreamFacade(
 92            DefaultStream(
 93                partition_generator=StreamPartitionGenerator(
 94                    stream,
 95                    message_repository,
 96                    SyncMode.full_refresh
 97                    if isinstance(cursor, FinalStateCursor)
 98                    else SyncMode.incremental,
 99                    [cursor_field] if cursor_field is not None else None,
100                    state,
101                ),
102                name=stream.name,
103                namespace=stream.namespace,
104                json_schema=stream.get_json_schema(),
105                availability_strategy=AlwaysAvailableAvailabilityStrategy(),
106                primary_key=pk,
107                cursor_field=cursor_field,
108                logger=logger,
109                cursor=cursor,
110            ),
111            stream,
112            cursor,
113            slice_logger=source._slice_logger,
114            logger=logger,
115        )
116
117    @property
118    def state(self) -> MutableMapping[str, Any]:
119        raise NotImplementedError(
120            "This should not be called as part of the Concurrent CDK code. Please report the problem to Airbyte"
121        )
122
123    @state.setter
124    def state(self, value: Mapping[str, Any]) -> None:
125        if "state" in dir(self._legacy_stream):
126            self._legacy_stream.state = value  # type: ignore  # validating `state` is attribute of stream using `if` above
127
128    def __init__(
129        self,
130        stream: DefaultStream,
131        legacy_stream: Stream,
132        cursor: Cursor,
133        slice_logger: SliceLogger,
134        logger: logging.Logger,
135    ):
136        """
137        :param stream: The underlying AbstractStream
138        """
139        self._abstract_stream = stream
140        self._legacy_stream = legacy_stream
141        self._cursor = cursor
142        self._slice_logger = slice_logger
143        self._logger = logger
144
145    def read(
146        self,
147        configured_stream: ConfiguredAirbyteStream,
148        logger: logging.Logger,
149        slice_logger: SliceLogger,
150        stream_state: MutableMapping[str, Any],
151        state_manager: ConnectorStateManager,
152        internal_config: InternalConfig,
153    ) -> Iterable[StreamData]:
154        yield from self._read_records()
155
156    def read_records(
157        self,
158        sync_mode: SyncMode,
159        cursor_field: Optional[List[str]] = None,
160        stream_slice: Optional[Mapping[str, Any]] = None,
161        stream_state: Optional[Mapping[str, Any]] = None,
162    ) -> Iterable[StreamData]:
163        try:
164            yield from self._read_records()
165        except Exception as exc:
166            if hasattr(self._cursor, "state"):
167                state = str(self._cursor.state)
168            else:
169                # This shouldn't happen if the ConcurrentCursor was used
170                state = "unknown; no state attribute was available on the cursor"
171            yield AirbyteMessage(
172                type=Type.LOG,
173                log=AirbyteLogMessage(
174                    level=Level.ERROR, message=f"Cursor State at time of exception: {state}"
175                ),
176            )
177            raise exc
178
179    def _read_records(self) -> Iterable[StreamData]:
180        for partition in self._abstract_stream.generate_partitions():
181            if self._slice_logger.should_log_slice_message(self._logger):
182                yield self._slice_logger.create_slice_log_message(partition.to_slice())
183            for record in partition.read():
184                yield record.data
185
186    @property
187    def name(self) -> str:
188        return self._abstract_stream.name
189
190    @property
191    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
192        # This method is not expected to be called directly. It is only implemented for backward compatibility with the old interface
193        return self.as_airbyte_stream().source_defined_primary_key  # type: ignore # source_defined_primary_key is known to be an Optional[List[List[str]]]
194
195    @property
196    def cursor_field(self) -> Union[str, List[str]]:
197        if self._abstract_stream.cursor_field is None:
198            return []
199        else:
200            return self._abstract_stream.cursor_field
201
202    @property
203    def cursor(self) -> Optional[Cursor]:  # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor
204        return self._cursor
205
206    @lru_cache(maxsize=None)
207    def get_json_schema(self) -> Mapping[str, Any]:
208        return self._abstract_stream.get_json_schema()
209
210    @property
211    def supports_incremental(self) -> bool:
212        return self._legacy_stream.supports_incremental
213
214    def check_availability(
215        self, logger: logging.Logger, source: Optional["Source"] = None
216    ) -> Tuple[bool, Optional[str]]:
217        """
218        Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters
219        :param logger: (ignored)
220        :param source:  (ignored)
221        :return:
222        """
223        availability = self._abstract_stream.check_availability()
224        return availability.is_available(), availability.message()
225
226    def as_airbyte_stream(self) -> AirbyteStream:
227        return self._abstract_stream.as_airbyte_stream()
228
229    def log_stream_sync_configuration(self) -> None:
230        self._abstract_stream.log_stream_sync_configuration()
231
232    def get_underlying_stream(self) -> DefaultStream:
233        return self._abstract_stream

The StreamFacade is a Stream that wraps an AbstractStream and exposes it as a Stream.

All methods either delegate to the wrapped AbstractStream or provide a default implementation. The default implementations define restrictions imposed on Streams migrated to the new interface. For instance, only source-defined cursors are supported.

StreamFacade( stream: airbyte_cdk.sources.streams.concurrent.default_stream.DefaultStream, legacy_stream: Stream, cursor: Cursor, slice_logger: airbyte_cdk.sources.utils.slice_logger.SliceLogger, logger: logging.Logger)
128    def __init__(
129        self,
130        stream: DefaultStream,
131        legacy_stream: Stream,
132        cursor: Cursor,
133        slice_logger: SliceLogger,
134        logger: logging.Logger,
135    ):
136        """
137        :param stream: The underlying AbstractStream
138        """
139        self._abstract_stream = stream
140        self._legacy_stream = legacy_stream
141        self._cursor = cursor
142        self._slice_logger = slice_logger
143        self._logger = logger
Parameters
  • stream: The underlying AbstractStream
@classmethod
def create_from_stream( cls, stream: Stream, source: AbstractSource, logger: logging.Logger, state: Optional[MutableMapping[str, Any]], cursor: Cursor) -> Stream:
 66    @classmethod
 67    def create_from_stream(
 68        cls,
 69        stream: Stream,
 70        source: AbstractSource,
 71        logger: logging.Logger,
 72        state: Optional[MutableMapping[str, Any]],
 73        cursor: Cursor,
 74    ) -> Stream:
 75        """
 76        Create a ConcurrentStream from a Stream object.
 77        :param source: The source
 78        :param stream: The stream
 79        :param max_workers: The maximum number of worker thread to use
 80        :return:
 81        """
 82        pk = get_primary_key_from_stream(stream.primary_key)
 83        cursor_field = get_cursor_field_from_stream(stream)
 84
 85        if not source.message_repository:
 86            raise ValueError(
 87                "A message repository is required to emit non-record messages. Please set the message repository on the source."
 88            )
 89
 90        message_repository = source.message_repository
 91        return StreamFacade(
 92            DefaultStream(
 93                partition_generator=StreamPartitionGenerator(
 94                    stream,
 95                    message_repository,
 96                    SyncMode.full_refresh
 97                    if isinstance(cursor, FinalStateCursor)
 98                    else SyncMode.incremental,
 99                    [cursor_field] if cursor_field is not None else None,
100                    state,
101                ),
102                name=stream.name,
103                namespace=stream.namespace,
104                json_schema=stream.get_json_schema(),
105                availability_strategy=AlwaysAvailableAvailabilityStrategy(),
106                primary_key=pk,
107                cursor_field=cursor_field,
108                logger=logger,
109                cursor=cursor,
110            ),
111            stream,
112            cursor,
113            slice_logger=source._slice_logger,
114            logger=logger,
115        )

Create a ConcurrentStream from a Stream object.

Parameters
  • source: The source
  • stream: The stream
  • max_workers: The maximum number of worker thread to use
Returns
state: MutableMapping[str, Any]
117    @property
118    def state(self) -> MutableMapping[str, Any]:
119        raise NotImplementedError(
120            "This should not be called as part of the Concurrent CDK code. Please report the problem to Airbyte"
121        )
def check_availability( self, logger: logging.Logger, source: Optional[Source] = None) -> Tuple[bool, Optional[str]]:
214    def check_availability(
215        self, logger: logging.Logger, source: Optional["Source"] = None
216    ) -> Tuple[bool, Optional[str]]:
217        """
218        Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters
219        :param logger: (ignored)
220        :param source:  (ignored)
221        :return:
222        """
223        availability = self._abstract_stream.check_availability()
224        return availability.is_available(), availability.message()

Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters

Parameters
  • logger: (ignored)
  • source: (ignored)
Returns
def get_underlying_stream( self) -> airbyte_cdk.sources.streams.concurrent.default_stream.DefaultStream:
232    def get_underlying_stream(self) -> DefaultStream:
233        return self._abstract_stream

Return the underlying stream facade object.

def create_connector_config_control_message( config: MutableMapping[str, Any]) -> AirbyteMessage:
 99def create_connector_config_control_message(config: MutableMapping[str, Any]) -> AirbyteMessage:
100    control_message = AirbyteControlMessage(
101        type=OrchestratorType.CONNECTOR_CONFIG,
102        emitted_at=time.time() * 1000,
103        connectorConfig=AirbyteControlConnectorConfigMessage(config=config),
104    )
105    return AirbyteMessage(type=Type.CONTROL, control=control_message)
def emit_configuration_as_airbyte_control_message(config: MutableMapping[str, Any]) -> None:
90def emit_configuration_as_airbyte_control_message(config: MutableMapping[str, Any]) -> None:
91    """
92    WARNING: deprecated - emit_configuration_as_airbyte_control_message is being deprecated in favor of the MessageRepository mechanism.
93    See the airbyte_cdk.sources.message package
94    """
95    airbyte_message = create_connector_config_control_message(config)
96    print(orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode())

WARNING: deprecated - emit_configuration_as_airbyte_control_message is being deprecated in favor of the MessageRepository mechanism. See the airbyte_cdk.sources.message package

 53class AbstractSource(Source, ABC):
 54    """
 55    Abstract base class for an Airbyte Source. Consumers should implement any abstract methods
 56    in this class to create an Airbyte Specification compliant Source.
 57    """
 58
 59    @abstractmethod
 60    def check_connection(
 61        self, logger: logging.Logger, config: Mapping[str, Any]
 62    ) -> Tuple[bool, Optional[Any]]:
 63        """
 64        :param logger: source logger
 65        :param config: The user-provided configuration as specified by the source's spec.
 66          This usually contains information required to check connection e.g. tokens, secrets and keys etc.
 67        :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful
 68          and we can connect to the underlying data source using the provided configuration.
 69          Otherwise, the input config cannot be used to connect to the underlying data source,
 70          and the "error" object should describe what went wrong.
 71          The error object will be cast to string to display the problem to the user.
 72        """
 73
 74    @abstractmethod
 75    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
 76        """
 77        :param config: The user-provided configuration as specified by the source's spec.
 78        Any stream construction related operation should happen here.
 79        :return: A list of the streams in this source connector.
 80        """
 81
 82    # Stream name to instance map for applying output object transformation
 83    _stream_to_instance_map: Dict[str, Stream] = {}
 84    _slice_logger: SliceLogger = DebugSliceLogger()
 85
 86    def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog:
 87        """Implements the Discover operation from the Airbyte Specification.
 88        See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover.
 89        """
 90        streams = [stream.as_airbyte_stream() for stream in self.streams(config=config)]
 91        return AirbyteCatalog(streams=streams)
 92
 93    def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus:
 94        """Implements the Check Connection operation from the Airbyte Specification.
 95        See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check.
 96        """
 97        check_succeeded, error = self.check_connection(logger, config)
 98        if not check_succeeded:
 99            return AirbyteConnectionStatus(status=Status.FAILED, message=repr(error))
100        return AirbyteConnectionStatus(status=Status.SUCCEEDED)
101
102    def read(
103        self,
104        logger: logging.Logger,
105        config: Mapping[str, Any],
106        catalog: ConfiguredAirbyteCatalog,
107        state: Optional[List[AirbyteStateMessage]] = None,
108    ) -> Iterator[AirbyteMessage]:
109        """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/."""
110        logger.info(f"Starting syncing {self.name}")
111        config, internal_config = split_config(config)
112        # TODO assert all streams exist in the connector
113        # get the streams once in case the connector needs to make any queries to generate them
114        stream_instances = {s.name: s for s in self.streams(config)}
115        state_manager = ConnectorStateManager(state=state)
116        self._stream_to_instance_map = stream_instances
117
118        stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {}
119
120        with create_timer(self.name) as timer:
121            for configured_stream in catalog.streams:
122                stream_instance = stream_instances.get(configured_stream.stream.name)
123                is_stream_exist = bool(stream_instance)
124                try:
125                    # Used direct reference to `stream_instance` instead of `is_stream_exist` to avoid mypy type checking errors
126                    if not stream_instance:
127                        if not self.raise_exception_on_missing_stream:
128                            yield stream_status_as_airbyte_message(
129                                configured_stream.stream, AirbyteStreamStatus.INCOMPLETE
130                            )
131                            continue
132
133                        error_message = (
134                            f"The stream '{configured_stream.stream.name}' in your connection configuration was not found in the source. "
135                            f"Refresh the schema in your replication settings and remove this stream from future sync attempts."
136                        )
137
138                        # Use configured_stream as stream_instance to support references in error handling.
139                        stream_instance = configured_stream.stream
140
141                        raise AirbyteTracedException(
142                            message="A stream listed in your configuration was not found in the source. Please check the logs for more "
143                            "details.",
144                            internal_message=error_message,
145                            failure_type=FailureType.config_error,
146                        )
147
148                    timer.start_event(f"Syncing stream {configured_stream.stream.name}")
149                    logger.info(f"Marking stream {configured_stream.stream.name} as STARTED")
150                    yield stream_status_as_airbyte_message(
151                        configured_stream.stream, AirbyteStreamStatus.STARTED
152                    )
153                    yield from self._read_stream(
154                        logger=logger,
155                        stream_instance=stream_instance,
156                        configured_stream=configured_stream,
157                        state_manager=state_manager,
158                        internal_config=internal_config,
159                    )
160                    logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
161                    yield stream_status_as_airbyte_message(
162                        configured_stream.stream, AirbyteStreamStatus.COMPLETE
163                    )
164
165                except Exception as e:
166                    yield from self._emit_queued_messages()
167                    logger.exception(
168                        f"Encountered an exception while reading stream {configured_stream.stream.name}"
169                    )
170                    logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
171                    yield stream_status_as_airbyte_message(
172                        configured_stream.stream, AirbyteStreamStatus.INCOMPLETE
173                    )
174
175                    stream_descriptor = StreamDescriptor(name=configured_stream.stream.name)
176
177                    if isinstance(e, AirbyteTracedException):
178                        traced_exception = e
179                        info_message = f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
180                    else:
181                        traced_exception = self._serialize_exception(
182                            stream_descriptor, e, stream_instance=stream_instance
183                        )
184                        info_message = f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}"
185
186                    yield traced_exception.as_sanitized_airbyte_message(
187                        stream_descriptor=stream_descriptor
188                    )
189                    stream_name_to_exception[stream_instance.name] = traced_exception  # type: ignore # use configured_stream if stream_instance is None
190                    if self.stop_sync_on_stream_failure:
191                        logger.info(info_message)
192                        break
193                finally:
194                    # Finish read event only if the stream instance exists;
195                    # otherwise, there's no need as it never started
196                    if is_stream_exist:
197                        timer.finish_event()
198                        logger.info(f"Finished syncing {configured_stream.stream.name}")
199                        logger.info(timer.report())
200
201        if len(stream_name_to_exception) > 0:
202            error_message = generate_failed_streams_error_message(
203                {key: [value] for key, value in stream_name_to_exception.items()}
204            )
205            logger.info(error_message)
206            # We still raise at least one exception when a stream raises an exception because the platform currently relies
207            # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
208            # type because this combined error isn't actionable, but rather the previously emitted individual errors.
209            raise AirbyteTracedException(
210                message=error_message, failure_type=FailureType.config_error
211            )
212        logger.info(f"Finished syncing {self.name}")
213
214    @staticmethod
215    def _serialize_exception(
216        stream_descriptor: StreamDescriptor, e: Exception, stream_instance: Optional[Stream] = None
217    ) -> AirbyteTracedException:
218        display_message = stream_instance.get_error_display_message(e) if stream_instance else None
219        if display_message:
220            return AirbyteTracedException.from_exception(
221                e, message=display_message, stream_descriptor=stream_descriptor
222            )
223        return AirbyteTracedException.from_exception(e, stream_descriptor=stream_descriptor)
224
225    @property
226    def raise_exception_on_missing_stream(self) -> bool:
227        return False
228
229    def _read_stream(
230        self,
231        logger: logging.Logger,
232        stream_instance: Stream,
233        configured_stream: ConfiguredAirbyteStream,
234        state_manager: ConnectorStateManager,
235        internal_config: InternalConfig,
236    ) -> Iterator[AirbyteMessage]:
237        if internal_config.page_size and isinstance(stream_instance, HttpStream):
238            logger.info(
239                f"Setting page size for {stream_instance.name} to {internal_config.page_size}"
240            )
241            stream_instance.page_size = internal_config.page_size
242        logger.debug(
243            f"Syncing configured stream: {configured_stream.stream.name}",
244            extra={
245                "sync_mode": configured_stream.sync_mode,
246                "primary_key": configured_stream.primary_key,
247                "cursor_field": configured_stream.cursor_field,
248            },
249        )
250        stream_instance.log_stream_sync_configuration()
251
252        stream_name = configured_stream.stream.name
253        stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
254
255        # This is a hack. Existing full refresh streams that are converted into resumable full refresh need to discard
256        # the state because the terminal state for a full refresh sync is not compatible with substream resumable full
257        # refresh state. This is only required when running live traffic regression testing since the platform normally
258        # handles whether to pass state
259        if stream_state == {"__ab_no_cursor_state_message": True}:
260            stream_state = {}
261
262        if "state" in dir(stream_instance):
263            stream_instance.state = stream_state  # type: ignore # we check that state in the dir(stream_instance)
264            logger.info(f"Setting state of {self.name} stream to {stream_state}")
265
266        record_iterator = stream_instance.read(
267            configured_stream,
268            logger,
269            self._slice_logger,
270            stream_state,
271            state_manager,
272            internal_config,
273        )
274
275        record_counter = 0
276        logger.info(f"Syncing stream: {stream_name} ")
277        for record_data_or_message in record_iterator:
278            record = self._get_message(record_data_or_message, stream_instance)
279            if record.type == MessageType.RECORD:
280                record_counter += 1
281                if record_counter == 1:
282                    logger.info(f"Marking stream {stream_name} as RUNNING")
283                    # If we just read the first record of the stream, emit the transition to the RUNNING state
284                    yield stream_status_as_airbyte_message(
285                        configured_stream.stream, AirbyteStreamStatus.RUNNING
286                    )
287            yield from self._emit_queued_messages()
288            yield record
289
290        logger.info(f"Read {record_counter} records from {stream_name} stream")
291
292    def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
293        if self.message_repository:
294            yield from self.message_repository.consume_queue()
295        return
296
297    def _get_message(
298        self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream
299    ) -> AirbyteMessage:
300        """
301        Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
302        """
303        match record_data_or_message:
304            case AirbyteMessage():
305                return record_data_or_message
306            case _:
307                return stream_data_to_airbyte_message(
308                    stream.name,
309                    record_data_or_message,
310                    stream.transformer,
311                    stream.get_json_schema(),
312                )
313
314    @property
315    def message_repository(self) -> Union[None, MessageRepository]:
316        return _default_message_repository
317
318    @property
319    def stop_sync_on_stream_failure(self) -> bool:
320        """
321        WARNING: This function is in-development which means it is subject to change. Use at your own risk.
322
323        By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then
324        continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync
325        on the first error seen and emit a single error trace message for that stream.
326        """
327        return False

Abstract base class for an Airbyte Source. Consumers should implement any abstract methods in this class to create an Airbyte Specification compliant Source.

@abstractmethod
def check_connection( self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
59    @abstractmethod
60    def check_connection(
61        self, logger: logging.Logger, config: Mapping[str, Any]
62    ) -> Tuple[bool, Optional[Any]]:
63        """
64        :param logger: source logger
65        :param config: The user-provided configuration as specified by the source's spec.
66          This usually contains information required to check connection e.g. tokens, secrets and keys etc.
67        :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful
68          and we can connect to the underlying data source using the provided configuration.
69          Otherwise, the input config cannot be used to connect to the underlying data source,
70          and the "error" object should describe what went wrong.
71          The error object will be cast to string to display the problem to the user.
72        """
Parameters
  • logger: source logger
  • config: The user-provided configuration as specified by the source's spec. This usually contains information required to check connection e.g. tokens, secrets and keys etc.
Returns

A tuple of (boolean, error). If boolean is true, then the connection check is successful and we can connect to the underlying data source using the provided configuration. Otherwise, the input config cannot be used to connect to the underlying data source, and the "error" object should describe what went wrong. The error object will be cast to string to display the problem to the user.

@abstractmethod
def streams( self, config: Mapping[str, Any]) -> List[Stream]:
74    @abstractmethod
75    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
76        """
77        :param config: The user-provided configuration as specified by the source's spec.
78        Any stream construction related operation should happen here.
79        :return: A list of the streams in this source connector.
80        """
Parameters
  • config: The user-provided configuration as specified by the source's spec. Any stream construction related operation should happen here.
Returns

A list of the streams in this source connector.

def discover( self, logger: logging.Logger, config: Mapping[str, Any]) -> airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteCatalog:
86    def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog:
87        """Implements the Discover operation from the Airbyte Specification.
88        See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover.
89        """
90        streams = [stream.as_airbyte_stream() for stream in self.streams(config=config)]
91        return AirbyteCatalog(streams=streams)

Implements the Discover operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover.

def check( self, logger: logging.Logger, config: Mapping[str, Any]) -> airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteConnectionStatus:
 93    def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus:
 94        """Implements the Check Connection operation from the Airbyte Specification.
 95        See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check.
 96        """
 97        check_succeeded, error = self.check_connection(logger, config)
 98        if not check_succeeded:
 99            return AirbyteConnectionStatus(status=Status.FAILED, message=repr(error))
100        return AirbyteConnectionStatus(status=Status.SUCCEEDED)

Implements the Check Connection operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check.

def read( self, logger: logging.Logger, config: Mapping[str, Any], catalog: airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteCatalog, state: Optional[List[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage]] = None) -> Iterator[AirbyteMessage]:
102    def read(
103        self,
104        logger: logging.Logger,
105        config: Mapping[str, Any],
106        catalog: ConfiguredAirbyteCatalog,
107        state: Optional[List[AirbyteStateMessage]] = None,
108    ) -> Iterator[AirbyteMessage]:
109        """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/."""
110        logger.info(f"Starting syncing {self.name}")
111        config, internal_config = split_config(config)
112        # TODO assert all streams exist in the connector
113        # get the streams once in case the connector needs to make any queries to generate them
114        stream_instances = {s.name: s for s in self.streams(config)}
115        state_manager = ConnectorStateManager(state=state)
116        self._stream_to_instance_map = stream_instances
117
118        stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {}
119
120        with create_timer(self.name) as timer:
121            for configured_stream in catalog.streams:
122                stream_instance = stream_instances.get(configured_stream.stream.name)
123                is_stream_exist = bool(stream_instance)
124                try:
125                    # Used direct reference to `stream_instance` instead of `is_stream_exist` to avoid mypy type checking errors
126                    if not stream_instance:
127                        if not self.raise_exception_on_missing_stream:
128                            yield stream_status_as_airbyte_message(
129                                configured_stream.stream, AirbyteStreamStatus.INCOMPLETE
130                            )
131                            continue
132
133                        error_message = (
134                            f"The stream '{configured_stream.stream.name}' in your connection configuration was not found in the source. "
135                            f"Refresh the schema in your replication settings and remove this stream from future sync attempts."
136                        )
137
138                        # Use configured_stream as stream_instance to support references in error handling.
139                        stream_instance = configured_stream.stream
140
141                        raise AirbyteTracedException(
142                            message="A stream listed in your configuration was not found in the source. Please check the logs for more "
143                            "details.",
144                            internal_message=error_message,
145                            failure_type=FailureType.config_error,
146                        )
147
148                    timer.start_event(f"Syncing stream {configured_stream.stream.name}")
149                    logger.info(f"Marking stream {configured_stream.stream.name} as STARTED")
150                    yield stream_status_as_airbyte_message(
151                        configured_stream.stream, AirbyteStreamStatus.STARTED
152                    )
153                    yield from self._read_stream(
154                        logger=logger,
155                        stream_instance=stream_instance,
156                        configured_stream=configured_stream,
157                        state_manager=state_manager,
158                        internal_config=internal_config,
159                    )
160                    logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
161                    yield stream_status_as_airbyte_message(
162                        configured_stream.stream, AirbyteStreamStatus.COMPLETE
163                    )
164
165                except Exception as e:
166                    yield from self._emit_queued_messages()
167                    logger.exception(
168                        f"Encountered an exception while reading stream {configured_stream.stream.name}"
169                    )
170                    logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
171                    yield stream_status_as_airbyte_message(
172                        configured_stream.stream, AirbyteStreamStatus.INCOMPLETE
173                    )
174
175                    stream_descriptor = StreamDescriptor(name=configured_stream.stream.name)
176
177                    if isinstance(e, AirbyteTracedException):
178                        traced_exception = e
179                        info_message = f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
180                    else:
181                        traced_exception = self._serialize_exception(
182                            stream_descriptor, e, stream_instance=stream_instance
183                        )
184                        info_message = f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}"
185
186                    yield traced_exception.as_sanitized_airbyte_message(
187                        stream_descriptor=stream_descriptor
188                    )
189                    stream_name_to_exception[stream_instance.name] = traced_exception  # type: ignore # use configured_stream if stream_instance is None
190                    if self.stop_sync_on_stream_failure:
191                        logger.info(info_message)
192                        break
193                finally:
194                    # Finish read event only if the stream instance exists;
195                    # otherwise, there's no need as it never started
196                    if is_stream_exist:
197                        timer.finish_event()
198                        logger.info(f"Finished syncing {configured_stream.stream.name}")
199                        logger.info(timer.report())
200
201        if len(stream_name_to_exception) > 0:
202            error_message = generate_failed_streams_error_message(
203                {key: [value] for key, value in stream_name_to_exception.items()}
204            )
205            logger.info(error_message)
206            # We still raise at least one exception when a stream raises an exception because the platform currently relies
207            # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
208            # type because this combined error isn't actionable, but rather the previously emitted individual errors.
209            raise AirbyteTracedException(
210                message=error_message, failure_type=FailureType.config_error
211            )
212        logger.info(f"Finished syncing {self.name}")

Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.

raise_exception_on_missing_stream: bool
225    @property
226    def raise_exception_on_missing_stream(self) -> bool:
227        return False
message_repository: Optional[MessageRepository]
314    @property
315    def message_repository(self) -> Union[None, MessageRepository]:
316        return _default_message_repository
stop_sync_on_stream_failure: bool
318    @property
319    def stop_sync_on_stream_failure(self) -> bool:
320        """
321        WARNING: This function is in-development which means it is subject to change. Use at your own risk.
322
323        By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then
324        continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync
325        on the first error seen and emit a single error trace message for that stream.
326        """
327        return False

WARNING: This function is in-development which means it is subject to change. Use at your own risk.

By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync on the first error seen and emit a single error trace message for that stream.

class BaseConfig(pydantic.v1.main.BaseModel):
13class BaseConfig(BaseModel):
14    """Base class for connector spec, adds the following behaviour:
15
16    - resolve $ref and replace it with definition
17    - replace all occurrences of anyOf with oneOf
18    - drop description
19    """
20
21    @classmethod
22    def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]:
23        """We're overriding the schema classmethod to enable some post-processing"""
24        schema = super().schema(*args, **kwargs)
25        rename_key(schema, old_key="anyOf", new_key="oneOf")  # UI supports only oneOf
26        expand_refs(schema)
27        schema.pop("description", None)  # description added from the docstring
28        return schema

Base class for connector spec, adds the following behaviour:

  • resolve $ref and replace it with definition
  • replace all occurrences of anyOf with oneOf
  • drop description
@classmethod
def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]:
21    @classmethod
22    def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]:
23        """We're overriding the schema classmethod to enable some post-processing"""
24        schema = super().schema(*args, **kwargs)
25        rename_key(schema, old_key="anyOf", new_key="oneOf")  # UI supports only oneOf
26        expand_refs(schema)
27        schema.pop("description", None)  # description added from the docstring
28        return schema

We're overriding the schema classmethod to enable some post-processing

class BaseConnector(abc.ABC, typing.Generic[~TConfig]):
 34class BaseConnector(ABC, Generic[TConfig]):
 35    # configure whether the `check_config_against_spec_or_exit()` needs to be called
 36    check_config_against_spec: bool = True
 37
 38    @abstractmethod
 39    def configure(self, config: Mapping[str, Any], temp_dir: str) -> TConfig:
 40        """
 41        Persist config in temporary directory to run the Source job
 42        """
 43
 44    @staticmethod
 45    def read_config(config_path: str) -> Mapping[str, Any]:
 46        config = BaseConnector._read_json_file(config_path)
 47        if isinstance(config, Mapping):
 48            return config
 49        else:
 50            raise ValueError(
 51                f"The content of {config_path} is not an object and therefore is not a valid config. Please ensure the file represent a config."
 52            )
 53
 54    @staticmethod
 55    def _read_json_file(file_path: str) -> Any:
 56        with open(file_path, "r") as file:
 57            contents = file.read()
 58
 59        try:
 60            return json.loads(contents)
 61        except json.JSONDecodeError as error:
 62            raise ValueError(
 63                f"Could not read json file {file_path}: {error}. Please ensure that it is a valid JSON."
 64            )
 65
 66    @staticmethod
 67    def write_config(config: TConfig, config_path: str) -> None:
 68        with open(config_path, "w") as fh:
 69            fh.write(json.dumps(config))
 70
 71    def spec(self, logger: logging.Logger) -> ConnectorSpecification:
 72        """
 73        Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password)
 74        required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root.
 75        """
 76
 77        package = self.__class__.__module__.split(".")[0]
 78
 79        yaml_spec = load_optional_package_file(package, "spec.yaml")
 80        json_spec = load_optional_package_file(package, "spec.json")
 81
 82        if yaml_spec and json_spec:
 83            raise RuntimeError(
 84                "Found multiple spec files in the package. Only one of spec.yaml or spec.json should be provided."
 85            )
 86
 87        if yaml_spec:
 88            spec_obj = yaml.load(yaml_spec, Loader=yaml.SafeLoader)
 89        elif json_spec:
 90            try:
 91                spec_obj = json.loads(json_spec)
 92            except json.JSONDecodeError as error:
 93                raise ValueError(
 94                    f"Could not read json spec file: {error}. Please ensure that it is a valid JSON."
 95                )
 96        else:
 97            raise FileNotFoundError("Unable to find spec.yaml or spec.json in the package.")
 98
 99        return ConnectorSpecificationSerializer.load(spec_obj)
100
101    @abstractmethod
102    def check(self, logger: logging.Logger, config: TConfig) -> AirbyteConnectionStatus:
103        """
104        Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect
105        to the Stripe API.
106        """

Helper class that provides a standard way to create an ABC using inheritance.

check_config_against_spec: bool = True
@abstractmethod
def configure(self, config: Mapping[str, Any], temp_dir: str) -> ~TConfig:
38    @abstractmethod
39    def configure(self, config: Mapping[str, Any], temp_dir: str) -> TConfig:
40        """
41        Persist config in temporary directory to run the Source job
42        """

Persist config in temporary directory to run the Source job

@staticmethod
def read_config(config_path: str) -> Mapping[str, Any]:
44    @staticmethod
45    def read_config(config_path: str) -> Mapping[str, Any]:
46        config = BaseConnector._read_json_file(config_path)
47        if isinstance(config, Mapping):
48            return config
49        else:
50            raise ValueError(
51                f"The content of {config_path} is not an object and therefore is not a valid config. Please ensure the file represent a config."
52            )
@staticmethod
def write_config(config: ~TConfig, config_path: str) -> None:
66    @staticmethod
67    def write_config(config: TConfig, config_path: str) -> None:
68        with open(config_path, "w") as fh:
69            fh.write(json.dumps(config))
def spec( self, logger: logging.Logger) -> airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification:
71    def spec(self, logger: logging.Logger) -> ConnectorSpecification:
72        """
73        Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password)
74        required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root.
75        """
76
77        package = self.__class__.__module__.split(".")[0]
78
79        yaml_spec = load_optional_package_file(package, "spec.yaml")
80        json_spec = load_optional_package_file(package, "spec.json")
81
82        if yaml_spec and json_spec:
83            raise RuntimeError(
84                "Found multiple spec files in the package. Only one of spec.yaml or spec.json should be provided."
85            )
86
87        if yaml_spec:
88            spec_obj = yaml.load(yaml_spec, Loader=yaml.SafeLoader)
89        elif json_spec:
90            try:
91                spec_obj = json.loads(json_spec)
92            except json.JSONDecodeError as error:
93                raise ValueError(
94                    f"Could not read json spec file: {error}. Please ensure that it is a valid JSON."
95                )
96        else:
97            raise FileNotFoundError("Unable to find spec.yaml or spec.json in the package.")
98
99        return ConnectorSpecificationSerializer.load(spec_obj)

Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password) required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root.

@abstractmethod
def check( self, logger: logging.Logger, config: ~TConfig) -> airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteConnectionStatus:
101    @abstractmethod
102    def check(self, logger: logging.Logger, config: TConfig) -> AirbyteConnectionStatus:
103        """
104        Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect
105        to the Stripe API.
106        """

Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect to the Stripe API.

124class Connector(DefaultConnectorMixin, BaseConnector[Mapping[str, Any]], ABC): ...

Helper class that provides a standard way to create an ABC using inheritance.

 30class Destination(Connector, ABC):
 31    VALID_CMDS = {"spec", "check", "write"}
 32
 33    @abstractmethod
 34    def write(
 35        self,
 36        config: Mapping[str, Any],
 37        configured_catalog: ConfiguredAirbyteCatalog,
 38        input_messages: Iterable[AirbyteMessage],
 39    ) -> Iterable[AirbyteMessage]:
 40        """Implement to define how the connector writes data to the destination"""
 41
 42    def _run_check(self, config: Mapping[str, Any]) -> AirbyteMessage:
 43        check_result = self.check(logger, config)
 44        return AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
 45
 46    def _parse_input_stream(self, input_stream: io.TextIOWrapper) -> Iterable[AirbyteMessage]:
 47        """Reads from stdin, converting to Airbyte messages"""
 48        for line in input_stream:
 49            try:
 50                yield AirbyteMessageSerializer.load(orjson.loads(line))
 51            except orjson.JSONDecodeError:
 52                logger.info(
 53                    f"ignoring input which can't be deserialized as Airbyte Message: {line}"
 54                )
 55
 56    def _run_write(
 57        self,
 58        config: Mapping[str, Any],
 59        configured_catalog_path: str,
 60        input_stream: io.TextIOWrapper,
 61    ) -> Iterable[AirbyteMessage]:
 62        catalog = ConfiguredAirbyteCatalogSerializer.load(
 63            orjson.loads(open(configured_catalog_path).read())
 64        )
 65        input_messages = self._parse_input_stream(input_stream)
 66        logger.info("Begin writing to the destination...")
 67        yield from self.write(
 68            config=config, configured_catalog=catalog, input_messages=input_messages
 69        )
 70        logger.info("Writing complete.")
 71
 72    def parse_args(self, args: List[str]) -> argparse.Namespace:
 73        """
 74        :param args: commandline arguments
 75        :return:
 76        """
 77
 78        parent_parser = argparse.ArgumentParser(add_help=False)
 79        main_parser = argparse.ArgumentParser()
 80        subparsers = main_parser.add_subparsers(title="commands", dest="command")
 81
 82        # spec
 83        subparsers.add_parser(
 84            "spec", help="outputs the json configuration specification", parents=[parent_parser]
 85        )
 86
 87        # check
 88        check_parser = subparsers.add_parser(
 89            "check", help="checks the config can be used to connect", parents=[parent_parser]
 90        )
 91        required_check_parser = check_parser.add_argument_group("required named arguments")
 92        required_check_parser.add_argument(
 93            "--config", type=str, required=True, help="path to the json configuration file"
 94        )
 95
 96        # write
 97        write_parser = subparsers.add_parser(
 98            "write", help="Writes data to the destination", parents=[parent_parser]
 99        )
100        write_required = write_parser.add_argument_group("required named arguments")
101        write_required.add_argument(
102            "--config", type=str, required=True, help="path to the JSON configuration file"
103        )
104        write_required.add_argument(
105            "--catalog", type=str, required=True, help="path to the configured catalog JSON file"
106        )
107
108        parsed_args = main_parser.parse_args(args)
109        cmd = parsed_args.command
110        if not cmd:
111            raise Exception("No command entered. ")
112        elif cmd not in ["spec", "check", "write"]:
113            # This is technically dead code since parse_args() would fail if this was the case
114            # But it's non-obvious enough to warrant placing it here anyways
115            raise Exception(f"Unknown command entered: {cmd}")
116
117        return parsed_args
118
119    def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]:
120        cmd = parsed_args.command
121        if cmd not in self.VALID_CMDS:
122            raise Exception(f"Unrecognized command: {cmd}")
123
124        spec = self.spec(logger)
125        if cmd == "spec":
126            yield AirbyteMessage(type=Type.SPEC, spec=spec)
127            return
128        config = self.read_config(config_path=parsed_args.config)
129        if self.check_config_against_spec or cmd == "check":
130            try:
131                check_config_against_spec_or_exit(config, spec)
132            except AirbyteTracedException as traced_exc:
133                connection_status = traced_exc.as_connection_status_message()
134                if connection_status and cmd == "check":
135                    yield connection_status
136                    return
137                raise traced_exc
138
139        if cmd == "check":
140            yield self._run_check(config=config)
141        elif cmd == "write":
142            # Wrap in UTF-8 to override any other input encodings
143            wrapped_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8")
144            yield from self._run_write(
145                config=config,
146                configured_catalog_path=parsed_args.catalog,
147                input_stream=wrapped_stdin,
148            )
149
150    def run(self, args: List[str]) -> None:
151        init_uncaught_exception_handler(logger)
152        parsed_args = self.parse_args(args)
153        output_messages = self.run_cmd(parsed_args)
154        for message in output_messages:
155            print(orjson.dumps(AirbyteMessageSerializer.dump(message)).decode())

Helper class that provides a standard way to create an ABC using inheritance.

VALID_CMDS = {'write', 'check', 'spec'}
@abstractmethod
def write( self, config: Mapping[str, Any], configured_catalog: airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
33    @abstractmethod
34    def write(
35        self,
36        config: Mapping[str, Any],
37        configured_catalog: ConfiguredAirbyteCatalog,
38        input_messages: Iterable[AirbyteMessage],
39    ) -> Iterable[AirbyteMessage]:
40        """Implement to define how the connector writes data to the destination"""

Implement to define how the connector writes data to the destination

def parse_args(self, args: List[str]) -> argparse.Namespace:
 72    def parse_args(self, args: List[str]) -> argparse.Namespace:
 73        """
 74        :param args: commandline arguments
 75        :return:
 76        """
 77
 78        parent_parser = argparse.ArgumentParser(add_help=False)
 79        main_parser = argparse.ArgumentParser()
 80        subparsers = main_parser.add_subparsers(title="commands", dest="command")
 81
 82        # spec
 83        subparsers.add_parser(
 84            "spec", help="outputs the json configuration specification", parents=[parent_parser]
 85        )
 86
 87        # check
 88        check_parser = subparsers.add_parser(
 89            "check", help="checks the config can be used to connect", parents=[parent_parser]
 90        )
 91        required_check_parser = check_parser.add_argument_group("required named arguments")
 92        required_check_parser.add_argument(
 93            "--config", type=str, required=True, help="path to the json configuration file"
 94        )
 95
 96        # write
 97        write_parser = subparsers.add_parser(
 98            "write", help="Writes data to the destination", parents=[parent_parser]
 99        )
100        write_required = write_parser.add_argument_group("required named arguments")
101        write_required.add_argument(
102            "--config", type=str, required=True, help="path to the JSON configuration file"
103        )
104        write_required.add_argument(
105            "--catalog", type=str, required=True, help="path to the configured catalog JSON file"
106        )
107
108        parsed_args = main_parser.parse_args(args)
109        cmd = parsed_args.command
110        if not cmd:
111            raise Exception("No command entered. ")
112        elif cmd not in ["spec", "check", "write"]:
113            # This is technically dead code since parse_args() would fail if this was the case
114            # But it's non-obvious enough to warrant placing it here anyways
115            raise Exception(f"Unknown command entered: {cmd}")
116
117        return parsed_args
Parameters
  • args: commandline arguments
Returns
def run_cmd( self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]:
119    def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]:
120        cmd = parsed_args.command
121        if cmd not in self.VALID_CMDS:
122            raise Exception(f"Unrecognized command: {cmd}")
123
124        spec = self.spec(logger)
125        if cmd == "spec":
126            yield AirbyteMessage(type=Type.SPEC, spec=spec)
127            return
128        config = self.read_config(config_path=parsed_args.config)
129        if self.check_config_against_spec or cmd == "check":
130            try:
131                check_config_against_spec_or_exit(config, spec)
132            except AirbyteTracedException as traced_exc:
133                connection_status = traced_exc.as_connection_status_message()
134                if connection_status and cmd == "check":
135                    yield connection_status
136                    return
137                raise traced_exc
138
139        if cmd == "check":
140            yield self._run_check(config=config)
141        elif cmd == "write":
142            # Wrap in UTF-8 to override any other input encodings
143            wrapped_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8")
144            yield from self._run_write(
145                config=config,
146                configured_catalog_path=parsed_args.catalog,
147                input_stream=wrapped_stdin,
148            )
def run(self, args: List[str]) -> None:
150    def run(self, args: List[str]) -> None:
151        init_uncaught_exception_handler(logger)
152        parsed_args = self.parse_args(args)
153        output_messages = self.run_cmd(parsed_args)
154        for message in output_messages:
155            print(orjson.dumps(AirbyteMessageSerializer.dump(message)).decode())
56class Source(
57    DefaultConnectorMixin,
58    BaseSource[Mapping[str, Any], List[AirbyteStateMessage], ConfiguredAirbyteCatalog],
59    ABC,
60):
61    # can be overridden to change an input state.
62    @classmethod
63    def read_state(cls, state_path: str) -> List[AirbyteStateMessage]:
64        """
65        Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either
66        a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the
67        incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s).
68        :param state_path: The filepath to where the stream states are located
69        :return: The complete stream state based on the connector's previous sync
70        """
71        parsed_state_messages = []
72        if state_path:
73            state_obj = BaseConnector._read_json_file(state_path)
74            if state_obj:
75                for state in state_obj:  # type: ignore  # `isinstance(state_obj, List)` ensures that this is a list
76                    parsed_message = AirbyteStateMessageSerializer.load(state)
77                    if (
78                        not parsed_message.stream
79                        and not parsed_message.data
80                        and not parsed_message.global_
81                    ):
82                        raise ValueError(
83                            "AirbyteStateMessage should contain either a stream, global, or state field"
84                        )
85                    parsed_state_messages.append(parsed_message)
86        return parsed_state_messages
87
88    # can be overridden to change an input catalog
89    @classmethod
90    def read_catalog(cls, catalog_path: str) -> ConfiguredAirbyteCatalog:
91        return ConfiguredAirbyteCatalogSerializer.load(cls._read_json_file(catalog_path))
92
93    @property
94    def name(self) -> str:
95        """Source name"""
96        return self.__class__.__name__

Helper class that provides a standard way to create an ABC using inheritance.

@classmethod
def read_state( cls, state_path: str) -> List[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage]:
62    @classmethod
63    def read_state(cls, state_path: str) -> List[AirbyteStateMessage]:
64        """
65        Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either
66        a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the
67        incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s).
68        :param state_path: The filepath to where the stream states are located
69        :return: The complete stream state based on the connector's previous sync
70        """
71        parsed_state_messages = []
72        if state_path:
73            state_obj = BaseConnector._read_json_file(state_path)
74            if state_obj:
75                for state in state_obj:  # type: ignore  # `isinstance(state_obj, List)` ensures that this is a list
76                    parsed_message = AirbyteStateMessageSerializer.load(state)
77                    if (
78                        not parsed_message.stream
79                        and not parsed_message.data
80                        and not parsed_message.global_
81                    ):
82                        raise ValueError(
83                            "AirbyteStateMessage should contain either a stream, global, or state field"
84                        )
85                    parsed_state_messages.append(parsed_message)
86        return parsed_state_messages

Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s).

Parameters
  • state_path: The filepath to where the stream states are located
Returns

The complete stream state based on the connector's previous sync

@classmethod
def read_catalog( cls, catalog_path: str) -> airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteCatalog:
89    @classmethod
90    def read_catalog(cls, catalog_path: str) -> ConfiguredAirbyteCatalog:
91        return ConfiguredAirbyteCatalogSerializer.load(cls._read_json_file(catalog_path))
name: str
93    @property
94    def name(self) -> str:
95        """Source name"""
96        return self.__class__.__name__

Source name

@dataclass
class AddFields(airbyte_cdk.RecordTransformation):
 37@dataclass
 38class AddFields(RecordTransformation):
 39    """
 40    Transformation which adds field to an output record. The path of the added field can be nested. Adding nested fields will create all
 41    necessary parent objects (like mkdir -p). Adding fields to an array will extend the array to that index (filling intermediate
 42    indices with null values). So if you add a field at index 5 to the array ["value"], it will become ["value", null, null, null, null,
 43    "new_value"].
 44
 45
 46    This transformation has access to the following contextual values:
 47        record: the record about to be output by the connector
 48        config: the input configuration provided to a connector
 49        stream_state: the current state of the stream
 50        stream_slice: the current stream slice being read
 51
 52
 53
 54    Examples of instantiating this transformation via YAML:
 55    - type: AddFields
 56      fields:
 57        # hardcoded constant
 58        - path: ["path"]
 59          value: "static_value"
 60
 61        # nested path
 62        - path: ["path", "to", "field"]
 63          value: "static"
 64
 65        # from config
 66        - path: ["shop_id"]
 67          value: "{{ config.shop_id }}"
 68
 69        # from stream_interval
 70        - path: ["date"]
 71          value: "{{ stream_interval.start_date }}"
 72
 73        # from record
 74        - path: ["unnested_value"]
 75          value: {{ record.nested.field }}
 76
 77        # from stream_slice
 78        - path: ["start_date"]
 79          value: {{ stream_slice.start_date }}
 80
 81        # by supplying any valid Jinja template directive or expression https://jinja.palletsprojects.com/en/3.1.x/templates/#
 82        - path: ["two_times_two"]
 83          value: {{ 2 * 2 }}
 84
 85    Attributes:
 86        fields (List[AddedFieldDefinition]): A list of transformations (path and corresponding value) that will be added to the record
 87    """
 88
 89    fields: List[AddedFieldDefinition]
 90    parameters: InitVar[Mapping[str, Any]]
 91    condition: str = ""
 92    _parsed_fields: List[ParsedAddFieldDefinition] = field(
 93        init=False, repr=False, default_factory=list
 94    )
 95
 96    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 97        self._filter_interpolator = InterpolatedBoolean(
 98            condition=self.condition, parameters=parameters
 99        )
100
101        for add_field in self.fields:
102            if len(add_field.path) < 1:
103                raise ValueError(
104                    f"Expected a non-zero-length path for the AddFields transformation {add_field}"
105                )
106
107            if not isinstance(add_field.value, InterpolatedString):
108                if not isinstance(add_field.value, str):
109                    raise f"Expected a string value for the AddFields transformation: {add_field}"
110                else:
111                    self._parsed_fields.append(
112                        ParsedAddFieldDefinition(
113                            add_field.path,
114                            InterpolatedString.create(add_field.value, parameters=parameters),
115                            value_type=add_field.value_type,
116                            parameters=parameters,
117                        )
118                    )
119            else:
120                self._parsed_fields.append(
121                    ParsedAddFieldDefinition(
122                        add_field.path,
123                        add_field.value,
124                        value_type=add_field.value_type,
125                        parameters={},
126                    )
127                )
128
129    def transform(
130        self,
131        record: Dict[str, Any],
132        config: Optional[Config] = None,
133        stream_state: Optional[StreamState] = None,
134        stream_slice: Optional[StreamSlice] = None,
135    ) -> None:
136        if config is None:
137            config = {}
138        kwargs = {"record": record, "stream_slice": stream_slice}
139        for parsed_field in self._parsed_fields:
140            valid_types = (parsed_field.value_type,) if parsed_field.value_type else None
141            value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs)
142            is_empty_condition = not self.condition
143            if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs):
144                dpath.new(record, parsed_field.path, value)
145
146    def __eq__(self, other: Any) -> bool:
147        return bool(self.__dict__ == other.__dict__)

Transformation which adds field to an output record. The path of the added field can be nested. Adding nested fields will create all necessary parent objects (like mkdir -p). Adding fields to an array will extend the array to that index (filling intermediate indices with null values). So if you add a field at index 5 to the array ["value"], it will become ["value", null, null, null, null, "new_value"].

This transformation has access to the following contextual values:

record: the record about to be output by the connector config: the input configuration provided to a connector stream_state: the current state of the stream stream_slice: the current stream slice being read

Examples of instantiating this transformation via YAML:

  • type: AddFields fields: # hardcoded constant
    • path: ["path"] value: "static_value"
# nested path
- path: ["path", "to", "field"]
  value: "static"

# from config
- path: ["shop_id"]
  value: "{{ config.shop_id }}"

# from stream_interval
- path: ["date"]
  value: "{{ stream_interval.start_date }}"

# from record
- path: ["unnested_value"]
  value: {{ record.nested.field }}

# from stream_slice
- path: ["start_date"]
  value: {{ stream_slice.start_date }}

# by supplying any valid Jinja template directive or expression https://jinja.palletsprojects.com/en/3.1.x/templates/#
- path: ["two_times_two"]
  value: {{ 2 * 2 }}
Attributes:
  • fields (List[AddedFieldDefinition]): A list of transformations (path and corresponding value) that will be added to the record
AddFields( fields: List[AddedFieldDefinition], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], condition: str = '')
fields: List[AddedFieldDefinition]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
condition: str = ''
def transform( self, record: Dict[str, Any], config: Optional[Mapping[str, Any]] = None, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None) -> None:
129    def transform(
130        self,
131        record: Dict[str, Any],
132        config: Optional[Config] = None,
133        stream_state: Optional[StreamState] = None,
134        stream_slice: Optional[StreamSlice] = None,
135    ) -> None:
136        if config is None:
137            config = {}
138        kwargs = {"record": record, "stream_slice": stream_slice}
139        for parsed_field in self._parsed_fields:
140            valid_types = (parsed_field.value_type,) if parsed_field.value_type else None
141            value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs)
142            is_empty_condition = not self.condition
143            if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs):
144                dpath.new(record, parsed_field.path, value)

Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.

Parameters
  • record: The input record to be transformed
  • config: The user-provided configuration as specified by the source's spec
  • stream_state: The stream state
  • stream_slice: The stream slice
Returns

The transformed record

@dataclass(frozen=True)
class AddedFieldDefinition:
17@dataclass(frozen=True)
18class AddedFieldDefinition:
19    """Defines the field to add on a record"""
20
21    path: FieldPointer
22    value: Union[InterpolatedString, str]
23    value_type: Optional[Type[Any]]
24    parameters: InitVar[Mapping[str, Any]]

Defines the field to add on a record

AddedFieldDefinition( path: List[str], value: Union[InterpolatedString, str], value_type: Optional[Type[Any]], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
path: List[str]
value: Union[InterpolatedString, str]
value_type: Optional[Type[Any]]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
@dataclass
class ApiKeyAuthenticator(airbyte_cdk.DeclarativeAuthenticator):
24@dataclass
25class ApiKeyAuthenticator(DeclarativeAuthenticator):
26    """
27    ApiKeyAuth sets a request header on the HTTP requests sent.
28
29    The header is of the form:
30    `"<header>": "<token>"`
31
32    For example,
33    `ApiKeyAuthenticator("Authorization", "Bearer hello")`
34    will result in the following header set on the HTTP request
35    `"Authorization": "Bearer hello"`
36
37    Attributes:
38        request_option (RequestOption): request option how to inject the token into the request
39        token_provider (TokenProvider): Provider of the token
40        config (Config): The user-provided configuration as specified by the source's spec
41        parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
42    """
43
44    request_option: RequestOption
45    token_provider: TokenProvider
46    config: Config
47    parameters: InitVar[Mapping[str, Any]]
48
49    @property
50    def auth_header(self) -> str:
51        options = self._get_request_options(RequestOptionType.header)
52        return next(iter(options.keys()), "")
53
54    @property
55    def token(self) -> str:
56        return self.token_provider.get_token()
57
58    def _get_request_options(self, option_type: RequestOptionType) -> Mapping[str, Any]:
59        options: MutableMapping[str, Any] = {}
60        if self.request_option.inject_into == option_type:
61            self.request_option.inject_into_request(options, self.token, self.config)
62        return options
63
64    def get_request_params(self) -> Mapping[str, Any]:
65        return self._get_request_options(RequestOptionType.request_parameter)
66
67    def get_request_body_data(self) -> Union[Mapping[str, Any], str]:
68        return self._get_request_options(RequestOptionType.body_data)
69
70    def get_request_body_json(self) -> Mapping[str, Any]:
71        return self._get_request_options(RequestOptionType.body_json)

ApiKeyAuth sets a request header on the HTTP requests sent.

The header is of the form: "<header>": "<token>"

For example, ApiKeyAuthenticator("Authorization", "Bearer hello") will result in the following header set on the HTTP request "Authorization": "Bearer hello"

Attributes:
  • request_option (RequestOption): request option how to inject the token into the request
  • token_provider (TokenProvider): Provider of the token
  • config (Config): The user-provided configuration as specified by the source's spec
  • parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
ApiKeyAuthenticator( request_option: RequestOption, token_provider: airbyte_cdk.sources.declarative.auth.token_provider.TokenProvider, config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
request_option: RequestOption
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
auth_header: str
49    @property
50    def auth_header(self) -> str:
51        options = self._get_request_options(RequestOptionType.header)
52        return next(iter(options.keys()), "")

HTTP header to set on the requests

token: str
54    @property
55    def token(self) -> str:
56        return self.token_provider.get_token()

The header value to set on outgoing HTTP requests

def get_request_params(self) -> Mapping[str, Any]:
64    def get_request_params(self) -> Mapping[str, Any]:
65        return self._get_request_options(RequestOptionType.request_parameter)

HTTP request parameter to add to the requests

def get_request_body_data(self) -> Union[Mapping[str, Any], str]:
67    def get_request_body_data(self) -> Union[Mapping[str, Any], str]:
68        return self._get_request_options(RequestOptionType.body_data)

Form-encoded body data to set on the requests

def get_request_body_json(self) -> Mapping[str, Any]:
70    def get_request_body_json(self) -> Mapping[str, Any]:
71        return self._get_request_options(RequestOptionType.body_json)

JSON-encoded body data to set on the requests

class BackoffStrategy(abc.ABC):
12class BackoffStrategy(ABC):
13    @abstractmethod
14    def backoff_time(
15        self,
16        response_or_exception: Optional[Union[requests.Response, requests.RequestException]],
17        attempt_count: int,
18    ) -> Optional[float]:
19        """
20        Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header.
21
22        This method is called only if should_backoff() returns True for the input request.
23
24        :param response_or_exception: The response or exception that caused the backoff.
25        :param attempt_count: The number of attempts already performed for this request.
26        :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff
27        to the default backoff behavior (e.g using an exponential algorithm).
28        """
29        pass

Helper class that provides a standard way to create an ABC using inheritance.

@abstractmethod
def backoff_time( self, response_or_exception: Union[requests.models.Response, requests.exceptions.RequestException, NoneType], attempt_count: int) -> Optional[float]:
13    @abstractmethod
14    def backoff_time(
15        self,
16        response_or_exception: Optional[Union[requests.Response, requests.RequestException]],
17        attempt_count: int,
18    ) -> Optional[float]:
19        """
20        Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header.
21
22        This method is called only if should_backoff() returns True for the input request.
23
24        :param response_or_exception: The response or exception that caused the backoff.
25        :param attempt_count: The number of attempts already performed for this request.
26        :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff
27        to the default backoff behavior (e.g using an exponential algorithm).
28        """
29        pass

Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header.

This method is called only if should_backoff() returns True for the input request.

Parameters
  • response_or_exception: The response or exception that caused the backoff.
  • attempt_count: The number of attempts already performed for this request. :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff to the default backoff behavior (e.g using an exponential algorithm).
@dataclass
class BasicHttpAuthenticator(airbyte_cdk.DeclarativeAuthenticator):
101@dataclass
102class BasicHttpAuthenticator(DeclarativeAuthenticator):
103    """
104    Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using base64
105    https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme
106
107    The header is of the form
108    `"Authorization": "Basic <encoded_credentials>"`
109
110    Attributes:
111        username (Union[InterpolatedString, str]): The username
112        config (Config): The user-provided configuration as specified by the source's spec
113        password (Union[InterpolatedString, str]): The password
114        parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
115    """
116
117    username: Union[InterpolatedString, str]
118    config: Config
119    parameters: InitVar[Mapping[str, Any]]
120    password: Union[InterpolatedString, str] = ""
121
122    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
123        self._username = InterpolatedString.create(self.username, parameters=parameters)
124        self._password = InterpolatedString.create(self.password, parameters=parameters)
125
126    @property
127    def auth_header(self) -> str:
128        return "Authorization"
129
130    @property
131    def token(self) -> str:
132        auth_string = (
133            f"{self._username.eval(self.config)}:{self._password.eval(self.config)}".encode("utf8")
134        )
135        b64_encoded = base64.b64encode(auth_string).decode("utf8")
136        return f"Basic {b64_encoded}"

Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using base64 https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme

The header is of the form "Authorization": "Basic <encoded_credentials>"

Attributes:
  • username (Union[InterpolatedString, str]): The username
  • config (Config): The user-provided configuration as specified by the source's spec
  • password (Union[InterpolatedString, str]): The password
  • parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
BasicHttpAuthenticator( username: Union[InterpolatedString, str], config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], password: Union[InterpolatedString, str] = '')
username: Union[InterpolatedString, str]
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
password: Union[InterpolatedString, str] = ''
auth_header: str
126    @property
127    def auth_header(self) -> str:
128        return "Authorization"

HTTP header to set on the requests

token: str
130    @property
131    def token(self) -> str:
132        auth_string = (
133            f"{self._username.eval(self.config)}:{self._password.eval(self.config)}".encode("utf8")
134        )
135        b64_encoded = base64.b64encode(auth_string).decode("utf8")
136        return f"Basic {b64_encoded}"

The header value to set on outgoing HTTP requests

@dataclass
class BearerAuthenticator(airbyte_cdk.DeclarativeAuthenticator):
74@dataclass
75class BearerAuthenticator(DeclarativeAuthenticator):
76    """
77    Authenticator that sets the Authorization header on the HTTP requests sent.
78
79    The header is of the form:
80    `"Authorization": "Bearer <token>"`
81
82    Attributes:
83        token_provider (TokenProvider): Provider of the token
84        config (Config): The user-provided configuration as specified by the source's spec
85        parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
86    """
87
88    token_provider: TokenProvider
89    config: Config
90    parameters: InitVar[Mapping[str, Any]]
91
92    @property
93    def auth_header(self) -> str:
94        return "Authorization"
95
96    @property
97    def token(self) -> str:
98        return f"Bearer {self.token_provider.get_token()}"

Authenticator that sets the Authorization header on the HTTP requests sent.

The header is of the form: "Authorization": "Bearer <token>"

Attributes:
  • token_provider (TokenProvider): Provider of the token
  • config (Config): The user-provided configuration as specified by the source's spec
  • parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
BearerAuthenticator( token_provider: airbyte_cdk.sources.declarative.auth.token_provider.TokenProvider, config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
auth_header: str
92    @property
93    def auth_header(self) -> str:
94        return "Authorization"

HTTP header to set on the requests

token: str
96    @property
97    def token(self) -> str:
98        return f"Bearer {self.token_provider.get_token()}"

The header value to set on outgoing HTTP requests

@dataclass
class CartesianProductStreamSlicer(airbyte_cdk.sources.declarative.partition_routers.partition_router.PartitionRouter):
 40@dataclass
 41class CartesianProductStreamSlicer(PartitionRouter):
 42    """
 43    Stream slicers that iterates over the cartesian product of input stream slicers
 44    Given 2 stream slicers with the following slices:
 45    A: [{"i": 0}, {"i": 1}, {"i": 2}]
 46    B: [{"s": "hello"}, {"s": "world"}]
 47    the resulting stream slices are
 48    [
 49        {"i": 0, "s": "hello"},
 50        {"i": 0, "s": "world"},
 51        {"i": 1, "s": "hello"},
 52        {"i": 1, "s": "world"},
 53        {"i": 2, "s": "hello"},
 54        {"i": 2, "s": "world"},
 55    ]
 56
 57    Attributes:
 58        stream_slicers (List[PartitionRouter]): Underlying stream slicers. The RequestOptions (e.g: Request headers, parameters, etc..) returned by this slicer are the combination of the RequestOptions of its input slicers. If there are conflicts e.g: two slicers define the same header or request param, the conflict is resolved by taking the value from the first slicer, where ordering is determined by the order in which slicers were input to this composite slicer.
 59    """
 60
 61    stream_slicers: List[PartitionRouter]
 62    parameters: InitVar[Mapping[str, Any]]
 63
 64    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 65        check_for_substream_in_slicers(self.stream_slicers, self.logger.warning)
 66
 67    def get_request_params(
 68        self,
 69        *,
 70        stream_state: Optional[StreamState] = None,
 71        stream_slice: Optional[StreamSlice] = None,
 72        next_page_token: Optional[Mapping[str, Any]] = None,
 73    ) -> Mapping[str, Any]:
 74        return dict(
 75            ChainMap(
 76                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
 77                    s.get_request_params(
 78                        stream_state=stream_state,
 79                        stream_slice=stream_slice,
 80                        next_page_token=next_page_token,
 81                    )
 82                    for s in self.stream_slicers
 83                ]
 84            )
 85        )
 86
 87    def get_request_headers(
 88        self,
 89        *,
 90        stream_state: Optional[StreamState] = None,
 91        stream_slice: Optional[StreamSlice] = None,
 92        next_page_token: Optional[Mapping[str, Any]] = None,
 93    ) -> Mapping[str, Any]:
 94        return dict(
 95            ChainMap(
 96                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
 97                    s.get_request_headers(
 98                        stream_state=stream_state,
 99                        stream_slice=stream_slice,
100                        next_page_token=next_page_token,
101                    )
102                    for s in self.stream_slicers
103                ]
104            )
105        )
106
107    def get_request_body_data(
108        self,
109        *,
110        stream_state: Optional[StreamState] = None,
111        stream_slice: Optional[StreamSlice] = None,
112        next_page_token: Optional[Mapping[str, Any]] = None,
113    ) -> Mapping[str, Any]:
114        return dict(
115            ChainMap(
116                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
117                    s.get_request_body_data(
118                        stream_state=stream_state,
119                        stream_slice=stream_slice,
120                        next_page_token=next_page_token,
121                    )
122                    for s in self.stream_slicers
123                ]
124            )
125        )
126
127    def get_request_body_json(
128        self,
129        *,
130        stream_state: Optional[StreamState] = None,
131        stream_slice: Optional[StreamSlice] = None,
132        next_page_token: Optional[Mapping[str, Any]] = None,
133    ) -> Mapping[str, Any]:
134        return dict(
135            ChainMap(
136                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
137                    s.get_request_body_json(
138                        stream_state=stream_state,
139                        stream_slice=stream_slice,
140                        next_page_token=next_page_token,
141                    )
142                    for s in self.stream_slicers
143                ]
144            )
145        )
146
147    def stream_slices(self) -> Iterable[StreamSlice]:
148        sub_slices = (s.stream_slices() for s in self.stream_slicers)
149        product = itertools.product(*sub_slices)
150        for stream_slice_tuple in product:
151            partition = dict(ChainMap(*[s.partition for s in stream_slice_tuple]))  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
152            cursor_slices = [s.cursor_slice for s in stream_slice_tuple if s.cursor_slice]
153            if len(cursor_slices) > 1:
154                raise ValueError(
155                    f"There should only be a single cursor slice. Found {cursor_slices}"
156                )
157            if cursor_slices:
158                cursor_slice = cursor_slices[0]
159            else:
160                cursor_slice = {}
161            yield StreamSlice(partition=partition, cursor_slice=cursor_slice)
162
163    def set_initial_state(self, stream_state: StreamState) -> None:
164        """
165        Parent stream states are not supported for cartesian product stream slicer
166        """
167        pass
168
169    def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
170        """
171        Parent stream states are not supported for cartesian product stream slicer
172        """
173        pass
174
175    @property
176    def logger(self) -> logging.Logger:
177        return logging.getLogger("airbyte.CartesianProductStreamSlicer")

Stream slicers that iterates over the cartesian product of input stream slicers Given 2 stream slicers with the following slices: A: [{"i": 0}, {"i": 1}, {"i": 2}] B: [{"s": "hello"}, {"s": "world"}] the resulting stream slices are [ {"i": 0, "s": "hello"}, {"i": 0, "s": "world"}, {"i": 1, "s": "hello"}, {"i": 1, "s": "world"}, {"i": 2, "s": "hello"}, {"i": 2, "s": "world"}, ]

Attributes:
  • stream_slicers (List[PartitionRouter]): Underlying stream slicers. The RequestOptions (e.g: Request headers, parameters, etc..) returned by this slicer are the combination of the RequestOptions of its input slicers. If there are conflicts e.g: two slicers define the same header or request param, the conflict is resolved by taking the value from the first slicer, where ordering is determined by the order in which slicers were input to this composite slicer.
CartesianProductStreamSlicer( stream_slicers: List[airbyte_cdk.sources.declarative.partition_routers.PartitionRouter], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
67    def get_request_params(
68        self,
69        *,
70        stream_state: Optional[StreamState] = None,
71        stream_slice: Optional[StreamSlice] = None,
72        next_page_token: Optional[Mapping[str, Any]] = None,
73    ) -> Mapping[str, Any]:
74        return dict(
75            ChainMap(
76                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
77                    s.get_request_params(
78                        stream_state=stream_state,
79                        stream_slice=stream_slice,
80                        next_page_token=next_page_token,
81                    )
82                    for s in self.stream_slicers
83                ]
84            )
85        )

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
 87    def get_request_headers(
 88        self,
 89        *,
 90        stream_state: Optional[StreamState] = None,
 91        stream_slice: Optional[StreamSlice] = None,
 92        next_page_token: Optional[Mapping[str, Any]] = None,
 93    ) -> Mapping[str, Any]:
 94        return dict(
 95            ChainMap(
 96                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
 97                    s.get_request_headers(
 98                        stream_state=stream_state,
 99                        stream_slice=stream_slice,
100                        next_page_token=next_page_token,
101                    )
102                    for s in self.stream_slicers
103                ]
104            )
105        )

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
107    def get_request_body_data(
108        self,
109        *,
110        stream_state: Optional[StreamState] = None,
111        stream_slice: Optional[StreamSlice] = None,
112        next_page_token: Optional[Mapping[str, Any]] = None,
113    ) -> Mapping[str, Any]:
114        return dict(
115            ChainMap(
116                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
117                    s.get_request_body_data(
118                        stream_state=stream_state,
119                        stream_slice=stream_slice,
120                        next_page_token=next_page_token,
121                    )
122                    for s in self.stream_slicers
123                ]
124            )
125        )

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
127    def get_request_body_json(
128        self,
129        *,
130        stream_state: Optional[StreamState] = None,
131        stream_slice: Optional[StreamSlice] = None,
132        next_page_token: Optional[Mapping[str, Any]] = None,
133    ) -> Mapping[str, Any]:
134        return dict(
135            ChainMap(
136                *[  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
137                    s.get_request_body_json(
138                        stream_state=stream_state,
139                        stream_slice=stream_slice,
140                        next_page_token=next_page_token,
141                    )
142                    for s in self.stream_slicers
143                ]
144            )
145        )

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def stream_slices(self) -> Iterable[StreamSlice]:
147    def stream_slices(self) -> Iterable[StreamSlice]:
148        sub_slices = (s.stream_slices() for s in self.stream_slicers)
149        product = itertools.product(*sub_slices)
150        for stream_slice_tuple in product:
151            partition = dict(ChainMap(*[s.partition for s in stream_slice_tuple]))  # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons
152            cursor_slices = [s.cursor_slice for s in stream_slice_tuple if s.cursor_slice]
153            if len(cursor_slices) > 1:
154                raise ValueError(
155                    f"There should only be a single cursor slice. Found {cursor_slices}"
156                )
157            if cursor_slices:
158                cursor_slice = cursor_slices[0]
159            else:
160                cursor_slice = {}
161            yield StreamSlice(partition=partition, cursor_slice=cursor_slice)

Defines stream slices

Returns

An iterable of stream slices

def set_initial_state(self, stream_state: Mapping[str, Any]) -> None:
163    def set_initial_state(self, stream_state: StreamState) -> None:
164        """
165        Parent stream states are not supported for cartesian product stream slicer
166        """
167        pass

Parent stream states are not supported for cartesian product stream slicer

def get_stream_state(self) -> Optional[Mapping[str, Mapping[str, Any]]]:
169    def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
170        """
171        Parent stream states are not supported for cartesian product stream slicer
172        """
173        pass

Parent stream states are not supported for cartesian product stream slicer

logger: logging.Logger
175    @property
176    def logger(self) -> logging.Logger:
177        return logging.getLogger("airbyte.CartesianProductStreamSlicer")
@dataclass
class CursorPaginationStrategy(airbyte_cdk.PaginationStrategy):
24@dataclass
25class CursorPaginationStrategy(PaginationStrategy):
26    """
27    Pagination strategy that evaluates an interpolated string to define the next page token
28
29    Attributes:
30        page_size (Optional[int]): the number of records to request
31        cursor_value (Union[InterpolatedString, str]): template string evaluating to the cursor value
32        config (Config): connection config
33        stop_condition (Optional[InterpolatedBoolean]): template string evaluating when to stop paginating
34        decoder (Decoder): decoder to decode the response
35    """
36
37    cursor_value: Union[InterpolatedString, str]
38    config: Config
39    parameters: InitVar[Mapping[str, Any]]
40    page_size: Optional[int] = None
41    stop_condition: Optional[Union[InterpolatedBoolean, str]] = None
42    decoder: Decoder = field(
43        default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
44    )
45
46    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
47        if isinstance(self.cursor_value, str):
48            self._cursor_value = InterpolatedString.create(self.cursor_value, parameters=parameters)
49        else:
50            self._cursor_value = self.cursor_value
51        if isinstance(self.stop_condition, str):
52            self._stop_condition: Optional[InterpolatedBoolean] = InterpolatedBoolean(
53                condition=self.stop_condition, parameters=parameters
54            )
55        else:
56            self._stop_condition = self.stop_condition
57
58    @property
59    def initial_token(self) -> Optional[Any]:
60        """
61        CursorPaginationStrategy does not have an initial value because the next cursor is typically included
62        in the response of the first request. For Resumable Full Refresh streams that checkpoint the page
63        cursor, the next cursor should be read from the state or stream slice object.
64        """
65        return None
66
67    def next_page_token(
68        self,
69        response: requests.Response,
70        last_page_size: int,
71        last_record: Optional[Record],
72        last_page_token_value: Optional[Any] = None,
73    ) -> Optional[Any]:
74        decoded_response = next(self.decoder.decode(response))
75        # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This
76        # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links
77        headers: Dict[str, Any] = dict(response.headers)
78        headers["link"] = response.links
79        if self._stop_condition:
80            should_stop = self._stop_condition.eval(
81                self.config,
82                response=decoded_response,
83                headers=headers,
84                last_record=last_record,
85                last_page_size=last_page_size,
86            )
87            if should_stop:
88                return None
89        token = self._cursor_value.eval(
90            config=self.config,
91            response=decoded_response,
92            headers=headers,
93            last_record=last_record,
94            last_page_size=last_page_size,
95        )
96        return token if token else None
97
98    def get_page_size(self) -> Optional[int]:
99        return self.page_size

Pagination strategy that evaluates an interpolated string to define the next page token

Attributes:
  • page_size (Optional[int]): the number of records to request
  • cursor_value (Union[InterpolatedString, str]): template string evaluating to the cursor value
  • config (Config): connection config
  • stop_condition (Optional[InterpolatedBoolean]): template string evaluating when to stop paginating
  • decoder (Decoder): decoder to decode the response
CursorPaginationStrategy( cursor_value: Union[InterpolatedString, str], config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], page_size: Optional[int] = None, stop_condition: Union[InterpolatedBoolean, str, NoneType] = None, decoder: Decoder = <factory>)
cursor_value: Union[InterpolatedString, str]
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
page_size: Optional[int] = None
stop_condition: Union[InterpolatedBoolean, str, NoneType] = None
decoder: Decoder
initial_token: Optional[Any]
58    @property
59    def initial_token(self) -> Optional[Any]:
60        """
61        CursorPaginationStrategy does not have an initial value because the next cursor is typically included
62        in the response of the first request. For Resumable Full Refresh streams that checkpoint the page
63        cursor, the next cursor should be read from the state or stream slice object.
64        """
65        return None

CursorPaginationStrategy does not have an initial value because the next cursor is typically included in the response of the first request. For Resumable Full Refresh streams that checkpoint the page cursor, the next cursor should be read from the state or stream slice object.

def next_page_token( self, response: requests.models.Response, last_page_size: int, last_record: Optional[Record], last_page_token_value: Optional[Any] = None) -> Optional[Any]:
67    def next_page_token(
68        self,
69        response: requests.Response,
70        last_page_size: int,
71        last_record: Optional[Record],
72        last_page_token_value: Optional[Any] = None,
73    ) -> Optional[Any]:
74        decoded_response = next(self.decoder.decode(response))
75        # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This
76        # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links
77        headers: Dict[str, Any] = dict(response.headers)
78        headers["link"] = response.links
79        if self._stop_condition:
80            should_stop = self._stop_condition.eval(
81                self.config,
82                response=decoded_response,
83                headers=headers,
84                last_record=last_record,
85                last_page_size=last_page_size,
86            )
87            if should_stop:
88                return None
89        token = self._cursor_value.eval(
90            config=self.config,
91            response=decoded_response,
92            headers=headers,
93            last_record=last_record,
94            last_page_size=last_page_size,
95        )
96        return token if token else None
Parameters
  • response: response to process
  • last_page_size: the number of records read from the response
  • last_record: the last record extracted from the response
  • last_page_token_value: The current value of the page token made on the last request
Returns

next page token. Returns None if there are no more pages to fetch

def get_page_size(self) -> Optional[int]:
98    def get_page_size(self) -> Optional[int]:
99        return self.page_size
Returns

page size: The number of records to fetch in a page. Returns None if unspecified

 28@dataclass
 29class DatetimeBasedCursor(DeclarativeCursor):
 30    """
 31    Slices the stream over a datetime range and create a state with format {<cursor_field>: <datetime> }
 32
 33    Given a start time, end time, a step function, and an optional lookback window,
 34    the stream slicer will partition the date range from start time - lookback window to end time.
 35
 36    The step function is defined as a string of the form ISO8601 duration
 37
 38    The timestamp format accepts the same format codes as datetime.strfptime, which are
 39    all the format codes required by the 1989 C standard.
 40    Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html
 41
 42    Attributes:
 43        start_datetime (Union[MinMaxDatetime, str]): the datetime that determines the earliest record that should be synced
 44        end_datetime (Optional[Union[MinMaxDatetime, str]]): the datetime that determines the last record that should be synced
 45        cursor_field (Union[InterpolatedString, str]): record's cursor field
 46        datetime_format (str): format of the datetime
 47        step (Optional[str]): size of the timewindow (ISO8601 duration)
 48        cursor_granularity (Optional[str]): smallest increment the datetime_format has (ISO 8601 duration) that will be used to ensure that the start of a slice does not overlap with the end of the previous one
 49        config (Config): connection config
 50        start_time_option (Optional[RequestOption]): request option for start time
 51        end_time_option (Optional[RequestOption]): request option for end time
 52        partition_field_start (Optional[str]): partition start time field
 53        partition_field_end (Optional[str]): stream slice end time field
 54        lookback_window (Optional[InterpolatedString]): how many days before start_datetime to read data for (ISO8601 duration)
 55    """
 56
 57    start_datetime: Union[MinMaxDatetime, str]
 58    cursor_field: Union[InterpolatedString, str]
 59    datetime_format: str
 60    config: Config
 61    parameters: InitVar[Mapping[str, Any]]
 62    _highest_observed_cursor_field_value: Optional[str] = field(
 63        repr=False, default=None
 64    )  # tracks the latest observed datetime, which may not be safe to emit in the case of out-of-order records
 65    _cursor: Optional[str] = field(
 66        repr=False, default=None
 67    )  # tracks the latest observed datetime that is appropriate to emit as stream state
 68    end_datetime: Optional[Union[MinMaxDatetime, str]] = None
 69    step: Optional[Union[InterpolatedString, str]] = None
 70    cursor_granularity: Optional[str] = None
 71    start_time_option: Optional[RequestOption] = None
 72    end_time_option: Optional[RequestOption] = None
 73    partition_field_start: Optional[str] = None
 74    partition_field_end: Optional[str] = None
 75    lookback_window: Optional[Union[InterpolatedString, str]] = None
 76    message_repository: Optional[MessageRepository] = None
 77    is_compare_strictly: Optional[bool] = False
 78    cursor_datetime_formats: List[str] = field(default_factory=lambda: [])
 79
 80    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 81        if (self.step and not self.cursor_granularity) or (
 82            not self.step and self.cursor_granularity
 83        ):
 84            raise ValueError(
 85                f"If step is defined, cursor_granularity should be as well and vice-versa. "
 86                f"Right now, step is `{self.step}` and cursor_granularity is `{self.cursor_granularity}`"
 87            )
 88        self._start_datetime = MinMaxDatetime.create(self.start_datetime, parameters)
 89        self._end_datetime = (
 90            None if not self.end_datetime else MinMaxDatetime.create(self.end_datetime, parameters)
 91        )
 92
 93        self._timezone = datetime.timezone.utc
 94        self._interpolation = JinjaInterpolation()
 95
 96        self._step = (
 97            self._parse_timedelta(
 98                InterpolatedString.create(self.step, parameters=parameters).eval(self.config)
 99            )
100            if self.step
101            else datetime.timedelta.max
102        )
103        self._cursor_granularity = self._parse_timedelta(self.cursor_granularity)
104        self.cursor_field = InterpolatedString.create(self.cursor_field, parameters=parameters)
105        self._lookback_window = (
106            InterpolatedString.create(self.lookback_window, parameters=parameters)
107            if self.lookback_window
108            else None
109        )
110        self._partition_field_start = InterpolatedString.create(
111            self.partition_field_start or "start_time", parameters=parameters
112        )
113        self._partition_field_end = InterpolatedString.create(
114            self.partition_field_end or "end_time", parameters=parameters
115        )
116        self._parser = DatetimeParser()
117
118        # If datetime format is not specified then start/end datetime should inherit it from the stream slicer
119        if not self._start_datetime.datetime_format:
120            self._start_datetime.datetime_format = self.datetime_format
121        if self._end_datetime and not self._end_datetime.datetime_format:
122            self._end_datetime.datetime_format = self.datetime_format
123
124        if not self.cursor_datetime_formats:
125            self.cursor_datetime_formats = [self.datetime_format]
126
127        _validate_component_request_option_paths(
128            self.config, self.start_time_option, self.end_time_option
129        )
130
131    def get_stream_state(self) -> StreamState:
132        return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {}  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
133
134    def set_initial_state(self, stream_state: StreamState) -> None:
135        """
136        Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called
137        before calling anything else
138
139        :param stream_state: The state of the stream as returned by get_stream_state
140        """
141        self._cursor = (
142            stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None  # type: ignore [union-attr]
143        )
144
145    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
146        """
147        Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.
148
149        :param stream_slice: The current slice, which may or may not contain the most recently observed record
150        :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the
151          stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.
152        """
153        record_cursor_value = record.get(self.cursor_field.eval(self.config))  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
154        # if the current record has no cursor value, we cannot meaningfully update the state based on it, so there is nothing more to do
155        if not record_cursor_value:
156            return
157
158        start_field = self._partition_field_start.eval(self.config)
159        end_field = self._partition_field_end.eval(self.config)
160        is_highest_observed_cursor_value = (
161            not self._highest_observed_cursor_field_value
162            or self.parse_date(record_cursor_value)
163            > self.parse_date(self._highest_observed_cursor_field_value)
164        )
165        if (
166            self._is_within_daterange_boundaries(
167                record,
168                stream_slice.get(start_field),  # type: ignore [arg-type]
169                stream_slice.get(end_field),  # type: ignore [arg-type]
170            )
171            and is_highest_observed_cursor_value
172        ):
173            self._highest_observed_cursor_field_value = record_cursor_value
174
175    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
176        if stream_slice.partition:
177            raise ValueError(
178                f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}."
179            )
180        cursor_value_str_by_cursor_value_datetime = dict(
181            map(
182                # we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like
183                # 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z'
184                lambda datetime_str: (self.parse_date(datetime_str), datetime_str),  # type: ignore # because of the filter on the next line, this will only be called with a str
185                filter(
186                    lambda item: item, [self._cursor, self._highest_observed_cursor_field_value]
187                ),
188            )
189        )
190        self._cursor = (
191            cursor_value_str_by_cursor_value_datetime[
192                max(cursor_value_str_by_cursor_value_datetime.keys())
193            ]
194            if cursor_value_str_by_cursor_value_datetime
195            else None
196        )
197
198    def stream_slices(self) -> Iterable[StreamSlice]:
199        """
200        Partition the daterange into slices of size = step.
201
202        The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime
203        The end of the window is the minimum datetime between the start of the window and end_datetime.
204
205        :return:
206        """
207        end_datetime = self.select_best_end_datetime()
208        start_datetime = self._calculate_earliest_possible_value(self.select_best_end_datetime())
209        return self._partition_daterange(start_datetime, end_datetime, self._step)
210
211    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
212        # Datetime based cursors operate over slices made up of datetime ranges. Stream state is based on the progress
213        # through each slice and does not belong to a specific slice. We just return stream state as it is.
214        return self.get_stream_state()
215
216    def _calculate_earliest_possible_value(
217        self, end_datetime: datetime.datetime
218    ) -> datetime.datetime:
219        lookback_delta = self._parse_timedelta(
220            self._lookback_window.eval(self.config) if self._lookback_window else "P0D"
221        )
222        earliest_possible_start_datetime = min(
223            self._start_datetime.get_datetime(self.config), end_datetime
224        )
225        try:
226            cursor_datetime = (
227                self._calculate_cursor_datetime_from_state(self.get_stream_state()) - lookback_delta
228            )
229        except OverflowError:
230            # cursor_datetime defers to the minimum date if it does not exist in the state. Trying to subtract
231            # a timedelta from the minimum datetime results in an OverflowError
232            cursor_datetime = self._calculate_cursor_datetime_from_state(self.get_stream_state())
233        return max(earliest_possible_start_datetime, cursor_datetime)
234
235    def select_best_end_datetime(self) -> datetime.datetime:
236        """
237        Returns the optimal end datetime.
238        This method compares the current datetime with a pre-configured end datetime
239        and returns the earlier of the two. If no pre-configured end datetime is set,
240        the current datetime is returned.
241
242        :return datetime.datetime: The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier.
243        """
244        now = datetime.datetime.now(tz=self._timezone)
245        if not self._end_datetime:
246            return now
247        return min(self._end_datetime.get_datetime(self.config), now)
248
249    def _calculate_cursor_datetime_from_state(
250        self, stream_state: Mapping[str, Any]
251    ) -> datetime.datetime:
252        if self.cursor_field.eval(self.config, stream_state=stream_state) in stream_state:  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
253            return self.parse_date(stream_state[self.cursor_field.eval(self.config)])  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
254        return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc)
255
256    def _format_datetime(self, dt: datetime.datetime) -> str:
257        return self._parser.format(dt, self.datetime_format)
258
259    def _partition_daterange(
260        self,
261        start: datetime.datetime,
262        end: datetime.datetime,
263        step: Union[datetime.timedelta, Duration],
264    ) -> List[StreamSlice]:
265        start_field = self._partition_field_start.eval(self.config)
266        end_field = self._partition_field_end.eval(self.config)
267        dates = []
268
269        while self._is_within_date_range(start, end):
270            next_start = self._evaluate_next_start_date_safely(start, step)
271            end_date = self._get_date(next_start - self._cursor_granularity, end, min)
272            dates.append(
273                StreamSlice(
274                    partition={},
275                    cursor_slice={
276                        start_field: self._format_datetime(start),
277                        end_field: self._format_datetime(end_date),
278                    },
279                )
280            )
281            start = next_start
282        return dates
283
284    def _is_within_date_range(self, start: datetime.datetime, end: datetime.datetime) -> bool:
285        if self.is_compare_strictly:
286            return start < end
287        return start <= end
288
289    def _evaluate_next_start_date_safely(
290        self, start: datetime.datetime, step: datetime.timedelta
291    ) -> datetime.datetime:
292        """
293        Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date
294        This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code
295        would have broken anyway.
296        """
297        try:
298            return start + step
299        except OverflowError:
300            return datetime.datetime.max.replace(tzinfo=datetime.timezone.utc)
301
302    def _get_date(
303        self,
304        cursor_value: datetime.datetime,
305        default_date: datetime.datetime,
306        comparator: Callable[[datetime.datetime, datetime.datetime], datetime.datetime],
307    ) -> datetime.datetime:
308        cursor_date = cursor_value or default_date
309        return comparator(cursor_date, default_date)
310
311    def parse_date(self, date: str) -> datetime.datetime:
312        for datetime_format in self.cursor_datetime_formats + [self.datetime_format]:
313            try:
314                return self._parser.parse(date, datetime_format)
315            except ValueError:
316                pass
317        raise ValueError(f"No format in {self.cursor_datetime_formats} matching {date}")
318
319    @classmethod
320    def _parse_timedelta(cls, time_str: Optional[str]) -> Union[datetime.timedelta, Duration]:
321        """
322        :return Parses an ISO 8601 durations into datetime.timedelta or Duration objects.
323        """
324        if not time_str:
325            return datetime.timedelta(0)
326        return parse_duration(time_str)
327
328    def get_request_params(
329        self,
330        *,
331        stream_state: Optional[StreamState] = None,
332        stream_slice: Optional[StreamSlice] = None,
333        next_page_token: Optional[Mapping[str, Any]] = None,
334    ) -> Mapping[str, Any]:
335        return self._get_request_options(RequestOptionType.request_parameter, stream_slice)
336
337    def get_request_headers(
338        self,
339        *,
340        stream_state: Optional[StreamState] = None,
341        stream_slice: Optional[StreamSlice] = None,
342        next_page_token: Optional[Mapping[str, Any]] = None,
343    ) -> Mapping[str, Any]:
344        return self._get_request_options(RequestOptionType.header, stream_slice)
345
346    def get_request_body_data(
347        self,
348        *,
349        stream_state: Optional[StreamState] = None,
350        stream_slice: Optional[StreamSlice] = None,
351        next_page_token: Optional[Mapping[str, Any]] = None,
352    ) -> Mapping[str, Any]:
353        return self._get_request_options(RequestOptionType.body_data, stream_slice)
354
355    def get_request_body_json(
356        self,
357        *,
358        stream_state: Optional[StreamState] = None,
359        stream_slice: Optional[StreamSlice] = None,
360        next_page_token: Optional[Mapping[str, Any]] = None,
361    ) -> Mapping[str, Any]:
362        return self._get_request_options(RequestOptionType.body_json, stream_slice)
363
364    def request_kwargs(self) -> Mapping[str, Any]:
365        # Never update kwargs
366        return {}
367
368    def _get_request_options(
369        self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]
370    ) -> Mapping[str, Any]:
371        options: MutableMapping[str, Any] = {}
372        if not stream_slice:
373            return options
374
375        if self.start_time_option and self.start_time_option.inject_into == option_type:
376            start_time_value = stream_slice.get(self._partition_field_start.eval(self.config))
377            self.start_time_option.inject_into_request(options, start_time_value, self.config)
378
379        if self.end_time_option and self.end_time_option.inject_into == option_type:
380            end_time_value = stream_slice.get(self._partition_field_end.eval(self.config))
381            self.end_time_option.inject_into_request(options, end_time_value, self.config)
382
383        return options
384
385    def should_be_synced(self, record: Record) -> bool:
386        cursor_field = self.cursor_field.eval(self.config)  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
387        record_cursor_value = record.get(cursor_field)
388        if not record_cursor_value:
389            self._send_log(
390                Level.WARN,
391                f"Could not find cursor field `{cursor_field}` in record. The incremental sync will assume it needs to be synced",
392            )
393            return True
394        latest_possible_cursor_value = self.select_best_end_datetime()
395        earliest_possible_cursor_value = self._calculate_earliest_possible_value(
396            latest_possible_cursor_value
397        )
398        return self._is_within_daterange_boundaries(
399            record, earliest_possible_cursor_value, latest_possible_cursor_value
400        )
401
402    def _is_within_daterange_boundaries(
403        self,
404        record: Record,
405        start_datetime_boundary: Union[datetime.datetime, str],
406        end_datetime_boundary: Union[datetime.datetime, str],
407    ) -> bool:
408        cursor_field = self.cursor_field.eval(self.config)  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
409        record_cursor_value = record.get(cursor_field)
410        if not record_cursor_value:
411            self._send_log(
412                Level.WARN,
413                f"Could not find cursor field `{cursor_field}` in record. The record will not be considered when emitting sync state",
414            )
415            return False
416        if isinstance(start_datetime_boundary, str):
417            start_datetime_boundary = self.parse_date(start_datetime_boundary)
418        if isinstance(end_datetime_boundary, str):
419            end_datetime_boundary = self.parse_date(end_datetime_boundary)
420        return (
421            start_datetime_boundary <= self.parse_date(record_cursor_value) <= end_datetime_boundary
422        )
423
424    def _send_log(self, level: Level, message: str) -> None:
425        if self.message_repository:
426            self.message_repository.emit_message(
427                AirbyteMessage(
428                    type=Type.LOG,
429                    log=AirbyteLogMessage(level=level, message=message),
430                )
431            )
432
433    def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
434        cursor_field = self.cursor_field.eval(self.config)  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
435        first_cursor_value = first.get(cursor_field)
436        second_cursor_value = second.get(cursor_field)
437        if first_cursor_value and second_cursor_value:
438            return self.parse_date(first_cursor_value) >= self.parse_date(second_cursor_value)
439        elif first_cursor_value:
440            return True
441        else:
442            return False
443
444    def set_runtime_lookback_window(self, lookback_window_in_seconds: int) -> None:
445        """
446        Updates the lookback window based on a given number of seconds if the new duration
447        is greater than the currently configured lookback window.
448
449        :param lookback_window_in_seconds: The lookback duration in seconds to potentially update to.
450        """
451        runtime_lookback_window = duration_isoformat(timedelta(seconds=lookback_window_in_seconds))
452        config_lookback = parse_duration(
453            self._lookback_window.eval(self.config) if self._lookback_window else "P0D"
454        )
455
456        # Check if the new runtime lookback window is greater than the current config lookback
457        if parse_duration(runtime_lookback_window) > config_lookback:
458            self._lookback_window = InterpolatedString.create(
459                runtime_lookback_window, parameters={}
460            )

Slices the stream over a datetime range and create a state with format {: }

Given a start time, end time, a step function, and an optional lookback window, the stream slicer will partition the date range from start time - lookback window to end time.

The step function is defined as a string of the form ISO8601 duration

The timestamp format accepts the same format codes as datetime.strfptime, which are all the format codes required by the 1989 C standard. Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html

Attributes:
  • start_datetime (Union[MinMaxDatetime, str]): the datetime that determines the earliest record that should be synced
  • end_datetime (Optional[Union[MinMaxDatetime, str]]): the datetime that determines the last record that should be synced
  • cursor_field (Union[InterpolatedString, str]): record's cursor field
  • datetime_format (str): format of the datetime
  • step (Optional[str]): size of the timewindow (ISO8601 duration)
  • cursor_granularity (Optional[str]): smallest increment the datetime_format has (ISO 8601 duration) that will be used to ensure that the start of a slice does not overlap with the end of the previous one
  • config (Config): connection config
  • start_time_option (Optional[RequestOption]): request option for start time
  • end_time_option (Optional[RequestOption]): request option for end time
  • partition_field_start (Optional[str]): partition start time field
  • partition_field_end (Optional[str]): stream slice end time field
  • lookback_window (Optional[InterpolatedString]): how many days before start_datetime to read data for (ISO8601 duration)
DatetimeBasedCursor( start_datetime: Union[MinMaxDatetime, str], cursor_field: Union[InterpolatedString, str], datetime_format: str, config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], _highest_observed_cursor_field_value: Optional[str] = None, _cursor: Optional[str] = None, end_datetime: Union[MinMaxDatetime, str, NoneType] = None, step: Union[InterpolatedString, str, NoneType] = None, cursor_granularity: Optional[str] = None, start_time_option: Optional[RequestOption] = None, end_time_option: Optional[RequestOption] = None, partition_field_start: Optional[str] = None, partition_field_end: Optional[str] = None, lookback_window: Union[InterpolatedString, str, NoneType] = None, message_repository: Optional[MessageRepository] = None, is_compare_strictly: Optional[bool] = False, cursor_datetime_formats: List[str] = <factory>)
start_datetime: Union[MinMaxDatetime, str]
cursor_field: Union[InterpolatedString, str]
datetime_format: str
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
end_datetime: Union[MinMaxDatetime, str, NoneType] = None
step: Union[InterpolatedString, str, NoneType] = None
cursor_granularity: Optional[str] = None
start_time_option: Optional[RequestOption] = None
end_time_option: Optional[RequestOption] = None
partition_field_start: Optional[str] = None
partition_field_end: Optional[str] = None
lookback_window: Union[InterpolatedString, str, NoneType] = None
message_repository: Optional[MessageRepository] = None
is_compare_strictly: Optional[bool] = False
cursor_datetime_formats: List[str]
def get_stream_state(self) -> Mapping[str, Any]:
131    def get_stream_state(self) -> StreamState:
132        return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {}  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__

Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it is used for two things:

  • Interpolation of the requests
  • Transformation of records
  • Saving the state

For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that allows for emitting the state to the platform.

def set_initial_state(self, stream_state: Mapping[str, Any]) -> None:
134    def set_initial_state(self, stream_state: StreamState) -> None:
135        """
136        Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called
137        before calling anything else
138
139        :param stream_state: The state of the stream as returned by get_stream_state
140        """
141        self._cursor = (
142            stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None  # type: ignore [union-attr]
143        )

Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called before calling anything else

Parameters
  • stream_state: The state of the stream as returned by get_stream_state
def observe( self, stream_slice: StreamSlice, record: Record) -> None:
145    def observe(self, stream_slice: StreamSlice, record: Record) -> None:
146        """
147        Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.
148
149        :param stream_slice: The current slice, which may or may not contain the most recently observed record
150        :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the
151          stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.
152        """
153        record_cursor_value = record.get(self.cursor_field.eval(self.config))  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
154        # if the current record has no cursor value, we cannot meaningfully update the state based on it, so there is nothing more to do
155        if not record_cursor_value:
156            return
157
158        start_field = self._partition_field_start.eval(self.config)
159        end_field = self._partition_field_end.eval(self.config)
160        is_highest_observed_cursor_value = (
161            not self._highest_observed_cursor_field_value
162            or self.parse_date(record_cursor_value)
163            > self.parse_date(self._highest_observed_cursor_field_value)
164        )
165        if (
166            self._is_within_daterange_boundaries(
167                record,
168                stream_slice.get(start_field),  # type: ignore [arg-type]
169                stream_slice.get(end_field),  # type: ignore [arg-type]
170            )
171            and is_highest_observed_cursor_value
172        ):
173            self._highest_observed_cursor_field_value = record_cursor_value

Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.

Parameters
  • stream_slice: The current slice, which may or may not contain the most recently observed record
  • record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.
def close_slice( self, stream_slice: StreamSlice, *args: Any) -> None:
175    def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
176        if stream_slice.partition:
177            raise ValueError(
178                f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}."
179            )
180        cursor_value_str_by_cursor_value_datetime = dict(
181            map(
182                # we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like
183                # 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z'
184                lambda datetime_str: (self.parse_date(datetime_str), datetime_str),  # type: ignore # because of the filter on the next line, this will only be called with a str
185                filter(
186                    lambda item: item, [self._cursor, self._highest_observed_cursor_field_value]
187                ),
188            )
189        )
190        self._cursor = (
191            cursor_value_str_by_cursor_value_datetime[
192                max(cursor_value_str_by_cursor_value_datetime.keys())
193            ]
194            if cursor_value_str_by_cursor_value_datetime
195            else None
196        )

Update state based on the stream slice. Note that stream_slice.cursor_slice and most_recent_record.associated_slice are expected to be the same but we make it explicit here that stream_slice should be leveraged to update the state. We do not pass in the latest record, since cursor instances should maintain the relevant internal state on their own.

Parameters
  • stream_slice: slice to close
def stream_slices(self) -> Iterable[StreamSlice]:
198    def stream_slices(self) -> Iterable[StreamSlice]:
199        """
200        Partition the daterange into slices of size = step.
201
202        The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime
203        The end of the window is the minimum datetime between the start of the window and end_datetime.
204
205        :return:
206        """
207        end_datetime = self.select_best_end_datetime()
208        start_datetime = self._calculate_earliest_possible_value(self.select_best_end_datetime())
209        return self._partition_daterange(start_datetime, end_datetime, self._step)

Partition the daterange into slices of size = step.

The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime The end of the window is the minimum datetime between the start of the window and end_datetime.

Returns
def select_state( self, stream_slice: Optional[StreamSlice] = None) -> Optional[Mapping[str, Any]]:
211    def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
212        # Datetime based cursors operate over slices made up of datetime ranges. Stream state is based on the progress
213        # through each slice and does not belong to a specific slice. We just return stream state as it is.
214        return self.get_stream_state()

Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of a specific parent delineated by the incoming slice's partition object.

def select_best_end_datetime(self) -> datetime.datetime:
235    def select_best_end_datetime(self) -> datetime.datetime:
236        """
237        Returns the optimal end datetime.
238        This method compares the current datetime with a pre-configured end datetime
239        and returns the earlier of the two. If no pre-configured end datetime is set,
240        the current datetime is returned.
241
242        :return datetime.datetime: The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier.
243        """
244        now = datetime.datetime.now(tz=self._timezone)
245        if not self._end_datetime:
246            return now
247        return min(self._end_datetime.get_datetime(self.config), now)

Returns the optimal end datetime. This method compares the current datetime with a pre-configured end datetime and returns the earlier of the two. If no pre-configured end datetime is set, the current datetime is returned.

Returns

The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier.

def parse_date(self, date: str) -> datetime.datetime:
311    def parse_date(self, date: str) -> datetime.datetime:
312        for datetime_format in self.cursor_datetime_formats + [self.datetime_format]:
313            try:
314                return self._parser.parse(date, datetime_format)
315            except ValueError:
316                pass
317        raise ValueError(f"No format in {self.cursor_datetime_formats} matching {date}")
def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
328    def get_request_params(
329        self,
330        *,
331        stream_state: Optional[StreamState] = None,
332        stream_slice: Optional[StreamSlice] = None,
333        next_page_token: Optional[Mapping[str, Any]] = None,
334    ) -> Mapping[str, Any]:
335        return self._get_request_options(RequestOptionType.request_parameter, stream_slice)

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
337    def get_request_headers(
338        self,
339        *,
340        stream_state: Optional[StreamState] = None,
341        stream_slice: Optional[StreamSlice] = None,
342        next_page_token: Optional[Mapping[str, Any]] = None,
343    ) -> Mapping[str, Any]:
344        return self._get_request_options(RequestOptionType.header, stream_slice)

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
346    def get_request_body_data(
347        self,
348        *,
349        stream_state: Optional[StreamState] = None,
350        stream_slice: Optional[StreamSlice] = None,
351        next_page_token: Optional[Mapping[str, Any]] = None,
352    ) -> Mapping[str, Any]:
353        return self._get_request_options(RequestOptionType.body_data, stream_slice)

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
355    def get_request_body_json(
356        self,
357        *,
358        stream_state: Optional[StreamState] = None,
359        stream_slice: Optional[StreamSlice] = None,
360        next_page_token: Optional[Mapping[str, Any]] = None,
361    ) -> Mapping[str, Any]:
362        return self._get_request_options(RequestOptionType.body_json, stream_slice)

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def request_kwargs(self) -> Mapping[str, Any]:
364    def request_kwargs(self) -> Mapping[str, Any]:
365        # Never update kwargs
366        return {}
def should_be_synced(self, record: Record) -> bool:
385    def should_be_synced(self, record: Record) -> bool:
386        cursor_field = self.cursor_field.eval(self.config)  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
387        record_cursor_value = record.get(cursor_field)
388        if not record_cursor_value:
389            self._send_log(
390                Level.WARN,
391                f"Could not find cursor field `{cursor_field}` in record. The incremental sync will assume it needs to be synced",
392            )
393            return True
394        latest_possible_cursor_value = self.select_best_end_datetime()
395        earliest_possible_cursor_value = self._calculate_earliest_possible_value(
396            latest_possible_cursor_value
397        )
398        return self._is_within_daterange_boundaries(
399            record, earliest_possible_cursor_value, latest_possible_cursor_value
400        )

Evaluating if a record should be synced allows for filtering and stop condition on pagination

def is_greater_than_or_equal( self, first: Record, second: Record) -> bool:
433    def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
434        cursor_field = self.cursor_field.eval(self.config)  # type: ignore  # cursor_field is converted to an InterpolatedString in __post_init__
435        first_cursor_value = first.get(cursor_field)
436        second_cursor_value = second.get(cursor_field)
437        if first_cursor_value and second_cursor_value:
438            return self.parse_date(first_cursor_value) >= self.parse_date(second_cursor_value)
439        elif first_cursor_value:
440            return True
441        else:
442            return False

Evaluating which record is greater in terms of cursor. This is used to avoid having to capture all the records to close a slice

def set_runtime_lookback_window(self, lookback_window_in_seconds: int) -> None:
444    def set_runtime_lookback_window(self, lookback_window_in_seconds: int) -> None:
445        """
446        Updates the lookback window based on a given number of seconds if the new duration
447        is greater than the currently configured lookback window.
448
449        :param lookback_window_in_seconds: The lookback duration in seconds to potentially update to.
450        """
451        runtime_lookback_window = duration_isoformat(timedelta(seconds=lookback_window_in_seconds))
452        config_lookback = parse_duration(
453            self._lookback_window.eval(self.config) if self._lookback_window else "P0D"
454        )
455
456        # Check if the new runtime lookback window is greater than the current config lookback
457        if parse_duration(runtime_lookback_window) > config_lookback:
458            self._lookback_window = InterpolatedString.create(
459                runtime_lookback_window, parameters={}
460            )

Updates the lookback window based on a given number of seconds if the new duration is greater than the currently configured lookback window.

Parameters
  • lookback_window_in_seconds: The lookback duration in seconds to potentially update to.
@dataclass
class DeclarativeAuthenticator(airbyte_cdk.AbstractHeaderAuthenticator):
14@dataclass
15class DeclarativeAuthenticator(AbstractHeaderAuthenticator):
16    """
17    Interface used to associate which authenticators can be used as part of the declarative framework
18    """
19
20    def get_request_params(self) -> Mapping[str, Any]:
21        """HTTP request parameter to add to the requests"""
22        return {}
23
24    def get_request_body_data(self) -> Union[Mapping[str, Any], str]:
25        """Form-encoded body data to set on the requests"""
26        return {}
27
28    def get_request_body_json(self) -> Mapping[str, Any]:
29        """JSON-encoded body data to set on the requests"""
30        return {}

Interface used to associate which authenticators can be used as part of the declarative framework

def get_request_params(self) -> Mapping[str, Any]:
20    def get_request_params(self) -> Mapping[str, Any]:
21        """HTTP request parameter to add to the requests"""
22        return {}

HTTP request parameter to add to the requests

def get_request_body_data(self) -> Union[Mapping[str, Any], str]:
24    def get_request_body_data(self) -> Union[Mapping[str, Any], str]:
25        """Form-encoded body data to set on the requests"""
26        return {}

Form-encoded body data to set on the requests

def get_request_body_json(self) -> Mapping[str, Any]:
28    def get_request_body_json(self) -> Mapping[str, Any]:
29        """JSON-encoded body data to set on the requests"""
30        return {}

JSON-encoded body data to set on the requests

 24@dataclass
 25class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, DeclarativeAuthenticator):
 26    """
 27    Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials based on
 28    a declarative connector configuration file. Credentials can be defined explicitly or via interpolation
 29    at runtime. The generated access token is attached to each request via the Authorization header.
 30
 31    Attributes:
 32        token_refresh_endpoint (Union[InterpolatedString, str]): The endpoint to refresh the access token
 33        client_id (Union[InterpolatedString, str]): The client id
 34        client_secret (Union[InterpolatedString, str]): Client secret
 35        refresh_token (Union[InterpolatedString, str]): The token used to refresh the access token
 36        access_token_name (Union[InterpolatedString, str]): THe field to extract access token from in the response
 37        expires_in_name (Union[InterpolatedString, str]): The field to extract expires_in from in the response
 38        config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec
 39        scopes (Optional[List[str]]): The scopes to request
 40        token_expiry_date (Optional[Union[InterpolatedString, str]]): The access token expiration date
 41        token_expiry_date_format str: format of the datetime; provide it if expires_in is returned in datetime instead of seconds
 42        token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
 43        refresh_request_body (Optional[Mapping[str, Any]]): The request body to send in the refresh request
 44        refresh_request_headers (Optional[Mapping[str, Any]]): The request headers to send in the refresh request
 45        grant_type: The grant_type to request for access_token. If set to refresh_token, the refresh_token parameter has to be provided
 46        message_repository (MessageRepository): the message repository used to emit logs on HTTP requests
 47    """
 48
 49    config: Mapping[str, Any]
 50    parameters: InitVar[Mapping[str, Any]]
 51    client_id: Optional[Union[InterpolatedString, str]] = None
 52    client_secret: Optional[Union[InterpolatedString, str]] = None
 53    token_refresh_endpoint: Optional[Union[InterpolatedString, str]] = None
 54    refresh_token: Optional[Union[InterpolatedString, str]] = None
 55    scopes: Optional[List[str]] = None
 56    token_expiry_date: Optional[Union[InterpolatedString, str]] = None
 57    _token_expiry_date: Optional[AirbyteDateTime] = field(init=False, repr=False, default=None)
 58    token_expiry_date_format: Optional[str] = None
 59    token_expiry_is_time_of_expiration: bool = False
 60    access_token_name: Union[InterpolatedString, str] = "access_token"
 61    access_token_value: Optional[Union[InterpolatedString, str]] = None
 62    client_id_name: Union[InterpolatedString, str] = "client_id"
 63    client_secret_name: Union[InterpolatedString, str] = "client_secret"
 64    expires_in_name: Union[InterpolatedString, str] = "expires_in"
 65    refresh_token_name: Union[InterpolatedString, str] = "refresh_token"
 66    refresh_request_body: Optional[Mapping[str, Any]] = None
 67    refresh_request_headers: Optional[Mapping[str, Any]] = None
 68    grant_type_name: Union[InterpolatedString, str] = "grant_type"
 69    grant_type: Union[InterpolatedString, str] = "refresh_token"
 70    message_repository: MessageRepository = NoopMessageRepository()
 71    profile_assertion: Optional[DeclarativeAuthenticator] = None
 72    use_profile_assertion: Optional[Union[InterpolatedBoolean, str, bool]] = False
 73
 74    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 75        super().__init__()
 76        if self.token_refresh_endpoint is not None:
 77            self._token_refresh_endpoint: Optional[InterpolatedString] = InterpolatedString.create(
 78                self.token_refresh_endpoint, parameters=parameters
 79            )
 80        else:
 81            self._token_refresh_endpoint = None
 82        self._client_id_name = InterpolatedString.create(self.client_id_name, parameters=parameters)
 83        self._client_id = (
 84            InterpolatedString.create(self.client_id, parameters=parameters)
 85            if self.client_id
 86            else self.client_id
 87        )
 88        self._client_secret_name = InterpolatedString.create(
 89            self.client_secret_name, parameters=parameters
 90        )
 91        self._client_secret = (
 92            InterpolatedString.create(self.client_secret, parameters=parameters)
 93            if self.client_secret
 94            else self.client_secret
 95        )
 96        self._refresh_token_name = InterpolatedString.create(
 97            self.refresh_token_name, parameters=parameters
 98        )
 99        if self.refresh_token is not None:
100            self._refresh_token: Optional[InterpolatedString] = InterpolatedString.create(
101                self.refresh_token, parameters=parameters
102            )
103        else:
104            self._refresh_token = None
105        self.access_token_name = InterpolatedString.create(
106            self.access_token_name, parameters=parameters
107        )
108        self.expires_in_name = InterpolatedString.create(
109            self.expires_in_name, parameters=parameters
110        )
111        self.grant_type_name = InterpolatedString.create(
112            self.grant_type_name, parameters=parameters
113        )
114        self.grant_type = InterpolatedString.create(
115            "urn:ietf:params:oauth:grant-type:jwt-bearer"
116            if self.use_profile_assertion
117            else self.grant_type,
118            parameters=parameters,
119        )
120        self._refresh_request_body = InterpolatedMapping(
121            self.refresh_request_body or {}, parameters=parameters
122        )
123        self._refresh_request_headers = InterpolatedMapping(
124            self.refresh_request_headers or {}, parameters=parameters
125        )
126        try:
127            if (
128                isinstance(self.token_expiry_date, (int, str))
129                and str(self.token_expiry_date).isdigit()
130            ):
131                self._token_expiry_date = ab_datetime_parse(self.token_expiry_date)
132            else:
133                self._token_expiry_date = (
134                    ab_datetime_parse(
135                        InterpolatedString.create(
136                            self.token_expiry_date, parameters=parameters
137                        ).eval(self.config)
138                    )
139                    if self.token_expiry_date
140                    else ab_datetime_now() - timedelta(days=1)
141                )
142        except ValueError as e:
143            raise ValueError(f"Invalid token expiry date format: {e}")
144        self.use_profile_assertion = (
145            InterpolatedBoolean(self.use_profile_assertion, parameters=parameters)
146            if isinstance(self.use_profile_assertion, str)
147            else self.use_profile_assertion
148        )
149        self.assertion_name = "assertion"
150
151        if self.access_token_value is not None:
152            self._access_token_value = InterpolatedString.create(
153                self.access_token_value, parameters=parameters
154            ).eval(self.config)
155        else:
156            self._access_token_value = None
157
158        self._access_token: Optional[str] = (
159            self._access_token_value if self.access_token_value else None
160        )
161
162        if not self.use_profile_assertion and any(
163            client_creds is None for client_creds in [self.client_id, self.client_secret]
164        ):
165            raise ValueError(
166                "OAuthAuthenticator configuration error: Both 'client_id' and 'client_secret' are required for the "
167                "basic OAuth flow."
168            )
169        if self.profile_assertion is None and self.use_profile_assertion:
170            raise ValueError(
171                "OAuthAuthenticator configuration error: 'profile_assertion' is required when using the profile assertion flow."
172            )
173        if self.get_grant_type() == "refresh_token" and self._refresh_token is None:
174            raise ValueError(
175                "OAuthAuthenticator configuration error: A 'refresh_token' is required when the 'grant_type' is set to 'refresh_token'."
176            )
177
178    def get_token_refresh_endpoint(self) -> Optional[str]:
179        if self._token_refresh_endpoint is not None:
180            refresh_token_endpoint: str = self._token_refresh_endpoint.eval(self.config)
181            if not refresh_token_endpoint:
182                raise ValueError(
183                    "OAuthAuthenticator was unable to evaluate token_refresh_endpoint parameter"
184                )
185            return refresh_token_endpoint
186        return None
187
188    def get_client_id_name(self) -> str:
189        return self._client_id_name.eval(self.config)  # type: ignore # eval returns a string in this context
190
191    def get_client_id(self) -> str:
192        client_id = self._client_id.eval(self.config) if self._client_id else self._client_id
193        if not client_id:
194            raise ValueError("OAuthAuthenticator was unable to evaluate client_id parameter")
195        return client_id  # type: ignore # value will be returned as a string, or an error will be raised
196
197    def get_client_secret_name(self) -> str:
198        return self._client_secret_name.eval(self.config)  # type: ignore # eval returns a string in this context
199
200    def get_client_secret(self) -> str:
201        client_secret = (
202            self._client_secret.eval(self.config) if self._client_secret else self._client_secret
203        )
204        if not client_secret:
205            raise ValueError("OAuthAuthenticator was unable to evaluate client_secret parameter")
206        return client_secret  # type: ignore # value will be returned as a string, or an error will be raised
207
208    def get_refresh_token_name(self) -> str:
209        return self._refresh_token_name.eval(self.config)  # type: ignore # eval returns a string in this context
210
211    def get_refresh_token(self) -> Optional[str]:
212        return None if self._refresh_token is None else str(self._refresh_token.eval(self.config))
213
214    def get_scopes(self) -> List[str]:
215        return self.scopes or []
216
217    def get_access_token_name(self) -> str:
218        return self.access_token_name.eval(self.config)  # type: ignore # eval returns a string in this context
219
220    def get_expires_in_name(self) -> str:
221        return self.expires_in_name.eval(self.config)  # type: ignore # eval returns a string in this context
222
223    def get_grant_type_name(self) -> str:
224        return self.grant_type_name.eval(self.config)  # type: ignore # eval returns a string in this context
225
226    def get_grant_type(self) -> str:
227        return self.grant_type.eval(self.config)  # type: ignore # eval returns a string in this context
228
229    def get_refresh_request_body(self) -> Mapping[str, Any]:
230        return self._refresh_request_body.eval(self.config)
231
232    def get_refresh_request_headers(self) -> Mapping[str, Any]:
233        return self._refresh_request_headers.eval(self.config)
234
235    def get_token_expiry_date(self) -> AirbyteDateTime:
236        if not self._has_access_token_been_initialized():
237            return AirbyteDateTime.from_datetime(datetime.min)
238        return self._token_expiry_date  # type: ignore # _token_expiry_date is an AirbyteDateTime. It is never None despite what mypy thinks
239
240    def _has_access_token_been_initialized(self) -> bool:
241        return self._access_token is not None
242
243    def set_token_expiry_date(self, value: Union[str, int]) -> None:
244        self._token_expiry_date = self._parse_token_expiration_date(value)
245
246    def get_assertion_name(self) -> str:
247        return self.assertion_name
248
249    def get_assertion(self) -> str:
250        if self.profile_assertion is None:
251            raise ValueError("profile_assertion is not set")
252        return self.profile_assertion.token
253
254    def build_refresh_request_body(self) -> Mapping[str, Any]:
255        """
256        Returns the request body to set on the refresh request
257
258        Override to define additional parameters
259        """
260        if self.use_profile_assertion:
261            return {
262                self.get_grant_type_name(): self.get_grant_type(),
263                self.get_assertion_name(): self.get_assertion(),
264            }
265        return super().build_refresh_request_body()
266
267    @property
268    def access_token(self) -> str:
269        if self._access_token is None:
270            raise ValueError("access_token is not set")
271        return self._access_token
272
273    @access_token.setter
274    def access_token(self, value: str) -> None:
275        self._access_token = value
276
277    @property
278    def _message_repository(self) -> MessageRepository:
279        """
280        Overriding AbstractOauth2Authenticator._message_repository to allow for HTTP request logs
281        """
282        return self.message_repository

Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials based on a declarative connector configuration file. Credentials can be defined explicitly or via interpolation at runtime. The generated access token is attached to each request via the Authorization header.

Attributes:
  • token_refresh_endpoint (Union[InterpolatedString, str]): The endpoint to refresh the access token
  • client_id (Union[InterpolatedString, str]): The client id
  • client_secret (Union[InterpolatedString, str]): Client secret
  • refresh_token (Union[InterpolatedString, str]): The token used to refresh the access token
  • access_token_name (Union[InterpolatedString, str]): THe field to extract access token from in the response
  • expires_in_name (Union[InterpolatedString, str]): The field to extract expires_in from in the response
  • config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec
  • scopes (Optional[List[str]]): The scopes to request
  • token_expiry_date (Optional[Union[InterpolatedString, str]]): The access token expiration date
  • token_expiry_date_format str: format of the datetime; provide it if expires_in is returned in datetime instead of seconds
  • token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
  • refresh_request_body (Optional[Mapping[str, Any]]): The request body to send in the refresh request
  • refresh_request_headers (Optional[Mapping[str, Any]]): The request headers to send in the refresh request
  • grant_type: The grant_type to request for access_token. If set to refresh_token, the refresh_token parameter has to be provided
  • message_repository (MessageRepository): the message repository used to emit logs on HTTP requests
DeclarativeOauth2Authenticator( config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], client_id: Union[InterpolatedString, str, NoneType] = None, client_secret: Union[InterpolatedString, str, NoneType] = None, token_refresh_endpoint: Union[InterpolatedString, str, NoneType] = None, refresh_token: Union[InterpolatedString, str, NoneType] = None, scopes: Optional[List[str]] = None, token_expiry_date: Union[InterpolatedString, str, NoneType] = None, token_expiry_date_format: Optional[str] = None, token_expiry_is_time_of_expiration: bool = False, access_token_name: Union[InterpolatedString, str] = 'access_token', access_token_value: Union[InterpolatedString, str, NoneType] = None, client_id_name: Union[InterpolatedString, str] = 'client_id', client_secret_name: Union[InterpolatedString, str] = 'client_secret', expires_in_name: Union[InterpolatedString, str] = 'expires_in', refresh_token_name: Union[InterpolatedString, str] = 'refresh_token', refresh_request_body: Optional[Mapping[str, Any]] = None, refresh_request_headers: Optional[Mapping[str, Any]] = None, grant_type_name: Union[InterpolatedString, str] = 'grant_type', grant_type: Union[InterpolatedString, str] = 'refresh_token', message_repository: MessageRepository = <airbyte_cdk.sources.message.NoopMessageRepository object>, profile_assertion: Optional[DeclarativeAuthenticator] = None, use_profile_assertion: Union[InterpolatedBoolean, str, bool, NoneType] = False)

If all of refresh_token_error_status_codes, refresh_token_error_key, and refresh_token_error_values are set, then http errors with such params will be wrapped in AirbyteTracedException.

config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
client_id: Union[InterpolatedString, str, NoneType] = None
client_secret: Union[InterpolatedString, str, NoneType] = None
token_refresh_endpoint: Union[InterpolatedString, str, NoneType] = None
refresh_token: Union[InterpolatedString, str, NoneType] = None
scopes: Optional[List[str]] = None
token_expiry_date: Union[InterpolatedString, str, NoneType] = None
token_expiry_date_format: Optional[str] = None

Format of the datetime; exists it if expires_in is returned as the expiration datetime instead of seconds until it expires

token_expiry_is_time_of_expiration: bool = False

Indicates that the Token Expiry returns the date until which the token will be valid, not the amount of time it will be valid.

access_token_name: Union[InterpolatedString, str] = 'access_token'
access_token_value: Union[InterpolatedString, str, NoneType] = None
client_id_name: Union[InterpolatedString, str] = 'client_id'
client_secret_name: Union[InterpolatedString, str] = 'client_secret'
expires_in_name: Union[InterpolatedString, str] = 'expires_in'
refresh_token_name: Union[InterpolatedString, str] = 'refresh_token'
refresh_request_body: Optional[Mapping[str, Any]] = None
refresh_request_headers: Optional[Mapping[str, Any]] = None
grant_type_name: Union[InterpolatedString, str] = 'grant_type'
grant_type: Union[InterpolatedString, str] = 'refresh_token'
profile_assertion: Optional[DeclarativeAuthenticator] = None
use_profile_assertion: Union[InterpolatedBoolean, str, bool, NoneType] = False
def get_token_refresh_endpoint(self) -> Optional[str]:
178    def get_token_refresh_endpoint(self) -> Optional[str]:
179        if self._token_refresh_endpoint is not None:
180            refresh_token_endpoint: str = self._token_refresh_endpoint.eval(self.config)
181            if not refresh_token_endpoint:
182                raise ValueError(
183                    "OAuthAuthenticator was unable to evaluate token_refresh_endpoint parameter"
184                )
185            return refresh_token_endpoint
186        return None

Returns the endpoint to refresh the access token

def get_client_id_name(self) -> str:
188    def get_client_id_name(self) -> str:
189        return self._client_id_name.eval(self.config)  # type: ignore # eval returns a string in this context

The client id name to authenticate

def get_client_id(self) -> str:
191    def get_client_id(self) -> str:
192        client_id = self._client_id.eval(self.config) if self._client_id else self._client_id
193        if not client_id:
194            raise ValueError("OAuthAuthenticator was unable to evaluate client_id parameter")
195        return client_id  # type: ignore # value will be returned as a string, or an error will be raised

The client id to authenticate

def get_client_secret_name(self) -> str:
197    def get_client_secret_name(self) -> str:
198        return self._client_secret_name.eval(self.config)  # type: ignore # eval returns a string in this context

The client secret name to authenticate

def get_client_secret(self) -> str:
200    def get_client_secret(self) -> str:
201        client_secret = (
202            self._client_secret.eval(self.config) if self._client_secret else self._client_secret
203        )
204        if not client_secret:
205            raise ValueError("OAuthAuthenticator was unable to evaluate client_secret parameter")
206        return client_secret  # type: ignore # value will be returned as a string, or an error will be raised

The client secret to authenticate

def get_refresh_token_name(self) -> str:
208    def get_refresh_token_name(self) -> str:
209        return self._refresh_token_name.eval(self.config)  # type: ignore # eval returns a string in this context

The refresh token name to authenticate

def get_refresh_token(self) -> Optional[str]:
211    def get_refresh_token(self) -> Optional[str]:
212        return None if self._refresh_token is None else str(self._refresh_token.eval(self.config))

The token used to refresh the access token when it expires

def get_scopes(self) -> List[str]:
214    def get_scopes(self) -> List[str]:
215        return self.scopes or []

List of requested scopes

def get_access_token_name(self) -> str:
217    def get_access_token_name(self) -> str:
218        return self.access_token_name.eval(self.config)  # type: ignore # eval returns a string in this context

Field to extract access token from in the response

def get_expires_in_name(self) -> str:
220    def get_expires_in_name(self) -> str:
221        return self.expires_in_name.eval(self.config)  # type: ignore # eval returns a string in this context

Returns the expires_in field name

def get_grant_type_name(self) -> str:
223    def get_grant_type_name(self) -> str:
224        return self.grant_type_name.eval(self.config)  # type: ignore # eval returns a string in this context

Returns grant_type specified name for requesting access_token

def get_grant_type(self) -> str:
226    def get_grant_type(self) -> str:
227        return self.grant_type.eval(self.config)  # type: ignore # eval returns a string in this context

Returns grant_type specified for requesting access_token

def get_refresh_request_body(self) -> Mapping[str, Any]:
229    def get_refresh_request_body(self) -> Mapping[str, Any]:
230        return self._refresh_request_body.eval(self.config)

Returns the request body to set on the refresh request

def get_refresh_request_headers(self) -> Mapping[str, Any]:
232    def get_refresh_request_headers(self) -> Mapping[str, Any]:
233        return self._refresh_request_headers.eval(self.config)

Returns the request headers to set on the refresh request

def get_token_expiry_date(self) -> airbyte_cdk.utils.datetime_helpers.AirbyteDateTime:
235    def get_token_expiry_date(self) -> AirbyteDateTime:
236        if not self._has_access_token_been_initialized():
237            return AirbyteDateTime.from_datetime(datetime.min)
238        return self._token_expiry_date  # type: ignore # _token_expiry_date is an AirbyteDateTime. It is never None despite what mypy thinks

Expiration date of the access token

def set_token_expiry_date(self, value: Union[str, int]) -> None:
243    def set_token_expiry_date(self, value: Union[str, int]) -> None:
244        self._token_expiry_date = self._parse_token_expiration_date(value)

Setter for access token expiration date

def get_assertion_name(self) -> str:
246    def get_assertion_name(self) -> str:
247        return self.assertion_name
def get_assertion(self) -> str:
249    def get_assertion(self) -> str:
250        if self.profile_assertion is None:
251            raise ValueError("profile_assertion is not set")
252        return self.profile_assertion.token
def build_refresh_request_body(self) -> Mapping[str, Any]:
254    def build_refresh_request_body(self) -> Mapping[str, Any]:
255        """
256        Returns the request body to set on the refresh request
257
258        Override to define additional parameters
259        """
260        if self.use_profile_assertion:
261            return {
262                self.get_grant_type_name(): self.get_grant_type(),
263                self.get_assertion_name(): self.get_assertion(),
264            }
265        return super().build_refresh_request_body()

Returns the request body to set on the refresh request

Override to define additional parameters

access_token: str
267    @property
268    def access_token(self) -> str:
269        if self._access_token is None:
270            raise ValueError("access_token is not set")
271        return self._access_token

Returns the access token

@dataclass
class DeclarativeSingleUseRefreshTokenOauth2Authenticator(airbyte_cdk.SingleUseRefreshTokenOauth2Authenticator, airbyte_cdk.DeclarativeAuthenticator):
285@dataclass
286class DeclarativeSingleUseRefreshTokenOauth2Authenticator(
287    SingleUseRefreshTokenOauth2Authenticator, DeclarativeAuthenticator
288):
289    """
290    Declarative version of SingleUseRefreshTokenOauth2Authenticator which can be used in declarative connectors.
291    """
292
293    def __init__(self, *args: Any, **kwargs: Any) -> None:
294        super().__init__(*args, **kwargs)

Declarative version of SingleUseRefreshTokenOauth2Authenticator which can be used in declarative connectors.

DeclarativeSingleUseRefreshTokenOauth2Authenticator(*args: Any, **kwargs: Any)
293    def __init__(self, *args: Any, **kwargs: Any) -> None:
294        super().__init__(*args, **kwargs)
Arguments:
  • connector_config (Mapping[str, Any]): The full connector configuration
  • token_refresh_endpoint (str): Full URL to the token refresh endpoint
  • scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None.
  • access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token".
  • expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in".
  • refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token".
  • refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None.
  • refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None.
  • grant_type (str, optional): OAuth grant type. Defaults to "refresh_token".
  • client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object.
  • client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object.
  • access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token").
  • refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token").
  • token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date").
  • token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration.
  • token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
  • message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update
@dataclass
class DeclarativeStream(airbyte_cdk.Stream):
 32@dataclass
 33class DeclarativeStream(Stream):
 34    """
 35    DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever
 36
 37    Attributes:
 38        name (str): stream name
 39        primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream
 40        schema_loader (SchemaLoader): The schema loader
 41        retriever (Retriever): The retriever
 42        config (Config): The user-provided configuration as specified by the source's spec
 43        stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field
 44        stream. Transformations are applied in the order in which they are defined.
 45    """
 46
 47    retriever: Retriever
 48    config: Config
 49    parameters: InitVar[Mapping[str, Any]]
 50    name: str
 51    primary_key: Optional[Union[str, List[str], List[List[str]]]]
 52    state_migrations: List[StateMigration] = field(repr=True, default_factory=list)
 53    schema_loader: Optional[SchemaLoader] = None
 54    _name: str = field(init=False, repr=False, default="")
 55    _primary_key: str = field(init=False, repr=False, default="")
 56    stream_cursor_field: Optional[Union[InterpolatedString, str]] = None
 57
 58    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 59        self._stream_cursor_field = (
 60            InterpolatedString.create(self.stream_cursor_field, parameters=parameters)
 61            if isinstance(self.stream_cursor_field, str)
 62            else self.stream_cursor_field
 63        )
 64        self._schema_loader = (
 65            self.schema_loader
 66            if self.schema_loader
 67            else DefaultSchemaLoader(config=self.config, parameters=parameters)
 68        )
 69
 70    @property  # type: ignore
 71    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
 72        return self._primary_key
 73
 74    @primary_key.setter
 75    def primary_key(self, value: str) -> None:
 76        if not isinstance(value, property):
 77            self._primary_key = value
 78
 79    @property
 80    def exit_on_rate_limit(self) -> bool:
 81        if isinstance(self.retriever, AsyncRetriever):
 82            return self.retriever.exit_on_rate_limit
 83
 84        return self.retriever.requester.exit_on_rate_limit  # type: ignore # abstract Retriever class has not requester attribute
 85
 86    @exit_on_rate_limit.setter
 87    def exit_on_rate_limit(self, value: bool) -> None:
 88        if isinstance(self.retriever, AsyncRetriever):
 89            self.retriever.exit_on_rate_limit = value
 90        else:
 91            self.retriever.requester.exit_on_rate_limit = value  # type: ignore[attr-defined]
 92
 93    @property  # type: ignore
 94    def name(self) -> str:
 95        """
 96        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
 97        """
 98        return self._name
 99
100    @name.setter
101    def name(self, value: str) -> None:
102        if not isinstance(value, property):
103            self._name = value
104
105    @property
106    def state(self) -> MutableMapping[str, Any]:
107        return self.retriever.state  # type: ignore
108
109    @state.setter
110    def state(self, value: MutableMapping[str, Any]) -> None:
111        """State setter, accept state serialized by state getter."""
112        state: Mapping[str, Any] = value
113        if self.state_migrations:
114            for migration in self.state_migrations:
115                if migration.should_migrate(state):
116                    state = migration.migrate(state)
117        self.retriever.state = state
118
119    def get_updated_state(
120        self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
121    ) -> MutableMapping[str, Any]:
122        return self.state
123
124    @property
125    def cursor_field(self) -> Union[str, List[str]]:
126        """
127        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
128        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
129        """
130        cursor = self._stream_cursor_field.eval(self.config)  # type: ignore # _stream_cursor_field is always cast to interpolated string
131        return cursor if cursor else []
132
133    @property
134    def is_resumable(self) -> bool:
135        # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on
136        # if the retriever has a cursor defined.
137        return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False
138
139    def read_records(
140        self,
141        sync_mode: SyncMode,
142        cursor_field: Optional[List[str]] = None,
143        stream_slice: Optional[Mapping[str, Any]] = None,
144        stream_state: Optional[Mapping[str, Any]] = None,
145    ) -> Iterable[Mapping[str, Any]]:
146        """
147        :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
148        """
149        if stream_slice is None or (
150            not isinstance(stream_slice, StreamSlice) and stream_slice == {}
151        ):
152            # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field
153            # As part of the declarative model without custom components, this should never happen as the CDK would wire up a
154            # SinglePartitionRouter that would create this StreamSlice properly
155            # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default
156            # empty slice which seems to make sense.
157            stream_slice = StreamSlice(partition={}, cursor_slice={})
158        if not isinstance(stream_slice, StreamSlice):
159            raise ValueError(
160                f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}"
161            )
162        yield from self.retriever.read_records(self.get_json_schema(), stream_slice)  # type: ignore # records are of the correct type
163
164    def get_json_schema(self) -> Mapping[str, Any]:  # type: ignore
165        """
166        :return: A dict of the JSON schema representing this stream.
167
168        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
169        Override as needed.
170        """
171        return self._schema_loader.get_json_schema()
172
173    def stream_slices(
174        self,
175        *,
176        sync_mode: SyncMode,
177        cursor_field: Optional[List[str]] = None,
178        stream_state: Optional[Mapping[str, Any]] = None,
179    ) -> Iterable[Optional[StreamSlice]]:
180        """
181        Override to define the slices for this stream. See the stream slicing section of the docs for more information.
182
183        :param sync_mode:
184        :param cursor_field:
185        :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state
186        :return:
187        """
188        return self.retriever.stream_slices()
189
190    @property
191    def state_checkpoint_interval(self) -> Optional[int]:
192        """
193        We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but:
194        * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the
195            cursor value once at the end of every slice.
196        * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the
197            important state is the one at the beginning of the slice
198        """
199        return None
200
201    def get_cursor(self) -> Optional[Cursor]:
202        if self.retriever and isinstance(self.retriever, SimpleRetriever):
203            return self.retriever.cursor
204        return None
205
206    def _get_checkpoint_reader(
207        self,
208        logger: logging.Logger,
209        cursor_field: Optional[List[str]],
210        sync_mode: SyncMode,
211        stream_state: MutableMapping[str, Any],
212    ) -> CheckpointReader:
213        """
214        This method is overridden to prevent issues with stream slice classification for incremental streams that have parent streams.
215
216        The classification logic, when used with `itertools.tee`, creates a copy of the stream slices. When `stream_slices` is called
217        the second time, the parent records generated during the classification phase are lost. This occurs because `itertools.tee`
218        only buffers the results, meaning the logic in `simple_retriever` that observes and updates the cursor isn't executed again.
219
220        By overriding this method, we ensure that the stream slices are processed correctly and parent records are not lost,
221        allowing the cursor to function as expected.
222        """
223        mappings_or_slices = self.stream_slices(
224            cursor_field=cursor_field,
225            sync_mode=sync_mode,  # todo: change this interface to no longer rely on sync_mode for behavior
226            stream_state=stream_state,
227        )
228
229        cursor = self.get_cursor()
230        checkpoint_mode = self._checkpoint_mode
231
232        if isinstance(
233            cursor, (GlobalSubstreamCursor, PerPartitionCursor, PerPartitionWithGlobalCursor)
234        ):
235            self.has_multiple_slices = True
236            return CursorBasedCheckpointReader(
237                stream_slices=mappings_or_slices,
238                cursor=cursor,
239                read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH,
240            )
241
242        return super()._get_checkpoint_reader(logger, cursor_field, sync_mode, stream_state)

DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever

Attributes:
  • name (str): stream name
  • primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream
  • schema_loader (SchemaLoader): The schema loader
  • retriever (Retriever): The retriever
  • config (Config): The user-provided configuration as specified by the source's spec
  • stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field
  • stream. Transformations are applied in the order in which they are defined.
DeclarativeStream( retriever: airbyte_cdk.sources.declarative.retrievers.Retriever, config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], name: str = <property object>, primary_key: Union[str, List[str], List[List[str]], NoneType] = <property object>, state_migrations: List[airbyte_cdk.sources.declarative.migrations.state_migration.StateMigration] = <factory>, schema_loader: Optional[airbyte_cdk.sources.declarative.schema.SchemaLoader] = None, stream_cursor_field: Union[InterpolatedString, str, NoneType] = None)
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
name: str
93    @property  # type: ignore
94    def name(self) -> str:
95        """
96        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
97        """
98        return self._name
Returns

Stream name. By default this is the implementing class name, but it can be overridden as needed.

primary_key: Union[str, List[str], List[List[str]], NoneType]
70    @property  # type: ignore
71    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
72        return self._primary_key
Returns

string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. If the stream has no primary keys, return None.

schema_loader: Optional[airbyte_cdk.sources.declarative.schema.SchemaLoader] = None
stream_cursor_field: Union[InterpolatedString, str, NoneType] = None
exit_on_rate_limit: bool
79    @property
80    def exit_on_rate_limit(self) -> bool:
81        if isinstance(self.retriever, AsyncRetriever):
82            return self.retriever.exit_on_rate_limit
83
84        return self.retriever.requester.exit_on_rate_limit  # type: ignore # abstract Retriever class has not requester attribute

Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.

state: MutableMapping[str, Any]
105    @property
106    def state(self) -> MutableMapping[str, Any]:
107        return self.retriever.state  # type: ignore

State setter, accept state serialized by state getter.

def get_updated_state( self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> MutableMapping[str, Any]:
119    def get_updated_state(
120        self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
121    ) -> MutableMapping[str, Any]:
122        return self.state

DEPRECATED. Please use explicit state property instead, see IncrementalMixin docs.

Override to extract state from the latest record. Needed to implement incremental sync.

Inspects the latest record extracted from the data source and the current state object and return an updated state object.

For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.

Parameters
  • current_stream_state: The stream's current state object
  • latest_record: The latest record extracted from the stream
Returns

An updated state object

cursor_field: Union[str, List[str]]
124    @property
125    def cursor_field(self) -> Union[str, List[str]]:
126        """
127        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
128        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
129        """
130        cursor = self._stream_cursor_field.eval(self.config)  # type: ignore # _stream_cursor_field is always cast to interpolated string
131        return cursor if cursor else []

Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.

Returns

The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.

is_resumable: bool
133    @property
134    def is_resumable(self) -> bool:
135        # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on
136        # if the retriever has a cursor defined.
137        return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False
Returns

True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning on the next sync job.

def read_records( self, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, cursor_field: Optional[List[str]] = None, stream_slice: Optional[Mapping[str, Any]] = None, stream_state: Optional[Mapping[str, Any]] = None) -> Iterable[Mapping[str, Any]]:
139    def read_records(
140        self,
141        sync_mode: SyncMode,
142        cursor_field: Optional[List[str]] = None,
143        stream_slice: Optional[Mapping[str, Any]] = None,
144        stream_state: Optional[Mapping[str, Any]] = None,
145    ) -> Iterable[Mapping[str, Any]]:
146        """
147        :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
148        """
149        if stream_slice is None or (
150            not isinstance(stream_slice, StreamSlice) and stream_slice == {}
151        ):
152            # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field
153            # As part of the declarative model without custom components, this should never happen as the CDK would wire up a
154            # SinglePartitionRouter that would create this StreamSlice properly
155            # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default
156            # empty slice which seems to make sense.
157            stream_slice = StreamSlice(partition={}, cursor_slice={})
158        if not isinstance(stream_slice, StreamSlice):
159            raise ValueError(
160                f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}"
161            )
162        yield from self.retriever.read_records(self.get_json_schema(), stream_slice)  # type: ignore # records are of the correct type
Parameters
  • stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
def get_json_schema(self) -> Mapping[str, Any]:
164    def get_json_schema(self) -> Mapping[str, Any]:  # type: ignore
165        """
166        :return: A dict of the JSON schema representing this stream.
167
168        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
169        Override as needed.
170        """
171        return self._schema_loader.get_json_schema()
Returns

A dict of the JSON schema representing this stream.

The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. Override as needed.

def stream_slices( self, *, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None) -> Iterable[Optional[StreamSlice]]:
173    def stream_slices(
174        self,
175        *,
176        sync_mode: SyncMode,
177        cursor_field: Optional[List[str]] = None,
178        stream_state: Optional[Mapping[str, Any]] = None,
179    ) -> Iterable[Optional[StreamSlice]]:
180        """
181        Override to define the slices for this stream. See the stream slicing section of the docs for more information.
182
183        :param sync_mode:
184        :param cursor_field:
185        :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state
186        :return:
187        """
188        return self.retriever.stream_slices()

Override to define the slices for this stream. See the stream slicing section of the docs for more information.

Parameters
  • sync_mode:
  • cursor_field:
  • stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state
Returns
state_checkpoint_interval: Optional[int]
190    @property
191    def state_checkpoint_interval(self) -> Optional[int]:
192        """
193        We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but:
194        * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the
195            cursor value once at the end of every slice.
196        * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the
197            important state is the one at the beginning of the slice
198        """
199        return None

We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but:

  • In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the cursor value once at the end of every slice.
  • Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the important state is the one at the beginning of the slice
def get_cursor(self) -> Optional[airbyte_cdk.sources.streams.checkpoint.Cursor]:
201    def get_cursor(self) -> Optional[Cursor]:
202        if self.retriever and isinstance(self.retriever, SimpleRetriever):
203            return self.retriever.cursor
204        return None

A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need to define a cursor implementation and override this method to manage state through a Cursor.

@dataclass
class Decoder:
15@dataclass
16class Decoder:
17    """
18    Decoder strategy to transform a requests.Response into a Mapping[str, Any]
19    """
20
21    @abstractmethod
22    def is_stream_response(self) -> bool:
23        """
24        Set to True if you'd like to use stream=True option in http requester
25        """
26
27    @abstractmethod
28    def decode(self, response: requests.Response) -> DECODER_OUTPUT_TYPE:
29        """
30        Decodes a requests.Response into a Mapping[str, Any] or an array
31        :param response: the response to decode
32        :return: Generator of Mapping describing the response
33        """

Decoder strategy to transform a requests.Response into a Mapping[str, Any]

@abstractmethod
def is_stream_response(self) -> bool:
21    @abstractmethod
22    def is_stream_response(self) -> bool:
23        """
24        Set to True if you'd like to use stream=True option in http requester
25        """

Set to True if you'd like to use stream=True option in http requester

@abstractmethod
def decode( self, response: requests.models.Response) -> Generator[MutableMapping[str, Any], NoneType, NoneType]:
27    @abstractmethod
28    def decode(self, response: requests.Response) -> DECODER_OUTPUT_TYPE:
29        """
30        Decodes a requests.Response into a Mapping[str, Any] or an array
31        :param response: the response to decode
32        :return: Generator of Mapping describing the response
33        """

Decodes a requests.Response into a Mapping[str, Any] or an array

Parameters
  • response: the response to decode
Returns

Generator of Mapping describing the response

@dataclass
class DefaultPaginator(airbyte_cdk.sources.declarative.requesters.paginators.paginator.Paginator):
 33@dataclass
 34class DefaultPaginator(Paginator):
 35    """
 36    Default paginator to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token
 37
 38    Examples:
 39        1.
 40        * fetches up to 10 records at a time by setting the "limit" request param to 10
 41        * updates the request path with  "{{ response._metadata.next }}"
 42        ```
 43          paginator:
 44            type: "DefaultPaginator"
 45            page_size_option:
 46              type: RequestOption
 47              inject_into: request_parameter
 48              field_name: limit
 49            page_token_option:
 50              type: RequestPath
 51              path: "location"
 52            pagination_strategy:
 53              type: "CursorPagination"
 54              cursor_value: "{{ response._metadata.next }}"
 55              page_size: 10
 56        ```
 57
 58        2.
 59        * fetches up to 5 records at a time by setting the "page_size" header to 5
 60        * increments a record counter and set the request parameter "offset" to the value of the counter
 61        ```
 62          paginator:
 63            type: "DefaultPaginator"
 64            page_size_option:
 65              type: RequestOption
 66              inject_into: header
 67              field_name: page_size
 68            pagination_strategy:
 69              type: "OffsetIncrement"
 70              page_size: 5
 71            page_token_option:
 72              option_type: "request_parameter"
 73              field_name: "offset"
 74        ```
 75
 76        3.
 77        * fetches up to 5 records at a time by setting the "page_size" request param to 5
 78        * increments a page counter and set the request parameter "page" to the value of the counter
 79        ```
 80          paginator:
 81            type: "DefaultPaginator"
 82            page_size_option:
 83              type: RequestOption
 84              inject_into: request_parameter
 85              field_name: page_size
 86            pagination_strategy:
 87              type: "PageIncrement"
 88              page_size: 5
 89            page_token_option:
 90              type: RequestOption
 91              option_type: "request_parameter"
 92              field_name: "page"
 93        ```
 94    Attributes:
 95        page_size_option (Optional[RequestOption]): the request option to set the page size. Cannot be injected in the path.
 96        page_token_option (Optional[RequestPath, RequestOption]): the request option to set the page token
 97        pagination_strategy (PaginationStrategy): Strategy defining how to get the next page token
 98        config (Config): connection config
 99        url_base (Union[InterpolatedString, str]): endpoint's base url
100        decoder (Decoder): decoder to decode the response
101    """
102
103    pagination_strategy: PaginationStrategy
104    config: Config
105    url_base: Union[InterpolatedString, str]
106    parameters: InitVar[Mapping[str, Any]]
107    decoder: Decoder = field(
108        default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
109    )
110    page_size_option: Optional[RequestOption] = None
111    page_token_option: Optional[Union[RequestPath, RequestOption]] = None
112
113    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
114        if self.page_size_option and not self.pagination_strategy.get_page_size():
115            raise ValueError(
116                "page_size_option cannot be set if the pagination strategy does not have a page_size"
117            )
118        if isinstance(self.url_base, str):
119            self.url_base = InterpolatedString(string=self.url_base, parameters=parameters)
120
121        if self.page_token_option and not isinstance(self.page_token_option, RequestPath):
122            _validate_component_request_option_paths(
123                self.config,
124                self.page_size_option,
125                self.page_token_option,
126            )
127
128    def get_initial_token(self) -> Optional[Any]:
129        """
130        Return the page token that should be used for the first request of a stream
131
132        WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing
133        of state using page numbers. Because paginators are stateless
134        """
135        return self.pagination_strategy.initial_token
136
137    def next_page_token(
138        self,
139        response: requests.Response,
140        last_page_size: int,
141        last_record: Optional[Record],
142        last_page_token_value: Optional[Any] = None,
143    ) -> Optional[Mapping[str, Any]]:
144        next_page_token = self.pagination_strategy.next_page_token(
145            response=response,
146            last_page_size=last_page_size,
147            last_record=last_record,
148            last_page_token_value=last_page_token_value,
149        )
150        if next_page_token:
151            return {"next_page_token": next_page_token}
152        else:
153            return None
154
155    def path(
156        self,
157        next_page_token: Optional[Mapping[str, Any]],
158        stream_state: Optional[Mapping[str, Any]] = None,
159        stream_slice: Optional[StreamSlice] = None,
160    ) -> Optional[str]:
161        token = next_page_token.get("next_page_token") if next_page_token else None
162        if token and self.page_token_option and isinstance(self.page_token_option, RequestPath):
163            # make additional interpolation context
164            interpolation_context = get_interpolation_context(
165                stream_state=stream_state,
166                stream_slice=stream_slice,
167                next_page_token=next_page_token,
168            )
169            # Replace url base to only return the path
170            return str(token).replace(self.url_base.eval(self.config, **interpolation_context), "")  # type: ignore # url_base is casted to a InterpolatedString in __post_init__
171        else:
172            return None
173
174    def get_request_params(
175        self,
176        *,
177        stream_state: Optional[StreamState] = None,
178        stream_slice: Optional[StreamSlice] = None,
179        next_page_token: Optional[Mapping[str, Any]] = None,
180    ) -> MutableMapping[str, Any]:
181        return self._get_request_options(RequestOptionType.request_parameter, next_page_token)
182
183    def get_request_headers(
184        self,
185        *,
186        stream_state: Optional[StreamState] = None,
187        stream_slice: Optional[StreamSlice] = None,
188        next_page_token: Optional[Mapping[str, Any]] = None,
189    ) -> Mapping[str, str]:
190        return self._get_request_options(RequestOptionType.header, next_page_token)
191
192    def get_request_body_data(
193        self,
194        *,
195        stream_state: Optional[StreamState] = None,
196        stream_slice: Optional[StreamSlice] = None,
197        next_page_token: Optional[Mapping[str, Any]] = None,
198    ) -> Mapping[str, Any]:
199        return self._get_request_options(RequestOptionType.body_data, next_page_token)
200
201    def get_request_body_json(
202        self,
203        *,
204        stream_state: Optional[StreamState] = None,
205        stream_slice: Optional[StreamSlice] = None,
206        next_page_token: Optional[Mapping[str, Any]] = None,
207    ) -> Mapping[str, Any]:
208        return self._get_request_options(RequestOptionType.body_json, next_page_token)
209
210    def _get_request_options(
211        self, option_type: RequestOptionType, next_page_token: Optional[Mapping[str, Any]]
212    ) -> MutableMapping[str, Any]:
213        options: MutableMapping[str, Any] = {}
214
215        token = next_page_token.get("next_page_token") if next_page_token else None
216        if (
217            self.page_token_option
218            and token is not None
219            and isinstance(self.page_token_option, RequestOption)
220            and self.page_token_option.inject_into == option_type
221        ):
222            self.page_token_option.inject_into_request(options, token, self.config)
223
224        if (
225            self.page_size_option
226            and self.pagination_strategy.get_page_size()
227            and self.page_size_option.inject_into == option_type
228        ):
229            page_size = self.pagination_strategy.get_page_size()
230            self.page_size_option.inject_into_request(options, page_size, self.config)
231
232        return options

Default paginator to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token

Examples:

1.

  • fetches up to 10 records at a time by setting the "limit" request param to 10
  • updates the request path with "{{ response._metadata.next }}"
  paginator:
    type: "DefaultPaginator"
    page_size_option:
      type: RequestOption
      inject_into: request_parameter
      field_name: limit
    page_token_option:
      type: RequestPath
      path: "location"
    pagination_strategy:
      type: "CursorPagination"
      cursor_value: "{{ response._metadata.next }}"
      page_size: 10

2.

  • fetches up to 5 records at a time by setting the "page_size" header to 5
  • increments a record counter and set the request parameter "offset" to the value of the counter
  paginator:
    type: "DefaultPaginator"
    page_size_option:
      type: RequestOption
      inject_into: header
      field_name: page_size
    pagination_strategy:
      type: "OffsetIncrement"
      page_size: 5
    page_token_option:
      option_type: "request_parameter"
      field_name: "offset"

3.

  • fetches up to 5 records at a time by setting the "page_size" request param to 5
  • increments a page counter and set the request parameter "page" to the value of the counter
  paginator:
    type: "DefaultPaginator"
    page_size_option:
      type: RequestOption
      inject_into: request_parameter
      field_name: page_size
    pagination_strategy:
      type: "PageIncrement"
      page_size: 5
    page_token_option:
      type: RequestOption
      option_type: "request_parameter"
      field_name: "page"
Attributes:
  • page_size_option (Optional[RequestOption]): the request option to set the page size. Cannot be injected in the path.
  • page_token_option (Optional[RequestPath, RequestOption]): the request option to set the page token
  • pagination_strategy (PaginationStrategy): Strategy defining how to get the next page token
  • config (Config): connection config
  • url_base (Union[InterpolatedString, str]): endpoint's base url
  • decoder (Decoder): decoder to decode the response
DefaultPaginator( pagination_strategy: PaginationStrategy, config: Mapping[str, Any], url_base: Union[InterpolatedString, str], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], decoder: Decoder = <factory>, page_size_option: Optional[RequestOption] = None, page_token_option: Union[airbyte_cdk.sources.declarative.requesters.request_path.RequestPath, RequestOption, NoneType] = None)
pagination_strategy: PaginationStrategy
config: Mapping[str, Any]
url_base: Union[InterpolatedString, str]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
decoder: Decoder
page_size_option: Optional[RequestOption] = None
def get_initial_token(self) -> Optional[Any]:
128    def get_initial_token(self) -> Optional[Any]:
129        """
130        Return the page token that should be used for the first request of a stream
131
132        WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing
133        of state using page numbers. Because paginators are stateless
134        """
135        return self.pagination_strategy.initial_token

Return the page token that should be used for the first request of a stream

WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing of state using page numbers. Because paginators are stateless

def next_page_token( self, response: requests.models.Response, last_page_size: int, last_record: Optional[Record], last_page_token_value: Optional[Any] = None) -> Optional[Mapping[str, Any]]:
137    def next_page_token(
138        self,
139        response: requests.Response,
140        last_page_size: int,
141        last_record: Optional[Record],
142        last_page_token_value: Optional[Any] = None,
143    ) -> Optional[Mapping[str, Any]]:
144        next_page_token = self.pagination_strategy.next_page_token(
145            response=response,
146            last_page_size=last_page_size,
147            last_record=last_record,
148            last_page_token_value=last_page_token_value,
149        )
150        if next_page_token:
151            return {"next_page_token": next_page_token}
152        else:
153            return None

Returns the next_page_token to use to fetch the next page of records.

Parameters
  • response: the response to process
  • last_page_size: the number of records read from the response
  • last_record: the last record extracted from the response
  • last_page_token_value: The current value of the page token made on the last request
Returns

A mapping {"next_page_token": } for the next page from the input response object. Returning None means there are no more pages to read in this response.

def path( self, next_page_token: Optional[Mapping[str, Any]], stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None) -> Optional[str]:
155    def path(
156        self,
157        next_page_token: Optional[Mapping[str, Any]],
158        stream_state: Optional[Mapping[str, Any]] = None,
159        stream_slice: Optional[StreamSlice] = None,
160    ) -> Optional[str]:
161        token = next_page_token.get("next_page_token") if next_page_token else None
162        if token and self.page_token_option and isinstance(self.page_token_option, RequestPath):
163            # make additional interpolation context
164            interpolation_context = get_interpolation_context(
165                stream_state=stream_state,
166                stream_slice=stream_slice,
167                next_page_token=next_page_token,
168            )
169            # Replace url base to only return the path
170            return str(token).replace(self.url_base.eval(self.config, **interpolation_context), "")  # type: ignore # url_base is casted to a InterpolatedString in __post_init__
171        else:
172            return None

Returns the URL path to hit to fetch the next page of records

e.g: if you wanted to hit https://myapi.com/v1/some_entity then this will return "some_entity"

Returns

path to hit to fetch the next request. Returning None means the path is not defined by the next_page_token

def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> MutableMapping[str, Any]:
174    def get_request_params(
175        self,
176        *,
177        stream_state: Optional[StreamState] = None,
178        stream_slice: Optional[StreamSlice] = None,
179        next_page_token: Optional[Mapping[str, Any]] = None,
180    ) -> MutableMapping[str, Any]:
181        return self._get_request_options(RequestOptionType.request_parameter, next_page_token)

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, str]:
183    def get_request_headers(
184        self,
185        *,
186        stream_state: Optional[StreamState] = None,
187        stream_slice: Optional[StreamSlice] = None,
188        next_page_token: Optional[Mapping[str, Any]] = None,
189    ) -> Mapping[str, str]:
190        return self._get_request_options(RequestOptionType.header, next_page_token)

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
192    def get_request_body_data(
193        self,
194        *,
195        stream_state: Optional[StreamState] = None,
196        stream_slice: Optional[StreamSlice] = None,
197        next_page_token: Optional[Mapping[str, Any]] = None,
198    ) -> Mapping[str, Any]:
199        return self._get_request_options(RequestOptionType.body_data, next_page_token)

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
201    def get_request_body_json(
202        self,
203        *,
204        stream_state: Optional[StreamState] = None,
205        stream_slice: Optional[StreamSlice] = None,
206        next_page_token: Optional[Mapping[str, Any]] = None,
207    ) -> Mapping[str, Any]:
208        return self._get_request_options(RequestOptionType.body_json, next_page_token)

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

15@dataclass
16class DefaultRequestOptionsProvider(RequestOptionsProvider):
17    """
18    Request options provider that extracts fields from the stream_slice and injects them into the respective location in the
19    outbound request being made
20    """
21
22    parameters: InitVar[Mapping[str, Any]]
23
24    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
25        pass
26
27    def get_request_params(
28        self,
29        *,
30        stream_state: Optional[StreamState] = None,
31        stream_slice: Optional[StreamSlice] = None,
32        next_page_token: Optional[Mapping[str, Any]] = None,
33    ) -> Mapping[str, Any]:
34        return {}
35
36    def get_request_headers(
37        self,
38        *,
39        stream_state: Optional[StreamState] = None,
40        stream_slice: Optional[StreamSlice] = None,
41        next_page_token: Optional[Mapping[str, Any]] = None,
42    ) -> Mapping[str, Any]:
43        return {}
44
45    def get_request_body_data(
46        self,
47        *,
48        stream_state: Optional[StreamState] = None,
49        stream_slice: Optional[StreamSlice] = None,
50        next_page_token: Optional[Mapping[str, Any]] = None,
51    ) -> Union[Mapping[str, Any], str]:
52        return {}
53
54    def get_request_body_json(
55        self,
56        *,
57        stream_state: Optional[StreamState] = None,
58        stream_slice: Optional[StreamSlice] = None,
59        next_page_token: Optional[Mapping[str, Any]] = None,
60    ) -> Mapping[str, Any]:
61        return {}

Request options provider that extracts fields from the stream_slice and injects them into the respective location in the outbound request being made

DefaultRequestOptionsProvider(parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
27    def get_request_params(
28        self,
29        *,
30        stream_state: Optional[StreamState] = None,
31        stream_slice: Optional[StreamSlice] = None,
32        next_page_token: Optional[Mapping[str, Any]] = None,
33    ) -> Mapping[str, Any]:
34        return {}

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
36    def get_request_headers(
37        self,
38        *,
39        stream_state: Optional[StreamState] = None,
40        stream_slice: Optional[StreamSlice] = None,
41        next_page_token: Optional[Mapping[str, Any]] = None,
42    ) -> Mapping[str, Any]:
43        return {}

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Union[Mapping[str, Any], str]:
45    def get_request_body_data(
46        self,
47        *,
48        stream_state: Optional[StreamState] = None,
49        stream_slice: Optional[StreamSlice] = None,
50        next_page_token: Optional[Mapping[str, Any]] = None,
51    ) -> Union[Mapping[str, Any], str]:
52        return {}

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
54    def get_request_body_json(
55        self,
56        *,
57        stream_state: Optional[StreamState] = None,
58        stream_slice: Optional[StreamSlice] = None,
59        next_page_token: Optional[Mapping[str, Any]] = None,
60    ) -> Mapping[str, Any]:
61        return {}

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

@dataclass
class DpathExtractor(airbyte_cdk.RecordExtractor):
18@dataclass
19class DpathExtractor(RecordExtractor):
20    """
21    Record extractor that searches a decoded response over a path defined as an array of fields.
22
23    If the field path points to an array, that array is returned.
24    If the field path points to an object, that object is returned wrapped as an array.
25    If the field path points to an empty object, an empty array is returned.
26    If the field path points to a non-existing path, an empty array is returned.
27
28    Examples of instantiating this transform:
29    ```
30      extractor:
31        type: DpathExtractor
32        field_path:
33          - "root"
34          - "data"
35    ```
36
37    ```
38      extractor:
39        type: DpathExtractor
40        field_path:
41          - "root"
42          - "{{ parameters['field'] }}"
43    ```
44
45    ```
46      extractor:
47        type: DpathExtractor
48        field_path: []
49    ```
50
51    Attributes:
52        field_path (Union[InterpolatedString, str]): Path to the field that should be extracted
53        config (Config): The user-provided configuration as specified by the source's spec
54        decoder (Decoder): The decoder responsible to transfom the response in a Mapping
55    """
56
57    field_path: List[Union[InterpolatedString, str]]
58    config: Config
59    parameters: InitVar[Mapping[str, Any]]
60    decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={}))
61
62    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
63        self._field_path = [
64            InterpolatedString.create(path, parameters=parameters) for path in self.field_path
65        ]
66        for path_index in range(len(self.field_path)):
67            if isinstance(self.field_path[path_index], str):
68                self._field_path[path_index] = InterpolatedString.create(
69                    self.field_path[path_index], parameters=parameters
70                )
71
72    def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
73        for body in self.decoder.decode(response):
74            if len(self._field_path) == 0:
75                extracted = body
76            else:
77                path = [path.eval(self.config) for path in self._field_path]
78                if "*" in path:
79                    extracted = dpath.values(body, path)
80                else:
81                    extracted = dpath.get(body, path, default=[])  # type: ignore # extracted will be a MutableMapping, given input data structure
82            if isinstance(extracted, list):
83                yield from extracted
84            elif extracted:
85                yield extracted
86            else:
87                yield from []

Record extractor that searches a decoded response over a path defined as an array of fields.

If the field path points to an array, that array is returned. If the field path points to an object, that object is returned wrapped as an array. If the field path points to an empty object, an empty array is returned. If the field path points to a non-existing path, an empty array is returned.

Examples of instantiating this transform:

  extractor:
    type: DpathExtractor
    field_path:
      - "root"
      - "data"
  extractor:
    type: DpathExtractor
    field_path:
      - "root"
      - "{{ parameters['field'] }}"
  extractor:
    type: DpathExtractor
    field_path: []
Attributes:
  • field_path (Union[InterpolatedString, str]): Path to the field that should be extracted
  • config (Config): The user-provided configuration as specified by the source's spec
  • decoder (Decoder): The decoder responsible to transfom the response in a Mapping
DpathExtractor( field_path: List[Union[InterpolatedString, str]], config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], decoder: Decoder = <factory>)
field_path: List[Union[InterpolatedString, str]]
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
decoder: Decoder
def extract_records( self, response: requests.models.Response) -> Iterable[MutableMapping[Any, Any]]:
72    def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]:
73        for body in self.decoder.decode(response):
74            if len(self._field_path) == 0:
75                extracted = body
76            else:
77                path = [path.eval(self.config) for path in self._field_path]
78                if "*" in path:
79                    extracted = dpath.values(body, path)
80                else:
81                    extracted = dpath.get(body, path, default=[])  # type: ignore # extracted will be a MutableMapping, given input data structure
82            if isinstance(extracted, list):
83                yield from extracted
84            elif extracted:
85                yield extracted
86            else:
87                yield from []

Selects records from the response

Parameters
  • response: The response to extract the records from
Returns

List of Records extracted from the response

FieldPointer = typing.List[str]
class HttpMethod(enum.Enum):
19class HttpMethod(Enum):
20    """
21    Http Method to use when submitting an outgoing HTTP request
22    """
23
24    DELETE = "DELETE"
25    GET = "GET"
26    PATCH = "PATCH"
27    POST = "POST"

Http Method to use when submitting an outgoing HTTP request

DELETE = <HttpMethod.DELETE: 'DELETE'>
GET = <HttpMethod.GET: 'GET'>
PATCH = <HttpMethod.PATCH: 'PATCH'>
POST = <HttpMethod.POST: 'POST'>
@dataclass
class HttpRequester(airbyte_cdk.Requester):
 38@dataclass
 39class HttpRequester(Requester):
 40    """
 41    Default implementation of a Requester
 42
 43    Attributes:
 44        name (str): Name of the stream. Only used for request/response caching
 45        url_base (Union[InterpolatedString, str]): Base url to send requests to
 46        path (Union[InterpolatedString, str]): Path to send requests to
 47        http_method (Union[str, HttpMethod]): HTTP method to use when sending requests
 48        request_options_provider (Optional[InterpolatedRequestOptionsProvider]): request option provider defining the options to set on outgoing requests
 49        authenticator (DeclarativeAuthenticator): Authenticator defining how to authenticate to the source
 50        error_handler (Optional[ErrorHandler]): Error handler defining how to detect and handle errors
 51        backoff_strategies (Optional[List[BackoffStrategy]]): List of backoff strategies to use when retrying requests
 52        config (Config): The user-provided configuration as specified by the source's spec
 53        use_cache (bool): Indicates that data should be cached for this stream
 54    """
 55
 56    name: str
 57    url_base: Union[InterpolatedString, str]
 58    config: Config
 59    parameters: InitVar[Mapping[str, Any]]
 60
 61    path: Optional[Union[InterpolatedString, str]] = None
 62    authenticator: Optional[DeclarativeAuthenticator] = None
 63    http_method: Union[str, HttpMethod] = HttpMethod.GET
 64    request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None
 65    error_handler: Optional[ErrorHandler] = None
 66    api_budget: Optional[APIBudget] = None
 67    disable_retries: bool = False
 68    message_repository: MessageRepository = NoopMessageRepository()
 69    use_cache: bool = False
 70    _exit_on_rate_limit: bool = False
 71    stream_response: bool = False
 72    decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={}))
 73
 74    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 75        self._url_base = InterpolatedString.create(self.url_base, parameters=parameters)
 76        self._path = InterpolatedString.create(
 77            self.path if self.path else EmptyString, parameters=parameters
 78        )
 79        if self.request_options_provider is None:
 80            self._request_options_provider = InterpolatedRequestOptionsProvider(
 81                config=self.config, parameters=parameters
 82            )
 83        elif isinstance(self.request_options_provider, dict):
 84            self._request_options_provider = InterpolatedRequestOptionsProvider(
 85                config=self.config, **self.request_options_provider
 86            )
 87        else:
 88            self._request_options_provider = self.request_options_provider
 89        self._authenticator = self.authenticator or NoAuth(parameters=parameters)
 90        self._http_method = (
 91            HttpMethod[self.http_method] if isinstance(self.http_method, str) else self.http_method
 92        )
 93        self.error_handler = self.error_handler
 94        self._parameters = parameters
 95
 96        if self.error_handler is not None and hasattr(self.error_handler, "backoff_strategies"):
 97            backoff_strategies = self.error_handler.backoff_strategies  # type: ignore
 98        else:
 99            backoff_strategies = None
100
101        self._http_client = HttpClient(
102            name=self.name,
103            logger=self.logger,
104            error_handler=self.error_handler,
105            api_budget=self.api_budget,
106            authenticator=self._authenticator,
107            use_cache=self.use_cache,
108            backoff_strategy=backoff_strategies,
109            disable_retries=self.disable_retries,
110            message_repository=self.message_repository,
111        )
112
113    @property
114    def exit_on_rate_limit(self) -> bool:
115        return self._exit_on_rate_limit
116
117    @exit_on_rate_limit.setter
118    def exit_on_rate_limit(self, value: bool) -> None:
119        self._exit_on_rate_limit = value
120
121    def get_authenticator(self) -> DeclarativeAuthenticator:
122        return self._authenticator
123
124    def get_url_base(
125        self,
126        *,
127        stream_state: Optional[StreamState] = None,
128        stream_slice: Optional[StreamSlice] = None,
129        next_page_token: Optional[Mapping[str, Any]] = None,
130    ) -> str:
131        interpolation_context = get_interpolation_context(
132            stream_state=stream_state,
133            stream_slice=stream_slice,
134            next_page_token=next_page_token,
135        )
136        return str(self._url_base.eval(self.config, **interpolation_context))
137
138    def get_path(
139        self,
140        *,
141        stream_state: Optional[StreamState] = None,
142        stream_slice: Optional[StreamSlice] = None,
143        next_page_token: Optional[Mapping[str, Any]] = None,
144    ) -> str:
145        interpolation_context = get_interpolation_context(
146            stream_state=stream_state,
147            stream_slice=stream_slice,
148            next_page_token=next_page_token,
149        )
150        path = str(self._path.eval(self.config, **interpolation_context))
151        return path.lstrip("/")
152
153    def get_method(self) -> HttpMethod:
154        return self._http_method
155
156    def get_request_params(
157        self,
158        *,
159        stream_state: Optional[StreamState] = None,
160        stream_slice: Optional[StreamSlice] = None,
161        next_page_token: Optional[Mapping[str, Any]] = None,
162    ) -> MutableMapping[str, Any]:
163        return self._request_options_provider.get_request_params(
164            stream_state=stream_state,
165            stream_slice=stream_slice,
166            next_page_token=next_page_token,
167        )
168
169    def get_request_headers(
170        self,
171        *,
172        stream_state: Optional[StreamState] = None,
173        stream_slice: Optional[StreamSlice] = None,
174        next_page_token: Optional[Mapping[str, Any]] = None,
175    ) -> Mapping[str, Any]:
176        return self._request_options_provider.get_request_headers(
177            stream_state=stream_state,
178            stream_slice=stream_slice,
179            next_page_token=next_page_token,
180        )
181
182    # fixing request options provider types has a lot of dependencies
183    def get_request_body_data(  # type: ignore
184        self,
185        *,
186        stream_state: Optional[StreamState] = None,
187        stream_slice: Optional[StreamSlice] = None,
188        next_page_token: Optional[Mapping[str, Any]] = None,
189    ) -> Union[Mapping[str, Any], str]:
190        return (
191            self._request_options_provider.get_request_body_data(
192                stream_state=stream_state,
193                stream_slice=stream_slice,
194                next_page_token=next_page_token,
195            )
196            or {}
197        )
198
199    # fixing request options provider types has a lot of dependencies
200    def get_request_body_json(  # type: ignore
201        self,
202        *,
203        stream_state: Optional[StreamState] = None,
204        stream_slice: Optional[StreamSlice] = None,
205        next_page_token: Optional[Mapping[str, Any]] = None,
206    ) -> Optional[Mapping[str, Any]]:
207        return self._request_options_provider.get_request_body_json(
208            stream_state=stream_state,
209            stream_slice=stream_slice,
210            next_page_token=next_page_token,
211        )
212
213    @property
214    def logger(self) -> logging.Logger:
215        return logging.getLogger(f"airbyte.HttpRequester.{self.name}")
216
217    def _get_request_options(
218        self,
219        stream_state: Optional[StreamState],
220        stream_slice: Optional[StreamSlice],
221        next_page_token: Optional[Mapping[str, Any]],
222        requester_method: Callable[..., Optional[Union[Mapping[str, Any], str]]],
223        auth_options_method: Callable[..., Optional[Union[Mapping[str, Any], str]]],
224        extra_options: Optional[Union[Mapping[str, Any], str]] = None,
225    ) -> Union[Mapping[str, Any], str]:
226        """
227        Get the request_option from the requester, the authenticator and extra_options passed in.
228        Raise a ValueError if there's a key collision
229        Returned merged mapping otherwise
230        """
231
232        is_body_json = requester_method.__name__ == "get_request_body_json"
233
234        return combine_mappings(
235            [
236                requester_method(
237                    stream_state=stream_state,
238                    stream_slice=stream_slice,
239                    next_page_token=next_page_token,
240                ),
241                auth_options_method(),
242                extra_options,
243            ],
244            allow_same_value_merge=is_body_json,
245        )
246
247    def _request_headers(
248        self,
249        stream_state: Optional[StreamState] = None,
250        stream_slice: Optional[StreamSlice] = None,
251        next_page_token: Optional[Mapping[str, Any]] = None,
252        extra_headers: Optional[Mapping[str, Any]] = None,
253    ) -> Mapping[str, Any]:
254        """
255        Specifies request headers.
256        Authentication headers will overwrite any overlapping headers returned from this method.
257        """
258        headers = self._get_request_options(
259            stream_state,
260            stream_slice,
261            next_page_token,
262            self.get_request_headers,
263            self.get_authenticator().get_auth_header,
264            extra_headers,
265        )
266        if isinstance(headers, str):
267            raise ValueError("Request headers cannot be a string")
268        return {str(k): str(v) for k, v in headers.items()}
269
270    def _request_params(
271        self,
272        stream_state: Optional[StreamState],
273        stream_slice: Optional[StreamSlice],
274        next_page_token: Optional[Mapping[str, Any]],
275        extra_params: Optional[Mapping[str, Any]] = None,
276    ) -> Mapping[str, Any]:
277        """
278        Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
279
280        E.g: you might want to define query parameters for paging if next_page_token is not None.
281        """
282        options = self._get_request_options(
283            stream_state,
284            stream_slice,
285            next_page_token,
286            self.get_request_params,
287            self.get_authenticator().get_request_params,
288            extra_params,
289        )
290        if isinstance(options, str):
291            raise ValueError("Request params cannot be a string")
292
293        for k, v in options.items():
294            if isinstance(v, (dict,)):
295                raise ValueError(
296                    f"Invalid value for `{k}` parameter. The values of request params cannot be an object."
297                )
298
299        return options
300
301    def _request_body_data(
302        self,
303        stream_state: Optional[StreamState],
304        stream_slice: Optional[StreamSlice],
305        next_page_token: Optional[Mapping[str, Any]],
306        extra_body_data: Optional[Union[Mapping[str, Any], str]] = None,
307    ) -> Optional[Union[Mapping[str, Any], str]]:
308        """
309        Specifies how to populate the body of the request with a non-JSON payload.
310
311        If returns a ready text that it will be sent as is.
312        If returns a dict that it will be converted to a urlencoded form.
313        E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
314
315        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
316        """
317        # Warning: use self.state instead of the stream_state passed as argument!
318        return self._get_request_options(
319            stream_state,
320            stream_slice,
321            next_page_token,
322            self.get_request_body_data,
323            self.get_authenticator().get_request_body_data,
324            extra_body_data,
325        )
326
327    def _request_body_json(
328        self,
329        stream_state: Optional[StreamState],
330        stream_slice: Optional[StreamSlice],
331        next_page_token: Optional[Mapping[str, Any]],
332        extra_body_json: Optional[Mapping[str, Any]] = None,
333    ) -> Optional[Mapping[str, Any]]:
334        """
335        Specifies how to populate the body of the request with a JSON payload.
336
337        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
338        """
339        # Warning: use self.state instead of the stream_state passed as argument!
340        options = self._get_request_options(
341            stream_state,
342            stream_slice,
343            next_page_token,
344            self.get_request_body_json,
345            self.get_authenticator().get_request_body_json,
346            extra_body_json,
347        )
348        if isinstance(options, str):
349            raise ValueError("Request body json cannot be a string")
350        return options
351
352    @classmethod
353    def _join_url(cls, url_base: str, path: str) -> str:
354        """
355        Joins a base URL with a given path and returns the resulting URL with any trailing slash removed.
356
357        This method ensures that there are no duplicate slashes when concatenating the base URL and the path,
358        which is useful when the full URL is provided from an interpolation context.
359
360        Args:
361            url_base (str): The base URL to which the path will be appended.
362            path (str): The path to join with the base URL.
363
364        Returns:
365            str: The resulting joined URL.
366
367        Note:
368            Related issue: https://github.com/airbytehq/airbyte-internal-issues/issues/11869
369            - If the path is an empty string or None, the method returns the base URL with any trailing slash removed.
370
371        Example:
372            1) _join_url("https://example.com/api/", "endpoint") >> 'https://example.com/api/endpoint'
373            2) _join_url("https://example.com/api", "/endpoint") >> 'https://example.com/api/endpoint'
374            3) _join_url("https://example.com/api/", "") >> 'https://example.com/api/'
375            4) _join_url("https://example.com/api", None) >> 'https://example.com/api'
376        """
377
378        # return a full-url if provided directly from interpolation context
379        if path == EmptyString or path is None:
380            return url_base
381        else:
382            # since we didn't provide a full-url, the url_base might not have a trailing slash
383            # so we join the url_base and path correctly
384            if not url_base.endswith("/"):
385                url_base += "/"
386
387        return urljoin(url_base, path)
388
389    def send_request(
390        self,
391        stream_state: Optional[StreamState] = None,
392        stream_slice: Optional[StreamSlice] = None,
393        next_page_token: Optional[Mapping[str, Any]] = None,
394        path: Optional[str] = None,
395        request_headers: Optional[Mapping[str, Any]] = None,
396        request_params: Optional[Mapping[str, Any]] = None,
397        request_body_data: Optional[Union[Mapping[str, Any], str]] = None,
398        request_body_json: Optional[Mapping[str, Any]] = None,
399        log_formatter: Optional[Callable[[requests.Response], Any]] = None,
400    ) -> Optional[requests.Response]:
401        request, response = self._http_client.send_request(
402            http_method=self.get_method().value,
403            url=self._join_url(
404                self.get_url_base(
405                    stream_state=stream_state,
406                    stream_slice=stream_slice,
407                    next_page_token=next_page_token,
408                ),
409                path
410                or self.get_path(
411                    stream_state=stream_state,
412                    stream_slice=stream_slice,
413                    next_page_token=next_page_token,
414                ),
415            ),
416            request_kwargs={"stream": self.stream_response},
417            headers=self._request_headers(
418                stream_state, stream_slice, next_page_token, request_headers
419            ),
420            params=self._request_params(
421                stream_state, stream_slice, next_page_token, request_params
422            ),
423            json=self._request_body_json(
424                stream_state, stream_slice, next_page_token, request_body_json
425            ),
426            data=self._request_body_data(
427                stream_state, stream_slice, next_page_token, request_body_data
428            ),
429            dedupe_query_params=True,
430            log_formatter=log_formatter,
431            exit_on_rate_limit=self._exit_on_rate_limit,
432        )
433
434        return response

Default implementation of a Requester

Attributes:
  • name (str): Name of the stream. Only used for request/response caching
  • url_base (Union[InterpolatedString, str]): Base url to send requests to
  • path (Union[InterpolatedString, str]): Path to send requests to
  • http_method (Union[str, HttpMethod]): HTTP method to use when sending requests
  • request_options_provider (Optional[InterpolatedRequestOptionsProvider]): request option provider defining the options to set on outgoing requests
  • authenticator (DeclarativeAuthenticator): Authenticator defining how to authenticate to the source
  • error_handler (Optional[ErrorHandler]): Error handler defining how to detect and handle errors
  • backoff_strategies (Optional[List[BackoffStrategy]]): List of backoff strategies to use when retrying requests
  • config (Config): The user-provided configuration as specified by the source's spec
  • use_cache (bool): Indicates that data should be cached for this stream
HttpRequester( name: str, url_base: Union[InterpolatedString, str], config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], path: Union[InterpolatedString, str, NoneType] = None, authenticator: Optional[DeclarativeAuthenticator] = None, http_method: Union[str, HttpMethod] = <HttpMethod.GET: 'GET'>, request_options_provider: Optional[airbyte_cdk.sources.declarative.requesters.request_options.InterpolatedRequestOptionsProvider] = None, error_handler: Optional[airbyte_cdk.sources.streams.http.error_handlers.ErrorHandler] = None, api_budget: Optional[airbyte_cdk.sources.streams.call_rate.APIBudget] = None, disable_retries: bool = False, message_repository: MessageRepository = <airbyte_cdk.sources.message.NoopMessageRepository object>, use_cache: bool = False, _exit_on_rate_limit: bool = False, stream_response: bool = False, decoder: Decoder = <factory>)
name: str
url_base: Union[InterpolatedString, str]
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
path: Union[InterpolatedString, str, NoneType] = None
authenticator: Optional[DeclarativeAuthenticator] = None
http_method: Union[str, HttpMethod] = <HttpMethod.GET: 'GET'>
disable_retries: bool = False
use_cache: bool = False
stream_response: bool = False
decoder: Decoder
exit_on_rate_limit: bool
113    @property
114    def exit_on_rate_limit(self) -> bool:
115        return self._exit_on_rate_limit
def get_authenticator( self) -> DeclarativeAuthenticator:
121    def get_authenticator(self) -> DeclarativeAuthenticator:
122        return self._authenticator

Specifies the authenticator to use when submitting requests

def get_url_base( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> str:
124    def get_url_base(
125        self,
126        *,
127        stream_state: Optional[StreamState] = None,
128        stream_slice: Optional[StreamSlice] = None,
129        next_page_token: Optional[Mapping[str, Any]] = None,
130    ) -> str:
131        interpolation_context = get_interpolation_context(
132            stream_state=stream_state,
133            stream_slice=stream_slice,
134            next_page_token=next_page_token,
135        )
136        return str(self._url_base.eval(self.config, **interpolation_context))
Returns

URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"

def get_path( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> str:
138    def get_path(
139        self,
140        *,
141        stream_state: Optional[StreamState] = None,
142        stream_slice: Optional[StreamSlice] = None,
143        next_page_token: Optional[Mapping[str, Any]] = None,
144    ) -> str:
145        interpolation_context = get_interpolation_context(
146            stream_state=stream_state,
147            stream_slice=stream_slice,
148            next_page_token=next_page_token,
149        )
150        path = str(self._path.eval(self.config, **interpolation_context))
151        return path.lstrip("/")

Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"

def get_method(self) -> HttpMethod:
153    def get_method(self) -> HttpMethod:
154        return self._http_method

Specifies the HTTP method to use

def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> MutableMapping[str, Any]:
156    def get_request_params(
157        self,
158        *,
159        stream_state: Optional[StreamState] = None,
160        stream_slice: Optional[StreamSlice] = None,
161        next_page_token: Optional[Mapping[str, Any]] = None,
162    ) -> MutableMapping[str, Any]:
163        return self._request_options_provider.get_request_params(
164            stream_state=stream_state,
165            stream_slice=stream_slice,
166            next_page_token=next_page_token,
167        )

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
169    def get_request_headers(
170        self,
171        *,
172        stream_state: Optional[StreamState] = None,
173        stream_slice: Optional[StreamSlice] = None,
174        next_page_token: Optional[Mapping[str, Any]] = None,
175    ) -> Mapping[str, Any]:
176        return self._request_options_provider.get_request_headers(
177            stream_state=stream_state,
178            stream_slice=stream_slice,
179            next_page_token=next_page_token,
180        )

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Union[Mapping[str, Any], str]:
183    def get_request_body_data(  # type: ignore
184        self,
185        *,
186        stream_state: Optional[StreamState] = None,
187        stream_slice: Optional[StreamSlice] = None,
188        next_page_token: Optional[Mapping[str, Any]] = None,
189    ) -> Union[Mapping[str, Any], str]:
190        return (
191            self._request_options_provider.get_request_body_data(
192                stream_state=stream_state,
193                stream_slice=stream_slice,
194                next_page_token=next_page_token,
195            )
196            or {}
197        )

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Optional[Mapping[str, Any]]:
200    def get_request_body_json(  # type: ignore
201        self,
202        *,
203        stream_state: Optional[StreamState] = None,
204        stream_slice: Optional[StreamSlice] = None,
205        next_page_token: Optional[Mapping[str, Any]] = None,
206    ) -> Optional[Mapping[str, Any]]:
207        return self._request_options_provider.get_request_body_json(
208            stream_state=stream_state,
209            stream_slice=stream_slice,
210            next_page_token=next_page_token,
211        )

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

logger: logging.Logger
213    @property
214    def logger(self) -> logging.Logger:
215        return logging.getLogger(f"airbyte.HttpRequester.{self.name}")
def send_request( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None, path: Optional[str] = None, request_headers: Optional[Mapping[str, Any]] = None, request_params: Optional[Mapping[str, Any]] = None, request_body_data: Union[str, Mapping[str, Any], NoneType] = None, request_body_json: Optional[Mapping[str, Any]] = None, log_formatter: Optional[Callable[[requests.models.Response], Any]] = None) -> Optional[requests.models.Response]:
389    def send_request(
390        self,
391        stream_state: Optional[StreamState] = None,
392        stream_slice: Optional[StreamSlice] = None,
393        next_page_token: Optional[Mapping[str, Any]] = None,
394        path: Optional[str] = None,
395        request_headers: Optional[Mapping[str, Any]] = None,
396        request_params: Optional[Mapping[str, Any]] = None,
397        request_body_data: Optional[Union[Mapping[str, Any], str]] = None,
398        request_body_json: Optional[Mapping[str, Any]] = None,
399        log_formatter: Optional[Callable[[requests.Response], Any]] = None,
400    ) -> Optional[requests.Response]:
401        request, response = self._http_client.send_request(
402            http_method=self.get_method().value,
403            url=self._join_url(
404                self.get_url_base(
405                    stream_state=stream_state,
406                    stream_slice=stream_slice,
407                    next_page_token=next_page_token,
408                ),
409                path
410                or self.get_path(
411                    stream_state=stream_state,
412                    stream_slice=stream_slice,
413                    next_page_token=next_page_token,
414                ),
415            ),
416            request_kwargs={"stream": self.stream_response},
417            headers=self._request_headers(
418                stream_state, stream_slice, next_page_token, request_headers
419            ),
420            params=self._request_params(
421                stream_state, stream_slice, next_page_token, request_params
422            ),
423            json=self._request_body_json(
424                stream_state, stream_slice, next_page_token, request_body_json
425            ),
426            data=self._request_body_data(
427                stream_state, stream_slice, next_page_token, request_body_data
428            ),
429            dedupe_query_params=True,
430            log_formatter=log_formatter,
431            exit_on_rate_limit=self._exit_on_rate_limit,
432        )
433
434        return response

Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. If path is set, the path configured on the requester itself is ignored. If header, params and body are set, they are merged with the ones configured on the requester itself.

If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed.

@dataclass
class InterpolatedBoolean:
29@dataclass
30class InterpolatedBoolean:
31    f"""
32    Wrapper around a string to be evaluated to a boolean value.
33    The string will be evaluated as False if it interpolates to a value in {FALSE_VALUES}
34
35    Attributes:
36        condition (str): The string representing the condition to evaluate to a boolean
37    """
38    condition: str
39    parameters: InitVar[Mapping[str, Any]]
40
41    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
42        self._default = "False"
43        self._interpolation = JinjaInterpolation()
44        self._parameters = parameters
45
46    def eval(self, config: Config, **additional_parameters: Any) -> bool:
47        """
48        Interpolates the predicate condition string using the config and other optional arguments passed as parameter.
49
50        :param config: The user-provided configuration as specified by the source's spec
51        :param additional_parameters: Optional parameters used for interpolation
52        :return: The interpolated string
53        """
54        if isinstance(self.condition, bool):
55            return self.condition
56        else:
57            evaluated = self._interpolation.eval(
58                self.condition,
59                config,
60                self._default,
61                parameters=self._parameters,
62                **additional_parameters,
63            )
64            if evaluated in FALSE_VALUES:
65                return False
66            # The presence of a value is generally regarded as truthy, so we treat it as such
67            return True
InterpolatedBoolean( condition: str, parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
condition: str
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
def eval(self, config: Mapping[str, Any], **additional_parameters: Any) -> bool:
46    def eval(self, config: Config, **additional_parameters: Any) -> bool:
47        """
48        Interpolates the predicate condition string using the config and other optional arguments passed as parameter.
49
50        :param config: The user-provided configuration as specified by the source's spec
51        :param additional_parameters: Optional parameters used for interpolation
52        :return: The interpolated string
53        """
54        if isinstance(self.condition, bool):
55            return self.condition
56        else:
57            evaluated = self._interpolation.eval(
58                self.condition,
59                config,
60                self._default,
61                parameters=self._parameters,
62                **additional_parameters,
63            )
64            if evaluated in FALSE_VALUES:
65                return False
66            # The presence of a value is generally regarded as truthy, so we treat it as such
67            return True

Interpolates the predicate condition string using the config and other optional arguments passed as parameter.

Parameters
  • config: The user-provided configuration as specified by the source's spec
  • additional_parameters: Optional parameters used for interpolation
Returns

The interpolated string

@dataclass
class InterpolatedRequestInputProvider:
14@dataclass
15class InterpolatedRequestInputProvider:
16    """
17    Helper class that generically performs string interpolation on the provided dictionary or string input
18    """
19
20    parameters: InitVar[Mapping[str, Any]]
21    request_inputs: Optional[Union[str, Mapping[str, str]]] = field(default=None)
22    config: Config = field(default_factory=dict)
23    _interpolator: Optional[Union[InterpolatedString, InterpolatedMapping]] = field(
24        init=False, repr=False, default=None
25    )
26    _request_inputs: Optional[Union[str, Mapping[str, str]]] = field(
27        init=False, repr=False, default=None
28    )
29
30    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
31        self._request_inputs = self.request_inputs or {}
32        if isinstance(self._request_inputs, str):
33            self._interpolator = InterpolatedString(
34                self._request_inputs, default="", parameters=parameters
35            )
36        else:
37            self._interpolator = InterpolatedMapping(self._request_inputs, parameters=parameters)
38
39    def eval_request_inputs(
40        self,
41        stream_slice: Optional[StreamSlice] = None,
42        next_page_token: Optional[Mapping[str, Any]] = None,
43        valid_key_types: Optional[Tuple[Type[Any]]] = None,
44        valid_value_types: Optional[Tuple[Type[Any], ...]] = None,
45    ) -> Mapping[str, Any]:
46        """
47        Returns the request inputs to set on an outgoing HTTP request
48
49        :param stream_slice: The stream slice
50        :param next_page_token: The pagination token
51        :param valid_key_types: A tuple of types that the interpolator should allow
52        :param valid_value_types: A tuple of types that the interpolator should allow
53        :return: The request inputs to set on an outgoing HTTP request
54        """
55        kwargs = {
56            "stream_slice": stream_slice,
57            "next_page_token": next_page_token,
58        }
59        interpolated_value = self._interpolator.eval(  # type: ignore # self._interpolator is always initialized with a value and will not be None
60            self.config,
61            valid_key_types=valid_key_types,
62            valid_value_types=valid_value_types,
63            **kwargs,
64        )
65
66        if isinstance(interpolated_value, dict):
67            non_null_tokens = {k: v for k, v in interpolated_value.items() if v is not None}
68            return non_null_tokens
69        return interpolated_value  # type: ignore[no-any-return]

Helper class that generically performs string interpolation on the provided dictionary or string input

InterpolatedRequestInputProvider( parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], request_inputs: Union[str, Mapping[str, str], NoneType] = None, config: Mapping[str, Any] = <factory>)
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
request_inputs: Union[str, Mapping[str, str], NoneType] = None
config: Mapping[str, Any]
def eval_request_inputs( self, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None, valid_key_types: Optional[Tuple[Type[Any]]] = None, valid_value_types: Optional[Tuple[Type[Any], ...]] = None) -> Mapping[str, Any]:
39    def eval_request_inputs(
40        self,
41        stream_slice: Optional[StreamSlice] = None,
42        next_page_token: Optional[Mapping[str, Any]] = None,
43        valid_key_types: Optional[Tuple[Type[Any]]] = None,
44        valid_value_types: Optional[Tuple[Type[Any], ...]] = None,
45    ) -> Mapping[str, Any]:
46        """
47        Returns the request inputs to set on an outgoing HTTP request
48
49        :param stream_slice: The stream slice
50        :param next_page_token: The pagination token
51        :param valid_key_types: A tuple of types that the interpolator should allow
52        :param valid_value_types: A tuple of types that the interpolator should allow
53        :return: The request inputs to set on an outgoing HTTP request
54        """
55        kwargs = {
56            "stream_slice": stream_slice,
57            "next_page_token": next_page_token,
58        }
59        interpolated_value = self._interpolator.eval(  # type: ignore # self._interpolator is always initialized with a value and will not be None
60            self.config,
61            valid_key_types=valid_key_types,
62            valid_value_types=valid_value_types,
63            **kwargs,
64        )
65
66        if isinstance(interpolated_value, dict):
67            non_null_tokens = {k: v for k, v in interpolated_value.items() if v is not None}
68            return non_null_tokens
69        return interpolated_value  # type: ignore[no-any-return]

Returns the request inputs to set on an outgoing HTTP request

Parameters
  • stream_slice: The stream slice
  • next_page_token: The pagination token
  • valid_key_types: A tuple of types that the interpolator should allow
  • valid_value_types: A tuple of types that the interpolator should allow
Returns

The request inputs to set on an outgoing HTTP request

@dataclass
class InterpolatedString:
13@dataclass
14class InterpolatedString:
15    """
16    Wrapper around a raw string to be interpolated with the Jinja2 templating engine
17
18    Attributes:
19        string (str): The string to evalute
20        default (Optional[str]): The default value to return if the evaluation returns an empty string
21        parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
22    """
23
24    string: str
25    parameters: InitVar[Mapping[str, Any]]
26    default: Optional[str] = None
27
28    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
29        self.default = self.default or self.string
30        self._interpolation = JinjaInterpolation()
31        self._parameters = parameters
32        # indicates whether passed string is just a plain string, not Jinja template
33        # This allows for optimization, but we do not know it yet at this stage
34        self._is_plain_string = None
35
36    def eval(self, config: Config, **kwargs: Any) -> Any:
37        """
38        Interpolates the input string using the config and other optional arguments passed as parameter.
39
40        :param config: The user-provided configuration as specified by the source's spec
41        :param kwargs: Optional parameters used for interpolation
42        :return: The interpolated string
43        """
44        if self._is_plain_string:
45            return self.string
46        if self._is_plain_string is None:
47            # Let's check whether output from evaluation is the same as input.
48            # This indicates occurrence of a plain string, not a template and we can skip Jinja in subsequent runs.
49            evaluated = self._interpolation.eval(
50                self.string, config, self.default, parameters=self._parameters, **kwargs
51            )
52            self._is_plain_string = self.string == evaluated
53            return evaluated
54        return self._interpolation.eval(
55            self.string, config, self.default, parameters=self._parameters, **kwargs
56        )
57
58    def __eq__(self, other: Any) -> bool:
59        if not isinstance(other, InterpolatedString):
60            return False
61        return self.string == other.string and self.default == other.default
62
63    @classmethod
64    def create(
65        cls,
66        string_or_interpolated: Union["InterpolatedString", str],
67        *,
68        parameters: Mapping[str, Any],
69    ) -> "InterpolatedString":
70        """
71        Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString.
72
73        :param string_or_interpolated: either a raw string or an InterpolatedString.
74        :param parameters: parameters propagated from parent component
75        :return: InterpolatedString representing the input string.
76        """
77        if isinstance(string_or_interpolated, str):
78            return InterpolatedString(string=string_or_interpolated, parameters=parameters)
79        else:
80            return string_or_interpolated

Wrapper around a raw string to be interpolated with the Jinja2 templating engine

Attributes:
  • string (str): The string to evalute
  • default (Optional[str]): The default value to return if the evaluation returns an empty string
  • parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
InterpolatedString( string: str, parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], default: Optional[str] = None)
string: str
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
default: Optional[str] = None
def eval(self, config: Mapping[str, Any], **kwargs: Any) -> Any:
36    def eval(self, config: Config, **kwargs: Any) -> Any:
37        """
38        Interpolates the input string using the config and other optional arguments passed as parameter.
39
40        :param config: The user-provided configuration as specified by the source's spec
41        :param kwargs: Optional parameters used for interpolation
42        :return: The interpolated string
43        """
44        if self._is_plain_string:
45            return self.string
46        if self._is_plain_string is None:
47            # Let's check whether output from evaluation is the same as input.
48            # This indicates occurrence of a plain string, not a template and we can skip Jinja in subsequent runs.
49            evaluated = self._interpolation.eval(
50                self.string, config, self.default, parameters=self._parameters, **kwargs
51            )
52            self._is_plain_string = self.string == evaluated
53            return evaluated
54        return self._interpolation.eval(
55            self.string, config, self.default, parameters=self._parameters, **kwargs
56        )

Interpolates the input string using the config and other optional arguments passed as parameter.

Parameters
  • config: The user-provided configuration as specified by the source's spec
  • kwargs: Optional parameters used for interpolation
Returns

The interpolated string

@classmethod
def create( cls, string_or_interpolated: Union[InterpolatedString, str], *, parameters: Mapping[str, Any]) -> InterpolatedString:
63    @classmethod
64    def create(
65        cls,
66        string_or_interpolated: Union["InterpolatedString", str],
67        *,
68        parameters: Mapping[str, Any],
69    ) -> "InterpolatedString":
70        """
71        Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString.
72
73        :param string_or_interpolated: either a raw string or an InterpolatedString.
74        :param parameters: parameters propagated from parent component
75        :return: InterpolatedString representing the input string.
76        """
77        if isinstance(string_or_interpolated, str):
78            return InterpolatedString(string=string_or_interpolated, parameters=parameters)
79        else:
80            return string_or_interpolated

Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString.

Parameters
  • string_or_interpolated: either a raw string or an InterpolatedString.
  • parameters: parameters propagated from parent component
Returns

InterpolatedString representing the input string.

class JsonDecoder(airbyte_cdk.Decoder):
20class JsonDecoder(Decoder):
21    """
22    Decoder strategy that returns the json-encoded content of a response, if any.
23
24    Usually, we would try to instantiate the equivalent `CompositeRawDecoder(parser=JsonParser(), stream_response=False)` but there were specific historical behaviors related to the JsonDecoder that we didn't know if we could remove like the fallback on {} in case of errors.
25    """
26
27    def __init__(self, parameters: Mapping[str, Any]):
28        self._decoder = CompositeRawDecoder(parser=JsonParser(), stream_response=False)
29
30    def is_stream_response(self) -> bool:
31        return self._decoder.is_stream_response()
32
33    def decode(
34        self, response: requests.Response
35    ) -> Generator[MutableMapping[str, Any], None, None]:
36        """
37        Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping.
38        """
39        has_yielded = False
40        try:
41            for element in self._decoder.decode(response):
42                yield element
43                has_yielded = True
44        except Exception:
45            yield {}
46
47        if not has_yielded:
48            yield {}

Decoder strategy that returns the json-encoded content of a response, if any.

Usually, we would try to instantiate the equivalent CompositeRawDecoder(parser=JsonParser(), stream_response=False) but there were specific historical behaviors related to the JsonDecoder that we didn't know if we could remove like the fallback on {} in case of errors.

JsonDecoder(parameters: Mapping[str, Any])
27    def __init__(self, parameters: Mapping[str, Any]):
28        self._decoder = CompositeRawDecoder(parser=JsonParser(), stream_response=False)
def is_stream_response(self) -> bool:
30    def is_stream_response(self) -> bool:
31        return self._decoder.is_stream_response()

Set to True if you'd like to use stream=True option in http requester

def decode( self, response: requests.models.Response) -> Generator[MutableMapping[str, Any], NoneType, NoneType]:
33    def decode(
34        self, response: requests.Response
35    ) -> Generator[MutableMapping[str, Any], None, None]:
36        """
37        Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping.
38        """
39        has_yielded = False
40        try:
41            for element in self._decoder.decode(response):
42                yield element
43                has_yielded = True
44        except Exception:
45            yield {}
46
47        if not has_yielded:
48            yield {}

Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping.

33@dataclass
34class JsonFileSchemaLoader(ResourceSchemaLoader, SchemaLoader):
35    """
36    Loads the schema from a json file
37
38    Attributes:
39        file_path (Union[InterpolatedString, str]): The path to the json file describing the schema
40        name (str): The stream's name
41        config (Config): The user-provided configuration as specified by the source's spec
42        parameters (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed
43    """
44
45    config: Config
46    parameters: InitVar[Mapping[str, Any]]
47    file_path: Union[InterpolatedString, str] = field(default="")
48
49    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
50        if not self.file_path:
51            self.file_path = _default_file_path()
52        self.file_path = InterpolatedString.create(self.file_path, parameters=parameters)
53
54    def get_json_schema(self) -> Mapping[str, Any]:
55        # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory
56        # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there
57        json_schema_path = self._get_json_filepath()
58        resource, schema_path = self.extract_resource_and_schema_path(json_schema_path)
59        raw_json_file = pkgutil.get_data(resource, schema_path)
60
61        if not raw_json_file:
62            raise IOError(f"Cannot find file {json_schema_path}")
63        try:
64            raw_schema = json.loads(raw_json_file)
65        except ValueError as err:
66            raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err
67        self.package_name = resource
68        return self._resolve_schema_references(raw_schema)
69
70    def _get_json_filepath(self) -> Any:
71        return self.file_path.eval(self.config)  # type: ignore # file_path is always cast to an interpolated string
72
73    @staticmethod
74    def extract_resource_and_schema_path(json_schema_path: str) -> Tuple[str, str]:
75        """
76        When the connector is running on a docker container, package_data is accessible from the resource (source_<name>), so we extract
77        the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight
78        hack to identify the source name while we are in the airbyte_cdk module.
79        :param json_schema_path: The path to the schema JSON file
80        :return: Tuple of the resource name and the path to the schema file
81        """
82        split_path = json_schema_path.split("/")
83
84        if split_path[0] == "" or split_path[0] == ".":
85            split_path = split_path[1:]
86
87        if len(split_path) == 0:
88            return "", ""
89
90        if len(split_path) == 1:
91            return "", split_path[0]
92
93        return split_path[0], "/".join(split_path[1:])

Loads the schema from a json file

Attributes:
  • file_path (Union[InterpolatedString, str]): The path to the json file describing the schema
  • name (str): The stream's name
  • config (Config): The user-provided configuration as specified by the source's spec
  • parameters (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed
JsonFileSchemaLoader( config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], file_path: Union[InterpolatedString, str] = '')
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
file_path: Union[InterpolatedString, str] = ''
def get_json_schema(self) -> Mapping[str, Any]:
54    def get_json_schema(self) -> Mapping[str, Any]:
55        # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory
56        # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there
57        json_schema_path = self._get_json_filepath()
58        resource, schema_path = self.extract_resource_and_schema_path(json_schema_path)
59        raw_json_file = pkgutil.get_data(resource, schema_path)
60
61        if not raw_json_file:
62            raise IOError(f"Cannot find file {json_schema_path}")
63        try:
64            raw_schema = json.loads(raw_json_file)
65        except ValueError as err:
66            raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err
67        self.package_name = resource
68        return self._resolve_schema_references(raw_schema)

Returns a mapping describing the stream's schema

@staticmethod
def extract_resource_and_schema_path(json_schema_path: str) -> Tuple[str, str]:
73    @staticmethod
74    def extract_resource_and_schema_path(json_schema_path: str) -> Tuple[str, str]:
75        """
76        When the connector is running on a docker container, package_data is accessible from the resource (source_<name>), so we extract
77        the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight
78        hack to identify the source name while we are in the airbyte_cdk module.
79        :param json_schema_path: The path to the schema JSON file
80        :return: Tuple of the resource name and the path to the schema file
81        """
82        split_path = json_schema_path.split("/")
83
84        if split_path[0] == "" or split_path[0] == ".":
85            split_path = split_path[1:]
86
87        if len(split_path) == 0:
88            return "", ""
89
90        if len(split_path) == 1:
91            return "", split_path[0]
92
93        return split_path[0], "/".join(split_path[1:])

When the connector is running on a docker container, package_data is accessible from the resource (source_), so we extract the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight hack to identify the source name while we are in the airbyte_cdk module.

Parameters
  • json_schema_path: The path to the schema JSON file
Returns

Tuple of the resource name and the path to the schema file

class LegacyToPerPartitionStateMigration(airbyte_cdk.sources.declarative.migrations.state_migration.StateMigration):
20class LegacyToPerPartitionStateMigration(StateMigration):
21    """
22    Transforms the input state for per-partitioned streams from the legacy format to the low-code format.
23    The cursor field and partition ID fields are automatically extracted from the stream's DatetimebasedCursor and SubstreamPartitionRouter.
24
25    Example input state:
26    {
27    "13506132": {
28      "last_changed": "2022-12-27T08:34:39+00:00"
29    }
30    Example output state:
31    {
32      "partition": {"id": "13506132"},
33      "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"}
34    }
35    """
36
37    def __init__(
38        self,
39        partition_router: SubstreamPartitionRouter,
40        cursor: CustomIncrementalSync | DatetimeBasedCursor,
41        config: Mapping[str, Any],
42        parameters: Mapping[str, Any],
43    ):
44        self._partition_router = partition_router
45        self._cursor = cursor
46        self._config = config
47        self._parameters = parameters
48        self._partition_key_field = InterpolatedString.create(
49            self._get_partition_field(self._partition_router), parameters=self._parameters
50        ).eval(self._config)
51        self._cursor_field = InterpolatedString.create(
52            self._cursor.cursor_field, parameters=self._parameters
53        ).eval(self._config)
54
55    def _get_partition_field(self, partition_router: SubstreamPartitionRouter) -> str:
56        parent_stream_config = partition_router.parent_stream_configs[0]
57
58        # Retrieve the partition field with a condition, as properties are returned as a dictionary for custom components.
59        partition_field = (
60            parent_stream_config.partition_field
61            if isinstance(parent_stream_config, ParentStreamConfig)
62            else parent_stream_config.get("partition_field")  # type: ignore # See above comment on why parent_stream_config might be a dict
63        )
64
65        return partition_field
66
67    def should_migrate(self, stream_state: Mapping[str, Any]) -> bool:
68        if _is_already_migrated(stream_state):
69            return False
70
71        # There is exactly one parent stream
72        number_of_parent_streams = len(self._partition_router.parent_stream_configs)  # type: ignore # custom partition will introduce this attribute if needed
73        if number_of_parent_streams != 1:
74            # There should be exactly one parent stream
75            return False
76        """
77        The expected state format is
78        "<parent_key_id>" : {
79          "<cursor_field>" : "<cursor_value>"
80        }
81        """
82        if stream_state:
83            for key, value in stream_state.items():
84                if isinstance(value, dict):
85                    keys = list(value.keys())
86                    if len(keys) != 1:
87                        # The input partitioned state should only have one key
88                        return False
89                    if keys[0] != self._cursor_field:
90                        # Unexpected key. Found {keys[0]}. Expected {self._cursor.cursor_field}
91                        return False
92        return True
93
94    def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
95        states = [
96            {"partition": {self._partition_key_field: key}, "cursor": value}
97            for key, value in stream_state.items()
98        ]
99        return {"states": states}

Transforms the input state for per-partitioned streams from the legacy format to the low-code format. The cursor field and partition ID fields are automatically extracted from the stream's DatetimebasedCursor and SubstreamPartitionRouter.

Example input state: { "13506132": { "last_changed": "2022-12-27T08:34:39+00:00" } Example output state: { "partition": {"id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"} }

37    def __init__(
38        self,
39        partition_router: SubstreamPartitionRouter,
40        cursor: CustomIncrementalSync | DatetimeBasedCursor,
41        config: Mapping[str, Any],
42        parameters: Mapping[str, Any],
43    ):
44        self._partition_router = partition_router
45        self._cursor = cursor
46        self._config = config
47        self._parameters = parameters
48        self._partition_key_field = InterpolatedString.create(
49            self._get_partition_field(self._partition_router), parameters=self._parameters
50        ).eval(self._config)
51        self._cursor_field = InterpolatedString.create(
52            self._cursor.cursor_field, parameters=self._parameters
53        ).eval(self._config)
def should_migrate(self, stream_state: Mapping[str, Any]) -> bool:
67    def should_migrate(self, stream_state: Mapping[str, Any]) -> bool:
68        if _is_already_migrated(stream_state):
69            return False
70
71        # There is exactly one parent stream
72        number_of_parent_streams = len(self._partition_router.parent_stream_configs)  # type: ignore # custom partition will introduce this attribute if needed
73        if number_of_parent_streams != 1:
74            # There should be exactly one parent stream
75            return False
76        """
77        The expected state format is
78        "<parent_key_id>" : {
79          "<cursor_field>" : "<cursor_value>"
80        }
81        """
82        if stream_state:
83            for key, value in stream_state.items():
84                if isinstance(value, dict):
85                    keys = list(value.keys())
86                    if len(keys) != 1:
87                        # The input partitioned state should only have one key
88                        return False
89                    if keys[0] != self._cursor_field:
90                        # Unexpected key. Found {keys[0]}. Expected {self._cursor.cursor_field}
91                        return False
92        return True

Check if the stream_state should be migrated

Parameters
  • stream_state: The stream_state to potentially migrate
Returns

true if the state is of the expected format and should be migrated. False otherwise.

def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
94    def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]:
95        states = [
96            {"partition": {self._partition_key_field: key}, "cursor": value}
97            for key, value in stream_state.items()
98        ]
99        return {"states": states}

Migrate the stream_state. Assumes should_migrate(stream_state) returned True.

Parameters
  • stream_state: The stream_state to migrate
Returns

The migrated stream_state

 61class ManifestDeclarativeSource(DeclarativeSource):
 62    """Declarative source defined by a manifest of low-code components that define source connector behavior"""
 63
 64    def __init__(
 65        self,
 66        source_config: ConnectionDefinition,
 67        *,
 68        config: Mapping[str, Any] | None = None,
 69        debug: bool = False,
 70        emit_connector_builder_messages: bool = False,
 71        component_factory: Optional[ModelToComponentFactory] = None,
 72    ):
 73        """
 74        Args:
 75            config: The provided config dict.
 76            source_config: The manifest of low-code components that describe the source connector.
 77            debug: True if debug mode is enabled.
 78            emit_connector_builder_messages: True if messages should be emitted to the connector builder.
 79            component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked.
 80        """
 81        self.logger = logging.getLogger(f"airbyte.{self.name}")
 82        # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing
 83        manifest = dict(source_config)
 84        if "type" not in manifest:
 85            manifest["type"] = "DeclarativeSource"
 86
 87        # If custom components are needed, locate and/or register them.
 88        self.components_module: ModuleType | None = get_registered_components_module(config=config)
 89
 90        resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest)
 91        propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters(
 92            "", resolved_source_config, {}
 93        )
 94        self._source_config = propagated_source_config
 95        self._debug = debug
 96        self._emit_connector_builder_messages = emit_connector_builder_messages
 97        self._constructor = (
 98            component_factory
 99            if component_factory
100            else ModelToComponentFactory(
101                emit_connector_builder_messages,
102                max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
103            )
104        )
105        self._message_repository = self._constructor.get_message_repository()
106        self._slice_logger: SliceLogger = (
107            AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger()
108        )
109
110        self._validate_source()
111
112    @property
113    def resolved_manifest(self) -> Mapping[str, Any]:
114        return self._source_config
115
116    @property
117    def message_repository(self) -> MessageRepository:
118        return self._message_repository
119
120    @property
121    def connection_checker(self) -> ConnectionChecker:
122        check = self._source_config["check"]
123        if "type" not in check:
124            check["type"] = "CheckStream"
125        check_stream = self._constructor.create_component(
126            COMPONENTS_CHECKER_TYPE_MAPPING[check["type"]],
127            check,
128            dict(),
129            emit_connector_builder_messages=self._emit_connector_builder_messages,
130        )
131        if isinstance(check_stream, ConnectionChecker):
132            return check_stream
133        else:
134            raise ValueError(
135                f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}"
136            )
137
138    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
139        self._emit_manifest_debug_message(
140            extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)}
141        )
142
143        stream_configs = self._stream_configs(self._source_config) + self._dynamic_stream_configs(
144            self._source_config, config
145        )
146
147        api_budget_model = self._source_config.get("api_budget")
148        if api_budget_model:
149            self._constructor.set_api_budget(api_budget_model, config)
150
151        source_streams = [
152            self._constructor.create_component(
153                StateDelegatingStreamModel
154                if stream_config.get("type") == StateDelegatingStreamModel.__name__
155                else DeclarativeStreamModel,
156                stream_config,
157                config,
158                emit_connector_builder_messages=self._emit_connector_builder_messages,
159            )
160            for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs))
161        ]
162
163        return source_streams
164
165    @staticmethod
166    def _initialize_cache_for_parent_streams(
167        stream_configs: List[Dict[str, Any]],
168    ) -> List[Dict[str, Any]]:
169        parent_streams = set()
170
171        def update_with_cache_parent_configs(parent_configs: list[dict[str, Any]]) -> None:
172            for parent_config in parent_configs:
173                parent_streams.add(parent_config["stream"]["name"])
174                if parent_config["stream"]["type"] == "StateDelegatingStream":
175                    parent_config["stream"]["full_refresh_stream"]["retriever"]["requester"][
176                        "use_cache"
177                    ] = True
178                    parent_config["stream"]["incremental_stream"]["retriever"]["requester"][
179                        "use_cache"
180                    ] = True
181                else:
182                    parent_config["stream"]["retriever"]["requester"]["use_cache"] = True
183
184        for stream_config in stream_configs:
185            if stream_config.get("incremental_sync", {}).get("parent_stream"):
186                parent_streams.add(stream_config["incremental_sync"]["parent_stream"]["name"])
187                stream_config["incremental_sync"]["parent_stream"]["retriever"]["requester"][
188                    "use_cache"
189                ] = True
190
191            elif stream_config.get("retriever", {}).get("partition_router", {}):
192                partition_router = stream_config["retriever"]["partition_router"]
193
194                if isinstance(partition_router, dict) and partition_router.get(
195                    "parent_stream_configs"
196                ):
197                    update_with_cache_parent_configs(partition_router["parent_stream_configs"])
198                elif isinstance(partition_router, list):
199                    for router in partition_router:
200                        if router.get("parent_stream_configs"):
201                            update_with_cache_parent_configs(router["parent_stream_configs"])
202
203        for stream_config in stream_configs:
204            if stream_config["name"] in parent_streams:
205                if stream_config["type"] == "StateDelegatingStream":
206                    stream_config["full_refresh_stream"]["retriever"]["requester"]["use_cache"] = (
207                        True
208                    )
209                    stream_config["incremental_stream"]["retriever"]["requester"]["use_cache"] = (
210                        True
211                    )
212                else:
213                    stream_config["retriever"]["requester"]["use_cache"] = True
214
215        return stream_configs
216
217    def spec(self, logger: logging.Logger) -> ConnectorSpecification:
218        """
219        Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible
220        configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this
221        will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json"
222        in the project root.
223        """
224        self._configure_logger_level(logger)
225        self._emit_manifest_debug_message(
226            extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)}
227        )
228
229        spec = self._source_config.get("spec")
230        if spec:
231            if "type" not in spec:
232                spec["type"] = "Spec"
233            spec_component = self._constructor.create_component(SpecModel, spec, dict())
234            return spec_component.generate_spec()
235        else:
236            return super().spec(logger)
237
238    def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus:
239        self._configure_logger_level(logger)
240        return super().check(logger, config)
241
242    def read(
243        self,
244        logger: logging.Logger,
245        config: Mapping[str, Any],
246        catalog: ConfiguredAirbyteCatalog,
247        state: Optional[List[AirbyteStateMessage]] = None,
248    ) -> Iterator[AirbyteMessage]:
249        self._configure_logger_level(logger)
250        yield from super().read(logger, config, catalog, state)
251
252    def _configure_logger_level(self, logger: logging.Logger) -> None:
253        """
254        Set the log level to logging.DEBUG if debug mode is enabled
255        """
256        if self._debug:
257            logger.setLevel(logging.DEBUG)
258
259    def _validate_source(self) -> None:
260        """
261        Validates the connector manifest against the declarative component schema
262        """
263        try:
264            raw_component_schema = pkgutil.get_data(
265                "airbyte_cdk", "sources/declarative/declarative_component_schema.yaml"
266            )
267            if raw_component_schema is not None:
268                declarative_component_schema = yaml.load(
269                    raw_component_schema, Loader=yaml.SafeLoader
270                )
271            else:
272                raise RuntimeError(
273                    "Failed to read manifest component json schema required for validation"
274                )
275        except FileNotFoundError as e:
276            raise FileNotFoundError(
277                f"Failed to read manifest component json schema required for validation: {e}"
278            )
279
280        streams = self._source_config.get("streams")
281        dynamic_streams = self._source_config.get("dynamic_streams")
282        if not (streams or dynamic_streams):
283            raise ValidationError(
284                f"A valid manifest should have at least one stream defined. Got {streams}"
285            )
286
287        try:
288            validate(self._source_config, declarative_component_schema)
289        except ValidationError as e:
290            raise ValidationError(
291                "Validation against json schema defined in declarative_component_schema.yaml schema failed"
292            ) from e
293
294        cdk_version_str = metadata.version("airbyte_cdk")
295        cdk_version = self._parse_version(cdk_version_str, "airbyte-cdk")
296        manifest_version_str = self._source_config.get("version")
297        if manifest_version_str is None:
298            raise RuntimeError(
299                "Manifest version is not defined in the manifest. This is unexpected since it should be a required field. Please contact support."
300            )
301        manifest_version = self._parse_version(manifest_version_str, "manifest")
302
303        if (cdk_version.major, cdk_version.minor, cdk_version.micro) == (0, 0, 0):
304            # Skipping version compatibility check on unreleased dev branch
305            pass
306        elif (cdk_version.major, cdk_version.minor) < (
307            manifest_version.major,
308            manifest_version.minor,
309        ):
310            raise ValidationError(
311                f"The manifest version {manifest_version!s} is greater than the airbyte-cdk package version ({cdk_version!s}). Your "
312                f"manifest may contain features that are not in the current CDK version."
313            )
314        elif (manifest_version.major, manifest_version.minor) < (0, 29):
315            raise ValidationError(
316                f"The low-code framework was promoted to Beta in airbyte-cdk version 0.29.0 and contains many breaking changes to the "
317                f"language. The manifest version {manifest_version!s} is incompatible with the airbyte-cdk package version "
318                f"{cdk_version!s} which contains these breaking changes."
319            )
320
321    @staticmethod
322    def _parse_version(
323        version: str,
324        version_type: str,
325    ) -> Version:
326        """Takes a semantic version represented as a string and splits it into a tuple.
327
328        The fourth part (prerelease) is not returned in the tuple.
329
330        Returns:
331            Version: the parsed version object
332        """
333        try:
334            parsed_version = Version(version)
335        except InvalidVersion as ex:
336            raise ValidationError(
337                f"The {version_type} version '{version}' is not a valid version format."
338            ) from ex
339        else:
340            # No exception
341            return parsed_version
342
343    def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]:
344        # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config
345        stream_configs: List[Dict[str, Any]] = manifest.get("streams", [])
346        for s in stream_configs:
347            if "type" not in s:
348                s["type"] = "DeclarativeStream"
349        return stream_configs
350
351    def _dynamic_stream_configs(
352        self, manifest: Mapping[str, Any], config: Mapping[str, Any]
353    ) -> List[Dict[str, Any]]:
354        dynamic_stream_definitions: List[Dict[str, Any]] = manifest.get("dynamic_streams", [])
355        dynamic_stream_configs: List[Dict[str, Any]] = []
356        seen_dynamic_streams: Set[str] = set()
357
358        for dynamic_definition in dynamic_stream_definitions:
359            components_resolver_config = dynamic_definition["components_resolver"]
360
361            if not components_resolver_config:
362                raise ValueError(
363                    f"Missing 'components_resolver' in dynamic definition: {dynamic_definition}"
364                )
365
366            resolver_type = components_resolver_config.get("type")
367            if not resolver_type:
368                raise ValueError(
369                    f"Missing 'type' in components resolver configuration: {components_resolver_config}"
370                )
371
372            if resolver_type not in COMPONENTS_RESOLVER_TYPE_MAPPING:
373                raise ValueError(
374                    f"Invalid components resolver type '{resolver_type}'. "
375                    f"Expected one of {list(COMPONENTS_RESOLVER_TYPE_MAPPING.keys())}."
376                )
377
378            if "retriever" in components_resolver_config:
379                components_resolver_config["retriever"]["requester"]["use_cache"] = True
380
381            # Create a resolver for dynamic components based on type
382            components_resolver = self._constructor.create_component(
383                COMPONENTS_RESOLVER_TYPE_MAPPING[resolver_type], components_resolver_config, config
384            )
385
386            stream_template_config = dynamic_definition["stream_template"]
387
388            for dynamic_stream in components_resolver.resolve_components(
389                stream_template_config=stream_template_config
390            ):
391                if "type" not in dynamic_stream:
392                    dynamic_stream["type"] = "DeclarativeStream"
393
394                # Ensure that each stream is created with a unique name
395                name = dynamic_stream.get("name")
396
397                if not isinstance(name, str):
398                    raise ValueError(
399                        f"Expected stream name {name} to be a string, got {type(name)}."
400                    )
401
402                if name in seen_dynamic_streams:
403                    error_message = f"Dynamic streams list contains a duplicate name: {name}. Please contact Airbyte Support."
404                    failure_type = FailureType.system_error
405
406                    if resolver_type == "ConfigComponentsResolver":
407                        error_message = f"Dynamic streams list contains a duplicate name: {name}. Please check your configuration."
408                        failure_type = FailureType.config_error
409
410                    raise AirbyteTracedException(
411                        message=error_message,
412                        internal_message=error_message,
413                        failure_type=failure_type,
414                    )
415
416                seen_dynamic_streams.add(name)
417                dynamic_stream_configs.append(dynamic_stream)
418
419        return dynamic_stream_configs
420
421    def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None:
422        self.logger.debug("declarative source created from manifest", extra=extra_args)

Declarative source defined by a manifest of low-code components that define source connector behavior

ManifestDeclarativeSource( source_config: Mapping[str, Any], *, config: Optional[Mapping[str, Any]] = None, debug: bool = False, emit_connector_builder_messages: bool = False, component_factory: Optional[airbyte_cdk.sources.declarative.parsers.model_to_component_factory.ModelToComponentFactory] = None)
 64    def __init__(
 65        self,
 66        source_config: ConnectionDefinition,
 67        *,
 68        config: Mapping[str, Any] | None = None,
 69        debug: bool = False,
 70        emit_connector_builder_messages: bool = False,
 71        component_factory: Optional[ModelToComponentFactory] = None,
 72    ):
 73        """
 74        Args:
 75            config: The provided config dict.
 76            source_config: The manifest of low-code components that describe the source connector.
 77            debug: True if debug mode is enabled.
 78            emit_connector_builder_messages: True if messages should be emitted to the connector builder.
 79            component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked.
 80        """
 81        self.logger = logging.getLogger(f"airbyte.{self.name}")
 82        # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing
 83        manifest = dict(source_config)
 84        if "type" not in manifest:
 85            manifest["type"] = "DeclarativeSource"
 86
 87        # If custom components are needed, locate and/or register them.
 88        self.components_module: ModuleType | None = get_registered_components_module(config=config)
 89
 90        resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest)
 91        propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters(
 92            "", resolved_source_config, {}
 93        )
 94        self._source_config = propagated_source_config
 95        self._debug = debug
 96        self._emit_connector_builder_messages = emit_connector_builder_messages
 97        self._constructor = (
 98            component_factory
 99            if component_factory
100            else ModelToComponentFactory(
101                emit_connector_builder_messages,
102                max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
103            )
104        )
105        self._message_repository = self._constructor.get_message_repository()
106        self._slice_logger: SliceLogger = (
107            AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger()
108        )
109
110        self._validate_source()
Arguments:
  • config: The provided config dict.
  • source_config: The manifest of low-code components that describe the source connector.
  • debug: True if debug mode is enabled.
  • emit_connector_builder_messages: True if messages should be emitted to the connector builder.
  • component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked.
logger
components_module: module | None
resolved_manifest: Mapping[str, Any]
112    @property
113    def resolved_manifest(self) -> Mapping[str, Any]:
114        return self._source_config
message_repository: MessageRepository
116    @property
117    def message_repository(self) -> MessageRepository:
118        return self._message_repository
120    @property
121    def connection_checker(self) -> ConnectionChecker:
122        check = self._source_config["check"]
123        if "type" not in check:
124            check["type"] = "CheckStream"
125        check_stream = self._constructor.create_component(
126            COMPONENTS_CHECKER_TYPE_MAPPING[check["type"]],
127            check,
128            dict(),
129            emit_connector_builder_messages=self._emit_connector_builder_messages,
130        )
131        if isinstance(check_stream, ConnectionChecker):
132            return check_stream
133        else:
134            raise ValueError(
135                f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}"
136            )

Returns the ConnectionChecker to use for the check operation

def streams( self, config: Mapping[str, Any]) -> List[Stream]:
138    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
139        self._emit_manifest_debug_message(
140            extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)}
141        )
142
143        stream_configs = self._stream_configs(self._source_config) + self._dynamic_stream_configs(
144            self._source_config, config
145        )
146
147        api_budget_model = self._source_config.get("api_budget")
148        if api_budget_model:
149            self._constructor.set_api_budget(api_budget_model, config)
150
151        source_streams = [
152            self._constructor.create_component(
153                StateDelegatingStreamModel
154                if stream_config.get("type") == StateDelegatingStreamModel.__name__
155                else DeclarativeStreamModel,
156                stream_config,
157                config,
158                emit_connector_builder_messages=self._emit_connector_builder_messages,
159            )
160            for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs))
161        ]
162
163        return source_streams
Parameters
  • config: The user-provided configuration as specified by the source's spec. Any stream construction related operation should happen here.
Returns

A list of the streams in this source connector.

def spec( self, logger: logging.Logger) -> airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification:
217    def spec(self, logger: logging.Logger) -> ConnectorSpecification:
218        """
219        Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible
220        configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this
221        will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json"
222        in the project root.
223        """
224        self._configure_logger_level(logger)
225        self._emit_manifest_debug_message(
226            extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)}
227        )
228
229        spec = self._source_config.get("spec")
230        if spec:
231            if "type" not in spec:
232                spec["type"] = "Spec"
233            spec_component = self._constructor.create_component(SpecModel, spec, dict())
234            return spec_component.generate_spec()
235        else:
236            return super().spec(logger)

Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json" in the project root.

def check( self, logger: logging.Logger, config: Mapping[str, Any]) -> airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteConnectionStatus:
238    def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus:
239        self._configure_logger_level(logger)
240        return super().check(logger, config)

Implements the Check Connection operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check.

def read( self, logger: logging.Logger, config: Mapping[str, Any], catalog: airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteCatalog, state: Optional[List[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage]] = None) -> Iterator[AirbyteMessage]:
242    def read(
243        self,
244        logger: logging.Logger,
245        config: Mapping[str, Any],
246        catalog: ConfiguredAirbyteCatalog,
247        state: Optional[List[AirbyteStateMessage]] = None,
248    ) -> Iterator[AirbyteMessage]:
249        self._configure_logger_level(logger)
250        yield from super().read(logger, config, catalog, state)

Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.

@dataclass
class MinMaxDatetime:
 14@dataclass
 15class MinMaxDatetime:
 16    """
 17    Compares the provided date against optional minimum or maximum times. If date is earlier than
 18    min_date, then min_date is returned. If date is greater than max_date, then max_date is returned.
 19    If neither, the input date is returned.
 20
 21    The timestamp format accepts the same format codes as datetime.strfptime, which are
 22    all the format codes required by the 1989 C standard.
 23    Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html
 24
 25    Attributes:
 26        datetime (Union[InterpolatedString, str]): InterpolatedString or string representing the datetime in the format specified by `datetime_format`
 27        datetime_format (str): Format of the datetime passed as argument
 28        min_datetime (Union[InterpolatedString, str]): Represents the minimum allowed datetime value.
 29        max_datetime (Union[InterpolatedString, str]): Represents the maximum allowed datetime value.
 30    """
 31
 32    datetime: Union[InterpolatedString, str]
 33    parameters: InitVar[Mapping[str, Any]]
 34    # datetime_format is a unique case where we inherit it from the parent if it is not specified before using the default value
 35    # which is why we need dedicated getter/setter methods and private dataclass field
 36    datetime_format: str
 37    _datetime_format: str = field(init=False, repr=False, default="")
 38    min_datetime: Union[InterpolatedString, str] = ""
 39    max_datetime: Union[InterpolatedString, str] = ""
 40
 41    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 42        self.datetime = InterpolatedString.create(self.datetime, parameters=parameters or {})
 43        self._parser = DatetimeParser()
 44        self.min_datetime = (
 45            InterpolatedString.create(self.min_datetime, parameters=parameters)  # type: ignore [assignment]  #  expression has type "InterpolatedString | None", variable has type "InterpolatedString | str"
 46            if self.min_datetime
 47            else None
 48        )  # type: ignore
 49        self.max_datetime = (
 50            InterpolatedString.create(self.max_datetime, parameters=parameters)  # type: ignore [assignment]  #  expression has type "InterpolatedString | None", variable has type "InterpolatedString | str"
 51            if self.max_datetime
 52            else None
 53        )  # type: ignore
 54
 55    def get_datetime(
 56        self, config: Mapping[str, Any], **additional_parameters: Mapping[str, Any]
 57    ) -> dt.datetime:
 58        """
 59        Evaluates and returns the datetime
 60        :param config: The user-provided configuration as specified by the source's spec
 61        :param additional_parameters: Additional arguments to be passed to the strings for interpolation
 62        :return: The evaluated datetime
 63        """
 64        # We apply a default datetime format here instead of at instantiation, so it can be set by the parent first
 65        datetime_format = self._datetime_format
 66        if not datetime_format:
 67            datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z"
 68
 69        time = self._parser.parse(
 70            str(
 71                self.datetime.eval(  # type: ignore[union-attr] # str has no attribute "eval"
 72                    config,
 73                    **additional_parameters,
 74                )
 75            ),
 76            datetime_format,
 77        )  # type: ignore # datetime is always cast to an interpolated string
 78
 79        if self.min_datetime:
 80            min_time = str(self.min_datetime.eval(config, **additional_parameters))  # type: ignore # min_datetime is always cast to an interpolated string
 81            if min_time:
 82                min_datetime = self._parser.parse(min_time, datetime_format)  # type: ignore # min_datetime is always cast to an interpolated string
 83                time = max(time, min_datetime)
 84        if self.max_datetime:
 85            max_time = str(self.max_datetime.eval(config, **additional_parameters))  # type: ignore # max_datetime is always cast to an interpolated string
 86            if max_time:
 87                max_datetime = self._parser.parse(max_time, datetime_format)
 88                time = min(time, max_datetime)
 89        return time
 90
 91    @property  # type: ignore # properties don't play well with dataclasses...
 92    def datetime_format(self) -> str:
 93        """The format of the string representing the datetime"""
 94        return self._datetime_format
 95
 96    @datetime_format.setter
 97    def datetime_format(self, value: str) -> None:
 98        """Setter for the datetime format"""
 99        # Covers the case where datetime_format is not provided in the constructor, which causes the property object
100        # to be set which we need to avoid doing
101        if not isinstance(value, property):
102            self._datetime_format = value
103
104    @classmethod
105    def create(
106        cls,
107        interpolated_string_or_min_max_datetime: Union[InterpolatedString, str, "MinMaxDatetime"],
108        parameters: Optional[Mapping[str, Any]] = None,
109    ) -> "MinMaxDatetime":
110        if parameters is None:
111            parameters = {}
112        if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance(
113            interpolated_string_or_min_max_datetime, str
114        ):
115            return MinMaxDatetime(  # type: ignore [call-arg]
116                datetime=interpolated_string_or_min_max_datetime, parameters=parameters
117            )
118        else:
119            return interpolated_string_or_min_max_datetime

Compares the provided date against optional minimum or maximum times. If date is earlier than min_date, then min_date is returned. If date is greater than max_date, then max_date is returned. If neither, the input date is returned.

The timestamp format accepts the same format codes as datetime.strfptime, which are all the format codes required by the 1989 C standard. Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html

Attributes:
  • datetime (Union[InterpolatedString, str]): InterpolatedString or string representing the datetime in the format specified by datetime_format
  • datetime_format (str): Format of the datetime passed as argument
  • min_datetime (Union[InterpolatedString, str]): Represents the minimum allowed datetime value.
  • max_datetime (Union[InterpolatedString, str]): Represents the maximum allowed datetime value.
MinMaxDatetime( datetime: Union[InterpolatedString, str], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], datetime_format: str = <property object>, min_datetime: Union[InterpolatedString, str] = '', max_datetime: Union[InterpolatedString, str] = '')
datetime: Union[InterpolatedString, str]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
datetime_format: str
91    @property  # type: ignore # properties don't play well with dataclasses...
92    def datetime_format(self) -> str:
93        """The format of the string representing the datetime"""
94        return self._datetime_format

The format of the string representing the datetime

min_datetime: Union[InterpolatedString, str] = ''
max_datetime: Union[InterpolatedString, str] = ''
def get_datetime( self, config: Mapping[str, Any], **additional_parameters: Mapping[str, Any]) -> datetime.datetime:
55    def get_datetime(
56        self, config: Mapping[str, Any], **additional_parameters: Mapping[str, Any]
57    ) -> dt.datetime:
58        """
59        Evaluates and returns the datetime
60        :param config: The user-provided configuration as specified by the source's spec
61        :param additional_parameters: Additional arguments to be passed to the strings for interpolation
62        :return: The evaluated datetime
63        """
64        # We apply a default datetime format here instead of at instantiation, so it can be set by the parent first
65        datetime_format = self._datetime_format
66        if not datetime_format:
67            datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z"
68
69        time = self._parser.parse(
70            str(
71                self.datetime.eval(  # type: ignore[union-attr] # str has no attribute "eval"
72                    config,
73                    **additional_parameters,
74                )
75            ),
76            datetime_format,
77        )  # type: ignore # datetime is always cast to an interpolated string
78
79        if self.min_datetime:
80            min_time = str(self.min_datetime.eval(config, **additional_parameters))  # type: ignore # min_datetime is always cast to an interpolated string
81            if min_time:
82                min_datetime = self._parser.parse(min_time, datetime_format)  # type: ignore # min_datetime is always cast to an interpolated string
83                time = max(time, min_datetime)
84        if self.max_datetime:
85            max_time = str(self.max_datetime.eval(config, **additional_parameters))  # type: ignore # max_datetime is always cast to an interpolated string
86            if max_time:
87                max_datetime = self._parser.parse(max_time, datetime_format)
88                time = min(time, max_datetime)
89        return time

Evaluates and returns the datetime

Parameters
  • config: The user-provided configuration as specified by the source's spec
  • additional_parameters: Additional arguments to be passed to the strings for interpolation
Returns

The evaluated datetime

@classmethod
def create( cls, interpolated_string_or_min_max_datetime: Union[InterpolatedString, str, MinMaxDatetime], parameters: Optional[Mapping[str, Any]] = None) -> MinMaxDatetime:
104    @classmethod
105    def create(
106        cls,
107        interpolated_string_or_min_max_datetime: Union[InterpolatedString, str, "MinMaxDatetime"],
108        parameters: Optional[Mapping[str, Any]] = None,
109    ) -> "MinMaxDatetime":
110        if parameters is None:
111            parameters = {}
112        if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance(
113            interpolated_string_or_min_max_datetime, str
114        ):
115            return MinMaxDatetime(  # type: ignore [call-arg]
116                datetime=interpolated_string_or_min_max_datetime, parameters=parameters
117            )
118        else:
119            return interpolated_string_or_min_max_datetime
@dataclass
class NoAuth(airbyte_cdk.DeclarativeAuthenticator):
33@dataclass
34class NoAuth(DeclarativeAuthenticator):
35    parameters: InitVar[Mapping[str, Any]]
36
37    @property
38    def auth_header(self) -> str:
39        return ""
40
41    @property
42    def token(self) -> str:
43        return ""
NoAuth(parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
auth_header: str
37    @property
38    def auth_header(self) -> str:
39        return ""

HTTP header to set on the requests

token: str
41    @property
42    def token(self) -> str:
43        return ""

The header value to set on outgoing HTTP requests

@dataclass
class OffsetIncrement(airbyte_cdk.PaginationStrategy):
 23@dataclass
 24class OffsetIncrement(PaginationStrategy):
 25    """
 26    Pagination strategy that returns the number of records reads so far and returns it as the next page token
 27    Examples:
 28        # page_size to be a constant integer value
 29        pagination_strategy:
 30          type: OffsetIncrement
 31          page_size: 2
 32
 33        # page_size to be a constant string value
 34        pagination_strategy:
 35          type: OffsetIncrement
 36          page_size: "2"
 37
 38        # page_size to be an interpolated string value
 39        pagination_strategy:
 40          type: OffsetIncrement
 41          page_size: "{{ parameters['items_per_page'] }}"
 42
 43    Attributes:
 44        page_size (InterpolatedString): the number of records to request
 45    """
 46
 47    config: Config
 48    page_size: Optional[Union[str, int]]
 49    parameters: InitVar[Mapping[str, Any]]
 50    decoder: Decoder = field(
 51        default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={}))
 52    )
 53    inject_on_first_request: bool = False
 54
 55    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 56        page_size = str(self.page_size) if isinstance(self.page_size, int) else self.page_size
 57        if page_size:
 58            self._page_size: Optional[InterpolatedString] = InterpolatedString(
 59                page_size, parameters=parameters
 60            )
 61        else:
 62            self._page_size = None
 63
 64    @property
 65    def initial_token(self) -> Optional[Any]:
 66        if self.inject_on_first_request:
 67            return 0
 68        return None
 69
 70    def next_page_token(
 71        self,
 72        response: requests.Response,
 73        last_page_size: int,
 74        last_record: Optional[Record],
 75        last_page_token_value: Optional[Any] = None,
 76    ) -> Optional[Any]:
 77        decoded_response = next(self.decoder.decode(response))
 78
 79        # Stop paginating when there are fewer records than the page size or the current page has no records
 80        if (
 81            self._page_size
 82            and last_page_size < self._page_size.eval(self.config, response=decoded_response)
 83        ) or last_page_size == 0:
 84            return None
 85        elif last_page_token_value is None:
 86            # If the OffsetIncrement strategy does not inject on the first request, the incoming last_page_token_value
 87            # will be None. For this case, we assume that None was the first page and progress to the next offset
 88            return 0 + last_page_size
 89        elif not isinstance(last_page_token_value, int):
 90            raise ValueError(
 91                f"Last page token value {last_page_token_value} for OffsetIncrement pagination strategy was not an integer"
 92            )
 93        else:
 94            return last_page_token_value + last_page_size
 95
 96    def get_page_size(self) -> Optional[int]:
 97        if self._page_size:
 98            page_size = self._page_size.eval(self.config)
 99            if not isinstance(page_size, int):
100                raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}")
101            return page_size
102        else:
103            return None

Pagination strategy that returns the number of records reads so far and returns it as the next page token

Examples:

page_size to be a constant integer value

pagination_strategy: type: OffsetIncrement page_size: 2

page_size to be a constant string value

pagination_strategy: type: OffsetIncrement page_size: "2"

page_size to be an interpolated string value

pagination_strategy: type: OffsetIncrement page_size: "{{ parameters['items_per_page'] }}"

Attributes:
  • page_size (InterpolatedString): the number of records to request
OffsetIncrement( config: Mapping[str, Any], page_size: Union[str, int, NoneType], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], decoder: Decoder = <factory>, inject_on_first_request: bool = False)
config: Mapping[str, Any]
page_size: Union[str, int, NoneType]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
decoder: Decoder
inject_on_first_request: bool = False
initial_token: Optional[Any]
64    @property
65    def initial_token(self) -> Optional[Any]:
66        if self.inject_on_first_request:
67            return 0
68        return None

Return the initial value of the token

def next_page_token( self, response: requests.models.Response, last_page_size: int, last_record: Optional[Record], last_page_token_value: Optional[Any] = None) -> Optional[Any]:
70    def next_page_token(
71        self,
72        response: requests.Response,
73        last_page_size: int,
74        last_record: Optional[Record],
75        last_page_token_value: Optional[Any] = None,
76    ) -> Optional[Any]:
77        decoded_response = next(self.decoder.decode(response))
78
79        # Stop paginating when there are fewer records than the page size or the current page has no records
80        if (
81            self._page_size
82            and last_page_size < self._page_size.eval(self.config, response=decoded_response)
83        ) or last_page_size == 0:
84            return None
85        elif last_page_token_value is None:
86            # If the OffsetIncrement strategy does not inject on the first request, the incoming last_page_token_value
87            # will be None. For this case, we assume that None was the first page and progress to the next offset
88            return 0 + last_page_size
89        elif not isinstance(last_page_token_value, int):
90            raise ValueError(
91                f"Last page token value {last_page_token_value} for OffsetIncrement pagination strategy was not an integer"
92            )
93        else:
94            return last_page_token_value + last_page_size
Parameters
  • response: response to process
  • last_page_size: the number of records read from the response
  • last_record: the last record extracted from the response
  • last_page_token_value: The current value of the page token made on the last request
Returns

next page token. Returns None if there are no more pages to fetch

def get_page_size(self) -> Optional[int]:
 96    def get_page_size(self) -> Optional[int]:
 97        if self._page_size:
 98            page_size = self._page_size.eval(self.config)
 99            if not isinstance(page_size, int):
100                raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}")
101            return page_size
102        else:
103            return None
Returns

page size: The number of records to fetch in a page. Returns None if unspecified

@dataclass
class PageIncrement(airbyte_cdk.PaginationStrategy):
18@dataclass
19class PageIncrement(PaginationStrategy):
20    """
21    Pagination strategy that returns the number of pages reads so far and returns it as the next page token
22
23    Attributes:
24        page_size (int): the number of records to request
25        start_from_page (int): number of the initial page
26    """
27
28    config: Config
29    page_size: Optional[Union[str, int]]
30    parameters: InitVar[Mapping[str, Any]]
31    start_from_page: int = 0
32    inject_on_first_request: bool = False
33
34    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
35        if isinstance(self.page_size, int) or (self.page_size is None):
36            self._page_size = self.page_size
37        else:
38            page_size = InterpolatedString(self.page_size, parameters=parameters).eval(self.config)
39            if not isinstance(page_size, int):
40                raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}")
41            self._page_size = page_size
42
43    @property
44    def initial_token(self) -> Optional[Any]:
45        if self.inject_on_first_request:
46            return self.start_from_page
47        return None
48
49    def next_page_token(
50        self,
51        response: requests.Response,
52        last_page_size: int,
53        last_record: Optional[Record],
54        last_page_token_value: Optional[Any],
55    ) -> Optional[Any]:
56        # Stop paginating when there are fewer records than the page size or the current page has no records
57        if (self._page_size and last_page_size < self._page_size) or last_page_size == 0:
58            return None
59        elif last_page_token_value is None:
60            # If the PageIncrement strategy does not inject on the first request, the incoming last_page_token_value
61            # may be None. When this is the case, we assume we've already requested the first page specified by
62            # start_from_page and must now get the next page
63            return self.start_from_page + 1
64        elif not isinstance(last_page_token_value, int):
65            raise ValueError(
66                f"Last page token value {last_page_token_value} for PageIncrement pagination strategy was not an integer"
67            )
68        else:
69            return last_page_token_value + 1
70
71    def get_page_size(self) -> Optional[int]:
72        return self._page_size

Pagination strategy that returns the number of pages reads so far and returns it as the next page token

Attributes:
  • page_size (int): the number of records to request
  • start_from_page (int): number of the initial page
PageIncrement( config: Mapping[str, Any], page_size: Union[str, int, NoneType], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], start_from_page: int = 0, inject_on_first_request: bool = False)
config: Mapping[str, Any]
page_size: Union[str, int, NoneType]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
start_from_page: int = 0
inject_on_first_request: bool = False
initial_token: Optional[Any]
43    @property
44    def initial_token(self) -> Optional[Any]:
45        if self.inject_on_first_request:
46            return self.start_from_page
47        return None

Return the initial value of the token

def next_page_token( self, response: requests.models.Response, last_page_size: int, last_record: Optional[Record], last_page_token_value: Optional[Any]) -> Optional[Any]:
49    def next_page_token(
50        self,
51        response: requests.Response,
52        last_page_size: int,
53        last_record: Optional[Record],
54        last_page_token_value: Optional[Any],
55    ) -> Optional[Any]:
56        # Stop paginating when there are fewer records than the page size or the current page has no records
57        if (self._page_size and last_page_size < self._page_size) or last_page_size == 0:
58            return None
59        elif last_page_token_value is None:
60            # If the PageIncrement strategy does not inject on the first request, the incoming last_page_token_value
61            # may be None. When this is the case, we assume we've already requested the first page specified by
62            # start_from_page and must now get the next page
63            return self.start_from_page + 1
64        elif not isinstance(last_page_token_value, int):
65            raise ValueError(
66                f"Last page token value {last_page_token_value} for PageIncrement pagination strategy was not an integer"
67            )
68        else:
69            return last_page_token_value + 1
Parameters
  • response: response to process
  • last_page_size: the number of records read from the response
  • last_record: the last record extracted from the response
  • last_page_token_value: The current value of the page token made on the last request
Returns

next page token. Returns None if there are no more pages to fetch

def get_page_size(self) -> Optional[int]:
71    def get_page_size(self) -> Optional[int]:
72        return self._page_size
Returns

page size: The number of records to fetch in a page. Returns None if unspecified

@dataclass
class PaginationStrategy:
15@dataclass
16class PaginationStrategy:
17    """
18    Defines how to get the next page token
19    """
20
21    @property
22    @abstractmethod
23    def initial_token(self) -> Optional[Any]:
24        """
25        Return the initial value of the token
26        """
27
28    @abstractmethod
29    def next_page_token(
30        self,
31        response: requests.Response,
32        last_page_size: int,
33        last_record: Optional[Record],
34        last_page_token_value: Optional[Any],
35    ) -> Optional[Any]:
36        """
37        :param response: response to process
38        :param last_page_size: the number of records read from the response
39        :param last_record: the last record extracted from the response
40        :param last_page_token_value: The current value of the page token made on the last request
41        :return: next page token. Returns None if there are no more pages to fetch
42        """
43        pass
44
45    @abstractmethod
46    def get_page_size(self) -> Optional[int]:
47        """
48        :return: page size: The number of records to fetch in a page. Returns None if unspecified
49        """

Defines how to get the next page token

initial_token: Optional[Any]
21    @property
22    @abstractmethod
23    def initial_token(self) -> Optional[Any]:
24        """
25        Return the initial value of the token
26        """

Return the initial value of the token

@abstractmethod
def next_page_token( self, response: requests.models.Response, last_page_size: int, last_record: Optional[Record], last_page_token_value: Optional[Any]) -> Optional[Any]:
28    @abstractmethod
29    def next_page_token(
30        self,
31        response: requests.Response,
32        last_page_size: int,
33        last_record: Optional[Record],
34        last_page_token_value: Optional[Any],
35    ) -> Optional[Any]:
36        """
37        :param response: response to process
38        :param last_page_size: the number of records read from the response
39        :param last_record: the last record extracted from the response
40        :param last_page_token_value: The current value of the page token made on the last request
41        :return: next page token. Returns None if there are no more pages to fetch
42        """
43        pass
Parameters
  • response: response to process
  • last_page_size: the number of records read from the response
  • last_record: the last record extracted from the response
  • last_page_token_value: The current value of the page token made on the last request
Returns

next page token. Returns None if there are no more pages to fetch

@abstractmethod
def get_page_size(self) -> Optional[int]:
45    @abstractmethod
46    def get_page_size(self) -> Optional[int]:
47        """
48        :return: page size: The number of records to fetch in a page. Returns None if unspecified
49        """
Returns

page size: The number of records to fetch in a page. Returns None if unspecified

@dataclass
class ParentStreamConfig:
31@dataclass
32class ParentStreamConfig:
33    """
34    Describes how to create a stream slice from a parent stream
35
36    stream: The stream to read records from
37    parent_key: The key of the parent stream's records that will be the stream slice key
38    partition_field: The partition key
39    extra_fields: Additional field paths to include in the stream slice
40    request_option: How to inject the slice value on an outgoing HTTP request
41    incremental_dependency (bool): Indicates if the parent stream should be read incrementally.
42    """
43
44    stream: "DeclarativeStream"  # Parent streams must be DeclarativeStream because we can't know which part of the stream slice is a partition for regular Stream
45    parent_key: Union[InterpolatedString, str]
46    partition_field: Union[InterpolatedString, str]
47    config: Config
48    parameters: InitVar[Mapping[str, Any]]
49    extra_fields: Optional[Union[List[List[str]], List[List[InterpolatedString]]]] = (
50        None  # List of field paths (arrays of strings)
51    )
52    request_option: Optional[RequestOption] = None
53    incremental_dependency: bool = False
54    lazy_read_pointer: Optional[List[Union[InterpolatedString, str]]] = None
55
56    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
57        self.parent_key = InterpolatedString.create(self.parent_key, parameters=parameters)
58        self.partition_field = InterpolatedString.create(
59            self.partition_field, parameters=parameters
60        )
61        if self.extra_fields:
62            # Create InterpolatedString for each field path in extra_keys
63            self.extra_fields = [
64                [InterpolatedString.create(path, parameters=parameters) for path in key_path]
65                for key_path in self.extra_fields
66            ]
67
68        self.lazy_read_pointer = (
69            [
70                InterpolatedString.create(path, parameters=parameters)
71                if isinstance(path, str)
72                else path
73                for path in self.lazy_read_pointer
74            ]
75            if self.lazy_read_pointer
76            else None
77        )

Describes how to create a stream slice from a parent stream

stream: The stream to read records from parent_key: The key of the parent stream's records that will be the stream slice key partition_field: The partition key extra_fields: Additional field paths to include in the stream slice request_option: How to inject the slice value on an outgoing HTTP request incremental_dependency (bool): Indicates if the parent stream should be read incrementally.

ParentStreamConfig( stream: DeclarativeStream, parent_key: Union[InterpolatedString, str], partition_field: Union[InterpolatedString, str], config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], extra_fields: Union[List[List[str]], List[List[InterpolatedString]], NoneType] = None, request_option: Optional[RequestOption] = None, incremental_dependency: bool = False, lazy_read_pointer: Optional[List[Union[InterpolatedString, str]]] = None)
parent_key: Union[InterpolatedString, str]
partition_field: Union[InterpolatedString, str]
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
extra_fields: Union[List[List[str]], List[List[InterpolatedString]], NoneType] = None
request_option: Optional[RequestOption] = None
incremental_dependency: bool = False
lazy_read_pointer: Optional[List[Union[InterpolatedString, str]]] = None
class ReadException(builtins.Exception):
 7class ReadException(Exception):
 8    """
 9    Raise when there is an error reading data from an API Source
10    """

Raise when there is an error reading data from an API Source

@dataclass
class RecordExtractor:
12@dataclass
13class RecordExtractor:
14    """
15    Responsible for translating an HTTP response into a list of records by extracting records from the response.
16    """
17
18    @abstractmethod
19    def extract_records(
20        self,
21        response: requests.Response,
22    ) -> Iterable[Mapping[str, Any]]:
23        """
24        Selects records from the response
25        :param response: The response to extract the records from
26        :return: List of Records extracted from the response
27        """
28        pass

Responsible for translating an HTTP response into a list of records by extracting records from the response.

@abstractmethod
def extract_records(self, response: requests.models.Response) -> Iterable[Mapping[str, Any]]:
18    @abstractmethod
19    def extract_records(
20        self,
21        response: requests.Response,
22    ) -> Iterable[Mapping[str, Any]]:
23        """
24        Selects records from the response
25        :param response: The response to extract the records from
26        :return: List of Records extracted from the response
27        """
28        pass

Selects records from the response

Parameters
  • response: The response to extract the records from
Returns

List of Records extracted from the response

@dataclass
class RecordFilter:
17@dataclass
18class RecordFilter:
19    """
20    Filter applied on a list of Records
21
22    config (Config): The user-provided configuration as specified by the source's spec
23    condition (str): The string representing the predicate to filter a record. Records will be removed if evaluated to False
24    """
25
26    parameters: InitVar[Mapping[str, Any]]
27    config: Config
28    condition: str = ""
29
30    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
31        self._filter_interpolator = InterpolatedBoolean(
32            condition=self.condition, parameters=parameters
33        )
34
35    def filter_records(
36        self,
37        records: Iterable[Mapping[str, Any]],
38        stream_state: StreamState,
39        stream_slice: Optional[StreamSlice] = None,
40        next_page_token: Optional[Mapping[str, Any]] = None,
41    ) -> Iterable[Mapping[str, Any]]:
42        kwargs = {
43            "stream_state": stream_state,
44            "stream_slice": stream_slice,
45            "next_page_token": next_page_token,
46            "stream_slice.extra_fields": stream_slice.extra_fields if stream_slice else {},
47        }
48        for record in records:
49            if self._filter_interpolator.eval(self.config, record=record, **kwargs):
50                yield record

Filter applied on a list of Records

config (Config): The user-provided configuration as specified by the source's spec condition (str): The string representing the predicate to filter a record. Records will be removed if evaluated to False

RecordFilter( parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], config: Mapping[str, Any], condition: str = '')
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
config: Mapping[str, Any]
condition: str = ''
def filter_records( self, records: Iterable[Mapping[str, Any]], stream_state: Mapping[str, Any], stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Iterable[Mapping[str, Any]]:
35    def filter_records(
36        self,
37        records: Iterable[Mapping[str, Any]],
38        stream_state: StreamState,
39        stream_slice: Optional[StreamSlice] = None,
40        next_page_token: Optional[Mapping[str, Any]] = None,
41    ) -> Iterable[Mapping[str, Any]]:
42        kwargs = {
43            "stream_state": stream_state,
44            "stream_slice": stream_slice,
45            "next_page_token": next_page_token,
46            "stream_slice.extra_fields": stream_slice.extra_fields if stream_slice else {},
47        }
48        for record in records:
49            if self._filter_interpolator.eval(self.config, record=record, **kwargs):
50                yield record
@dataclass
class RecordSelector(airbyte_cdk.sources.declarative.extractors.http_selector.HttpSelector):
 24@dataclass
 25class RecordSelector(HttpSelector):
 26    """
 27    Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering
 28    records based on a heuristic.
 29
 30    Attributes:
 31        extractor (RecordExtractor): The record extractor responsible for extracting records from a response
 32        schema_normalization (TypeTransformer): The record normalizer responsible for casting record values to stream schema types
 33        record_filter (RecordFilter): The record filter responsible for filtering extracted records
 34        transformations (List[RecordTransformation]): The transformations to be done on the records
 35    """
 36
 37    extractor: RecordExtractor
 38    config: Config
 39    parameters: InitVar[Mapping[str, Any]]
 40    schema_normalization: Union[TypeTransformer, DeclarativeTypeTransformer]
 41    name: str
 42    _name: Union[InterpolatedString, str] = field(init=False, repr=False, default="")
 43    record_filter: Optional[RecordFilter] = None
 44    transformations: List[RecordTransformation] = field(default_factory=lambda: [])
 45    transform_before_filtering: bool = False
 46
 47    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 48        self._parameters = parameters
 49        self._name = (
 50            InterpolatedString(self._name, parameters=parameters)
 51            if isinstance(self._name, str)
 52            else self._name
 53        )
 54
 55    @property  # type: ignore
 56    def name(self) -> str:
 57        """
 58        :return: Stream name
 59        """
 60        return (
 61            str(self._name.eval(self.config))
 62            if isinstance(self._name, InterpolatedString)
 63            else self._name
 64        )
 65
 66    @name.setter
 67    def name(self, value: str) -> None:
 68        if not isinstance(value, property):
 69            self._name = value
 70
 71    def select_records(
 72        self,
 73        response: requests.Response,
 74        stream_state: StreamState,
 75        records_schema: Mapping[str, Any],
 76        stream_slice: Optional[StreamSlice] = None,
 77        next_page_token: Optional[Mapping[str, Any]] = None,
 78    ) -> Iterable[Record]:
 79        """
 80        Selects records from the response
 81        :param response: The response to select the records from
 82        :param stream_state: The stream state
 83        :param records_schema: json schema of records to return
 84        :param stream_slice: The stream slice
 85        :param next_page_token: The paginator token
 86        :return: List of Records selected from the response
 87        """
 88        all_data: Iterable[Mapping[str, Any]] = self.extractor.extract_records(response)
 89        yield from self.filter_and_transform(
 90            all_data, stream_state, records_schema, stream_slice, next_page_token
 91        )
 92
 93    def filter_and_transform(
 94        self,
 95        all_data: Iterable[Mapping[str, Any]],
 96        stream_state: StreamState,
 97        records_schema: Mapping[str, Any],
 98        stream_slice: Optional[StreamSlice] = None,
 99        next_page_token: Optional[Mapping[str, Any]] = None,
100    ) -> Iterable[Record]:
101        """
102        There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and
103        normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests
104        library).
105
106        Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could
107        share the logic of doing transformations on a set of records.
108        """
109        if self.transform_before_filtering:
110            transformed_data = self._transform(all_data, stream_state, stream_slice)
111            transformed_filtered_data = self._filter(
112                transformed_data, stream_state, stream_slice, next_page_token
113            )
114        else:
115            filtered_data = self._filter(all_data, stream_state, stream_slice, next_page_token)
116            transformed_filtered_data = self._transform(filtered_data, stream_state, stream_slice)
117        normalized_data = self._normalize_by_schema(
118            transformed_filtered_data, schema=records_schema
119        )
120        for data in normalized_data:
121            yield Record(data=data, stream_name=self.name, associated_slice=stream_slice)
122
123    def _normalize_by_schema(
124        self, records: Iterable[Mapping[str, Any]], schema: Optional[Mapping[str, Any]]
125    ) -> Iterable[Mapping[str, Any]]:
126        if schema:
127            # record has type Mapping[str, Any], but dict[str, Any] expected
128            for record in records:
129                normalized_record = dict(record)
130                self.schema_normalization.transform(normalized_record, schema)
131                yield normalized_record
132        else:
133            yield from records
134
135    def _filter(
136        self,
137        records: Iterable[Mapping[str, Any]],
138        stream_state: StreamState,
139        stream_slice: Optional[StreamSlice],
140        next_page_token: Optional[Mapping[str, Any]],
141    ) -> Iterable[Mapping[str, Any]]:
142        if self.record_filter:
143            yield from self.record_filter.filter_records(
144                records,
145                stream_state=stream_state,
146                stream_slice=stream_slice,
147                next_page_token=next_page_token,
148            )
149        else:
150            yield from records
151
152    def _transform(
153        self,
154        records: Iterable[Mapping[str, Any]],
155        stream_state: StreamState,
156        stream_slice: Optional[StreamSlice] = None,
157    ) -> Iterable[Mapping[str, Any]]:
158        for record in records:
159            for transformation in self.transformations:
160                transformation.transform(
161                    record,  # type: ignore  # record has type Mapping[str, Any], but Dict[str, Any] expected
162                    config=self.config,
163                    stream_state=stream_state,
164                    stream_slice=stream_slice,
165                )
166            yield record

Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering records based on a heuristic.

Attributes:
  • extractor (RecordExtractor): The record extractor responsible for extracting records from a response
  • schema_normalization (TypeTransformer): The record normalizer responsible for casting record values to stream schema types
  • record_filter (RecordFilter): The record filter responsible for filtering extracted records
  • transformations (List[RecordTransformation]): The transformations to be done on the records
RecordSelector( extractor: RecordExtractor, config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], schema_normalization: Union[TypeTransformer, airbyte_cdk.sources.declarative.extractors.TypeTransformer], name: str = <property object>, record_filter: Optional[RecordFilter] = None, transformations: List[RecordTransformation] = <factory>, transform_before_filtering: bool = False)
extractor: RecordExtractor
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
name: str
55    @property  # type: ignore
56    def name(self) -> str:
57        """
58        :return: Stream name
59        """
60        return (
61            str(self._name.eval(self.config))
62            if isinstance(self._name, InterpolatedString)
63            else self._name
64        )
Returns

Stream name

record_filter: Optional[RecordFilter] = None
transformations: List[RecordTransformation]
transform_before_filtering: bool = False
def select_records( self, response: requests.models.Response, stream_state: Mapping[str, Any], records_schema: Mapping[str, Any], stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Iterable[Record]:
71    def select_records(
72        self,
73        response: requests.Response,
74        stream_state: StreamState,
75        records_schema: Mapping[str, Any],
76        stream_slice: Optional[StreamSlice] = None,
77        next_page_token: Optional[Mapping[str, Any]] = None,
78    ) -> Iterable[Record]:
79        """
80        Selects records from the response
81        :param response: The response to select the records from
82        :param stream_state: The stream state
83        :param records_schema: json schema of records to return
84        :param stream_slice: The stream slice
85        :param next_page_token: The paginator token
86        :return: List of Records selected from the response
87        """
88        all_data: Iterable[Mapping[str, Any]] = self.extractor.extract_records(response)
89        yield from self.filter_and_transform(
90            all_data, stream_state, records_schema, stream_slice, next_page_token
91        )

Selects records from the response

Parameters
  • response: The response to select the records from
  • stream_state: The stream state
  • records_schema: json schema of records to return
  • stream_slice: The stream slice
  • next_page_token: The paginator token
Returns

List of Records selected from the response

def filter_and_transform( self, all_data: Iterable[Mapping[str, Any]], stream_state: Mapping[str, Any], records_schema: Mapping[str, Any], stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Iterable[Record]:
 93    def filter_and_transform(
 94        self,
 95        all_data: Iterable[Mapping[str, Any]],
 96        stream_state: StreamState,
 97        records_schema: Mapping[str, Any],
 98        stream_slice: Optional[StreamSlice] = None,
 99        next_page_token: Optional[Mapping[str, Any]] = None,
100    ) -> Iterable[Record]:
101        """
102        There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and
103        normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests
104        library).
105
106        Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could
107        share the logic of doing transformations on a set of records.
108        """
109        if self.transform_before_filtering:
110            transformed_data = self._transform(all_data, stream_state, stream_slice)
111            transformed_filtered_data = self._filter(
112                transformed_data, stream_state, stream_slice, next_page_token
113            )
114        else:
115            filtered_data = self._filter(all_data, stream_state, stream_slice, next_page_token)
116            transformed_filtered_data = self._transform(filtered_data, stream_state, stream_slice)
117        normalized_data = self._normalize_by_schema(
118            transformed_filtered_data, schema=records_schema
119        )
120        for data in normalized_data:
121            yield Record(data=data, stream_name=self.name, associated_slice=stream_slice)

There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests library).

Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could share the logic of doing transformations on a set of records.

@dataclass
class RecordTransformation:
13@dataclass
14class RecordTransformation:
15    """
16    Implementations of this class define transformations that can be applied to records of a stream.
17    """
18
19    @abstractmethod
20    def transform(
21        self,
22        record: Dict[str, Any],
23        config: Optional[Config] = None,
24        stream_state: Optional[StreamState] = None,
25        stream_slice: Optional[StreamSlice] = None,
26    ) -> None:
27        """
28        Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.
29
30        :param record: The input record to be transformed
31        :param config: The user-provided configuration as specified by the source's spec
32        :param stream_state: The stream state
33        :param stream_slice: The stream slice
34        :return: The transformed record
35        """
36
37    def __eq__(self, other: object) -> bool:
38        return other.__dict__ == self.__dict__

Implementations of this class define transformations that can be applied to records of a stream.

@abstractmethod
def transform( self, record: Dict[str, Any], config: Optional[Mapping[str, Any]] = None, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None) -> None:
19    @abstractmethod
20    def transform(
21        self,
22        record: Dict[str, Any],
23        config: Optional[Config] = None,
24        stream_state: Optional[StreamState] = None,
25        stream_slice: Optional[StreamSlice] = None,
26    ) -> None:
27        """
28        Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.
29
30        :param record: The input record to be transformed
31        :param config: The user-provided configuration as specified by the source's spec
32        :param stream_state: The stream state
33        :param stream_slice: The stream slice
34        :return: The transformed record
35        """

Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.

Parameters
  • record: The input record to be transformed
  • config: The user-provided configuration as specified by the source's spec
  • stream_state: The stream state
  • stream_slice: The stream slice
Returns

The transformed record

@dataclass
class RequestOption:
 25@dataclass
 26class RequestOption:
 27    """
 28    Describes an option to set on a request
 29
 30    Attributes:
 31        field_name (str): Describes the name of the parameter to inject. Mutually exclusive with field_path.
 32        field_path (list(str)): Describes the path to a nested field as a list of field names.
 33          Only valid for body_json injection type, and mutually exclusive with field_name.
 34        inject_into (RequestOptionType): Describes where in the HTTP request to inject the parameter
 35    """
 36
 37    inject_into: RequestOptionType
 38    parameters: InitVar[Mapping[str, Any]]
 39    field_name: Optional[Union[InterpolatedString, str]] = None
 40    field_path: Optional[List[Union[InterpolatedString, str]]] = None
 41
 42    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 43        # Validate inputs. We should expect either field_name or field_path, but not both
 44        if self.field_name is None and self.field_path is None:
 45            raise ValueError("RequestOption requires either a field_name or field_path")
 46
 47        if self.field_name is not None and self.field_path is not None:
 48            raise ValueError(
 49                "Only one of field_name or field_path can be provided to RequestOption"
 50            )
 51
 52        # Nested field injection is only supported for body JSON injection
 53        if self.field_path is not None and self.inject_into != RequestOptionType.body_json:
 54            raise ValueError(
 55                "Nested field injection is only supported for body JSON injection. Please use a top-level field_name for other injection types."
 56            )
 57
 58        # Convert field_name and field_path into InterpolatedString objects if they are strings
 59        if self.field_name is not None:
 60            self.field_name = InterpolatedString.create(self.field_name, parameters=parameters)
 61        elif self.field_path is not None:
 62            self.field_path = [
 63                InterpolatedString.create(segment, parameters=parameters)
 64                for segment in self.field_path
 65            ]
 66
 67    @property
 68    def _is_field_path(self) -> bool:
 69        """Returns whether this option is a field path (ie, a nested field)"""
 70        return self.field_path is not None
 71
 72    def inject_into_request(
 73        self,
 74        target: MutableMapping[str, Any],
 75        value: Any,
 76        config: Config,
 77    ) -> None:
 78        """
 79        Inject a request option value into a target request structure using either field_name or field_path.
 80        For non-body-json injection, only top-level field names are supported.
 81        For body-json injection, both field names and nested field paths are supported.
 82
 83        Args:
 84            target: The request structure to inject the value into
 85            value: The value to inject
 86            config: The config object to use for interpolation
 87        """
 88        if self._is_field_path:
 89            if self.inject_into != RequestOptionType.body_json:
 90                raise ValueError(
 91                    "Nested field injection is only supported for body JSON injection. Please use a top-level field_name for other injection types."
 92                )
 93
 94            assert self.field_path is not None  # for type checker
 95            current = target
 96            # Convert path segments into strings, evaluating any interpolated segments
 97            # Example: ["data", "{{ config[user_type] }}", "id"] -> ["data", "admin", "id"]
 98            *path_parts, final_key = [
 99                str(
100                    segment.eval(config=config)
101                    if isinstance(segment, InterpolatedString)
102                    else segment
103                )
104                for segment in self.field_path
105            ]
106
107            # Build a nested dictionary structure and set the final value at the deepest level
108            for part in path_parts:
109                current = current.setdefault(part, {})
110            current[final_key] = value
111        else:
112            # For non-nested fields, evaluate the field name if it's an interpolated string
113            key = (
114                self.field_name.eval(config=config)
115                if isinstance(self.field_name, InterpolatedString)
116                else self.field_name
117            )
118            target[str(key)] = value

Describes an option to set on a request

Attributes:
  • field_name (str): Describes the name of the parameter to inject. Mutually exclusive with field_path.
  • field_path (list(str)): Describes the path to a nested field as a list of field names. Only valid for body_json injection type, and mutually exclusive with field_name.
  • inject_into (RequestOptionType): Describes where in the HTTP request to inject the parameter
RequestOption( inject_into: RequestOptionType, parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], field_name: Union[InterpolatedString, str, NoneType] = None, field_path: Optional[List[Union[InterpolatedString, str]]] = None)
inject_into: RequestOptionType
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
field_name: Union[InterpolatedString, str, NoneType] = None
field_path: Optional[List[Union[InterpolatedString, str]]] = None
def inject_into_request( self, target: MutableMapping[str, Any], value: Any, config: Mapping[str, Any]) -> None:
 72    def inject_into_request(
 73        self,
 74        target: MutableMapping[str, Any],
 75        value: Any,
 76        config: Config,
 77    ) -> None:
 78        """
 79        Inject a request option value into a target request structure using either field_name or field_path.
 80        For non-body-json injection, only top-level field names are supported.
 81        For body-json injection, both field names and nested field paths are supported.
 82
 83        Args:
 84            target: The request structure to inject the value into
 85            value: The value to inject
 86            config: The config object to use for interpolation
 87        """
 88        if self._is_field_path:
 89            if self.inject_into != RequestOptionType.body_json:
 90                raise ValueError(
 91                    "Nested field injection is only supported for body JSON injection. Please use a top-level field_name for other injection types."
 92                )
 93
 94            assert self.field_path is not None  # for type checker
 95            current = target
 96            # Convert path segments into strings, evaluating any interpolated segments
 97            # Example: ["data", "{{ config[user_type] }}", "id"] -> ["data", "admin", "id"]
 98            *path_parts, final_key = [
 99                str(
100                    segment.eval(config=config)
101                    if isinstance(segment, InterpolatedString)
102                    else segment
103                )
104                for segment in self.field_path
105            ]
106
107            # Build a nested dictionary structure and set the final value at the deepest level
108            for part in path_parts:
109                current = current.setdefault(part, {})
110            current[final_key] = value
111        else:
112            # For non-nested fields, evaluate the field name if it's an interpolated string
113            key = (
114                self.field_name.eval(config=config)
115                if isinstance(self.field_name, InterpolatedString)
116                else self.field_name
117            )
118            target[str(key)] = value

Inject a request option value into a target request structure using either field_name or field_path. For non-body-json injection, only top-level field names are supported. For body-json injection, both field names and nested field paths are supported.

Arguments:
  • target: The request structure to inject the value into
  • value: The value to inject
  • config: The config object to use for interpolation
class RequestOptionType(enum.Enum):
14class RequestOptionType(Enum):
15    """
16    Describes where to set a value on a request
17    """
18
19    request_parameter = "request_parameter"
20    header = "header"
21    body_data = "body_data"
22    body_json = "body_json"

Describes where to set a value on a request

request_parameter = <RequestOptionType.request_parameter: 'request_parameter'>
header = <RequestOptionType.header: 'header'>
body_data = <RequestOptionType.body_data: 'body_data'>
body_json = <RequestOptionType.body_json: 'body_json'>
 30class Requester(RequestOptionsProvider):
 31    @abstractmethod
 32    def get_authenticator(self) -> DeclarativeAuthenticator:
 33        """
 34        Specifies the authenticator to use when submitting requests
 35        """
 36        pass
 37
 38    @abstractmethod
 39    def get_url_base(
 40        self,
 41        *,
 42        stream_state: Optional[StreamState],
 43        stream_slice: Optional[StreamSlice],
 44        next_page_token: Optional[Mapping[str, Any]],
 45    ) -> str:
 46        """
 47        :return: URL base for the  API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
 48        """
 49
 50    @abstractmethod
 51    def get_path(
 52        self,
 53        *,
 54        stream_state: Optional[StreamState],
 55        stream_slice: Optional[StreamSlice],
 56        next_page_token: Optional[Mapping[str, Any]],
 57    ) -> str:
 58        """
 59        Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
 60        """
 61
 62    @abstractmethod
 63    def get_method(self) -> HttpMethod:
 64        """
 65        Specifies the HTTP method to use
 66        """
 67
 68    @abstractmethod
 69    def get_request_params(
 70        self,
 71        *,
 72        stream_state: Optional[StreamState] = None,
 73        stream_slice: Optional[StreamSlice] = None,
 74        next_page_token: Optional[Mapping[str, Any]] = None,
 75    ) -> MutableMapping[str, Any]:
 76        """
 77        Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
 78
 79        E.g: you might want to define query parameters for paging if next_page_token is not None.
 80        """
 81
 82    @abstractmethod
 83    def get_request_headers(
 84        self,
 85        *,
 86        stream_state: Optional[StreamState] = None,
 87        stream_slice: Optional[StreamSlice] = None,
 88        next_page_token: Optional[Mapping[str, Any]] = None,
 89    ) -> Mapping[str, Any]:
 90        """
 91        Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
 92        """
 93
 94    @abstractmethod
 95    def get_request_body_data(
 96        self,
 97        *,
 98        stream_state: Optional[StreamState] = None,
 99        stream_slice: Optional[StreamSlice] = None,
100        next_page_token: Optional[Mapping[str, Any]] = None,
101    ) -> Union[Mapping[str, Any], str]:
102        """
103        Specifies how to populate the body of the request with a non-JSON payload.
104
105        If returns a ready text that it will be sent as is.
106        If returns a dict that it will be converted to a urlencoded form.
107        E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
108
109        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
110        """
111
112    @abstractmethod
113    def get_request_body_json(
114        self,
115        *,
116        stream_state: Optional[StreamState] = None,
117        stream_slice: Optional[StreamSlice] = None,
118        next_page_token: Optional[Mapping[str, Any]] = None,
119    ) -> Mapping[str, Any]:
120        """
121        Specifies how to populate the body of the request with a JSON payload.
122
123        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
124        """
125
126    @abstractmethod
127    def send_request(
128        self,
129        stream_state: Optional[StreamState] = None,
130        stream_slice: Optional[StreamSlice] = None,
131        next_page_token: Optional[Mapping[str, Any]] = None,
132        path: Optional[str] = None,
133        request_headers: Optional[Mapping[str, Any]] = None,
134        request_params: Optional[Mapping[str, Any]] = None,
135        request_body_data: Optional[Union[Mapping[str, Any], str]] = None,
136        request_body_json: Optional[Mapping[str, Any]] = None,
137        log_formatter: Optional[Callable[[requests.Response], Any]] = None,
138    ) -> Optional[requests.Response]:
139        """
140        Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error.
141        If path is set, the path configured on the requester itself is ignored.
142        If header, params and body are set, they are merged with the ones configured on the requester itself.
143
144        If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed.
145        """

Defines the request options to set on an outgoing HTTP request

Options can be passed by

  • request parameter
  • request headers
  • body data
  • json content
@abstractmethod
def get_authenticator( self) -> DeclarativeAuthenticator:
31    @abstractmethod
32    def get_authenticator(self) -> DeclarativeAuthenticator:
33        """
34        Specifies the authenticator to use when submitting requests
35        """
36        pass

Specifies the authenticator to use when submitting requests

@abstractmethod
def get_url_base( self, *, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[StreamSlice], next_page_token: Optional[Mapping[str, Any]]) -> str:
38    @abstractmethod
39    def get_url_base(
40        self,
41        *,
42        stream_state: Optional[StreamState],
43        stream_slice: Optional[StreamSlice],
44        next_page_token: Optional[Mapping[str, Any]],
45    ) -> str:
46        """
47        :return: URL base for the  API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
48        """
Returns

URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"

@abstractmethod
def get_path( self, *, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[StreamSlice], next_page_token: Optional[Mapping[str, Any]]) -> str:
50    @abstractmethod
51    def get_path(
52        self,
53        *,
54        stream_state: Optional[StreamState],
55        stream_slice: Optional[StreamSlice],
56        next_page_token: Optional[Mapping[str, Any]],
57    ) -> str:
58        """
59        Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
60        """

Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"

@abstractmethod
def get_method(self) -> HttpMethod:
62    @abstractmethod
63    def get_method(self) -> HttpMethod:
64        """
65        Specifies the HTTP method to use
66        """

Specifies the HTTP method to use

@abstractmethod
def get_request_params( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> MutableMapping[str, Any]:
68    @abstractmethod
69    def get_request_params(
70        self,
71        *,
72        stream_state: Optional[StreamState] = None,
73        stream_slice: Optional[StreamSlice] = None,
74        next_page_token: Optional[Mapping[str, Any]] = None,
75    ) -> MutableMapping[str, Any]:
76        """
77        Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
78
79        E.g: you might want to define query parameters for paging if next_page_token is not None.
80        """

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

@abstractmethod
def get_request_headers( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
82    @abstractmethod
83    def get_request_headers(
84        self,
85        *,
86        stream_state: Optional[StreamState] = None,
87        stream_slice: Optional[StreamSlice] = None,
88        next_page_token: Optional[Mapping[str, Any]] = None,
89    ) -> Mapping[str, Any]:
90        """
91        Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
92        """

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

@abstractmethod
def get_request_body_data( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Union[Mapping[str, Any], str]:
 94    @abstractmethod
 95    def get_request_body_data(
 96        self,
 97        *,
 98        stream_state: Optional[StreamState] = None,
 99        stream_slice: Optional[StreamSlice] = None,
100        next_page_token: Optional[Mapping[str, Any]] = None,
101    ) -> Union[Mapping[str, Any], str]:
102        """
103        Specifies how to populate the body of the request with a non-JSON payload.
104
105        If returns a ready text that it will be sent as is.
106        If returns a dict that it will be converted to a urlencoded form.
107        E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
108
109        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
110        """

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

@abstractmethod
def get_request_body_json( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
112    @abstractmethod
113    def get_request_body_json(
114        self,
115        *,
116        stream_state: Optional[StreamState] = None,
117        stream_slice: Optional[StreamSlice] = None,
118        next_page_token: Optional[Mapping[str, Any]] = None,
119    ) -> Mapping[str, Any]:
120        """
121        Specifies how to populate the body of the request with a JSON payload.
122
123        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
124        """

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

@abstractmethod
def send_request( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None, path: Optional[str] = None, request_headers: Optional[Mapping[str, Any]] = None, request_params: Optional[Mapping[str, Any]] = None, request_body_data: Union[str, Mapping[str, Any], NoneType] = None, request_body_json: Optional[Mapping[str, Any]] = None, log_formatter: Optional[Callable[[requests.models.Response], Any]] = None) -> Optional[requests.models.Response]:
126    @abstractmethod
127    def send_request(
128        self,
129        stream_state: Optional[StreamState] = None,
130        stream_slice: Optional[StreamSlice] = None,
131        next_page_token: Optional[Mapping[str, Any]] = None,
132        path: Optional[str] = None,
133        request_headers: Optional[Mapping[str, Any]] = None,
134        request_params: Optional[Mapping[str, Any]] = None,
135        request_body_data: Optional[Union[Mapping[str, Any], str]] = None,
136        request_body_json: Optional[Mapping[str, Any]] = None,
137        log_formatter: Optional[Callable[[requests.Response], Any]] = None,
138    ) -> Optional[requests.Response]:
139        """
140        Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error.
141        If path is set, the path configured on the requester itself is ignored.
142        If header, params and body are set, they are merged with the ones configured on the requester itself.
143
144        If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed.
145        """

Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. If path is set, the path configured on the requester itself is ignored. If header, params and body are set, they are merged with the ones configured on the requester itself.

If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed.

ResponseStatus
@dataclass
class SimpleRetriever(airbyte_cdk.sources.declarative.retrievers.retriever.Retriever):
 51@dataclass
 52class SimpleRetriever(Retriever):
 53    """
 54    Retrieves records by synchronously sending requests to fetch records.
 55
 56    The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the stream slicer.
 57
 58    For each stream slice, submit requests until there are no more pages of records to fetch.
 59
 60    This retriever currently inherits from HttpStream to reuse the request submission and pagination machinery.
 61    As a result, some of the parameters passed to some methods are unused.
 62    The two will be decoupled in a future release.
 63
 64    Attributes:
 65        stream_name (str): The stream's name
 66        stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): The stream's primary key
 67        requester (Requester): The HTTP requester
 68        record_selector (HttpSelector): The record selector
 69        paginator (Optional[Paginator]): The paginator
 70        stream_slicer (Optional[StreamSlicer]): The stream slicer
 71        cursor (Optional[cursor]): The cursor
 72        parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
 73    """
 74
 75    requester: Requester
 76    record_selector: HttpSelector
 77    config: Config
 78    parameters: InitVar[Mapping[str, Any]]
 79    name: str
 80    _name: Union[InterpolatedString, str] = field(init=False, repr=False, default="")
 81    primary_key: Optional[Union[str, List[str], List[List[str]]]]
 82    _primary_key: str = field(init=False, repr=False, default="")
 83    paginator: Optional[Paginator] = None
 84    stream_slicer: StreamSlicer = field(
 85        default_factory=lambda: SinglePartitionRouter(parameters={})
 86    )
 87    request_option_provider: RequestOptionsProvider = field(
 88        default_factory=lambda: DefaultRequestOptionsProvider(parameters={})
 89    )
 90    cursor: Optional[DeclarativeCursor] = None
 91    ignore_stream_slicer_parameters_on_paginated_requests: bool = False
 92
 93    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 94        self._paginator = self.paginator or NoPagination(parameters=parameters)
 95        self._parameters = parameters
 96        self._name = (
 97            InterpolatedString(self._name, parameters=parameters)
 98            if isinstance(self._name, str)
 99            else self._name
100        )
101
102    @property  # type: ignore
103    def name(self) -> str:
104        """
105        :return: Stream name
106        """
107        return (
108            str(self._name.eval(self.config))
109            if isinstance(self._name, InterpolatedString)
110            else self._name
111        )
112
113    @name.setter
114    def name(self, value: str) -> None:
115        if not isinstance(value, property):
116            self._name = value
117
118    def _get_mapping(
119        self, method: Callable[..., Optional[Union[Mapping[str, Any], str]]], **kwargs: Any
120    ) -> Tuple[Union[Mapping[str, Any], str], Set[str]]:
121        """
122        Get mapping from the provided method, and get the keys of the mapping.
123        If the method returns a string, it will return the string and an empty set.
124        If the method returns a dict, it will return the dict and its keys.
125        """
126        mapping = method(**kwargs) or {}
127        keys = set(mapping.keys()) if not isinstance(mapping, str) else set()
128        return mapping, keys
129
130    def _get_request_options(
131        self,
132        stream_state: Optional[StreamData],
133        stream_slice: Optional[StreamSlice],
134        next_page_token: Optional[Mapping[str, Any]],
135        paginator_method: Callable[..., Optional[Union[Mapping[str, Any], str]]],
136        stream_slicer_method: Callable[..., Optional[Union[Mapping[str, Any], str]]],
137    ) -> Union[Mapping[str, Any], str]:
138        """
139        Get the request_option from the paginator and the stream slicer.
140        Raise a ValueError if there's a key collision
141        Returned merged mapping otherwise
142        """
143        # FIXME we should eventually remove the usage of stream_state as part of the interpolation
144
145        is_body_json = paginator_method.__name__ == "get_request_body_json"
146
147        mappings = [
148            paginator_method(
149                stream_slice=stream_slice,
150                next_page_token=next_page_token,
151            ),
152        ]
153        if not next_page_token or not self.ignore_stream_slicer_parameters_on_paginated_requests:
154            mappings.append(
155                stream_slicer_method(
156                    stream_slice=stream_slice,
157                    next_page_token=next_page_token,
158                )
159            )
160        return combine_mappings(mappings, allow_same_value_merge=is_body_json)
161
162    def _request_headers(
163        self,
164        stream_state: Optional[StreamData] = None,
165        stream_slice: Optional[StreamSlice] = None,
166        next_page_token: Optional[Mapping[str, Any]] = None,
167    ) -> Mapping[str, Any]:
168        """
169        Specifies request headers.
170        Authentication headers will overwrite any overlapping headers returned from this method.
171        """
172        headers = self._get_request_options(
173            stream_state,
174            stream_slice,
175            next_page_token,
176            self._paginator.get_request_headers,
177            self.request_option_provider.get_request_headers,
178        )
179        if isinstance(headers, str):
180            raise ValueError("Request headers cannot be a string")
181        return {str(k): str(v) for k, v in headers.items()}
182
183    def _request_params(
184        self,
185        stream_state: Optional[StreamData] = None,
186        stream_slice: Optional[StreamSlice] = None,
187        next_page_token: Optional[Mapping[str, Any]] = None,
188    ) -> Mapping[str, Any]:
189        """
190        Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
191
192        E.g: you might want to define query parameters for paging if next_page_token is not None.
193        """
194        params = self._get_request_options(
195            stream_state,
196            stream_slice,
197            next_page_token,
198            self._paginator.get_request_params,
199            self.request_option_provider.get_request_params,
200        )
201        if isinstance(params, str):
202            raise ValueError("Request params cannot be a string")
203        return params
204
205    def _request_body_data(
206        self,
207        stream_state: Optional[StreamData] = None,
208        stream_slice: Optional[StreamSlice] = None,
209        next_page_token: Optional[Mapping[str, Any]] = None,
210    ) -> Union[Mapping[str, Any], str]:
211        """
212        Specifies how to populate the body of the request with a non-JSON payload.
213
214        If returns a ready text that it will be sent as is.
215        If returns a dict that it will be converted to a urlencoded form.
216        E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
217
218        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
219        """
220        return self._get_request_options(
221            stream_state,
222            stream_slice,
223            next_page_token,
224            self._paginator.get_request_body_data,
225            self.request_option_provider.get_request_body_data,
226        )
227
228    def _request_body_json(
229        self,
230        stream_state: Optional[StreamData] = None,
231        stream_slice: Optional[StreamSlice] = None,
232        next_page_token: Optional[Mapping[str, Any]] = None,
233    ) -> Optional[Mapping[str, Any]]:
234        """
235        Specifies how to populate the body of the request with a JSON payload.
236
237        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
238        """
239        body_json = self._get_request_options(
240            stream_state,
241            stream_slice,
242            next_page_token,
243            self._paginator.get_request_body_json,
244            self.request_option_provider.get_request_body_json,
245        )
246        if isinstance(body_json, str):
247            raise ValueError("Request body json cannot be a string")
248        return body_json
249
250    def _paginator_path(
251        self,
252        next_page_token: Optional[Mapping[str, Any]] = None,
253        stream_state: Optional[Mapping[str, Any]] = None,
254        stream_slice: Optional[StreamSlice] = None,
255    ) -> Optional[str]:
256        """
257        If the paginator points to a path, follow it, else return nothing so the requester is used.
258        :param next_page_token:
259        :return:
260        """
261        return self._paginator.path(
262            next_page_token=next_page_token,
263            stream_state=stream_state,
264            stream_slice=stream_slice,
265        )
266
267    def _parse_response(
268        self,
269        response: Optional[requests.Response],
270        stream_state: StreamState,
271        records_schema: Mapping[str, Any],
272        stream_slice: Optional[StreamSlice] = None,
273        next_page_token: Optional[Mapping[str, Any]] = None,
274    ) -> Iterable[Record]:
275        if not response:
276            yield from []
277        else:
278            yield from self.record_selector.select_records(
279                response=response,
280                stream_state=stream_state,
281                records_schema=records_schema,
282                stream_slice=stream_slice,
283                next_page_token=next_page_token,
284            )
285
286    @property  # type: ignore
287    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
288        """The stream's primary key"""
289        return self._primary_key
290
291    @primary_key.setter
292    def primary_key(self, value: str) -> None:
293        if not isinstance(value, property):
294            self._primary_key = value
295
296    def _next_page_token(
297        self,
298        response: requests.Response,
299        last_page_size: int,
300        last_record: Optional[Record],
301        last_page_token_value: Optional[Any],
302    ) -> Optional[Mapping[str, Any]]:
303        """
304        Specifies a pagination strategy.
305
306        The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params.
307
308        :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
309        """
310        return self._paginator.next_page_token(
311            response=response,
312            last_page_size=last_page_size,
313            last_record=last_record,
314            last_page_token_value=last_page_token_value,
315        )
316
317    def _fetch_next_page(
318        self,
319        stream_state: Mapping[str, Any],
320        stream_slice: StreamSlice,
321        next_page_token: Optional[Mapping[str, Any]] = None,
322    ) -> Optional[requests.Response]:
323        return self.requester.send_request(
324            path=self._paginator_path(
325                next_page_token=next_page_token,
326                stream_state=stream_state,
327                stream_slice=stream_slice,
328            ),
329            stream_state=stream_state,
330            stream_slice=stream_slice,
331            next_page_token=next_page_token,
332            request_headers=self._request_headers(
333                stream_state=stream_state,
334                stream_slice=stream_slice,
335                next_page_token=next_page_token,
336            ),
337            request_params=self._request_params(
338                stream_state=stream_state,
339                stream_slice=stream_slice,
340                next_page_token=next_page_token,
341            ),
342            request_body_data=self._request_body_data(
343                stream_state=stream_state,
344                stream_slice=stream_slice,
345                next_page_token=next_page_token,
346            ),
347            request_body_json=self._request_body_json(
348                stream_state=stream_state,
349                stream_slice=stream_slice,
350                next_page_token=next_page_token,
351            ),
352        )
353
354    # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well.
355    def _read_pages(
356        self,
357        records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]],
358        stream_state: Mapping[str, Any],
359        stream_slice: StreamSlice,
360    ) -> Iterable[Record]:
361        pagination_complete = False
362        initial_token = self._paginator.get_initial_token()
363        next_page_token: Optional[Mapping[str, Any]] = (
364            {"next_page_token": initial_token} if initial_token else None
365        )
366        while not pagination_complete:
367            response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
368
369            last_page_size = 0
370            last_record: Optional[Record] = None
371            for record in records_generator_fn(response):
372                last_page_size += 1
373                last_record = record
374                yield record
375
376            if not response:
377                pagination_complete = True
378            else:
379                last_page_token_value = (
380                    next_page_token.get("next_page_token") if next_page_token else None
381                )
382                next_page_token = self._next_page_token(
383                    response=response,
384                    last_page_size=last_page_size,
385                    last_record=last_record,
386                    last_page_token_value=last_page_token_value,
387                )
388                if not next_page_token:
389                    pagination_complete = True
390
391        # Always return an empty generator just in case no records were ever yielded
392        yield from []
393
394    def _read_single_page(
395        self,
396        records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]],
397        stream_state: Mapping[str, Any],
398        stream_slice: StreamSlice,
399    ) -> Iterable[StreamData]:
400        initial_token = stream_state.get("next_page_token")
401        if initial_token is None:
402            initial_token = self._paginator.get_initial_token()
403        next_page_token: Optional[Mapping[str, Any]] = (
404            {"next_page_token": initial_token} if initial_token else None
405        )
406
407        response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
408
409        last_page_size = 0
410        last_record: Optional[Record] = None
411        for record in records_generator_fn(response):
412            last_page_size += 1
413            last_record = record
414            yield record
415
416        if not response:
417            next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
418        else:
419            last_page_token_value = (
420                next_page_token.get("next_page_token") if next_page_token else None
421            )
422            next_page_token = self._next_page_token(
423                response=response,
424                last_page_size=last_page_size,
425                last_record=last_record,
426                last_page_token_value=last_page_token_value,
427            ) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True}
428
429        if self.cursor:
430            self.cursor.close_slice(
431                StreamSlice(cursor_slice=next_page_token, partition=stream_slice.partition)
432            )
433
434        # Always return an empty generator just in case no records were ever yielded
435        yield from []
436
437    def read_records(
438        self,
439        records_schema: Mapping[str, Any],
440        stream_slice: Optional[StreamSlice] = None,
441    ) -> Iterable[StreamData]:
442        """
443        Fetch a stream's records from an HTTP API source
444
445        :param records_schema: json schema to describe record
446        :param stream_slice: The stream slice to read data for
447        :return: The records read from the API source
448        """
449        _slice = stream_slice or StreamSlice(partition={}, cursor_slice={})  # None-check
450
451        most_recent_record_from_slice = None
452        record_generator = partial(
453            self._parse_records,
454            stream_slice=stream_slice,
455            stream_state=self.state or {},
456            records_schema=records_schema,
457        )
458
459        if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
460            stream_state = self.state
461
462            # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
463            # fetch more records. The platform deletes stream state for full refresh streams before starting a
464            # new job, so we don't need to worry about this value existing for the initial attempt
465            if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
466                return
467
468            yield from self._read_single_page(record_generator, stream_state, _slice)
469        else:
470            for stream_data in self._read_pages(record_generator, self.state, _slice):
471                current_record = self._extract_record(stream_data, _slice)
472                if self.cursor and current_record:
473                    self.cursor.observe(_slice, current_record)
474
475                # Latest record read, not necessarily within slice boundaries.
476                # TODO Remove once all custom components implement `observe` method.
477                # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
478                most_recent_record_from_slice = self._get_most_recent_record(
479                    most_recent_record_from_slice, current_record, _slice
480                )
481                yield stream_data
482
483            if self.cursor:
484                self.cursor.close_slice(_slice, most_recent_record_from_slice)
485        return
486
487    def _get_most_recent_record(
488        self,
489        current_most_recent: Optional[Record],
490        current_record: Optional[Record],
491        stream_slice: StreamSlice,
492    ) -> Optional[Record]:
493        if self.cursor and current_record:
494            if not current_most_recent:
495                return current_record
496            else:
497                return (
498                    current_most_recent
499                    if self.cursor.is_greater_than_or_equal(current_most_recent, current_record)
500                    else current_record
501                )
502        else:
503            return None
504
505    def _extract_record(
506        self, stream_data: StreamData, stream_slice: StreamSlice
507    ) -> Optional[Record]:
508        """
509        As we allow the output of _read_pages to be StreamData, it can be multiple things. Therefore, we need to filter out and normalize
510        to data to streamline the rest of the process.
511        """
512        if isinstance(stream_data, Record):
513            # Record is not part of `StreamData` but is the most common implementation of `Mapping[str, Any]` which is part of `StreamData`
514            return stream_data
515        elif isinstance(stream_data, (dict, Mapping)):
516            return Record(
517                data=dict(stream_data), associated_slice=stream_slice, stream_name=self.name
518            )
519        elif isinstance(stream_data, AirbyteMessage) and stream_data.record:
520            return Record(
521                data=stream_data.record.data,  # type:ignore # AirbyteMessage always has record.data
522                associated_slice=stream_slice,
523                stream_name=self.name,
524            )
525        return None
526
527    # stream_slices is defined with arguments on http stream and fixing this has a long tail of dependencies. Will be resolved by the decoupling of http stream and simple retriever
528    def stream_slices(self) -> Iterable[Optional[StreamSlice]]:  # type: ignore
529        """
530        Specifies the slices for this stream. See the stream slicing section of the docs for more information.
531
532        :param sync_mode:
533        :param cursor_field:
534        :param stream_state:
535        :return:
536        """
537        return self.stream_slicer.stream_slices()
538
539    @property
540    def state(self) -> Mapping[str, Any]:
541        return self.cursor.get_stream_state() if self.cursor else {}
542
543    @state.setter
544    def state(self, value: StreamState) -> None:
545        """State setter, accept state serialized by state getter."""
546        if self.cursor:
547            self.cursor.set_initial_state(value)
548
549    def _parse_records(
550        self,
551        response: Optional[requests.Response],
552        stream_state: Mapping[str, Any],
553        records_schema: Mapping[str, Any],
554        stream_slice: Optional[StreamSlice],
555    ) -> Iterable[Record]:
556        yield from self._parse_response(
557            response,
558            stream_slice=stream_slice,
559            stream_state=stream_state,
560            records_schema=records_schema,
561        )
562
563    def must_deduplicate_query_params(self) -> bool:
564        return True
565
566    @staticmethod
567    def _to_partition_key(to_serialize: Any) -> str:
568        # separators have changed in Python 3.4. To avoid being impacted by further change, we explicitly specify our own value
569        return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True)

Retrieves records by synchronously sending requests to fetch records.

The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the stream slicer.

For each stream slice, submit requests until there are no more pages of records to fetch.

This retriever currently inherits from HttpStream to reuse the request submission and pagination machinery. As a result, some of the parameters passed to some methods are unused. The two will be decoupled in a future release.

Attributes:
  • stream_name (str): The stream's name
  • stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): The stream's primary key
  • requester (Requester): The HTTP requester
  • record_selector (HttpSelector): The record selector
  • paginator (Optional[Paginator]): The paginator
  • stream_slicer (Optional[StreamSlicer]): The stream slicer
  • cursor (Optional[cursor]): The cursor
  • parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
SimpleRetriever( requester: Requester, record_selector: airbyte_cdk.sources.declarative.extractors.HttpSelector, config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]], name: str = <property object>, primary_key: Union[str, List[str], List[List[str]], NoneType] = <property object>, paginator: Optional[airbyte_cdk.sources.declarative.requesters.paginators.Paginator] = None, stream_slicer: airbyte_cdk.sources.declarative.stream_slicers.StreamSlicer = <factory>, request_option_provider: airbyte_cdk.sources.declarative.requesters.request_options.RequestOptionsProvider = <factory>, cursor: Optional[airbyte_cdk.sources.declarative.incremental.DeclarativeCursor] = None, ignore_stream_slicer_parameters_on_paginated_requests: bool = False)
requester: Requester
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
name: str
102    @property  # type: ignore
103    def name(self) -> str:
104        """
105        :return: Stream name
106        """
107        return (
108            str(self._name.eval(self.config))
109            if isinstance(self._name, InterpolatedString)
110            else self._name
111        )
Returns

Stream name

primary_key: Union[str, List[str], List[List[str]], NoneType]
286    @property  # type: ignore
287    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
288        """The stream's primary key"""
289        return self._primary_key

The stream's primary key

ignore_stream_slicer_parameters_on_paginated_requests: bool = False
def read_records( self, records_schema: Mapping[str, Any], stream_slice: Optional[StreamSlice] = None) -> Iterable[Union[Mapping[str, Any], AirbyteMessage]]:
437    def read_records(
438        self,
439        records_schema: Mapping[str, Any],
440        stream_slice: Optional[StreamSlice] = None,
441    ) -> Iterable[StreamData]:
442        """
443        Fetch a stream's records from an HTTP API source
444
445        :param records_schema: json schema to describe record
446        :param stream_slice: The stream slice to read data for
447        :return: The records read from the API source
448        """
449        _slice = stream_slice or StreamSlice(partition={}, cursor_slice={})  # None-check
450
451        most_recent_record_from_slice = None
452        record_generator = partial(
453            self._parse_records,
454            stream_slice=stream_slice,
455            stream_state=self.state or {},
456            records_schema=records_schema,
457        )
458
459        if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
460            stream_state = self.state
461
462            # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
463            # fetch more records. The platform deletes stream state for full refresh streams before starting a
464            # new job, so we don't need to worry about this value existing for the initial attempt
465            if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
466                return
467
468            yield from self._read_single_page(record_generator, stream_state, _slice)
469        else:
470            for stream_data in self._read_pages(record_generator, self.state, _slice):
471                current_record = self._extract_record(stream_data, _slice)
472                if self.cursor and current_record:
473                    self.cursor.observe(_slice, current_record)
474
475                # Latest record read, not necessarily within slice boundaries.
476                # TODO Remove once all custom components implement `observe` method.
477                # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
478                most_recent_record_from_slice = self._get_most_recent_record(
479                    most_recent_record_from_slice, current_record, _slice
480                )
481                yield stream_data
482
483            if self.cursor:
484                self.cursor.close_slice(_slice, most_recent_record_from_slice)
485        return

Fetch a stream's records from an HTTP API source

Parameters
  • records_schema: json schema to describe record
  • stream_slice: The stream slice to read data for
Returns

The records read from the API source

def stream_slices(self) -> Iterable[Optional[StreamSlice]]:
528    def stream_slices(self) -> Iterable[Optional[StreamSlice]]:  # type: ignore
529        """
530        Specifies the slices for this stream. See the stream slicing section of the docs for more information.
531
532        :param sync_mode:
533        :param cursor_field:
534        :param stream_state:
535        :return:
536        """
537        return self.stream_slicer.stream_slices()

Specifies the slices for this stream. See the stream slicing section of the docs for more information.

Parameters
  • sync_mode:
  • cursor_field:
  • stream_state:
Returns
state: Mapping[str, Any]
539    @property
540    def state(self) -> Mapping[str, Any]:
541        return self.cursor.get_stream_state() if self.cursor else {}

State getter, should return state in form that can serialized to a string and send to the output as a STATE AirbyteMessage.

A good example of a state is a cursor_value: { self.cursor_field: "cursor_value" }

State should try to be as small as possible but at the same time descriptive enough to restore syncing process from the point where it stopped.

def must_deduplicate_query_params(self) -> bool:
563    def must_deduplicate_query_params(self) -> bool:
564        return True
@dataclass
class SinglePartitionRouter(airbyte_cdk.sources.declarative.partition_routers.partition_router.PartitionRouter):
13@dataclass
14class SinglePartitionRouter(PartitionRouter):
15    """Partition router returning only a stream slice"""
16
17    parameters: InitVar[Mapping[str, Any]]
18
19    def get_request_params(
20        self,
21        stream_state: Optional[StreamState] = None,
22        stream_slice: Optional[StreamSlice] = None,
23        next_page_token: Optional[Mapping[str, Any]] = None,
24    ) -> Mapping[str, Any]:
25        return {}
26
27    def get_request_headers(
28        self,
29        stream_state: Optional[StreamState] = None,
30        stream_slice: Optional[StreamSlice] = None,
31        next_page_token: Optional[Mapping[str, Any]] = None,
32    ) -> Mapping[str, Any]:
33        return {}
34
35    def get_request_body_data(
36        self,
37        stream_state: Optional[StreamState] = None,
38        stream_slice: Optional[StreamSlice] = None,
39        next_page_token: Optional[Mapping[str, Any]] = None,
40    ) -> Mapping[str, Any]:
41        return {}
42
43    def get_request_body_json(
44        self,
45        stream_state: Optional[StreamState] = None,
46        stream_slice: Optional[StreamSlice] = None,
47        next_page_token: Optional[Mapping[str, Any]] = None,
48    ) -> Mapping[str, Any]:
49        return {}
50
51    def stream_slices(self) -> Iterable[StreamSlice]:
52        yield StreamSlice(partition={}, cursor_slice={})
53
54    def set_initial_state(self, stream_state: StreamState) -> None:
55        """
56        SinglePartitionRouter doesn't have parent streams
57        """
58        pass
59
60    def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
61        """
62        SinglePartitionRouter doesn't have parent streams
63        """
64        pass

Partition router returning only a stream slice

SinglePartitionRouter(parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
def get_request_params( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
19    def get_request_params(
20        self,
21        stream_state: Optional[StreamState] = None,
22        stream_slice: Optional[StreamSlice] = None,
23        next_page_token: Optional[Mapping[str, Any]] = None,
24    ) -> Mapping[str, Any]:
25        return {}

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
27    def get_request_headers(
28        self,
29        stream_state: Optional[StreamState] = None,
30        stream_slice: Optional[StreamSlice] = None,
31        next_page_token: Optional[Mapping[str, Any]] = None,
32    ) -> Mapping[str, Any]:
33        return {}

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
35    def get_request_body_data(
36        self,
37        stream_state: Optional[StreamState] = None,
38        stream_slice: Optional[StreamSlice] = None,
39        next_page_token: Optional[Mapping[str, Any]] = None,
40    ) -> Mapping[str, Any]:
41        return {}

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
43    def get_request_body_json(
44        self,
45        stream_state: Optional[StreamState] = None,
46        stream_slice: Optional[StreamSlice] = None,
47        next_page_token: Optional[Mapping[str, Any]] = None,
48    ) -> Mapping[str, Any]:
49        return {}

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def stream_slices(self) -> Iterable[StreamSlice]:
51    def stream_slices(self) -> Iterable[StreamSlice]:
52        yield StreamSlice(partition={}, cursor_slice={})

Defines stream slices

Returns

An iterable of stream slices

def set_initial_state(self, stream_state: Mapping[str, Any]) -> None:
54    def set_initial_state(self, stream_state: StreamState) -> None:
55        """
56        SinglePartitionRouter doesn't have parent streams
57        """
58        pass

SinglePartitionRouter doesn't have parent streams

def get_stream_state(self) -> Optional[Mapping[str, Mapping[str, Any]]]:
60    def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
61        """
62        SinglePartitionRouter doesn't have parent streams
63        """
64        pass

SinglePartitionRouter doesn't have parent streams

class StopConditionPaginationStrategyDecorator(airbyte_cdk.PaginationStrategy):
42class StopConditionPaginationStrategyDecorator(PaginationStrategy):
43    def __init__(self, _delegate: PaginationStrategy, stop_condition: PaginationStopCondition):
44        self._delegate = _delegate
45        self._stop_condition = stop_condition
46
47    def next_page_token(
48        self,
49        response: requests.Response,
50        last_page_size: int,
51        last_record: Optional[Record],
52        last_page_token_value: Optional[Any] = None,
53    ) -> Optional[Any]:
54        # We evaluate in reverse order because the assumption is that most of the APIs using data feed structure
55        # will return records in descending order. In terms of performance/memory, we return the records lazily
56        if last_record and self._stop_condition.is_met(last_record):
57            return None
58        return self._delegate.next_page_token(
59            response, last_page_size, last_record, last_page_token_value
60        )
61
62    def get_page_size(self) -> Optional[int]:
63        return self._delegate.get_page_size()
64
65    @property
66    def initial_token(self) -> Optional[Any]:
67        return self._delegate.initial_token

Defines how to get the next page token

StopConditionPaginationStrategyDecorator( _delegate: PaginationStrategy, stop_condition: airbyte_cdk.sources.declarative.requesters.paginators.strategies.stop_condition.PaginationStopCondition)
43    def __init__(self, _delegate: PaginationStrategy, stop_condition: PaginationStopCondition):
44        self._delegate = _delegate
45        self._stop_condition = stop_condition
def next_page_token( self, response: requests.models.Response, last_page_size: int, last_record: Optional[Record], last_page_token_value: Optional[Any] = None) -> Optional[Any]:
47    def next_page_token(
48        self,
49        response: requests.Response,
50        last_page_size: int,
51        last_record: Optional[Record],
52        last_page_token_value: Optional[Any] = None,
53    ) -> Optional[Any]:
54        # We evaluate in reverse order because the assumption is that most of the APIs using data feed structure
55        # will return records in descending order. In terms of performance/memory, we return the records lazily
56        if last_record and self._stop_condition.is_met(last_record):
57            return None
58        return self._delegate.next_page_token(
59            response, last_page_size, last_record, last_page_token_value
60        )
Parameters
  • response: response to process
  • last_page_size: the number of records read from the response
  • last_record: the last record extracted from the response
  • last_page_token_value: The current value of the page token made on the last request
Returns

next page token. Returns None if there are no more pages to fetch

def get_page_size(self) -> Optional[int]:
62    def get_page_size(self) -> Optional[int]:
63        return self._delegate.get_page_size()
Returns

page size: The number of records to fetch in a page. Returns None if unspecified

initial_token: Optional[Any]
65    @property
66    def initial_token(self) -> Optional[Any]:
67        return self._delegate.initial_token

Return the initial value of the token

class StreamSlice(typing.Mapping[str, typing.Any]):
 67class StreamSlice(Mapping[str, Any]):
 68    def __init__(
 69        self,
 70        *,
 71        partition: Mapping[str, Any],
 72        cursor_slice: Mapping[str, Any],
 73        extra_fields: Optional[Mapping[str, Any]] = None,
 74    ) -> None:
 75        """
 76        :param partition: The partition keys representing a unique partition in the stream.
 77        :param cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens.
 78        :param extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream.
 79        """
 80        self._partition = partition
 81        self._cursor_slice = cursor_slice
 82        self._extra_fields = extra_fields or {}
 83
 84        # Ensure that partition keys do not overlap with cursor slice keys
 85        if partition.keys() & cursor_slice.keys():
 86            raise ValueError("Keys for partition and incremental sync cursor should not overlap")
 87
 88        self._stream_slice = dict(partition) | dict(cursor_slice)
 89
 90    @property
 91    def partition(self) -> Mapping[str, Any]:
 92        """Returns the partition portion of the stream slice."""
 93        p = self._partition
 94        while isinstance(p, StreamSlice):
 95            p = p.partition
 96        return p
 97
 98    @property
 99    def cursor_slice(self) -> Mapping[str, Any]:
100        """Returns the cursor slice portion of the stream slice."""
101        c = self._cursor_slice
102        while isinstance(c, StreamSlice):
103            c = c.cursor_slice
104        return c
105
106    @property
107    def extra_fields(self) -> Mapping[str, Any]:
108        """Returns the extra fields that are not part of the partition."""
109        return self._extra_fields
110
111    def __repr__(self) -> str:
112        return repr(self._stream_slice)
113
114    def __setitem__(self, key: str, value: Any) -> None:
115        raise ValueError("StreamSlice is immutable")
116
117    def __getitem__(self, key: str) -> Any:
118        return self._stream_slice[key]
119
120    def __len__(self) -> int:
121        return len(self._stream_slice)
122
123    def __iter__(self) -> Iterator[str]:
124        return iter(self._stream_slice)
125
126    def __contains__(self, item: Any) -> bool:
127        return item in self._stream_slice
128
129    def keys(self) -> KeysView[str]:
130        return self._stream_slice.keys()
131
132    def items(self) -> ItemsView[str, Any]:
133        return self._stream_slice.items()
134
135    def values(self) -> ValuesView[Any]:
136        return self._stream_slice.values()
137
138    def get(self, key: str, default: Any = None) -> Optional[Any]:
139        return self._stream_slice.get(key, default)
140
141    def __eq__(self, other: Any) -> bool:
142        if isinstance(other, dict):
143            return self._stream_slice == other
144        if isinstance(other, StreamSlice):
145            # noinspection PyProtectedMember
146            return self._partition == other._partition and self._cursor_slice == other._cursor_slice
147        return False
148
149    def __ne__(self, other: Any) -> bool:
150        return not self.__eq__(other)
151
152    def __json_serializable__(self) -> Any:
153        return self._stream_slice
154
155    def __hash__(self) -> int:
156        return SliceHasher.hash(
157            stream_slice=self._stream_slice
158        )  # no need to provide stream_name here as this is used for slicing the cursor
159
160    def __bool__(self) -> bool:
161        return bool(self._stream_slice) or bool(self._extra_fields)

A Mapping is a generic container for associating key/value pairs.

This class provides concrete generic implementations of all methods except for __getitem__, __iter__, and __len__.

StreamSlice( *, partition: Mapping[str, Any], cursor_slice: Mapping[str, Any], extra_fields: Optional[Mapping[str, Any]] = None)
68    def __init__(
69        self,
70        *,
71        partition: Mapping[str, Any],
72        cursor_slice: Mapping[str, Any],
73        extra_fields: Optional[Mapping[str, Any]] = None,
74    ) -> None:
75        """
76        :param partition: The partition keys representing a unique partition in the stream.
77        :param cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens.
78        :param extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream.
79        """
80        self._partition = partition
81        self._cursor_slice = cursor_slice
82        self._extra_fields = extra_fields or {}
83
84        # Ensure that partition keys do not overlap with cursor slice keys
85        if partition.keys() & cursor_slice.keys():
86            raise ValueError("Keys for partition and incremental sync cursor should not overlap")
87
88        self._stream_slice = dict(partition) | dict(cursor_slice)
Parameters
  • partition: The partition keys representing a unique partition in the stream.
  • cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens.
  • extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream.
partition: Mapping[str, Any]
90    @property
91    def partition(self) -> Mapping[str, Any]:
92        """Returns the partition portion of the stream slice."""
93        p = self._partition
94        while isinstance(p, StreamSlice):
95            p = p.partition
96        return p

Returns the partition portion of the stream slice.

cursor_slice: Mapping[str, Any]
 98    @property
 99    def cursor_slice(self) -> Mapping[str, Any]:
100        """Returns the cursor slice portion of the stream slice."""
101        c = self._cursor_slice
102        while isinstance(c, StreamSlice):
103            c = c.cursor_slice
104        return c

Returns the cursor slice portion of the stream slice.

extra_fields: Mapping[str, Any]
106    @property
107    def extra_fields(self) -> Mapping[str, Any]:
108        """Returns the extra fields that are not part of the partition."""
109        return self._extra_fields

Returns the extra fields that are not part of the partition.

def keys(self) -> KeysView[str]:
129    def keys(self) -> KeysView[str]:
130        return self._stream_slice.keys()

D.keys() -> a set-like object providing a view on D's keys

def items(self) -> ItemsView[str, Any]:
132    def items(self) -> ItemsView[str, Any]:
133        return self._stream_slice.items()

D.items() -> a set-like object providing a view on D's items

def values(self) -> ValuesView[Any]:
135    def values(self) -> ValuesView[Any]:
136        return self._stream_slice.values()

D.values() -> an object providing a view on D's values

def get(self, key: str, default: Any = None) -> Optional[Any]:
138    def get(self, key: str, default: Any = None) -> Optional[Any]:
139        return self._stream_slice.get(key, default)

D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.

@dataclass
class SubstreamPartitionRouter(airbyte_cdk.sources.declarative.partition_routers.partition_router.PartitionRouter):
 80@dataclass
 81class SubstreamPartitionRouter(PartitionRouter):
 82    """
 83    Partition router that iterates over the parent's stream records and emits slices
 84    Will populate the state with `partition_field` and `parent_slice` so they can be accessed by other components
 85
 86    Attributes:
 87        parent_stream_configs (List[ParentStreamConfig]): parent streams to iterate over and their config
 88    """
 89
 90    parent_stream_configs: List[ParentStreamConfig]
 91    config: Config
 92    parameters: InitVar[Mapping[str, Any]]
 93
 94    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
 95        if not self.parent_stream_configs:
 96            raise ValueError("SubstreamPartitionRouter needs at least 1 parent stream")
 97        self._parameters = parameters
 98
 99    def get_request_params(
100        self,
101        stream_state: Optional[StreamState] = None,
102        stream_slice: Optional[StreamSlice] = None,
103        next_page_token: Optional[Mapping[str, Any]] = None,
104    ) -> Mapping[str, Any]:
105        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
106        return self._get_request_option(RequestOptionType.request_parameter, stream_slice)
107
108    def get_request_headers(
109        self,
110        stream_state: Optional[StreamState] = None,
111        stream_slice: Optional[StreamSlice] = None,
112        next_page_token: Optional[Mapping[str, Any]] = None,
113    ) -> Mapping[str, Any]:
114        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
115        return self._get_request_option(RequestOptionType.header, stream_slice)
116
117    def get_request_body_data(
118        self,
119        stream_state: Optional[StreamState] = None,
120        stream_slice: Optional[StreamSlice] = None,
121        next_page_token: Optional[Mapping[str, Any]] = None,
122    ) -> Mapping[str, Any]:
123        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
124        return self._get_request_option(RequestOptionType.body_data, stream_slice)
125
126    def get_request_body_json(
127        self,
128        stream_state: Optional[StreamState] = None,
129        stream_slice: Optional[StreamSlice] = None,
130        next_page_token: Optional[Mapping[str, Any]] = None,
131    ) -> Mapping[str, Any]:
132        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
133        return self._get_request_option(RequestOptionType.body_json, stream_slice)
134
135    def _get_request_option(
136        self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]
137    ) -> Mapping[str, Any]:
138        params: MutableMapping[str, Any] = {}
139        if stream_slice:
140            for parent_config in self.parent_stream_configs:
141                if (
142                    parent_config.request_option
143                    and parent_config.request_option.inject_into == option_type
144                ):
145                    key = parent_config.partition_field.eval(self.config)  # type: ignore # partition_field is always casted to an interpolated string
146                    value = stream_slice.get(key)
147                    if value:
148                        parent_config.request_option.inject_into_request(params, value, self.config)
149        return params
150
151    def stream_slices(self) -> Iterable[StreamSlice]:
152        """
153        Iterate over each parent stream's record and create a StreamSlice for each record.
154
155        For each stream, iterate over its stream_slices.
156        For each stream slice, iterate over each record.
157        yield a stream slice for each such records.
158
159        If a parent slice contains no record, emit a slice with parent_record=None.
160
161        The template string can interpolate the following values:
162        - parent_stream_slice: mapping representing the parent's stream slice
163        - parent_record: mapping representing the parent record
164        - parent_stream_name: string representing the parent stream name
165        """
166        if not self.parent_stream_configs:
167            yield from []
168        else:
169            for parent_stream_config in self.parent_stream_configs:
170                parent_stream = parent_stream_config.stream
171                parent_field = parent_stream_config.parent_key.eval(self.config)  # type: ignore # parent_key is always casted to an interpolated string
172                partition_field = parent_stream_config.partition_field.eval(self.config)  # type: ignore # partition_field is always casted to an interpolated string
173                extra_fields = None
174                if parent_stream_config.extra_fields:
175                    extra_fields = [
176                        [field_path_part.eval(self.config) for field_path_part in field_path]  # type: ignore [union-attr]
177                        for field_path in parent_stream_config.extra_fields
178                    ]
179
180                # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does
181                # not support either substreams or RFR, but something that needs to be considered once we do
182                for parent_record in parent_stream.read_only_records():
183                    parent_partition = None
184                    # Skip non-records (eg AirbyteLogMessage)
185                    if isinstance(parent_record, AirbyteMessage):
186                        self.logger.warning(
187                            f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state."
188                        )
189                        if parent_record.type == MessageType.RECORD:
190                            parent_record = parent_record.record.data  # type: ignore[union-attr, assignment]  # record is always a Record
191                        else:
192                            continue
193                    elif isinstance(parent_record, Record):
194                        parent_partition = (
195                            parent_record.associated_slice.partition
196                            if parent_record.associated_slice
197                            else {}
198                        )
199                        parent_record = parent_record.data
200                    elif not isinstance(parent_record, Mapping):
201                        # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid
202                        raise AirbyteTracedException(
203                            message=f"Parent stream returned records as invalid type {type(parent_record)}"
204                        )
205                    try:
206                        partition_value = dpath.get(
207                            parent_record,  # type: ignore [arg-type]
208                            parent_field,
209                        )
210                    except KeyError:
211                        continue
212
213                    # Add extra fields
214                    extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields)
215
216                    if parent_stream_config.lazy_read_pointer:
217                        extracted_extra_fields = {
218                            "child_response": self._extract_child_response(
219                                parent_record,
220                                parent_stream_config.lazy_read_pointer,  # type: ignore[arg-type]  # lazy_read_pointer type handeled in __post_init__ of parent_stream_config
221                            ),
222                            **extracted_extra_fields,
223                        }
224
225                    yield StreamSlice(
226                        partition={
227                            partition_field: partition_value,
228                            "parent_slice": parent_partition or {},
229                        },
230                        cursor_slice={},
231                        extra_fields=extracted_extra_fields,
232                    )
233
234    def _extract_child_response(
235        self, parent_record: Mapping[str, Any] | AirbyteMessage, pointer: List[InterpolatedString]
236    ) -> requests.Response:
237        """Extract child records from a parent record based on lazy pointers."""
238
239        def _create_response(data: MutableMapping[str, Any]) -> SafeResponse:
240            """Create a SafeResponse with the given data."""
241            response = SafeResponse()
242            response.content = json.dumps(data).encode("utf-8")
243            response.status_code = 200
244            return response
245
246        path = [path.eval(self.config) for path in pointer]
247        return _create_response(dpath.get(parent_record, path, default=[]))  # type: ignore # argunet will be a MutableMapping, given input data structure
248
249    def _extract_extra_fields(
250        self,
251        parent_record: Mapping[str, Any] | AirbyteMessage,
252        extra_fields: Optional[List[List[str]]] = None,
253    ) -> Mapping[str, Any]:
254        """
255        Extracts additional fields specified by their paths from the parent record.
256
257        Args:
258            parent_record (Mapping[str, Any]): The record from the parent stream to extract fields from.
259            extra_fields (Optional[List[List[str]]]): A list of field paths (as lists of strings) to extract from the parent record.
260
261        Returns:
262            Mapping[str, Any]: A dictionary containing the extracted fields.
263                               The keys are the joined field paths, and the values are the corresponding extracted values.
264        """
265        extracted_extra_fields = {}
266        if extra_fields:
267            for extra_field_path in extra_fields:
268                try:
269                    extra_field_value = dpath.get(
270                        parent_record,  # type: ignore [arg-type]
271                        extra_field_path,
272                    )
273                    self.logger.debug(
274                        f"Extracted extra_field_path: {extra_field_path} with value: {extra_field_value}"
275                    )
276                except KeyError:
277                    self.logger.debug(f"Failed to extract extra_field_path: {extra_field_path}")
278                    extra_field_value = None
279                extracted_extra_fields[".".join(extra_field_path)] = extra_field_value
280        return extracted_extra_fields
281
282    def set_initial_state(self, stream_state: StreamState) -> None:
283        """
284        Set the state of the parent streams.
285
286        If the `parent_state` key is missing from `stream_state`, migrate the child stream state to the parent stream's state format.
287        This migration applies only to parent streams with incremental dependencies.
288
289        Args:
290            stream_state (StreamState): The state of the streams to be set.
291
292        Example of state format:
293        {
294            "parent_state": {
295                "parent_stream_name1": {
296                    "last_updated": "2023-05-27T00:00:00Z"
297                },
298                "parent_stream_name2": {
299                    "last_updated": "2023-05-27T00:00:00Z"
300                }
301            }
302        }
303
304        Example of migrating to parent state format:
305        - Initial state:
306        {
307            "updated_at": "2023-05-27T00:00:00Z"
308        }
309        - After migration:
310        {
311            "updated_at": "2023-05-27T00:00:00Z",
312            "parent_state": {
313                "parent_stream_name": {
314                    "parent_stream_cursor": "2023-05-27T00:00:00Z"
315                }
316            }
317        }
318        """
319        if not stream_state:
320            return
321
322        parent_state = stream_state.get("parent_state", {})
323
324        # Set state for each parent stream with an incremental dependency
325        for parent_config in self.parent_stream_configs:
326            if (
327                not parent_state.get(parent_config.stream.name, {})
328                and parent_config.incremental_dependency
329            ):
330                # Migrate child state to parent state format
331                parent_state = self._migrate_child_state_to_parent_state(stream_state)
332
333            if parent_config.incremental_dependency:
334                parent_config.stream.state = parent_state.get(parent_config.stream.name, {})
335
336    def _migrate_child_state_to_parent_state(self, stream_state: StreamState) -> StreamState:
337        """
338        Migrate the child or global stream state into the parent stream's state format.
339
340        This method converts the child stream state—or, if present, the global state—into a format that is
341        compatible with parent streams that use incremental synchronization. The migration occurs only for
342        parent streams with incremental dependencies. It filters out per-partition states and retains only the
343        global state in the form {cursor_field: cursor_value}.
344
345        The method supports multiple input formats:
346          - A simple global state, e.g.:
347                {"updated_at": "2023-05-27T00:00:00Z"}
348          - A state object that contains a "state" key (which is assumed to hold the global state), e.g.:
349                {"state": {"updated_at": "2023-05-27T00:00:00Z"}, ...}
350            In this case, the migration uses the first value from the "state" dictionary.
351          - Any per-partition state formats or other non-simple structures are ignored during migration.
352
353        Args:
354            stream_state (StreamState): The state to migrate. Expected formats include:
355                - {"updated_at": "2023-05-27T00:00:00Z"}
356                - {"state": {"updated_at": "2023-05-27T00:00:00Z"}, ...}
357                  (In this format, only the first global state value is used, and per-partition states are ignored.)
358
359        Returns:
360            StreamState: A migrated state for parent streams in the format:
361                {
362                    "parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
363                }
364            where each parent stream with an incremental dependency is assigned its corresponding cursor value.
365
366        Example:
367            Input: {"updated_at": "2023-05-27T00:00:00Z"}
368            Output: {
369                "parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
370            }
371        """
372        substream_state_values = list(stream_state.values())
373        substream_state = substream_state_values[0] if substream_state_values else {}
374
375        # Ignore per-partition states or invalid formats.
376        if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
377            # If a global state is present under the key "state", use its first value.
378            if "state" in stream_state and isinstance(stream_state["state"], dict):
379                substream_state = list(stream_state["state"].values())[0]
380            else:
381                return {}
382
383        # Build the parent state for all parent streams with incremental dependencies.
384        parent_state = {}
385        if substream_state:
386            for parent_config in self.parent_stream_configs:
387                if parent_config.incremental_dependency:
388                    parent_state[parent_config.stream.name] = {
389                        parent_config.stream.cursor_field: substream_state
390                    }
391
392        return parent_state
393
394    def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
395        """
396        Get the state of the parent streams.
397
398        Returns:
399            StreamState: The current state of the parent streams.
400
401        Example of state format:
402        {
403            "parent_stream_name1": {
404                "last_updated": "2023-05-27T00:00:00Z"
405            },
406            "parent_stream_name2": {
407                "last_updated": "2023-05-27T00:00:00Z"
408            }
409        }
410        """
411        parent_state = {}
412        for parent_config in self.parent_stream_configs:
413            if parent_config.incremental_dependency:
414                parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.state)
415        return parent_state
416
417    @property
418    def logger(self) -> logging.Logger:
419        return logging.getLogger("airbyte.SubstreamPartitionRouter")

Partition router that iterates over the parent's stream records and emits slices Will populate the state with partition_field and parent_slice so they can be accessed by other components

Attributes:
  • parent_stream_configs (List[ParentStreamConfig]): parent streams to iterate over and their config
SubstreamPartitionRouter( parent_stream_configs: List[ParentStreamConfig], config: Mapping[str, Any], parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]])
parent_stream_configs: List[ParentStreamConfig]
config: Mapping[str, Any]
parameters: dataclasses.InitVar[typing.Mapping[str, typing.Any]]
def get_request_params( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
 99    def get_request_params(
100        self,
101        stream_state: Optional[StreamState] = None,
102        stream_slice: Optional[StreamSlice] = None,
103        next_page_token: Optional[Mapping[str, Any]] = None,
104    ) -> Mapping[str, Any]:
105        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
106        return self._get_request_option(RequestOptionType.request_parameter, stream_slice)

Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def get_request_headers( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
108    def get_request_headers(
109        self,
110        stream_state: Optional[StreamState] = None,
111        stream_slice: Optional[StreamSlice] = None,
112        next_page_token: Optional[Mapping[str, Any]] = None,
113    ) -> Mapping[str, Any]:
114        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
115        return self._get_request_option(RequestOptionType.header, stream_slice)

Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def get_request_body_data( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
117    def get_request_body_data(
118        self,
119        stream_state: Optional[StreamState] = None,
120        stream_slice: Optional[StreamSlice] = None,
121        next_page_token: Optional[Mapping[str, Any]] = None,
122    ) -> Mapping[str, Any]:
123        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
124        return self._get_request_option(RequestOptionType.body_data, stream_slice)

Specifies how to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def get_request_body_json( self, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
126    def get_request_body_json(
127        self,
128        stream_state: Optional[StreamState] = None,
129        stream_slice: Optional[StreamSlice] = None,
130        next_page_token: Optional[Mapping[str, Any]] = None,
131    ) -> Mapping[str, Any]:
132        # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response
133        return self._get_request_option(RequestOptionType.body_json, stream_slice)

Specifies how to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def stream_slices(self) -> Iterable[StreamSlice]:
151    def stream_slices(self) -> Iterable[StreamSlice]:
152        """
153        Iterate over each parent stream's record and create a StreamSlice for each record.
154
155        For each stream, iterate over its stream_slices.
156        For each stream slice, iterate over each record.
157        yield a stream slice for each such records.
158
159        If a parent slice contains no record, emit a slice with parent_record=None.
160
161        The template string can interpolate the following values:
162        - parent_stream_slice: mapping representing the parent's stream slice
163        - parent_record: mapping representing the parent record
164        - parent_stream_name: string representing the parent stream name
165        """
166        if not self.parent_stream_configs:
167            yield from []
168        else:
169            for parent_stream_config in self.parent_stream_configs:
170                parent_stream = parent_stream_config.stream
171                parent_field = parent_stream_config.parent_key.eval(self.config)  # type: ignore # parent_key is always casted to an interpolated string
172                partition_field = parent_stream_config.partition_field.eval(self.config)  # type: ignore # partition_field is always casted to an interpolated string
173                extra_fields = None
174                if parent_stream_config.extra_fields:
175                    extra_fields = [
176                        [field_path_part.eval(self.config) for field_path_part in field_path]  # type: ignore [union-attr]
177                        for field_path in parent_stream_config.extra_fields
178                    ]
179
180                # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does
181                # not support either substreams or RFR, but something that needs to be considered once we do
182                for parent_record in parent_stream.read_only_records():
183                    parent_partition = None
184                    # Skip non-records (eg AirbyteLogMessage)
185                    if isinstance(parent_record, AirbyteMessage):
186                        self.logger.warning(
187                            f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state."
188                        )
189                        if parent_record.type == MessageType.RECORD:
190                            parent_record = parent_record.record.data  # type: ignore[union-attr, assignment]  # record is always a Record
191                        else:
192                            continue
193                    elif isinstance(parent_record, Record):
194                        parent_partition = (
195                            parent_record.associated_slice.partition
196                            if parent_record.associated_slice
197                            else {}
198                        )
199                        parent_record = parent_record.data
200                    elif not isinstance(parent_record, Mapping):
201                        # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid
202                        raise AirbyteTracedException(
203                            message=f"Parent stream returned records as invalid type {type(parent_record)}"
204                        )
205                    try:
206                        partition_value = dpath.get(
207                            parent_record,  # type: ignore [arg-type]
208                            parent_field,
209                        )
210                    except KeyError:
211                        continue
212
213                    # Add extra fields
214                    extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields)
215
216                    if parent_stream_config.lazy_read_pointer:
217                        extracted_extra_fields = {
218                            "child_response": self._extract_child_response(
219                                parent_record,
220                                parent_stream_config.lazy_read_pointer,  # type: ignore[arg-type]  # lazy_read_pointer type handeled in __post_init__ of parent_stream_config
221                            ),
222                            **extracted_extra_fields,
223                        }
224
225                    yield StreamSlice(
226                        partition={
227                            partition_field: partition_value,
228                            "parent_slice": parent_partition or {},
229                        },
230                        cursor_slice={},
231                        extra_fields=extracted_extra_fields,
232                    )

Iterate over each parent stream's record and create a StreamSlice for each record.

For each stream, iterate over its stream_slices. For each stream slice, iterate over each record. yield a stream slice for each such records.

If a parent slice contains no record, emit a slice with parent_record=None.

The template string can interpolate the following values:

  • parent_stream_slice: mapping representing the parent's stream slice
  • parent_record: mapping representing the parent record
  • parent_stream_name: string representing the parent stream name
def set_initial_state(self, stream_state: Mapping[str, Any]) -> None:
282    def set_initial_state(self, stream_state: StreamState) -> None:
283        """
284        Set the state of the parent streams.
285
286        If the `parent_state` key is missing from `stream_state`, migrate the child stream state to the parent stream's state format.
287        This migration applies only to parent streams with incremental dependencies.
288
289        Args:
290            stream_state (StreamState): The state of the streams to be set.
291
292        Example of state format:
293        {
294            "parent_state": {
295                "parent_stream_name1": {
296                    "last_updated": "2023-05-27T00:00:00Z"
297                },
298                "parent_stream_name2": {
299                    "last_updated": "2023-05-27T00:00:00Z"
300                }
301            }
302        }
303
304        Example of migrating to parent state format:
305        - Initial state:
306        {
307            "updated_at": "2023-05-27T00:00:00Z"
308        }
309        - After migration:
310        {
311            "updated_at": "2023-05-27T00:00:00Z",
312            "parent_state": {
313                "parent_stream_name": {
314                    "parent_stream_cursor": "2023-05-27T00:00:00Z"
315                }
316            }
317        }
318        """
319        if not stream_state:
320            return
321
322        parent_state = stream_state.get("parent_state", {})
323
324        # Set state for each parent stream with an incremental dependency
325        for parent_config in self.parent_stream_configs:
326            if (
327                not parent_state.get(parent_config.stream.name, {})
328                and parent_config.incremental_dependency
329            ):
330                # Migrate child state to parent state format
331                parent_state = self._migrate_child_state_to_parent_state(stream_state)
332
333            if parent_config.incremental_dependency:
334                parent_config.stream.state = parent_state.get(parent_config.stream.name, {})

Set the state of the parent streams.

If the parent_state key is missing from stream_state, migrate the child stream state to the parent stream's state format. This migration applies only to parent streams with incremental dependencies.

Arguments:
  • stream_state (StreamState): The state of the streams to be set.

Example of state format: { "parent_state": { "parent_stream_name1": { "last_updated": "2023-05-27T00:00:00Z" }, "parent_stream_name2": { "last_updated": "2023-05-27T00:00:00Z" } } }

Example of migrating to parent state format:

  • Initial state: { "updated_at": "2023-05-27T00:00:00Z" }
  • After migration: { "updated_at": "2023-05-27T00:00:00Z", "parent_state": { "parent_stream_name": { "parent_stream_cursor": "2023-05-27T00:00:00Z" } } }
def get_stream_state(self) -> Optional[Mapping[str, Mapping[str, Any]]]:
394    def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
395        """
396        Get the state of the parent streams.
397
398        Returns:
399            StreamState: The current state of the parent streams.
400
401        Example of state format:
402        {
403            "parent_stream_name1": {
404                "last_updated": "2023-05-27T00:00:00Z"
405            },
406            "parent_stream_name2": {
407                "last_updated": "2023-05-27T00:00:00Z"
408            }
409        }
410        """
411        parent_state = {}
412        for parent_config in self.parent_stream_configs:
413            if parent_config.incremental_dependency:
414                parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.state)
415        return parent_state

Get the state of the parent streams.

Returns:

StreamState: The current state of the parent streams.

Example of state format: { "parent_stream_name1": { "last_updated": "2023-05-27T00:00:00Z" }, "parent_stream_name2": { "last_updated": "2023-05-27T00:00:00Z" } }

logger: logging.Logger
417    @property
418    def logger(self) -> logging.Logger:
419        return logging.getLogger("airbyte.SubstreamPartitionRouter")
18class YamlDeclarativeSource(ConcurrentDeclarativeSource[List[AirbyteStateMessage]]):
19    """Declarative source defined by a yaml file"""
20
21    def __init__(
22        self,
23        path_to_yaml: str,
24        debug: bool = False,
25        catalog: Optional[ConfiguredAirbyteCatalog] = None,
26        config: Optional[Mapping[str, Any]] = None,
27        state: Optional[List[AirbyteStateMessage]] = None,
28    ) -> None:
29        """
30        :param path_to_yaml: Path to the yaml file describing the source
31        """
32        self._path_to_yaml = path_to_yaml
33        source_config = self._read_and_parse_yaml_file(path_to_yaml)
34
35        super().__init__(
36            catalog=catalog or ConfiguredAirbyteCatalog(streams=[]),
37            config=config or {},
38            state=state or [],
39            source_config=source_config,
40        )
41
42    def _read_and_parse_yaml_file(self, path_to_yaml_file: str) -> ConnectionDefinition:
43        try:
44            # For testing purposes, we want to allow to just pass a file
45            with open(path_to_yaml_file, "r") as f:
46                return yaml.safe_load(f)  # type: ignore  # we assume the yaml represents a ConnectionDefinition
47        except FileNotFoundError:
48            # Running inside the container, the working directory during an operation is not structured the same as the static files
49            package = self.__class__.__module__.split(".")[0]
50
51            yaml_config = pkgutil.get_data(package, path_to_yaml_file)
52            if yaml_config:
53                decoded_yaml = yaml_config.decode()
54                return self._parse(decoded_yaml)
55            return {}
56
57    def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None:
58        extra_args["path_to_yaml"] = self._path_to_yaml
59
60    @staticmethod
61    def _parse(connection_definition_str: str) -> ConnectionDefinition:
62        """
63        Parses a yaml file into a manifest. Component references still exist in the manifest which will be
64        resolved during the creating of the DeclarativeSource.
65        :param connection_definition_str: yaml string to parse
66        :return: The ConnectionDefinition parsed from connection_definition_str
67        """
68        return yaml.safe_load(connection_definition_str)  # type: ignore # yaml.safe_load doesn't return a type but know it is a Mapping

Declarative source defined by a yaml file

YamlDeclarativeSource( path_to_yaml: str, debug: bool = False, catalog: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteCatalog] = None, config: Optional[Mapping[str, Any]] = None, state: Optional[List[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage]] = None)
21    def __init__(
22        self,
23        path_to_yaml: str,
24        debug: bool = False,
25        catalog: Optional[ConfiguredAirbyteCatalog] = None,
26        config: Optional[Mapping[str, Any]] = None,
27        state: Optional[List[AirbyteStateMessage]] = None,
28    ) -> None:
29        """
30        :param path_to_yaml: Path to the yaml file describing the source
31        """
32        self._path_to_yaml = path_to_yaml
33        source_config = self._read_and_parse_yaml_file(path_to_yaml)
34
35        super().__init__(
36            catalog=catalog or ConfiguredAirbyteCatalog(streams=[]),
37            config=config or {},
38            state=state or [],
39            source_config=source_config,
40        )
Parameters
  • path_to_yaml: Path to the yaml file describing the source
def launch(source: Source, args: List[str]) -> None:
336def launch(source: Source, args: List[str]) -> None:
337    source_entrypoint = AirbyteEntrypoint(source)
338    parsed_args = source_entrypoint.parse_args(args)
339    # temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
340    # Refer to: https://github.com/airbytehq/oncall/issues/6235
341    with PRINT_BUFFER:
342        for message in source_entrypoint.run(parsed_args):
343            # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
344            # the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
345            print(f"{message}\n", end="")
class AirbyteEntrypoint:
 54class AirbyteEntrypoint(object):
 55    def __init__(self, source: Source):
 56        init_uncaught_exception_handler(logger)
 57
 58        # Deployment mode is read when instantiating the entrypoint because it is the common path shared by syncs and connector builder test requests
 59        if is_cloud_environment():
 60            _init_internal_request_filter()
 61
 62        self.source = source
 63        self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}")
 64
 65    @staticmethod
 66    def parse_args(args: List[str]) -> argparse.Namespace:
 67        # set up parent parsers
 68        parent_parser = argparse.ArgumentParser(add_help=False)
 69        parent_parser.add_argument(
 70            "--debug", action="store_true", help="enables detailed debug logs related to the sync"
 71        )
 72        main_parser = argparse.ArgumentParser()
 73        subparsers = main_parser.add_subparsers(title="commands", dest="command")
 74
 75        # spec
 76        subparsers.add_parser(
 77            "spec", help="outputs the json configuration specification", parents=[parent_parser]
 78        )
 79
 80        # check
 81        check_parser = subparsers.add_parser(
 82            "check", help="checks the config can be used to connect", parents=[parent_parser]
 83        )
 84        required_check_parser = check_parser.add_argument_group("required named arguments")
 85        required_check_parser.add_argument(
 86            "--config", type=str, required=True, help="path to the json configuration file"
 87        )
 88
 89        # discover
 90        discover_parser = subparsers.add_parser(
 91            "discover",
 92            help="outputs a catalog describing the source's schema",
 93            parents=[parent_parser],
 94        )
 95        required_discover_parser = discover_parser.add_argument_group("required named arguments")
 96        required_discover_parser.add_argument(
 97            "--config", type=str, required=True, help="path to the json configuration file"
 98        )
 99
100        # read
101        read_parser = subparsers.add_parser(
102            "read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser]
103        )
104
105        read_parser.add_argument(
106            "--state", type=str, required=False, help="path to the json-encoded state file"
107        )
108        required_read_parser = read_parser.add_argument_group("required named arguments")
109        required_read_parser.add_argument(
110            "--config", type=str, required=True, help="path to the json configuration file"
111        )
112        required_read_parser.add_argument(
113            "--catalog",
114            type=str,
115            required=True,
116            help="path to the catalog used to determine which data to read",
117        )
118
119        return main_parser.parse_args(args)
120
121    def run(self, parsed_args: argparse.Namespace) -> Iterable[str]:
122        cmd = parsed_args.command
123        if not cmd:
124            raise Exception("No command passed")
125
126        if hasattr(parsed_args, "debug") and parsed_args.debug:
127            self.logger.setLevel(logging.DEBUG)
128            logger.setLevel(logging.DEBUG)
129            self.logger.debug("Debug logs enabled")
130        else:
131            self.logger.setLevel(logging.INFO)
132
133        source_spec: ConnectorSpecification = self.source.spec(self.logger)
134        try:
135            with tempfile.TemporaryDirectory(
136                # Cleanup can fail on Windows due to file locks. Ignore if so,
137                # rather than failing the whole process.
138                ignore_cleanup_errors=True,
139            ) as temp_dir:
140                os.environ[ENV_REQUEST_CACHE_PATH] = (
141                    temp_dir  # set this as default directory for request_cache to store *.sqlite files
142                )
143                if cmd == "spec":
144                    message = AirbyteMessage(type=Type.SPEC, spec=source_spec)
145                    yield from [
146                        self.airbyte_message_to_string(queued_message)
147                        for queued_message in self._emit_queued_messages(self.source)
148                    ]
149                    yield self.airbyte_message_to_string(message)
150                else:
151                    raw_config = self.source.read_config(parsed_args.config)
152                    config = self.source.configure(raw_config, temp_dir)
153
154                    yield from [
155                        self.airbyte_message_to_string(queued_message)
156                        for queued_message in self._emit_queued_messages(self.source)
157                    ]
158                    if cmd == "check":
159                        yield from map(
160                            AirbyteEntrypoint.airbyte_message_to_string,
161                            self.check(source_spec, config),
162                        )
163                    elif cmd == "discover":
164                        yield from map(
165                            AirbyteEntrypoint.airbyte_message_to_string,
166                            self.discover(source_spec, config),
167                        )
168                    elif cmd == "read":
169                        config_catalog = self.source.read_catalog(parsed_args.catalog)
170                        state = self.source.read_state(parsed_args.state)
171
172                        yield from map(
173                            AirbyteEntrypoint.airbyte_message_to_string,
174                            self.read(source_spec, config, config_catalog, state),
175                        )
176                    else:
177                        raise Exception("Unexpected command " + cmd)
178        finally:
179            yield from [
180                self.airbyte_message_to_string(queued_message)
181                for queued_message in self._emit_queued_messages(self.source)
182            ]
183
184    def check(
185        self, source_spec: ConnectorSpecification, config: TConfig
186    ) -> Iterable[AirbyteMessage]:
187        self.set_up_secret_filter(config, source_spec.connectionSpecification)
188        try:
189            self.validate_connection(source_spec, config)
190        except AirbyteTracedException as traced_exc:
191            connection_status = traced_exc.as_connection_status_message()
192            # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error
193            # If the failure is not exceptional, we'll emit a failed connection status message and return
194            if traced_exc.failure_type != FailureType.config_error:
195                raise traced_exc
196            if connection_status:
197                yield from self._emit_queued_messages(self.source)
198                yield connection_status
199                return
200
201        try:
202            check_result = self.source.check(self.logger, config)
203        except AirbyteTracedException as traced_exc:
204            yield traced_exc.as_airbyte_message()
205            # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error
206            # If the failure is not exceptional, we'll emit a failed connection status message and return
207            if traced_exc.failure_type != FailureType.config_error:
208                raise traced_exc
209            else:
210                yield AirbyteMessage(
211                    type=Type.CONNECTION_STATUS,
212                    connectionStatus=AirbyteConnectionStatus(
213                        status=Status.FAILED, message=traced_exc.message
214                    ),
215                )
216                return
217        if check_result.status == Status.SUCCEEDED:
218            self.logger.info("Check succeeded")
219        else:
220            self.logger.error("Check failed")
221
222        yield from self._emit_queued_messages(self.source)
223        yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
224
225    def discover(
226        self, source_spec: ConnectorSpecification, config: TConfig
227    ) -> Iterable[AirbyteMessage]:
228        self.set_up_secret_filter(config, source_spec.connectionSpecification)
229        if self.source.check_config_against_spec:
230            self.validate_connection(source_spec, config)
231        catalog = self.source.discover(self.logger, config)
232
233        yield from self._emit_queued_messages(self.source)
234        yield AirbyteMessage(type=Type.CATALOG, catalog=catalog)
235
236    def read(
237        self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]
238    ) -> Iterable[AirbyteMessage]:
239        self.set_up_secret_filter(config, source_spec.connectionSpecification)
240        if self.source.check_config_against_spec:
241            self.validate_connection(source_spec, config)
242
243        # The Airbyte protocol dictates that counts be expressed as float/double to better protect against integer overflows
244        stream_message_counter: DefaultDict[HashableStreamDescriptor, float] = defaultdict(float)
245        for message in self.source.read(self.logger, config, catalog, state):
246            yield self.handle_record_counts(message, stream_message_counter)
247        for message in self._emit_queued_messages(self.source):
248            yield self.handle_record_counts(message, stream_message_counter)
249
250    @staticmethod
251    def handle_record_counts(
252        message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]
253    ) -> AirbyteMessage:
254        match message.type:
255            case Type.RECORD:
256                if message.record is None:
257                    raise ValueError("Record message must have a record attribute")
258
259                stream_message_count[
260                    HashableStreamDescriptor(
261                        name=message.record.stream,  # type: ignore[union-attr] # record has `stream`
262                        namespace=message.record.namespace,  # type: ignore[union-attr] # record has `namespace`
263                    )
264                ] += 1.0
265            case Type.STATE:
266                if message.state is None:
267                    raise ValueError("State message must have a state attribute")
268
269                stream_descriptor = message_utils.get_stream_descriptor(message)
270
271                # Set record count from the counter onto the state message
272                message.state.sourceStats = message.state.sourceStats or AirbyteStateStats()  # type: ignore[union-attr] # state has `sourceStats`
273                message.state.sourceStats.recordCount = stream_message_count.get(  # type: ignore[union-attr] # state has `sourceStats`
274                    stream_descriptor, 0.0
275                )
276
277                # Reset the counter
278                stream_message_count[stream_descriptor] = 0.0
279        return message
280
281    @staticmethod
282    def validate_connection(source_spec: ConnectorSpecification, config: TConfig) -> None:
283        # Remove internal flags from config before validating so
284        # jsonschema's additionalProperties flag won't fail the validation
285        connector_config, _ = split_config(config)
286        check_config_against_spec_or_exit(connector_config, source_spec)
287
288    @staticmethod
289    def set_up_secret_filter(config: TConfig, connection_specification: Mapping[str, Any]) -> None:
290        # Now that we have the config, we can use it to get a list of ai airbyte_secrets
291        # that we should filter in logging to avoid leaking secrets
292        config_secrets = get_secrets(connection_specification, config)
293        update_secrets(config_secrets)
294
295    @staticmethod
296    def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str:
297        global _HAS_LOGGED_FOR_SERIALIZATION_ERROR
298        serialized_message = AirbyteMessageSerializer.dump(airbyte_message)
299        try:
300            return orjson.dumps(serialized_message).decode()
301        except Exception as exception:
302            if not _HAS_LOGGED_FOR_SERIALIZATION_ERROR:
303                logger.warning(
304                    f"There was an error during the serialization of an AirbyteMessage: `{exception}`. This might impact the sync performances."
305                )
306                _HAS_LOGGED_FOR_SERIALIZATION_ERROR = True
307            return json.dumps(serialized_message)
308
309    @classmethod
310    def extract_state(cls, args: List[str]) -> Optional[Any]:
311        parsed_args = cls.parse_args(args)
312        if hasattr(parsed_args, "state"):
313            return parsed_args.state
314        return None
315
316    @classmethod
317    def extract_catalog(cls, args: List[str]) -> Optional[Any]:
318        parsed_args = cls.parse_args(args)
319        if hasattr(parsed_args, "catalog"):
320            return parsed_args.catalog
321        return None
322
323    @classmethod
324    def extract_config(cls, args: List[str]) -> Optional[Any]:
325        parsed_args = cls.parse_args(args)
326        if hasattr(parsed_args, "config"):
327            return parsed_args.config
328        return None
329
330    def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
331        if hasattr(source, "message_repository") and source.message_repository:
332            yield from source.message_repository.consume_queue()
333        return
AirbyteEntrypoint(source: Source)
55    def __init__(self, source: Source):
56        init_uncaught_exception_handler(logger)
57
58        # Deployment mode is read when instantiating the entrypoint because it is the common path shared by syncs and connector builder test requests
59        if is_cloud_environment():
60            _init_internal_request_filter()
61
62        self.source = source
63        self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}")
source
logger
@staticmethod
def parse_args(args: List[str]) -> argparse.Namespace:
 65    @staticmethod
 66    def parse_args(args: List[str]) -> argparse.Namespace:
 67        # set up parent parsers
 68        parent_parser = argparse.ArgumentParser(add_help=False)
 69        parent_parser.add_argument(
 70            "--debug", action="store_true", help="enables detailed debug logs related to the sync"
 71        )
 72        main_parser = argparse.ArgumentParser()
 73        subparsers = main_parser.add_subparsers(title="commands", dest="command")
 74
 75        # spec
 76        subparsers.add_parser(
 77            "spec", help="outputs the json configuration specification", parents=[parent_parser]
 78        )
 79
 80        # check
 81        check_parser = subparsers.add_parser(
 82            "check", help="checks the config can be used to connect", parents=[parent_parser]
 83        )
 84        required_check_parser = check_parser.add_argument_group("required named arguments")
 85        required_check_parser.add_argument(
 86            "--config", type=str, required=True, help="path to the json configuration file"
 87        )
 88
 89        # discover
 90        discover_parser = subparsers.add_parser(
 91            "discover",
 92            help="outputs a catalog describing the source's schema",
 93            parents=[parent_parser],
 94        )
 95        required_discover_parser = discover_parser.add_argument_group("required named arguments")
 96        required_discover_parser.add_argument(
 97            "--config", type=str, required=True, help="path to the json configuration file"
 98        )
 99
100        # read
101        read_parser = subparsers.add_parser(
102            "read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser]
103        )
104
105        read_parser.add_argument(
106            "--state", type=str, required=False, help="path to the json-encoded state file"
107        )
108        required_read_parser = read_parser.add_argument_group("required named arguments")
109        required_read_parser.add_argument(
110            "--config", type=str, required=True, help="path to the json configuration file"
111        )
112        required_read_parser.add_argument(
113            "--catalog",
114            type=str,
115            required=True,
116            help="path to the catalog used to determine which data to read",
117        )
118
119        return main_parser.parse_args(args)
def run(self, parsed_args: argparse.Namespace) -> Iterable[str]:
121    def run(self, parsed_args: argparse.Namespace) -> Iterable[str]:
122        cmd = parsed_args.command
123        if not cmd:
124            raise Exception("No command passed")
125
126        if hasattr(parsed_args, "debug") and parsed_args.debug:
127            self.logger.setLevel(logging.DEBUG)
128            logger.setLevel(logging.DEBUG)
129            self.logger.debug("Debug logs enabled")
130        else:
131            self.logger.setLevel(logging.INFO)
132
133        source_spec: ConnectorSpecification = self.source.spec(self.logger)
134        try:
135            with tempfile.TemporaryDirectory(
136                # Cleanup can fail on Windows due to file locks. Ignore if so,
137                # rather than failing the whole process.
138                ignore_cleanup_errors=True,
139            ) as temp_dir:
140                os.environ[ENV_REQUEST_CACHE_PATH] = (
141                    temp_dir  # set this as default directory for request_cache to store *.sqlite files
142                )
143                if cmd == "spec":
144                    message = AirbyteMessage(type=Type.SPEC, spec=source_spec)
145                    yield from [
146                        self.airbyte_message_to_string(queued_message)
147                        for queued_message in self._emit_queued_messages(self.source)
148                    ]
149                    yield self.airbyte_message_to_string(message)
150                else:
151                    raw_config = self.source.read_config(parsed_args.config)
152                    config = self.source.configure(raw_config, temp_dir)
153
154                    yield from [
155                        self.airbyte_message_to_string(queued_message)
156                        for queued_message in self._emit_queued_messages(self.source)
157                    ]
158                    if cmd == "check":
159                        yield from map(
160                            AirbyteEntrypoint.airbyte_message_to_string,
161                            self.check(source_spec, config),
162                        )
163                    elif cmd == "discover":
164                        yield from map(
165                            AirbyteEntrypoint.airbyte_message_to_string,
166                            self.discover(source_spec, config),
167                        )
168                    elif cmd == "read":
169                        config_catalog = self.source.read_catalog(parsed_args.catalog)
170                        state = self.source.read_state(parsed_args.state)
171
172                        yield from map(
173                            AirbyteEntrypoint.airbyte_message_to_string,
174                            self.read(source_spec, config, config_catalog, state),
175                        )
176                    else:
177                        raise Exception("Unexpected command " + cmd)
178        finally:
179            yield from [
180                self.airbyte_message_to_string(queued_message)
181                for queued_message in self._emit_queued_messages(self.source)
182            ]
def check( self, source_spec: airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification, config: ~TConfig) -> Iterable[AirbyteMessage]:
184    def check(
185        self, source_spec: ConnectorSpecification, config: TConfig
186    ) -> Iterable[AirbyteMessage]:
187        self.set_up_secret_filter(config, source_spec.connectionSpecification)
188        try:
189            self.validate_connection(source_spec, config)
190        except AirbyteTracedException as traced_exc:
191            connection_status = traced_exc.as_connection_status_message()
192            # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error
193            # If the failure is not exceptional, we'll emit a failed connection status message and return
194            if traced_exc.failure_type != FailureType.config_error:
195                raise traced_exc
196            if connection_status:
197                yield from self._emit_queued_messages(self.source)
198                yield connection_status
199                return
200
201        try:
202            check_result = self.source.check(self.logger, config)
203        except AirbyteTracedException as traced_exc:
204            yield traced_exc.as_airbyte_message()
205            # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error
206            # If the failure is not exceptional, we'll emit a failed connection status message and return
207            if traced_exc.failure_type != FailureType.config_error:
208                raise traced_exc
209            else:
210                yield AirbyteMessage(
211                    type=Type.CONNECTION_STATUS,
212                    connectionStatus=AirbyteConnectionStatus(
213                        status=Status.FAILED, message=traced_exc.message
214                    ),
215                )
216                return
217        if check_result.status == Status.SUCCEEDED:
218            self.logger.info("Check succeeded")
219        else:
220            self.logger.error("Check failed")
221
222        yield from self._emit_queued_messages(self.source)
223        yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
def discover( self, source_spec: airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification, config: ~TConfig) -> Iterable[AirbyteMessage]:
225    def discover(
226        self, source_spec: ConnectorSpecification, config: TConfig
227    ) -> Iterable[AirbyteMessage]:
228        self.set_up_secret_filter(config, source_spec.connectionSpecification)
229        if self.source.check_config_against_spec:
230            self.validate_connection(source_spec, config)
231        catalog = self.source.discover(self.logger, config)
232
233        yield from self._emit_queued_messages(self.source)
234        yield AirbyteMessage(type=Type.CATALOG, catalog=catalog)
def read( self, source_spec: airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification, config: ~TConfig, catalog: Any, state: list[typing.Any]) -> Iterable[AirbyteMessage]:
236    def read(
237        self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]
238    ) -> Iterable[AirbyteMessage]:
239        self.set_up_secret_filter(config, source_spec.connectionSpecification)
240        if self.source.check_config_against_spec:
241            self.validate_connection(source_spec, config)
242
243        # The Airbyte protocol dictates that counts be expressed as float/double to better protect against integer overflows
244        stream_message_counter: DefaultDict[HashableStreamDescriptor, float] = defaultdict(float)
245        for message in self.source.read(self.logger, config, catalog, state):
246            yield self.handle_record_counts(message, stream_message_counter)
247        for message in self._emit_queued_messages(self.source):
248            yield self.handle_record_counts(message, stream_message_counter)
@staticmethod
def handle_record_counts( message: AirbyteMessage, stream_message_count: DefaultDict[airbyte_cdk.sources.connector_state_manager.HashableStreamDescriptor, float]) -> AirbyteMessage:
250    @staticmethod
251    def handle_record_counts(
252        message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]
253    ) -> AirbyteMessage:
254        match message.type:
255            case Type.RECORD:
256                if message.record is None:
257                    raise ValueError("Record message must have a record attribute")
258
259                stream_message_count[
260                    HashableStreamDescriptor(
261                        name=message.record.stream,  # type: ignore[union-attr] # record has `stream`
262                        namespace=message.record.namespace,  # type: ignore[union-attr] # record has `namespace`
263                    )
264                ] += 1.0
265            case Type.STATE:
266                if message.state is None:
267                    raise ValueError("State message must have a state attribute")
268
269                stream_descriptor = message_utils.get_stream_descriptor(message)
270
271                # Set record count from the counter onto the state message
272                message.state.sourceStats = message.state.sourceStats or AirbyteStateStats()  # type: ignore[union-attr] # state has `sourceStats`
273                message.state.sourceStats.recordCount = stream_message_count.get(  # type: ignore[union-attr] # state has `sourceStats`
274                    stream_descriptor, 0.0
275                )
276
277                # Reset the counter
278                stream_message_count[stream_descriptor] = 0.0
279        return message
@staticmethod
def validate_connection( source_spec: airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification, config: ~TConfig) -> None:
281    @staticmethod
282    def validate_connection(source_spec: ConnectorSpecification, config: TConfig) -> None:
283        # Remove internal flags from config before validating so
284        # jsonschema's additionalProperties flag won't fail the validation
285        connector_config, _ = split_config(config)
286        check_config_against_spec_or_exit(connector_config, source_spec)
@staticmethod
def set_up_secret_filter(config: ~TConfig, connection_specification: Mapping[str, Any]) -> None:
288    @staticmethod
289    def set_up_secret_filter(config: TConfig, connection_specification: Mapping[str, Any]) -> None:
290        # Now that we have the config, we can use it to get a list of ai airbyte_secrets
291        # that we should filter in logging to avoid leaking secrets
292        config_secrets = get_secrets(connection_specification, config)
293        update_secrets(config_secrets)
@staticmethod
def airbyte_message_to_string( airbyte_message: AirbyteMessage) -> str:
295    @staticmethod
296    def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str:
297        global _HAS_LOGGED_FOR_SERIALIZATION_ERROR
298        serialized_message = AirbyteMessageSerializer.dump(airbyte_message)
299        try:
300            return orjson.dumps(serialized_message).decode()
301        except Exception as exception:
302            if not _HAS_LOGGED_FOR_SERIALIZATION_ERROR:
303                logger.warning(
304                    f"There was an error during the serialization of an AirbyteMessage: `{exception}`. This might impact the sync performances."
305                )
306                _HAS_LOGGED_FOR_SERIALIZATION_ERROR = True
307            return json.dumps(serialized_message)
@classmethod
def extract_state(cls, args: List[str]) -> Optional[Any]:
309    @classmethod
310    def extract_state(cls, args: List[str]) -> Optional[Any]:
311        parsed_args = cls.parse_args(args)
312        if hasattr(parsed_args, "state"):
313            return parsed_args.state
314        return None
@classmethod
def extract_catalog(cls, args: List[str]) -> Optional[Any]:
316    @classmethod
317    def extract_catalog(cls, args: List[str]) -> Optional[Any]:
318        parsed_args = cls.parse_args(args)
319        if hasattr(parsed_args, "catalog"):
320            return parsed_args.catalog
321        return None
@classmethod
def extract_config(cls, args: List[str]) -> Optional[Any]:
323    @classmethod
324    def extract_config(cls, args: List[str]) -> Optional[Any]:
325        parsed_args = cls.parse_args(args)
326        if hasattr(parsed_args, "config"):
327            return parsed_args.config
328        return None
class AbstractAPIBudget(abc.ABC):
479class AbstractAPIBudget(abc.ABC):
480    """Interface to some API where a client allowed to have N calls per T interval.
481
482    Important: APIBudget is not doing any API calls, the end user code is responsible to call this interface
483        to respect call rate limitation of the API.
484
485    It supports multiple policies applied to different group of requests. To distinct these groups we use RequestMatchers.
486    Individual policy represented by MovingWindowCallRatePolicy and currently supports only moving window strategy.
487    """
488
489    @abc.abstractmethod
490    def acquire_call(
491        self, request: Any, block: bool = True, timeout: Optional[float] = None
492    ) -> None:
493        """Try to get a call from budget, will block by default
494
495        :param request:
496        :param block: when true (default) will block the current thread until call credit is available
497        :param timeout: if set will limit maximum time in block, otherwise will wait until credit is available
498        :raises: CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout
499        """
500
501    @abc.abstractmethod
502    def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]:
503        """Find matching call rate policy for specific request"""
504
505    @abc.abstractmethod
506    def update_from_response(self, request: Any, response: Any) -> None:
507        """Update budget information based on response from API
508
509        :param request: the initial request that triggered this response
510        :param response: response from the API
511        """

Interface to some API where a client allowed to have N calls per T interval.

Important: APIBudget is not doing any API calls, the end user code is responsible to call this interface to respect call rate limitation of the API.

It supports multiple policies applied to different group of requests. To distinct these groups we use RequestMatchers. Individual policy represented by MovingWindowCallRatePolicy and currently supports only moving window strategy.

@abc.abstractmethod
def acquire_call( self, request: Any, block: bool = True, timeout: Optional[float] = None) -> None:
489    @abc.abstractmethod
490    def acquire_call(
491        self, request: Any, block: bool = True, timeout: Optional[float] = None
492    ) -> None:
493        """Try to get a call from budget, will block by default
494
495        :param request:
496        :param block: when true (default) will block the current thread until call credit is available
497        :param timeout: if set will limit maximum time in block, otherwise will wait until credit is available
498        :raises: CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout
499        """

Try to get a call from budget, will block by default

Parameters
  • request:
  • block: when true (default) will block the current thread until call credit is available
  • timeout: if set will limit maximum time in block, otherwise will wait until credit is available
Raises
  • CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout
@abc.abstractmethod
def get_matching_policy( self, request: Any) -> Optional[airbyte_cdk.sources.streams.call_rate.AbstractCallRatePolicy]:
501    @abc.abstractmethod
502    def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]:
503        """Find matching call rate policy for specific request"""

Find matching call rate policy for specific request

@abc.abstractmethod
def update_from_response(self, request: Any, response: Any) -> None:
505    @abc.abstractmethod
506    def update_from_response(self, request: Any, response: Any) -> None:
507        """Update budget information based on response from API
508
509        :param request: the initial request that triggered this response
510        :param response: response from the API
511        """

Update budget information based on response from API

Parameters
  • request: the initial request that triggered this response
  • response: response from the API
class AbstractHeaderAuthenticator(requests.auth.AuthBase):
13class AbstractHeaderAuthenticator(AuthBase):
14    """Abstract class for an header-based authenticators that add a header to outgoing HTTP requests."""
15
16    def __call__(self, request: requests.PreparedRequest) -> Any:
17        """Attach the HTTP headers required to authenticate on the HTTP request"""
18        request.headers.update(self.get_auth_header())
19        return request
20
21    def get_auth_header(self) -> Mapping[str, Any]:
22        """The header to set on outgoing HTTP requests"""
23        if self.auth_header:
24            return {self.auth_header: self.token}
25        return {}
26
27    @property
28    @abstractmethod
29    def auth_header(self) -> str:
30        """HTTP header to set on the requests"""
31
32    @property
33    @abstractmethod
34    def token(self) -> str:
35        """The header value to set on outgoing HTTP requests"""

Abstract class for an header-based authenticators that add a header to outgoing HTTP requests.

def get_auth_header(self) -> Mapping[str, Any]:
21    def get_auth_header(self) -> Mapping[str, Any]:
22        """The header to set on outgoing HTTP requests"""
23        if self.auth_header:
24            return {self.auth_header: self.token}
25        return {}

The header to set on outgoing HTTP requests

auth_header: str
27    @property
28    @abstractmethod
29    def auth_header(self) -> str:
30        """HTTP header to set on the requests"""

HTTP header to set on the requests

token: str
32    @property
33    @abstractmethod
34    def token(self) -> str:
35        """The header value to set on outgoing HTTP requests"""

The header value to set on outgoing HTTP requests

class BaseBackoffException(requests.exceptions.HTTPError):
12class BaseBackoffException(requests.exceptions.HTTPError):
13    def __init__(
14        self,
15        request: requests.PreparedRequest,
16        response: Optional[Union[requests.Response, Exception]],
17        error_message: str = "",
18    ):
19        if isinstance(response, requests.Response):
20            error_message = (
21                error_message
22                or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}"
23            )
24            super().__init__(error_message, request=request, response=response)
25        else:
26            error_message = error_message or f"Request URL: {request.url}, Exception: {response}"
27            super().__init__(error_message, request=request, response=None)

An HTTP error occurred.

BaseBackoffException( request: requests.models.PreparedRequest, response: Union[requests.models.Response, Exception, NoneType], error_message: str = '')
13    def __init__(
14        self,
15        request: requests.PreparedRequest,
16        response: Optional[Union[requests.Response, Exception]],
17        error_message: str = "",
18    ):
19        if isinstance(response, requests.Response):
20            error_message = (
21                error_message
22                or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}"
23            )
24            super().__init__(error_message, request=request, response=response)
25        else:
26            error_message = error_message or f"Request URL: {request.url}, Exception: {response}"
27            super().__init__(error_message, request=request, response=None)

Initialize RequestException with request and response objects.

class CachedLimiterSession(requests_cache.session.CacheMixin, airbyte_cdk.sources.streams.call_rate.LimiterMixin, requests.sessions.Session):
704class CachedLimiterSession(requests_cache.CacheMixin, LimiterMixin, requests.Session):
705    """Session class with caching and rate-limiting behavior."""

Session class with caching and rate-limiting behavior.

class DefaultBackoffException(airbyte_cdk.BaseBackoffException):
57class DefaultBackoffException(BaseBackoffException):
58    pass

An HTTP error occurred.

def default_backoff_handler( max_tries: Optional[int], factor: float, max_time: Optional[int] = None, **kwargs: Any) -> Callable[[Callable[[requests.models.PreparedRequest, Mapping[str, Any]], requests.models.Response]], Callable[[requests.models.PreparedRequest, Mapping[str, Any]], requests.models.Response]]:
34def default_backoff_handler(
35    max_tries: Optional[int], factor: float, max_time: Optional[int] = None, **kwargs: Any
36) -> Callable[[SendRequestCallableType], SendRequestCallableType]:
37    def log_retry_attempt(details: Mapping[str, Any]) -> None:
38        _, exc, _ = sys.exc_info()
39        if isinstance(exc, RequestException) and exc.response:
40            logger.info(
41                f"Status code: {exc.response.status_code!r}, Response Content: {exc.response.content!r}"
42            )
43        logger.info(
44            f"Caught retryable error '{str(exc)}' after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..."
45        )
46
47    def should_give_up(exc: Exception) -> bool:
48        # If a non-rate-limiting related 4XX error makes it this far, it means it was unexpected and probably consistent, so we shouldn't back off
49        if isinstance(exc, RequestException):
50            if exc.response is not None:
51                give_up: bool = (
52                    exc.response is not None
53                    and exc.response.status_code != codes.too_many_requests
54                    and 400 <= exc.response.status_code < 500
55                )
56                if give_up:
57                    logger.info(f"Giving up for returned HTTP status: {exc.response.status_code!r}")
58                return give_up
59        # Only RequestExceptions are retryable, so if we get here, it's not retryable
60        return False
61
62    return backoff.on_exception(  # type: ignore # Decorator function returns a function with a different signature than the input function, so mypy can't infer the type of the returned function
63        backoff.expo,
64        TRANSIENT_EXCEPTIONS,
65        jitter=None,
66        on_backoff=log_retry_attempt,
67        giveup=should_give_up,
68        max_tries=max_tries,
69        max_time=max_time,
70        factor=factor,
71        **kwargs,
72    )
class HttpAPIBudget(airbyte_cdk.sources.streams.call_rate.APIBudget):
631class HttpAPIBudget(APIBudget):
632    """Implementation of AbstractAPIBudget for HTTP"""
633
634    def __init__(
635        self,
636        ratelimit_reset_header: str = "ratelimit-reset",
637        ratelimit_remaining_header: str = "ratelimit-remaining",
638        status_codes_for_ratelimit_hit: list[int] = [429],
639        **kwargs: Any,
640    ):
641        """Constructor
642
643        :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget
644        :param ratelimit_remaining_header: name of the header that has the number of calls left
645        :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit
646        """
647        self._ratelimit_reset_header = ratelimit_reset_header
648        self._ratelimit_remaining_header = ratelimit_remaining_header
649        self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit
650        super().__init__(**kwargs)
651
652    def update_from_response(self, request: Any, response: Any) -> None:
653        policy = self.get_matching_policy(request)
654        if not policy:
655            return
656
657        if isinstance(response, requests.Response):
658            available_calls = self.get_calls_left_from_response(response)
659            reset_ts = self.get_reset_ts_from_response(response)
660            policy.update(available_calls=available_calls, call_reset_ts=reset_ts)
661
662    def get_reset_ts_from_response(
663        self, response: requests.Response
664    ) -> Optional[datetime.datetime]:
665        if response.headers.get(self._ratelimit_reset_header):
666            return datetime.datetime.fromtimestamp(
667                int(response.headers[self._ratelimit_reset_header])
668            )
669        return None
670
671    def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]:
672        if response.headers.get(self._ratelimit_remaining_header):
673            return int(response.headers[self._ratelimit_remaining_header])
674
675        if response.status_code in self._status_codes_for_ratelimit_hit:
676            return 0
677
678        return None

Implementation of AbstractAPIBudget for HTTP

HttpAPIBudget( ratelimit_reset_header: str = 'ratelimit-reset', ratelimit_remaining_header: str = 'ratelimit-remaining', status_codes_for_ratelimit_hit: list[int] = [429], **kwargs: Any)
634    def __init__(
635        self,
636        ratelimit_reset_header: str = "ratelimit-reset",
637        ratelimit_remaining_header: str = "ratelimit-remaining",
638        status_codes_for_ratelimit_hit: list[int] = [429],
639        **kwargs: Any,
640    ):
641        """Constructor
642
643        :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget
644        :param ratelimit_remaining_header: name of the header that has the number of calls left
645        :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit
646        """
647        self._ratelimit_reset_header = ratelimit_reset_header
648        self._ratelimit_remaining_header = ratelimit_remaining_header
649        self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit
650        super().__init__(**kwargs)

Constructor

Parameters
  • ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget
  • ratelimit_remaining_header: name of the header that has the number of calls left
  • status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit
def update_from_response(self, request: Any, response: Any) -> None:
652    def update_from_response(self, request: Any, response: Any) -> None:
653        policy = self.get_matching_policy(request)
654        if not policy:
655            return
656
657        if isinstance(response, requests.Response):
658            available_calls = self.get_calls_left_from_response(response)
659            reset_ts = self.get_reset_ts_from_response(response)
660            policy.update(available_calls=available_calls, call_reset_ts=reset_ts)

Update budget information based on the API response.

Parameters
  • request: the initial request that triggered this response
  • response: response from the API
def get_reset_ts_from_response(self, response: requests.models.Response) -> Optional[datetime.datetime]:
662    def get_reset_ts_from_response(
663        self, response: requests.Response
664    ) -> Optional[datetime.datetime]:
665        if response.headers.get(self._ratelimit_reset_header):
666            return datetime.datetime.fromtimestamp(
667                int(response.headers[self._ratelimit_reset_header])
668            )
669        return None
def get_calls_left_from_response(self, response: requests.models.Response) -> Optional[int]:
671    def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]:
672        if response.headers.get(self._ratelimit_remaining_header):
673            return int(response.headers[self._ratelimit_remaining_header])
674
675        if response.status_code in self._status_codes_for_ratelimit_hit:
676            return 0
677
678        return None
HttpAuthenticator
class HttpRequestMatcher(airbyte_cdk.sources.streams.call_rate.RequestMatcher):
103class HttpRequestMatcher(RequestMatcher):
104    """Simple implementation of RequestMatcher for HTTP requests using HttpRequestRegexMatcher under the hood."""
105
106    def __init__(
107        self,
108        method: Optional[str] = None,
109        url: Optional[str] = None,
110        params: Optional[Mapping[str, Any]] = None,
111        headers: Optional[Mapping[str, Any]] = None,
112    ):
113        """Constructor
114
115        :param method: HTTP method (e.g., "GET", "POST").
116        :param url: Full URL to match.
117        :param params: Dictionary of query parameters to match.
118        :param headers: Dictionary of headers to match.
119        """
120        # Parse the URL to extract the base and path
121        if url:
122            parsed_url = parse.urlsplit(url)
123            url_base = f"{parsed_url.scheme}://{parsed_url.netloc}"
124            url_path = parsed_url.path if parsed_url.path != "/" else None
125        else:
126            url_base = None
127            url_path = None
128
129        # Use HttpRequestRegexMatcher under the hood
130        self._regex_matcher = HttpRequestRegexMatcher(
131            method=method,
132            url_base=url_base,
133            url_path_pattern=re.escape(url_path) if url_path else None,
134            params=params,
135            headers=headers,
136        )
137
138    def __call__(self, request: Any) -> bool:
139        """
140        :param request: A requests.Request or requests.PreparedRequest instance.
141        :return: True if the request matches all provided criteria; False otherwise.
142        """
143        return self._regex_matcher(request)
144
145    def __str__(self) -> str:
146        return (
147            f"HttpRequestMatcher(method={self._regex_matcher._method}, "
148            f"url={self._regex_matcher._url_base}{self._regex_matcher._url_path_pattern.pattern if self._regex_matcher._url_path_pattern else ''}, "
149            f"params={self._regex_matcher._params}, headers={self._regex_matcher._headers})"
150        )

Simple implementation of RequestMatcher for HTTP requests using HttpRequestRegexMatcher under the hood.

HttpRequestMatcher( method: Optional[str] = None, url: Optional[str] = None, params: Optional[Mapping[str, Any]] = None, headers: Optional[Mapping[str, Any]] = None)
106    def __init__(
107        self,
108        method: Optional[str] = None,
109        url: Optional[str] = None,
110        params: Optional[Mapping[str, Any]] = None,
111        headers: Optional[Mapping[str, Any]] = None,
112    ):
113        """Constructor
114
115        :param method: HTTP method (e.g., "GET", "POST").
116        :param url: Full URL to match.
117        :param params: Dictionary of query parameters to match.
118        :param headers: Dictionary of headers to match.
119        """
120        # Parse the URL to extract the base and path
121        if url:
122            parsed_url = parse.urlsplit(url)
123            url_base = f"{parsed_url.scheme}://{parsed_url.netloc}"
124            url_path = parsed_url.path if parsed_url.path != "/" else None
125        else:
126            url_base = None
127            url_path = None
128
129        # Use HttpRequestRegexMatcher under the hood
130        self._regex_matcher = HttpRequestRegexMatcher(
131            method=method,
132            url_base=url_base,
133            url_path_pattern=re.escape(url_path) if url_path else None,
134            params=params,
135            headers=headers,
136        )

Constructor

Parameters
  • method: HTTP method (e.g., "GET", "POST").
  • url: Full URL to match.
  • params: Dictionary of query parameters to match.
  • headers: Dictionary of headers to match.
class HttpStream(airbyte_cdk.Stream, airbyte_cdk.sources.streams.core.CheckpointMixin, abc.ABC):
 45class HttpStream(Stream, CheckpointMixin, ABC):
 46    """
 47    Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API.
 48    """
 49
 50    source_defined_cursor = True  # Most HTTP streams use a source defined cursor (i.e: the user can't configure it like on a SQL table)
 51    page_size: Optional[int] = (
 52        None  # Use this variable to define page size for API http requests with pagination support
 53    )
 54
 55    def __init__(
 56        self, authenticator: Optional[AuthBase] = None, api_budget: Optional[APIBudget] = None
 57    ):
 58        self._exit_on_rate_limit: bool = False
 59        self._http_client = HttpClient(
 60            name=self.name,
 61            logger=self.logger,
 62            error_handler=self.get_error_handler(),
 63            api_budget=api_budget or APIBudget(policies=[]),
 64            authenticator=authenticator,
 65            use_cache=self.use_cache,
 66            backoff_strategy=self.get_backoff_strategy(),
 67            message_repository=InMemoryMessageRepository(),
 68        )
 69
 70        # There are three conditions that dictate if RFR should automatically be applied to a stream
 71        # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR
 72        # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR.
 73        # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform
 74        #    per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method
 75        if (
 76            not self.cursor
 77            and len(self.cursor_field) == 0
 78            and type(self).read_records is HttpStream.read_records
 79        ):
 80            self.cursor = ResumableFullRefreshCursor()
 81
 82    @property
 83    def exit_on_rate_limit(self) -> bool:
 84        """
 85        :return: False if the stream will retry endlessly when rate limited
 86        """
 87        return self._exit_on_rate_limit
 88
 89    @exit_on_rate_limit.setter
 90    def exit_on_rate_limit(self, value: bool) -> None:
 91        self._exit_on_rate_limit = value
 92
 93    @property
 94    def cache_filename(self) -> str:
 95        """
 96        Override if needed. Return the name of cache file
 97        Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.
 98        """
 99        return f"{self.name}.sqlite"
100
101    @property
102    def use_cache(self) -> bool:
103        """
104        Override if needed. If True, all records will be cached.
105        Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.
106        """
107        return False
108
109    @property
110    @abstractmethod
111    def url_base(self) -> str:
112        """
113        :return: URL base for the  API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
114        """
115
116    @property
117    def http_method(self) -> str:
118        """
119        Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH.
120        """
121        return "GET"
122
123    @property
124    @deprecated(
125        "Deprecated as of CDK version 3.0.0. "
126        "You should set error_handler explicitly in HttpStream.get_error_handler() instead."
127    )
128    def raise_on_http_errors(self) -> bool:
129        """
130        Override if needed. If set to False, allows opting-out of raising HTTP code exception.
131        """
132        return True
133
134    @property
135    @deprecated(
136        "Deprecated as of CDK version 3.0.0. "
137        "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead."
138    )
139    def max_retries(self) -> Union[int, None]:
140        """
141        Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit.
142        """
143        return 5
144
145    @property
146    @deprecated(
147        "Deprecated as of CDK version 3.0.0. "
148        "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead."
149    )
150    def max_time(self) -> Union[int, None]:
151        """
152        Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit.
153        """
154        return 60 * 10
155
156    @property
157    @deprecated(
158        "Deprecated as of CDK version 3.0.0. "
159        "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead."
160    )
161    def retry_factor(self) -> float:
162        """
163        Override if needed. Specifies factor for backoff policy.
164        """
165        return 5
166
167    @abstractmethod
168    def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
169        """
170        Override this method to define a pagination strategy.
171
172        The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params.
173
174        :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
175        """
176
177    @abstractmethod
178    def path(
179        self,
180        *,
181        stream_state: Optional[Mapping[str, Any]] = None,
182        stream_slice: Optional[Mapping[str, Any]] = None,
183        next_page_token: Optional[Mapping[str, Any]] = None,
184    ) -> str:
185        """
186        Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
187        """
188
189    def request_params(
190        self,
191        stream_state: Optional[Mapping[str, Any]],
192        stream_slice: Optional[Mapping[str, Any]] = None,
193        next_page_token: Optional[Mapping[str, Any]] = None,
194    ) -> MutableMapping[str, Any]:
195        """
196        Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs.
197
198        E.g: you might want to define query parameters for paging if next_page_token is not None.
199        """
200        return {}
201
202    def request_headers(
203        self,
204        stream_state: Optional[Mapping[str, Any]],
205        stream_slice: Optional[Mapping[str, Any]] = None,
206        next_page_token: Optional[Mapping[str, Any]] = None,
207    ) -> Mapping[str, Any]:
208        """
209        Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
210        """
211        return {}
212
213    def request_body_data(
214        self,
215        stream_state: Optional[Mapping[str, Any]],
216        stream_slice: Optional[Mapping[str, Any]] = None,
217        next_page_token: Optional[Mapping[str, Any]] = None,
218    ) -> Optional[Union[Mapping[str, Any], str]]:
219        """
220        Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload.
221
222        If returns a ready text that it will be sent as is.
223        If returns a dict that it will be converted to a urlencoded form.
224        E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
225
226        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
227        """
228        return None
229
230    def request_body_json(
231        self,
232        stream_state: Optional[Mapping[str, Any]],
233        stream_slice: Optional[Mapping[str, Any]] = None,
234        next_page_token: Optional[Mapping[str, Any]] = None,
235    ) -> Optional[Mapping[str, Any]]:
236        """
237        Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload.
238
239        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
240        """
241        return None
242
243    def request_kwargs(
244        self,
245        stream_state: Optional[Mapping[str, Any]],
246        stream_slice: Optional[Mapping[str, Any]] = None,
247        next_page_token: Optional[Mapping[str, Any]] = None,
248    ) -> Mapping[str, Any]:
249        """
250        Override to return a mapping of keyword arguments to be used when creating the HTTP request.
251        Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from
252        this method. Note that these options do not conflict with request-level options such as headers, request params, etc..
253        """
254        return {}
255
256    @abstractmethod
257    def parse_response(
258        self,
259        response: requests.Response,
260        *,
261        stream_state: Mapping[str, Any],
262        stream_slice: Optional[Mapping[str, Any]] = None,
263        next_page_token: Optional[Mapping[str, Any]] = None,
264    ) -> Iterable[Mapping[str, Any]]:
265        """
266        Parses the raw response object into a list of records.
267        By default, this returns an iterable containing the input. Override to parse differently.
268        :param response:
269        :param stream_state:
270        :param stream_slice:
271        :param next_page_token:
272        :return: An iterable containing the parsed response
273        """
274
275    def get_backoff_strategy(self) -> Optional[Union[BackoffStrategy, List[BackoffStrategy]]]:
276        """
277        Used to initialize Adapter to avoid breaking changes.
278        If Stream has a `backoff_time` method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed.
279
280        Override to provide custom BackoffStrategy
281        :return Optional[BackoffStrategy]:
282        """
283        if hasattr(self, "backoff_time"):
284            return HttpStreamAdapterBackoffStrategy(self)
285        else:
286            return None
287
288    def get_error_handler(self) -> Optional[ErrorHandler]:
289        """
290        Used to initialize Adapter to avoid breaking changes.
291        If Stream has a `should_retry` method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed.
292
293        Override to provide custom ErrorHandler
294        :return Optional[ErrorHandler]:
295        """
296        if hasattr(self, "should_retry"):
297            error_handler = HttpStreamAdapterHttpStatusErrorHandler(
298                stream=self,
299                logger=logging.getLogger(),
300                max_retries=self.max_retries,
301                max_time=timedelta(seconds=self.max_time or 0),
302            )
303            return error_handler
304        else:
305            return None
306
307    @classmethod
308    def _join_url(cls, url_base: str, path: str) -> str:
309        return urljoin(url_base, path)
310
311    @classmethod
312    def parse_response_error_message(cls, response: requests.Response) -> Optional[str]:
313        """
314        Parses the raw response object from a failed request into a user-friendly error message.
315        By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently.
316
317        :param response:
318        :return: A user-friendly message that indicates the cause of the error
319        """
320
321        # default logic to grab error from common fields
322        def _try_get_error(value: Optional[JsonType]) -> Optional[str]:
323            if isinstance(value, str):
324                return value
325            elif isinstance(value, list):
326                errors_in_value = [_try_get_error(v) for v in value]
327                return ", ".join(v for v in errors_in_value if v is not None)
328            elif isinstance(value, dict):
329                new_value = (
330                    value.get("message")
331                    or value.get("messages")
332                    or value.get("error")
333                    or value.get("errors")
334                    or value.get("failures")
335                    or value.get("failure")
336                    or value.get("detail")
337                )
338                return _try_get_error(new_value)
339            return None
340
341        try:
342            body = response.json()
343            return _try_get_error(body)
344        except requests.exceptions.JSONDecodeError:
345            return None
346
347    def get_error_display_message(self, exception: BaseException) -> Optional[str]:
348        """
349        Retrieves the user-friendly display message that corresponds to an exception.
350        This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.
351
352        The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message().
353        The method should be overriden as needed to handle any additional exception types.
354
355        :param exception: The exception that was raised
356        :return: A user-friendly message that indicates the cause of the error
357        """
358        if isinstance(exception, requests.HTTPError) and exception.response is not None:
359            return self.parse_response_error_message(exception.response)
360        return None
361
362    def read_records(
363        self,
364        sync_mode: SyncMode,
365        cursor_field: Optional[List[str]] = None,
366        stream_slice: Optional[Mapping[str, Any]] = None,
367        stream_state: Optional[Mapping[str, Any]] = None,
368    ) -> Iterable[StreamData]:
369        # A cursor_field indicates this is an incremental stream which offers better checkpointing than RFR enabled via the cursor
370        if self.cursor_field or not isinstance(self.get_cursor(), ResumableFullRefreshCursor):
371            yield from self._read_pages(
372                lambda req, res, state, _slice: self.parse_response(
373                    res, stream_slice=_slice, stream_state=state
374                ),
375                stream_slice,
376                stream_state,
377            )
378        else:
379            yield from self._read_single_page(
380                lambda req, res, state, _slice: self.parse_response(
381                    res, stream_slice=_slice, stream_state=state
382                ),
383                stream_slice,
384                stream_state,
385            )
386
387    @property
388    def state(self) -> MutableMapping[str, Any]:
389        cursor = self.get_cursor()
390        if cursor:
391            return cursor.get_stream_state()  # type: ignore
392        return self._state
393
394    @state.setter
395    def state(self, value: MutableMapping[str, Any]) -> None:
396        cursor = self.get_cursor()
397        if cursor:
398            cursor.set_initial_state(value)
399        self._state = value
400
401    def get_cursor(self) -> Optional[Cursor]:
402        # I don't love that this is semi-stateful but not sure what else to do. We don't know exactly what type of cursor to
403        # instantiate when creating the class. We can make a few assumptions like if there is a cursor_field which implies
404        # incremental, but we don't know until runtime if this is a substream. Ideally, a stream should explicitly define
405        # its cursor, but because we're trying to automatically apply RFR we're stuck with this logic where we replace the
406        # cursor at runtime once we detect this is a substream based on self.has_multiple_slices being reassigned
407        if self.has_multiple_slices and isinstance(self.cursor, ResumableFullRefreshCursor):
408            self.cursor = SubstreamResumableFullRefreshCursor()
409            return self.cursor
410        else:
411            return self.cursor
412
413    def _read_pages(
414        self,
415        records_generator_fn: Callable[
416            [
417                requests.PreparedRequest,
418                requests.Response,
419                Mapping[str, Any],
420                Optional[Mapping[str, Any]],
421            ],
422            Iterable[StreamData],
423        ],
424        stream_slice: Optional[Mapping[str, Any]] = None,
425        stream_state: Optional[Mapping[str, Any]] = None,
426    ) -> Iterable[StreamData]:
427        stream_state = stream_state or {}
428        pagination_complete = False
429        next_page_token = None
430        while not pagination_complete:
431            request, response = self._fetch_next_page(stream_slice, stream_state, next_page_token)
432            yield from records_generator_fn(request, response, stream_state, stream_slice)
433
434            next_page_token = self.next_page_token(response)
435            if not next_page_token:
436                pagination_complete = True
437
438        cursor = self.get_cursor()
439        if cursor and isinstance(cursor, SubstreamResumableFullRefreshCursor):
440            partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice)
441            # Substreams checkpoint state by marking an entire parent partition as completed so that on the subsequent attempt
442            # after a failure, completed parents are skipped and the sync can make progress
443            cursor.close_slice(StreamSlice(cursor_slice={}, partition=partition))
444
445        # Always return an empty generator just in case no records were ever yielded
446        yield from []
447
448    def _read_single_page(
449        self,
450        records_generator_fn: Callable[
451            [
452                requests.PreparedRequest,
453                requests.Response,
454                Mapping[str, Any],
455                Optional[Mapping[str, Any]],
456            ],
457            Iterable[StreamData],
458        ],
459        stream_slice: Optional[Mapping[str, Any]] = None,
460        stream_state: Optional[Mapping[str, Any]] = None,
461    ) -> Iterable[StreamData]:
462        partition, cursor_slice, remaining_slice = self._extract_slice_fields(
463            stream_slice=stream_slice
464        )
465        stream_state = stream_state or {}
466        next_page_token = cursor_slice or None
467
468        request, response = self._fetch_next_page(remaining_slice, stream_state, next_page_token)
469        yield from records_generator_fn(request, response, stream_state, remaining_slice)
470
471        next_page_token = self.next_page_token(response) or {
472            "__ab_full_refresh_sync_complete": True
473        }
474
475        cursor = self.get_cursor()
476        if cursor:
477            cursor.close_slice(StreamSlice(cursor_slice=next_page_token, partition=partition))
478
479        # Always return an empty generator just in case no records were ever yielded
480        yield from []
481
482    @staticmethod
483    def _extract_slice_fields(
484        stream_slice: Optional[Mapping[str, Any]],
485    ) -> tuple[Mapping[str, Any], Mapping[str, Any], Mapping[str, Any]]:
486        if not stream_slice:
487            return {}, {}, {}
488
489        if isinstance(stream_slice, StreamSlice):
490            partition = stream_slice.partition
491            cursor_slice = stream_slice.cursor_slice
492            remaining = {k: v for k, v in stream_slice.items()}
493        else:
494            # RFR streams that implement stream_slices() to generate stream slices in the legacy mapping format are converted into a
495            # structured stream slice mapping by the LegacyCursorBasedCheckpointReader. The structured mapping object has separate
496            # fields for the partition and cursor_slice value
497            partition = stream_slice.get("partition", {})
498            cursor_slice = stream_slice.get("cursor_slice", {})
499            remaining = {
500                key: val
501                for key, val in stream_slice.items()
502                if key != "partition" and key != "cursor_slice"
503            }
504        return partition, cursor_slice, remaining
505
506    def _fetch_next_page(
507        self,
508        stream_slice: Optional[Mapping[str, Any]] = None,
509        stream_state: Optional[Mapping[str, Any]] = None,
510        next_page_token: Optional[Mapping[str, Any]] = None,
511    ) -> Tuple[requests.PreparedRequest, requests.Response]:
512        request, response = self._http_client.send_request(
513            http_method=self.http_method,
514            url=self._join_url(
515                self.url_base,
516                self.path(
517                    stream_state=stream_state,
518                    stream_slice=stream_slice,
519                    next_page_token=next_page_token,
520                ),
521            ),
522            request_kwargs=self.request_kwargs(
523                stream_state=stream_state,
524                stream_slice=stream_slice,
525                next_page_token=next_page_token,
526            ),
527            headers=self.request_headers(
528                stream_state=stream_state,
529                stream_slice=stream_slice,
530                next_page_token=next_page_token,
531            ),
532            params=self.request_params(
533                stream_state=stream_state,
534                stream_slice=stream_slice,
535                next_page_token=next_page_token,
536            ),
537            json=self.request_body_json(
538                stream_state=stream_state,
539                stream_slice=stream_slice,
540                next_page_token=next_page_token,
541            ),
542            data=self.request_body_data(
543                stream_state=stream_state,
544                stream_slice=stream_slice,
545                next_page_token=next_page_token,
546            ),
547            dedupe_query_params=True,
548            log_formatter=self.get_log_formatter(),
549            exit_on_rate_limit=self.exit_on_rate_limit,
550        )
551
552        return request, response
553
554    def get_log_formatter(self) -> Optional[Callable[[requests.Response], Any]]:
555        """
556
557        :return Optional[Callable[[requests.Response], Any]]: Function that will be used in logging inside HttpClient
558        """
559        return None

Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API.

source_defined_cursor = True

Return False if the cursor can be configured by the user.

page_size: Optional[int] = None
exit_on_rate_limit: bool
82    @property
83    def exit_on_rate_limit(self) -> bool:
84        """
85        :return: False if the stream will retry endlessly when rate limited
86        """
87        return self._exit_on_rate_limit
Returns

False if the stream will retry endlessly when rate limited

cache_filename: str
93    @property
94    def cache_filename(self) -> str:
95        """
96        Override if needed. Return the name of cache file
97        Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.
98        """
99        return f"{self.name}.sqlite"

Override if needed. Return the name of cache file Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.

use_cache: bool
101    @property
102    def use_cache(self) -> bool:
103        """
104        Override if needed. If True, all records will be cached.
105        Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.
106        """
107        return False

Override if needed. If True, all records will be cached. Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.

url_base: str
109    @property
110    @abstractmethod
111    def url_base(self) -> str:
112        """
113        :return: URL base for the  API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
114        """
Returns

URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"

http_method: str
116    @property
117    def http_method(self) -> str:
118        """
119        Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH.
120        """
121        return "GET"

Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH.

raise_on_http_errors: bool
123    @property
124    @deprecated(
125        "Deprecated as of CDK version 3.0.0. "
126        "You should set error_handler explicitly in HttpStream.get_error_handler() instead."
127    )
128    def raise_on_http_errors(self) -> bool:
129        """
130        Override if needed. If set to False, allows opting-out of raising HTTP code exception.
131        """
132        return True

Override if needed. If set to False, allows opting-out of raising HTTP code exception.

max_retries: Optional[int]
134    @property
135    @deprecated(
136        "Deprecated as of CDK version 3.0.0. "
137        "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead."
138    )
139    def max_retries(self) -> Union[int, None]:
140        """
141        Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit.
142        """
143        return 5

Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit.

max_time: Optional[int]
145    @property
146    @deprecated(
147        "Deprecated as of CDK version 3.0.0. "
148        "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead."
149    )
150    def max_time(self) -> Union[int, None]:
151        """
152        Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit.
153        """
154        return 60 * 10

Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit.

retry_factor: float
156    @property
157    @deprecated(
158        "Deprecated as of CDK version 3.0.0. "
159        "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead."
160    )
161    def retry_factor(self) -> float:
162        """
163        Override if needed. Specifies factor for backoff policy.
164        """
165        return 5

Override if needed. Specifies factor for backoff policy.

@abstractmethod
def next_page_token(self, response: requests.models.Response) -> Optional[Mapping[str, Any]]:
167    @abstractmethod
168    def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
169        """
170        Override this method to define a pagination strategy.
171
172        The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params.
173
174        :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
175        """

Override this method to define a pagination strategy.

The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params.

Returns

The token for the next page from the input response object. Returning None means there are no more pages to read in this response.

@abstractmethod
def path( self, *, stream_state: Optional[Mapping[str, Any]] = None, stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> str:
177    @abstractmethod
178    def path(
179        self,
180        *,
181        stream_state: Optional[Mapping[str, Any]] = None,
182        stream_slice: Optional[Mapping[str, Any]] = None,
183        next_page_token: Optional[Mapping[str, Any]] = None,
184    ) -> str:
185        """
186        Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
187        """

Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"

def request_params( self, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> MutableMapping[str, Any]:
189    def request_params(
190        self,
191        stream_state: Optional[Mapping[str, Any]],
192        stream_slice: Optional[Mapping[str, Any]] = None,
193        next_page_token: Optional[Mapping[str, Any]] = None,
194    ) -> MutableMapping[str, Any]:
195        """
196        Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs.
197
198        E.g: you might want to define query parameters for paging if next_page_token is not None.
199        """
200        return {}

Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs.

E.g: you might want to define query parameters for paging if next_page_token is not None.

def request_headers( self, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
202    def request_headers(
203        self,
204        stream_state: Optional[Mapping[str, Any]],
205        stream_slice: Optional[Mapping[str, Any]] = None,
206        next_page_token: Optional[Mapping[str, Any]] = None,
207    ) -> Mapping[str, Any]:
208        """
209        Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
210        """
211        return {}

Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.

def request_body_data( self, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Union[str, Mapping[str, Any], NoneType]:
213    def request_body_data(
214        self,
215        stream_state: Optional[Mapping[str, Any]],
216        stream_slice: Optional[Mapping[str, Any]] = None,
217        next_page_token: Optional[Mapping[str, Any]] = None,
218    ) -> Optional[Union[Mapping[str, Any], str]]:
219        """
220        Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload.
221
222        If returns a ready text that it will be sent as is.
223        If returns a dict that it will be converted to a urlencoded form.
224        E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
225
226        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
227        """
228        return None

Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload.

If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def request_body_json( self, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Optional[Mapping[str, Any]]:
230    def request_body_json(
231        self,
232        stream_state: Optional[Mapping[str, Any]],
233        stream_slice: Optional[Mapping[str, Any]] = None,
234        next_page_token: Optional[Mapping[str, Any]] = None,
235    ) -> Optional[Mapping[str, Any]]:
236        """
237        Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload.
238
239        At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
240        """
241        return None

Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload.

At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.

def request_kwargs( self, stream_state: Optional[Mapping[str, Any]], stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Mapping[str, Any]:
243    def request_kwargs(
244        self,
245        stream_state: Optional[Mapping[str, Any]],
246        stream_slice: Optional[Mapping[str, Any]] = None,
247        next_page_token: Optional[Mapping[str, Any]] = None,
248    ) -> Mapping[str, Any]:
249        """
250        Override to return a mapping of keyword arguments to be used when creating the HTTP request.
251        Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from
252        this method. Note that these options do not conflict with request-level options such as headers, request params, etc..
253        """
254        return {}

Override to return a mapping of keyword arguments to be used when creating the HTTP request. Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from this method. Note that these options do not conflict with request-level options such as headers, request params, etc..

@abstractmethod
def parse_response( self, response: requests.models.Response, *, stream_state: Mapping[str, Any], stream_slice: Optional[Mapping[str, Any]] = None, next_page_token: Optional[Mapping[str, Any]] = None) -> Iterable[Mapping[str, Any]]:
256    @abstractmethod
257    def parse_response(
258        self,
259        response: requests.Response,
260        *,
261        stream_state: Mapping[str, Any],
262        stream_slice: Optional[Mapping[str, Any]] = None,
263        next_page_token: Optional[Mapping[str, Any]] = None,
264    ) -> Iterable[Mapping[str, Any]]:
265        """
266        Parses the raw response object into a list of records.
267        By default, this returns an iterable containing the input. Override to parse differently.
268        :param response:
269        :param stream_state:
270        :param stream_slice:
271        :param next_page_token:
272        :return: An iterable containing the parsed response
273        """

Parses the raw response object into a list of records. By default, this returns an iterable containing the input. Override to parse differently.

Parameters
  • response:
  • stream_state:
  • stream_slice:
  • next_page_token:
Returns

An iterable containing the parsed response

def get_backoff_strategy( self) -> Union[BackoffStrategy, List[BackoffStrategy], NoneType]:
275    def get_backoff_strategy(self) -> Optional[Union[BackoffStrategy, List[BackoffStrategy]]]:
276        """
277        Used to initialize Adapter to avoid breaking changes.
278        If Stream has a `backoff_time` method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed.
279
280        Override to provide custom BackoffStrategy
281        :return Optional[BackoffStrategy]:
282        """
283        if hasattr(self, "backoff_time"):
284            return HttpStreamAdapterBackoffStrategy(self)
285        else:
286            return None

Used to initialize Adapter to avoid breaking changes. If Stream has a backoff_time method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed.

Override to provide custom BackoffStrategy

Returns
def get_error_handler( self) -> Optional[airbyte_cdk.sources.streams.http.error_handlers.ErrorHandler]:
288    def get_error_handler(self) -> Optional[ErrorHandler]:
289        """
290        Used to initialize Adapter to avoid breaking changes.
291        If Stream has a `should_retry` method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed.
292
293        Override to provide custom ErrorHandler
294        :return Optional[ErrorHandler]:
295        """
296        if hasattr(self, "should_retry"):
297            error_handler = HttpStreamAdapterHttpStatusErrorHandler(
298                stream=self,
299                logger=logging.getLogger(),
300                max_retries=self.max_retries,
301                max_time=timedelta(seconds=self.max_time or 0),
302            )
303            return error_handler
304        else:
305            return None

Used to initialize Adapter to avoid breaking changes. If Stream has a should_retry method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed.

Override to provide custom ErrorHandler

Returns
@classmethod
def parse_response_error_message(cls, response: requests.models.Response) -> Optional[str]:
311    @classmethod
312    def parse_response_error_message(cls, response: requests.Response) -> Optional[str]:
313        """
314        Parses the raw response object from a failed request into a user-friendly error message.
315        By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently.
316
317        :param response:
318        :return: A user-friendly message that indicates the cause of the error
319        """
320
321        # default logic to grab error from common fields
322        def _try_get_error(value: Optional[JsonType]) -> Optional[str]:
323            if isinstance(value, str):
324                return value
325            elif isinstance(value, list):
326                errors_in_value = [_try_get_error(v) for v in value]
327                return ", ".join(v for v in errors_in_value if v is not None)
328            elif isinstance(value, dict):
329                new_value = (
330                    value.get("message")
331                    or value.get("messages")
332                    or value.get("error")
333                    or value.get("errors")
334                    or value.get("failures")
335                    or value.get("failure")
336                    or value.get("detail")
337                )
338                return _try_get_error(new_value)
339            return None
340
341        try:
342            body = response.json()
343            return _try_get_error(body)
344        except requests.exceptions.JSONDecodeError:
345            return None

Parses the raw response object from a failed request into a user-friendly error message. By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently.

Parameters
  • response:
Returns

A user-friendly message that indicates the cause of the error

def get_error_display_message(self, exception: BaseException) -> Optional[str]:
347    def get_error_display_message(self, exception: BaseException) -> Optional[str]:
348        """
349        Retrieves the user-friendly display message that corresponds to an exception.
350        This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.
351
352        The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message().
353        The method should be overriden as needed to handle any additional exception types.
354
355        :param exception: The exception that was raised
356        :return: A user-friendly message that indicates the cause of the error
357        """
358        if isinstance(exception, requests.HTTPError) and exception.response is not None:
359            return self.parse_response_error_message(exception.response)
360        return None

Retrieves the user-friendly display message that corresponds to an exception. This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.

The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message(). The method should be overriden as needed to handle any additional exception types.

Parameters
  • exception: The exception that was raised
Returns

A user-friendly message that indicates the cause of the error

def read_records( self, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, cursor_field: Optional[List[str]] = None, stream_slice: Optional[Mapping[str, Any]] = None, stream_state: Optional[Mapping[str, Any]] = None) -> Iterable[Union[Mapping[str, Any], AirbyteMessage]]:
362    def read_records(
363        self,
364        sync_mode: SyncMode,
365        cursor_field: Optional[List[str]] = None,
366        stream_slice: Optional[Mapping[str, Any]] = None,
367        stream_state: Optional[Mapping[str, Any]] = None,
368    ) -> Iterable[StreamData]:
369        # A cursor_field indicates this is an incremental stream which offers better checkpointing than RFR enabled via the cursor
370        if self.cursor_field or not isinstance(self.get_cursor(), ResumableFullRefreshCursor):
371            yield from self._read_pages(
372                lambda req, res, state, _slice: self.parse_response(
373                    res, stream_slice=_slice, stream_state=state
374                ),
375                stream_slice,
376                stream_state,
377            )
378        else:
379            yield from self._read_single_page(
380                lambda req, res, state, _slice: self.parse_response(
381                    res, stream_slice=_slice, stream_state=state
382                ),
383                stream_slice,
384                stream_state,
385            )

This method should be overridden by subclasses to read records based on the inputs

state: MutableMapping[str, Any]
387    @property
388    def state(self) -> MutableMapping[str, Any]:
389        cursor = self.get_cursor()
390        if cursor:
391            return cursor.get_stream_state()  # type: ignore
392        return self._state

State getter, should return state in form that can serialized to a string and send to the output as a STATE AirbyteMessage.

A good example of a state is a cursor_value: { self.cursor_field: "cursor_value" }

State should try to be as small as possible but at the same time descriptive enough to restore syncing process from the point where it stopped.

def get_cursor(self) -> Optional[airbyte_cdk.sources.streams.checkpoint.Cursor]:
401    def get_cursor(self) -> Optional[Cursor]:
402        # I don't love that this is semi-stateful but not sure what else to do. We don't know exactly what type of cursor to
403        # instantiate when creating the class. We can make a few assumptions like if there is a cursor_field which implies
404        # incremental, but we don't know until runtime if this is a substream. Ideally, a stream should explicitly define
405        # its cursor, but because we're trying to automatically apply RFR we're stuck with this logic where we replace the
406        # cursor at runtime once we detect this is a substream based on self.has_multiple_slices being reassigned
407        if self.has_multiple_slices and isinstance(self.cursor, ResumableFullRefreshCursor):
408            self.cursor = SubstreamResumableFullRefreshCursor()
409            return self.cursor
410        else:
411            return self.cursor

A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need to define a cursor implementation and override this method to manage state through a Cursor.

def get_log_formatter(self) -> Optional[Callable[[requests.models.Response], Any]]:
554    def get_log_formatter(self) -> Optional[Callable[[requests.Response], Any]]:
555        """
556
557        :return Optional[Callable[[requests.Response], Any]]: Function that will be used in logging inside HttpClient
558        """
559        return None
Returns

Function that will be used in logging inside HttpClient

class HttpSubStream(airbyte_cdk.HttpStream, abc.ABC):
562class HttpSubStream(HttpStream, ABC):
563    def __init__(self, parent: HttpStream, **kwargs: Any):
564        """
565        :param parent: should be the instance of HttpStream class
566        """
567        super().__init__(**kwargs)
568        self.parent = parent
569        self.has_multiple_slices = (
570            True  # Substreams are based on parent records which implies there are multiple slices
571        )
572
573        # There are three conditions that dictate if RFR should automatically be applied to a stream
574        # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR
575        # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR.
576        # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform
577        #    per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method
578        if (
579            not self.cursor
580            and len(self.cursor_field) == 0
581            and type(self).read_records is HttpStream.read_records
582        ):
583            self.cursor = SubstreamResumableFullRefreshCursor()
584
585    def stream_slices(
586        self,
587        sync_mode: SyncMode,
588        cursor_field: Optional[List[str]] = None,
589        stream_state: Optional[Mapping[str, Any]] = None,
590    ) -> Iterable[Optional[Mapping[str, Any]]]:
591        # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does
592        # not support either substreams or RFR, but something that needs to be considered once we do
593        for parent_record in self.parent.read_only_records(stream_state):
594            # Skip non-records (eg AirbyteLogMessage)
595            if isinstance(parent_record, AirbyteMessage):
596                if parent_record.type == MessageType.RECORD:
597                    parent_record = parent_record.record.data  # type: ignore [assignment, union-attr]  # Incorrect type for assignment
598                else:
599                    continue
600            elif isinstance(parent_record, Record):
601                parent_record = parent_record.data
602            yield {"parent": parent_record}

Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API.

HttpSubStream( parent: HttpStream, **kwargs: Any)
563    def __init__(self, parent: HttpStream, **kwargs: Any):
564        """
565        :param parent: should be the instance of HttpStream class
566        """
567        super().__init__(**kwargs)
568        self.parent = parent
569        self.has_multiple_slices = (
570            True  # Substreams are based on parent records which implies there are multiple slices
571        )
572
573        # There are three conditions that dictate if RFR should automatically be applied to a stream
574        # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR
575        # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR.
576        # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform
577        #    per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method
578        if (
579            not self.cursor
580            and len(self.cursor_field) == 0
581            and type(self).read_records is HttpStream.read_records
582        ):
583            self.cursor = SubstreamResumableFullRefreshCursor()
Parameters
  • parent: should be the instance of HttpStream class
parent
has_multiple_slices = False
def stream_slices( self, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None) -> Iterable[Optional[Mapping[str, Any]]]:
585    def stream_slices(
586        self,
587        sync_mode: SyncMode,
588        cursor_field: Optional[List[str]] = None,
589        stream_state: Optional[Mapping[str, Any]] = None,
590    ) -> Iterable[Optional[Mapping[str, Any]]]:
591        # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does
592        # not support either substreams or RFR, but something that needs to be considered once we do
593        for parent_record in self.parent.read_only_records(stream_state):
594            # Skip non-records (eg AirbyteLogMessage)
595            if isinstance(parent_record, AirbyteMessage):
596                if parent_record.type == MessageType.RECORD:
597                    parent_record = parent_record.record.data  # type: ignore [assignment, union-attr]  # Incorrect type for assignment
598                else:
599                    continue
600            elif isinstance(parent_record, Record):
601                parent_record = parent_record.data
602            yield {"parent": parent_record}

Override to define the slices for this stream. See the stream slicing section of the docs for more information.

Parameters
  • sync_mode:
  • cursor_field:
  • stream_state:
Returns
class LimiterSession(airbyte_cdk.sources.streams.call_rate.LimiterMixin, requests.sessions.Session):
700class LimiterSession(LimiterMixin, requests.Session):
701    """Session that adds rate-limiting behavior to requests."""

Session that adds rate-limiting behavior to requests.

class MovingWindowCallRatePolicy(airbyte_cdk.sources.streams.call_rate.BaseCallRatePolicy):
396class MovingWindowCallRatePolicy(BaseCallRatePolicy):
397    """
398    Policy to control requests rate implemented on top of PyRateLimiter lib.
399    The main difference between this policy and FixedWindowCallRatePolicy is that the rate-limiting window
400    is moving along requests that we made, and there is no moment when we reset an available number of calls.
401    This strategy requires saving of timestamps of all requests within a window.
402    """
403
404    def __init__(self, rates: list[Rate], matchers: list[RequestMatcher]):
405        """Constructor
406
407        :param rates: list of rates, the order is important and must be ascending
408        :param matchers:
409        """
410        if not rates:
411            raise ValueError("The list of rates can not be empty")
412        pyrate_rates = [
413            PyRateRate(limit=rate.limit, interval=int(rate.interval.total_seconds() * 1000))
414            for rate in rates
415        ]
416        self._bucket = InMemoryBucket(pyrate_rates)
417        # Limiter will create the background task that clears old requests in the bucket
418        self._limiter = Limiter(self._bucket)
419        super().__init__(matchers=matchers)
420
421    def try_acquire(self, request: Any, weight: int) -> None:
422        if not self.matches(request):
423            raise ValueError("Request does not match the policy")
424
425        try:
426            self._limiter.try_acquire(request, weight=weight)
427        except BucketFullException as exc:
428            item = self._limiter.bucket_factory.wrap_item(request, weight)
429            assert isinstance(item, RateItem)
430
431            with self._limiter.lock:
432                time_to_wait = self._bucket.waiting(item)
433                assert isinstance(time_to_wait, int)
434
435                raise CallRateLimitHit(
436                    error=str(exc.meta_info["error"]),
437                    item=request,
438                    weight=int(exc.meta_info["weight"]),
439                    rate=str(exc.meta_info["rate"]),
440                    time_to_wait=timedelta(milliseconds=time_to_wait),
441                )
442
443    def update(
444        self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
445    ) -> None:
446        """Adjust call bucket to reflect the state of the API server
447
448        :param available_calls:
449        :param call_reset_ts:
450        :return:
451        """
452        if (
453            available_calls is not None and call_reset_ts is None
454        ):  # we do our best to sync buckets with API
455            if available_calls == 0:
456                with self._limiter.lock:
457                    items_to_add = self._bucket.count() < self._bucket.rates[0].limit
458                    if items_to_add > 0:
459                        now: int = TimeClock().now()  # type: ignore[no-untyped-call]
460                        self._bucket.put(RateItem(name="dummy", timestamp=now, weight=items_to_add))
461        # TODO: add support if needed, it might be that it is not possible to make a good solution for this case
462        # if available_calls is not None and call_reset_ts is not None:
463        #     ts = call_reset_ts.timestamp()
464
465    def __str__(self) -> str:
466        """Return a human-friendly description of the moving window rate policy for logging purposes."""
467        rates_info = ", ".join(
468            f"{rate.limit} per {timedelta(milliseconds=rate.interval)}"
469            for rate in self._bucket.rates
470        )
471        current_bucket_count = self._bucket.count()
472        matcher_str = ", ".join(f"{matcher}" for matcher in self._matchers)
473        return (
474            f"MovingWindowCallRatePolicy(rates=[{rates_info}], current_bucket_count={current_bucket_count}, "
475            f"matchers=[{matcher_str}])"
476        )

Policy to control requests rate implemented on top of PyRateLimiter lib. The main difference between this policy and FixedWindowCallRatePolicy is that the rate-limiting window is moving along requests that we made, and there is no moment when we reset an available number of calls. This strategy requires saving of timestamps of all requests within a window.

MovingWindowCallRatePolicy( rates: list[Rate], matchers: list[airbyte_cdk.sources.streams.call_rate.RequestMatcher])
404    def __init__(self, rates: list[Rate], matchers: list[RequestMatcher]):
405        """Constructor
406
407        :param rates: list of rates, the order is important and must be ascending
408        :param matchers:
409        """
410        if not rates:
411            raise ValueError("The list of rates can not be empty")
412        pyrate_rates = [
413            PyRateRate(limit=rate.limit, interval=int(rate.interval.total_seconds() * 1000))
414            for rate in rates
415        ]
416        self._bucket = InMemoryBucket(pyrate_rates)
417        # Limiter will create the background task that clears old requests in the bucket
418        self._limiter = Limiter(self._bucket)
419        super().__init__(matchers=matchers)

Constructor

Parameters
  • rates: list of rates, the order is important and must be ascending
  • matchers:
def try_acquire(self, request: Any, weight: int) -> None:
421    def try_acquire(self, request: Any, weight: int) -> None:
422        if not self.matches(request):
423            raise ValueError("Request does not match the policy")
424
425        try:
426            self._limiter.try_acquire(request, weight=weight)
427        except BucketFullException as exc:
428            item = self._limiter.bucket_factory.wrap_item(request, weight)
429            assert isinstance(item, RateItem)
430
431            with self._limiter.lock:
432                time_to_wait = self._bucket.waiting(item)
433                assert isinstance(time_to_wait, int)
434
435                raise CallRateLimitHit(
436                    error=str(exc.meta_info["error"]),
437                    item=request,
438                    weight=int(exc.meta_info["weight"]),
439                    rate=str(exc.meta_info["rate"]),
440                    time_to_wait=timedelta(milliseconds=time_to_wait),
441                )

Try to acquire request

Parameters
  • request: a request object representing a single call to API
  • weight: number of requests to deduct from credit
Returns
def update( self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]) -> None:
443    def update(
444        self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
445    ) -> None:
446        """Adjust call bucket to reflect the state of the API server
447
448        :param available_calls:
449        :param call_reset_ts:
450        :return:
451        """
452        if (
453            available_calls is not None and call_reset_ts is None
454        ):  # we do our best to sync buckets with API
455            if available_calls == 0:
456                with self._limiter.lock:
457                    items_to_add = self._bucket.count() < self._bucket.rates[0].limit
458                    if items_to_add > 0:
459                        now: int = TimeClock().now()  # type: ignore[no-untyped-call]
460                        self._bucket.put(RateItem(name="dummy", timestamp=now, weight=items_to_add))
461        # TODO: add support if needed, it might be that it is not possible to make a good solution for this case
462        # if available_calls is not None and call_reset_ts is not None:
463        #     ts = call_reset_ts.timestamp()

Adjust call bucket to reflect the state of the API server

Parameters
  • available_calls:
  • call_reset_ts:
Returns
MultipleTokenAuthenticator
 26class Oauth2Authenticator(AbstractOauth2Authenticator):
 27    """
 28    Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials.
 29    The generated access token is attached to each request via the Authorization header.
 30    If a connector_config is provided any mutation of it's value in the scope of this class will emit AirbyteControlConnectorConfigMessage.
 31    """
 32
 33    def __init__(
 34        self,
 35        token_refresh_endpoint: str,
 36        client_id: str,
 37        client_secret: str,
 38        refresh_token: str,
 39        client_id_name: str = "client_id",
 40        client_secret_name: str = "client_secret",
 41        refresh_token_name: str = "refresh_token",
 42        scopes: List[str] | None = None,
 43        token_expiry_date: AirbyteDateTime | None = None,
 44        token_expiry_date_format: str | None = None,
 45        access_token_name: str = "access_token",
 46        expires_in_name: str = "expires_in",
 47        refresh_request_body: Mapping[str, Any] | None = None,
 48        refresh_request_headers: Mapping[str, Any] | None = None,
 49        grant_type_name: str = "grant_type",
 50        grant_type: str = "refresh_token",
 51        token_expiry_is_time_of_expiration: bool = False,
 52        refresh_token_error_status_codes: Tuple[int, ...] = (),
 53        refresh_token_error_key: str = "",
 54        refresh_token_error_values: Tuple[str, ...] = (),
 55    ) -> None:
 56        self._token_refresh_endpoint = token_refresh_endpoint
 57        self._client_secret_name = client_secret_name
 58        self._client_secret = client_secret
 59        self._client_id_name = client_id_name
 60        self._client_id = client_id
 61        self._refresh_token_name = refresh_token_name
 62        self._refresh_token = refresh_token
 63        self._scopes = scopes
 64        self._access_token_name = access_token_name
 65        self._expires_in_name = expires_in_name
 66        self._refresh_request_body = refresh_request_body
 67        self._refresh_request_headers = refresh_request_headers
 68        self._grant_type_name = grant_type_name
 69        self._grant_type = grant_type
 70
 71        self._token_expiry_date = token_expiry_date or (ab_datetime_now() - timedelta(days=1))
 72        self._token_expiry_date_format = token_expiry_date_format
 73        self._token_expiry_is_time_of_expiration = token_expiry_is_time_of_expiration
 74        self._access_token = None
 75        super().__init__(
 76            refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values
 77        )
 78
 79    def get_token_refresh_endpoint(self) -> str:
 80        return self._token_refresh_endpoint
 81
 82    def get_client_id_name(self) -> str:
 83        return self._client_id_name
 84
 85    def get_client_id(self) -> str:
 86        return self._client_id
 87
 88    def get_client_secret_name(self) -> str:
 89        return self._client_secret_name
 90
 91    def get_client_secret(self) -> str:
 92        return self._client_secret
 93
 94    def get_refresh_token_name(self) -> str:
 95        return self._refresh_token_name
 96
 97    def get_refresh_token(self) -> str:
 98        return self._refresh_token
 99
100    def get_access_token_name(self) -> str:
101        return self._access_token_name
102
103    def get_scopes(self) -> list[str]:
104        return self._scopes  # type: ignore[return-value]
105
106    def get_expires_in_name(self) -> str:
107        return self._expires_in_name
108
109    def get_refresh_request_body(self) -> Mapping[str, Any]:
110        return self._refresh_request_body  # type: ignore[return-value]
111
112    def get_refresh_request_headers(self) -> Mapping[str, Any]:
113        return self._refresh_request_headers  # type: ignore[return-value]
114
115    def get_grant_type_name(self) -> str:
116        return self._grant_type_name
117
118    def get_grant_type(self) -> str:
119        return self._grant_type
120
121    def get_token_expiry_date(self) -> AirbyteDateTime:
122        return self._token_expiry_date
123
124    def set_token_expiry_date(self, value: Union[str, int]) -> None:
125        self._token_expiry_date = self._parse_token_expiration_date(value)
126
127    @property
128    def token_expiry_is_time_of_expiration(self) -> bool:
129        return self._token_expiry_is_time_of_expiration
130
131    @property
132    def token_expiry_date_format(self) -> Optional[str]:
133        return self._token_expiry_date_format
134
135    @property
136    def access_token(self) -> str:
137        return self._access_token  # type: ignore[return-value]
138
139    @access_token.setter
140    def access_token(self, value: str) -> None:
141        self._access_token = value  # type: ignore[assignment]  # Incorrect type for assignment

Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials. The generated access token is attached to each request via the Authorization header. If a connector_config is provided any mutation of it's value in the scope of this class will emit AirbyteControlConnectorConfigMessage.

Oauth2Authenticator( token_refresh_endpoint: str, client_id: str, client_secret: str, refresh_token: str, client_id_name: str = 'client_id', client_secret_name: str = 'client_secret', refresh_token_name: str = 'refresh_token', scopes: Optional[List[str]] = None, token_expiry_date: airbyte_cdk.utils.datetime_helpers.AirbyteDateTime | None = None, token_expiry_date_format: str | None = None, access_token_name: str = 'access_token', expires_in_name: str = 'expires_in', refresh_request_body: Optional[Mapping[str, Any]] = None, refresh_request_headers: Optional[Mapping[str, Any]] = None, grant_type_name: str = 'grant_type', grant_type: str = 'refresh_token', token_expiry_is_time_of_expiration: bool = False, refresh_token_error_status_codes: Tuple[int, ...] = (), refresh_token_error_key: str = '', refresh_token_error_values: Tuple[str, ...] = ())
33    def __init__(
34        self,
35        token_refresh_endpoint: str,
36        client_id: str,
37        client_secret: str,
38        refresh_token: str,
39        client_id_name: str = "client_id",
40        client_secret_name: str = "client_secret",
41        refresh_token_name: str = "refresh_token",
42        scopes: List[str] | None = None,
43        token_expiry_date: AirbyteDateTime | None = None,
44        token_expiry_date_format: str | None = None,
45        access_token_name: str = "access_token",
46        expires_in_name: str = "expires_in",
47        refresh_request_body: Mapping[str, Any] | None = None,
48        refresh_request_headers: Mapping[str, Any] | None = None,
49        grant_type_name: str = "grant_type",
50        grant_type: str = "refresh_token",
51        token_expiry_is_time_of_expiration: bool = False,
52        refresh_token_error_status_codes: Tuple[int, ...] = (),
53        refresh_token_error_key: str = "",
54        refresh_token_error_values: Tuple[str, ...] = (),
55    ) -> None:
56        self._token_refresh_endpoint = token_refresh_endpoint
57        self._client_secret_name = client_secret_name
58        self._client_secret = client_secret
59        self._client_id_name = client_id_name
60        self._client_id = client_id
61        self._refresh_token_name = refresh_token_name
62        self._refresh_token = refresh_token
63        self._scopes = scopes
64        self._access_token_name = access_token_name
65        self._expires_in_name = expires_in_name
66        self._refresh_request_body = refresh_request_body
67        self._refresh_request_headers = refresh_request_headers
68        self._grant_type_name = grant_type_name
69        self._grant_type = grant_type
70
71        self._token_expiry_date = token_expiry_date or (ab_datetime_now() - timedelta(days=1))
72        self._token_expiry_date_format = token_expiry_date_format
73        self._token_expiry_is_time_of_expiration = token_expiry_is_time_of_expiration
74        self._access_token = None
75        super().__init__(
76            refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values
77        )

If all of refresh_token_error_status_codes, refresh_token_error_key, and refresh_token_error_values are set, then http errors with such params will be wrapped in AirbyteTracedException.

def get_token_refresh_endpoint(self) -> str:
79    def get_token_refresh_endpoint(self) -> str:
80        return self._token_refresh_endpoint

Returns the endpoint to refresh the access token

def get_client_id_name(self) -> str:
82    def get_client_id_name(self) -> str:
83        return self._client_id_name

The client id name to authenticate

def get_client_id(self) -> str:
85    def get_client_id(self) -> str:
86        return self._client_id

The client id to authenticate

def get_client_secret_name(self) -> str:
88    def get_client_secret_name(self) -> str:
89        return self._client_secret_name

The client secret name to authenticate

def get_client_secret(self) -> str:
91    def get_client_secret(self) -> str:
92        return self._client_secret

The client secret to authenticate

def get_refresh_token_name(self) -> str:
94    def get_refresh_token_name(self) -> str:
95        return self._refresh_token_name

The refresh token name to authenticate

def get_refresh_token(self) -> str:
97    def get_refresh_token(self) -> str:
98        return self._refresh_token

The token used to refresh the access token when it expires

def get_access_token_name(self) -> str:
100    def get_access_token_name(self) -> str:
101        return self._access_token_name

Field to extract access token from in the response

def get_scopes(self) -> list[str]:
103    def get_scopes(self) -> list[str]:
104        return self._scopes  # type: ignore[return-value]

List of requested scopes

def get_expires_in_name(self) -> str:
106    def get_expires_in_name(self) -> str:
107        return self._expires_in_name

Returns the expires_in field name

def get_refresh_request_body(self) -> Mapping[str, Any]:
109    def get_refresh_request_body(self) -> Mapping[str, Any]:
110        return self._refresh_request_body  # type: ignore[return-value]

Returns the request body to set on the refresh request

def get_refresh_request_headers(self) -> Mapping[str, Any]:
112    def get_refresh_request_headers(self) -> Mapping[str, Any]:
113        return self._refresh_request_headers  # type: ignore[return-value]

Returns the request headers to set on the refresh request

def get_grant_type_name(self) -> str:
115    def get_grant_type_name(self) -> str:
116        return self._grant_type_name

Returns grant_type specified name for requesting access_token

def get_grant_type(self) -> str:
118    def get_grant_type(self) -> str:
119        return self._grant_type

Returns grant_type specified for requesting access_token

def get_token_expiry_date(self) -> airbyte_cdk.utils.datetime_helpers.AirbyteDateTime:
121    def get_token_expiry_date(self) -> AirbyteDateTime:
122        return self._token_expiry_date

Expiration date of the access token

def set_token_expiry_date(self, value: Union[str, int]) -> None:
124    def set_token_expiry_date(self, value: Union[str, int]) -> None:
125        self._token_expiry_date = self._parse_token_expiration_date(value)

Setter for access token expiration date

token_expiry_is_time_of_expiration: bool
127    @property
128    def token_expiry_is_time_of_expiration(self) -> bool:
129        return self._token_expiry_is_time_of_expiration

Indicates that the Token Expiry returns the date until which the token will be valid, not the amount of time it will be valid.

token_expiry_date_format: Optional[str]
131    @property
132    def token_expiry_date_format(self) -> Optional[str]:
133        return self._token_expiry_date_format

Format of the datetime; exists it if expires_in is returned as the expiration datetime instead of seconds until it expires

access_token: str
135    @property
136    def access_token(self) -> str:
137        return self._access_token  # type: ignore[return-value]

Returns the access token

@dataclasses.dataclass
class Rate:
33@dataclasses.dataclass
34class Rate:
35    """Call rate limit"""
36
37    limit: int
38    interval: timedelta

Call rate limit

Rate(limit: int, interval: datetime.timedelta)
limit: int
interval: datetime.timedelta
class SingleUseRefreshTokenOauth2Authenticator(airbyte_cdk.Oauth2Authenticator):
144class SingleUseRefreshTokenOauth2Authenticator(Oauth2Authenticator):
145    """
146    Authenticator that should be used for API implementing single use refresh tokens:
147    when refreshing access token some API returns a new refresh token that needs to used in the next refresh flow.
148    This authenticator updates the configuration with new refresh token by emitting Airbyte control message from an observed mutation.
149    By default, this authenticator expects a connector config with a "credentials" field with the following nested fields: client_id,
150    client_secret, refresh_token. This behavior can be changed by defining custom config path (using dpath paths) in client_id_config_path,
151    client_secret_config_path, refresh_token_config_path constructor arguments.
152    """
153
154    def __init__(
155        self,
156        connector_config: Mapping[str, Any],
157        token_refresh_endpoint: str,
158        scopes: List[str] | None = None,
159        access_token_name: str = "access_token",
160        expires_in_name: str = "expires_in",
161        refresh_token_name: str = "refresh_token",
162        refresh_request_body: Mapping[str, Any] | None = None,
163        refresh_request_headers: Mapping[str, Any] | None = None,
164        grant_type_name: str = "grant_type",
165        grant_type: str = "refresh_token",
166        client_id_name: str = "client_id",
167        client_id: Optional[str] = None,
168        client_secret_name: str = "client_secret",
169        client_secret: Optional[str] = None,
170        access_token_config_path: Sequence[str] = ("credentials", "access_token"),
171        refresh_token_config_path: Sequence[str] = ("credentials", "refresh_token"),
172        token_expiry_date_config_path: Sequence[str] = ("credentials", "token_expiry_date"),
173        token_expiry_date_format: Optional[str] = None,
174        message_repository: MessageRepository = NoopMessageRepository(),
175        token_expiry_is_time_of_expiration: bool = False,
176        refresh_token_error_status_codes: Tuple[int, ...] = (),
177        refresh_token_error_key: str = "",
178        refresh_token_error_values: Tuple[str, ...] = (),
179    ) -> None:
180        """
181        Args:
182            connector_config (Mapping[str, Any]): The full connector configuration
183            token_refresh_endpoint (str): Full URL to the token refresh endpoint
184            scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None.
185            access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token".
186            expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in".
187            refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token".
188            refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None.
189            refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None.
190            grant_type (str, optional): OAuth grant type. Defaults to "refresh_token".
191            client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object.
192            client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object.
193            access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token").
194            refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token").
195            token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date").
196            token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration.
197            token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
198            message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update
199        """
200        self._connector_config = connector_config
201        self._client_id: str = self._get_config_value_by_path(
202            ("credentials", "client_id"), client_id
203        )
204        self._client_secret: str = self._get_config_value_by_path(
205            ("credentials", "client_secret"), client_secret
206        )
207        self._client_id_name = client_id_name
208        self._client_secret_name = client_secret_name
209        self._access_token_config_path = access_token_config_path
210        self._refresh_token_config_path = refresh_token_config_path
211        self._token_expiry_date_config_path = token_expiry_date_config_path
212        self._token_expiry_date_format = token_expiry_date_format
213        self._refresh_token_name = refresh_token_name
214        self._grant_type_name = grant_type_name
215        self._connector_config = connector_config
216        self.__message_repository = message_repository
217        super().__init__(
218            token_refresh_endpoint=token_refresh_endpoint,
219            client_id_name=self._client_id_name,
220            client_id=self._client_id,
221            client_secret_name=self._client_secret_name,
222            client_secret=self._client_secret,
223            refresh_token=self.get_refresh_token(),
224            refresh_token_name=self._refresh_token_name,
225            scopes=scopes,
226            token_expiry_date=self.get_token_expiry_date(),
227            access_token_name=access_token_name,
228            expires_in_name=expires_in_name,
229            refresh_request_body=refresh_request_body,
230            refresh_request_headers=refresh_request_headers,
231            grant_type_name=self._grant_type_name,
232            grant_type=grant_type,
233            token_expiry_date_format=token_expiry_date_format,
234            token_expiry_is_time_of_expiration=token_expiry_is_time_of_expiration,
235            refresh_token_error_status_codes=refresh_token_error_status_codes,
236            refresh_token_error_key=refresh_token_error_key,
237            refresh_token_error_values=refresh_token_error_values,
238        )
239
240    @property
241    def access_token(self) -> str:
242        """
243        Retrieve the access token from the configuration.
244
245        Returns:
246            str: The access token.
247        """
248        return self._get_config_value_by_path(self._access_token_config_path)  # type: ignore[return-value]
249
250    @access_token.setter
251    def access_token(self, new_access_token: str) -> None:
252        """
253        Sets a new access token.
254
255        Args:
256            new_access_token (str): The new access token to be set.
257        """
258        self._set_config_value_by_path(self._access_token_config_path, new_access_token)
259
260    def get_refresh_token(self) -> str:
261        """
262        Retrieve the refresh token from the configuration.
263
264        This method fetches the refresh token using the configuration path specified
265        by `_refresh_token_config_path`.
266
267        Returns:
268            str: The refresh token as a string.
269        """
270        return self._get_config_value_by_path(self._refresh_token_config_path)  # type: ignore[return-value]
271
272    def set_refresh_token(self, new_refresh_token: str) -> None:
273        """
274        Updates the refresh token in the configuration.
275
276        Args:
277            new_refresh_token (str): The new refresh token to be set.
278        """
279        self._set_config_value_by_path(self._refresh_token_config_path, new_refresh_token)
280
281    def get_token_expiry_date(self) -> AirbyteDateTime:
282        """
283        Retrieves the token expiry date from the configuration.
284
285        This method fetches the token expiry date from the configuration using the specified path.
286        If the expiry date is an empty string, it returns the current date and time minus one day.
287        Otherwise, it parses the expiry date string into an AirbyteDateTime object.
288
289        Returns:
290            AirbyteDateTime: The parsed or calculated token expiry date.
291
292        Raises:
293            TypeError: If the result is not an instance of AirbyteDateTime.
294        """
295        expiry_date = self._get_config_value_by_path(self._token_expiry_date_config_path)
296        result = (
297            ab_datetime_now() - timedelta(days=1)
298            if expiry_date == ""
299            else ab_datetime_parse(str(expiry_date))
300        )
301        if isinstance(result, AirbyteDateTime):
302            return result
303        raise TypeError("Invalid datetime conversion")
304
305    def set_token_expiry_date(self, new_token_expiry_date: AirbyteDateTime) -> None:  # type: ignore[override]
306        """
307        Sets the token expiry date in the configuration.
308
309        Args:
310            new_token_expiry_date (AirbyteDateTime): The new expiry date for the token.
311        """
312        self._set_config_value_by_path(
313            self._token_expiry_date_config_path, str(new_token_expiry_date)
314        )
315
316    def token_has_expired(self) -> bool:
317        """Returns True if the token is expired"""
318        return ab_datetime_now() > self.get_token_expiry_date()
319
320    @staticmethod
321    def get_new_token_expiry_date(
322        access_token_expires_in: str,
323        token_expiry_date_format: str | None = None,
324    ) -> AirbyteDateTime:
325        """
326        Calculate the new token expiry date based on the provided expiration duration or format.
327
328        Args:
329            access_token_expires_in (str): The duration (in seconds) until the access token expires, or the expiry date in a specific format.
330            token_expiry_date_format (str | None, optional): The format of the expiry date if provided. Defaults to None.
331
332        Returns:
333            AirbyteDateTime: The calculated expiry date of the access token.
334        """
335        if token_expiry_date_format:
336            return ab_datetime_parse(access_token_expires_in)
337        else:
338            return ab_datetime_now() + timedelta(seconds=int(access_token_expires_in))
339
340    def get_access_token(self) -> str:
341        """Retrieve new access and refresh token if the access token has expired.
342        The new refresh token is persisted with the set_refresh_token function
343        Returns:
344            str: The current access_token, updated if it was previously expired.
345        """
346        if self.token_has_expired():
347            new_access_token, access_token_expires_in, new_refresh_token = (
348                self.refresh_access_token()
349            )
350            new_token_expiry_date: AirbyteDateTime = self.get_new_token_expiry_date(
351                access_token_expires_in, self._token_expiry_date_format
352            )
353            self.access_token = new_access_token
354            self.set_refresh_token(new_refresh_token)
355            self.set_token_expiry_date(new_token_expiry_date)
356            self._emit_control_message()
357        return self.access_token
358
359    def refresh_access_token(self) -> Tuple[str, str, str]:  # type: ignore[override]
360        """
361        Refreshes the access token by making a handled request and extracting the necessary token information.
362
363        Returns:
364            Tuple[str, str, str]: A tuple containing the new access token, token expiry date, and refresh token.
365        """
366        response_json = self._make_handled_request()
367        return (
368            self._extract_access_token(response_json),
369            self._extract_token_expiry_date(response_json),
370            self._extract_refresh_token(response_json),
371        )
372
373    def _set_config_value_by_path(self, config_path: Union[str, Sequence[str]], value: Any) -> None:
374        """
375        Set a value in the connector configuration at the specified path.
376
377        Args:
378            config_path (Union[str, Sequence[str]]): The path within the configuration where the value should be set.
379                This can be a string representing a single key or a sequence of strings representing a nested path.
380            value (Any): The value to set at the specified path in the configuration.
381
382        Returns:
383            None
384        """
385        dpath.new(self._connector_config, config_path, value)  # type: ignore[arg-type]
386
387    def _get_config_value_by_path(
388        self, config_path: Union[str, Sequence[str]], default: Optional[str] = None
389    ) -> str | Any:
390        """
391        Retrieve a value from the connector configuration using a specified path.
392
393        Args:
394            config_path (Union[str, Sequence[str]]): The path to the desired configuration value. This can be a string or a sequence of strings.
395            default (Optional[str], optional): The default value to return if the specified path does not exist in the configuration. Defaults to None.
396
397        Returns:
398            Any: The value from the configuration at the specified path, or the default value if the path does not exist.
399        """
400        return dpath.get(
401            self._connector_config,  # type: ignore[arg-type]
402            config_path,
403            default=default if default is not None else "",
404        )
405
406    def _emit_control_message(self) -> None:
407        """
408        Emits a control message based on the connector configuration.
409
410        This method checks if the message repository is not a NoopMessageRepository.
411        If it is not, it emits a message using the message repository. Otherwise,
412        it falls back to emitting the configuration as an Airbyte control message
413        directly to the console for backward compatibility.
414
415        Note:
416            The function `emit_configuration_as_airbyte_control_message` has been deprecated
417            in favor of the package `airbyte_cdk.sources.message`.
418
419        Raises:
420            TypeError: If the argument types are incorrect.
421        """
422        # FIXME emit_configuration_as_airbyte_control_message as been deprecated in favor of package airbyte_cdk.sources.message
423        # Usually, a class shouldn't care about the implementation details but to keep backward compatibility where we print the
424        # message directly in the console, this is needed
425        if not isinstance(self._message_repository, NoopMessageRepository):
426            self._message_repository.emit_message(
427                create_connector_config_control_message(self._connector_config)  # type: ignore[arg-type]
428            )
429        else:
430            emit_configuration_as_airbyte_control_message(self._connector_config)  # type: ignore[arg-type]
431
432    @property
433    def _message_repository(self) -> MessageRepository:
434        """
435        Overriding AbstractOauth2Authenticator._message_repository to allow for HTTP request logs
436        """
437        return self.__message_repository

Authenticator that should be used for API implementing single use refresh tokens: when refreshing access token some API returns a new refresh token that needs to used in the next refresh flow. This authenticator updates the configuration with new refresh token by emitting Airbyte control message from an observed mutation. By default, this authenticator expects a connector config with a "credentials" field with the following nested fields: client_id, client_secret, refresh_token. This behavior can be changed by defining custom config path (using dpath paths) in client_id_config_path, client_secret_config_path, refresh_token_config_path constructor arguments.

SingleUseRefreshTokenOauth2Authenticator( connector_config: Mapping[str, Any], token_refresh_endpoint: str, scopes: Optional[List[str]] = None, access_token_name: str = 'access_token', expires_in_name: str = 'expires_in', refresh_token_name: str = 'refresh_token', refresh_request_body: Optional[Mapping[str, Any]] = None, refresh_request_headers: Optional[Mapping[str, Any]] = None, grant_type_name: str = 'grant_type', grant_type: str = 'refresh_token', client_id_name: str = 'client_id', client_id: Optional[str] = None, client_secret_name: str = 'client_secret', client_secret: Optional[str] = None, access_token_config_path: Sequence[str] = ('credentials', 'access_token'), refresh_token_config_path: Sequence[str] = ('credentials', 'refresh_token'), token_expiry_date_config_path: Sequence[str] = ('credentials', 'token_expiry_date'), token_expiry_date_format: Optional[str] = None, message_repository: MessageRepository = <airbyte_cdk.sources.message.NoopMessageRepository object>, token_expiry_is_time_of_expiration: bool = False, refresh_token_error_status_codes: Tuple[int, ...] = (), refresh_token_error_key: str = '', refresh_token_error_values: Tuple[str, ...] = ())
154    def __init__(
155        self,
156        connector_config: Mapping[str, Any],
157        token_refresh_endpoint: str,
158        scopes: List[str] | None = None,
159        access_token_name: str = "access_token",
160        expires_in_name: str = "expires_in",
161        refresh_token_name: str = "refresh_token",
162        refresh_request_body: Mapping[str, Any] | None = None,
163        refresh_request_headers: Mapping[str, Any] | None = None,
164        grant_type_name: str = "grant_type",
165        grant_type: str = "refresh_token",
166        client_id_name: str = "client_id",
167        client_id: Optional[str] = None,
168        client_secret_name: str = "client_secret",
169        client_secret: Optional[str] = None,
170        access_token_config_path: Sequence[str] = ("credentials", "access_token"),
171        refresh_token_config_path: Sequence[str] = ("credentials", "refresh_token"),
172        token_expiry_date_config_path: Sequence[str] = ("credentials", "token_expiry_date"),
173        token_expiry_date_format: Optional[str] = None,
174        message_repository: MessageRepository = NoopMessageRepository(),
175        token_expiry_is_time_of_expiration: bool = False,
176        refresh_token_error_status_codes: Tuple[int, ...] = (),
177        refresh_token_error_key: str = "",
178        refresh_token_error_values: Tuple[str, ...] = (),
179    ) -> None:
180        """
181        Args:
182            connector_config (Mapping[str, Any]): The full connector configuration
183            token_refresh_endpoint (str): Full URL to the token refresh endpoint
184            scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None.
185            access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token".
186            expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in".
187            refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token".
188            refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None.
189            refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None.
190            grant_type (str, optional): OAuth grant type. Defaults to "refresh_token".
191            client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object.
192            client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object.
193            access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token").
194            refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token").
195            token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date").
196            token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration.
197            token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
198            message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update
199        """
200        self._connector_config = connector_config
201        self._client_id: str = self._get_config_value_by_path(
202            ("credentials", "client_id"), client_id
203        )
204        self._client_secret: str = self._get_config_value_by_path(
205            ("credentials", "client_secret"), client_secret
206        )
207        self._client_id_name = client_id_name
208        self._client_secret_name = client_secret_name
209        self._access_token_config_path = access_token_config_path
210        self._refresh_token_config_path = refresh_token_config_path
211        self._token_expiry_date_config_path = token_expiry_date_config_path
212        self._token_expiry_date_format = token_expiry_date_format
213        self._refresh_token_name = refresh_token_name
214        self._grant_type_name = grant_type_name
215        self._connector_config = connector_config
216        self.__message_repository = message_repository
217        super().__init__(
218            token_refresh_endpoint=token_refresh_endpoint,
219            client_id_name=self._client_id_name,
220            client_id=self._client_id,
221            client_secret_name=self._client_secret_name,
222            client_secret=self._client_secret,
223            refresh_token=self.get_refresh_token(),
224            refresh_token_name=self._refresh_token_name,
225            scopes=scopes,
226            token_expiry_date=self.get_token_expiry_date(),
227            access_token_name=access_token_name,
228            expires_in_name=expires_in_name,
229            refresh_request_body=refresh_request_body,
230            refresh_request_headers=refresh_request_headers,
231            grant_type_name=self._grant_type_name,
232            grant_type=grant_type,
233            token_expiry_date_format=token_expiry_date_format,
234            token_expiry_is_time_of_expiration=token_expiry_is_time_of_expiration,
235            refresh_token_error_status_codes=refresh_token_error_status_codes,
236            refresh_token_error_key=refresh_token_error_key,
237            refresh_token_error_values=refresh_token_error_values,
238        )
Arguments:
  • connector_config (Mapping[str, Any]): The full connector configuration
  • token_refresh_endpoint (str): Full URL to the token refresh endpoint
  • scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None.
  • access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token".
  • expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in".
  • refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token".
  • refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None.
  • refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None.
  • grant_type (str, optional): OAuth grant type. Defaults to "refresh_token".
  • client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object.
  • client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object.
  • access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token").
  • refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token").
  • token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date").
  • token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration.
  • token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
  • message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update
access_token: str
240    @property
241    def access_token(self) -> str:
242        """
243        Retrieve the access token from the configuration.
244
245        Returns:
246            str: The access token.
247        """
248        return self._get_config_value_by_path(self._access_token_config_path)  # type: ignore[return-value]

Retrieve the access token from the configuration.

Returns:

str: The access token.

def get_refresh_token(self) -> str:
260    def get_refresh_token(self) -> str:
261        """
262        Retrieve the refresh token from the configuration.
263
264        This method fetches the refresh token using the configuration path specified
265        by `_refresh_token_config_path`.
266
267        Returns:
268            str: The refresh token as a string.
269        """
270        return self._get_config_value_by_path(self._refresh_token_config_path)  # type: ignore[return-value]

Retrieve the refresh token from the configuration.

This method fetches the refresh token using the configuration path specified by _refresh_token_config_path.

Returns:

str: The refresh token as a string.

def set_refresh_token(self, new_refresh_token: str) -> None:
272    def set_refresh_token(self, new_refresh_token: str) -> None:
273        """
274        Updates the refresh token in the configuration.
275
276        Args:
277            new_refresh_token (str): The new refresh token to be set.
278        """
279        self._set_config_value_by_path(self._refresh_token_config_path, new_refresh_token)

Updates the refresh token in the configuration.

Arguments:
  • new_refresh_token (str): The new refresh token to be set.
def get_token_expiry_date(self) -> airbyte_cdk.utils.datetime_helpers.AirbyteDateTime:
281    def get_token_expiry_date(self) -> AirbyteDateTime:
282        """
283        Retrieves the token expiry date from the configuration.
284
285        This method fetches the token expiry date from the configuration using the specified path.
286        If the expiry date is an empty string, it returns the current date and time minus one day.
287        Otherwise, it parses the expiry date string into an AirbyteDateTime object.
288
289        Returns:
290            AirbyteDateTime: The parsed or calculated token expiry date.
291
292        Raises:
293            TypeError: If the result is not an instance of AirbyteDateTime.
294        """
295        expiry_date = self._get_config_value_by_path(self._token_expiry_date_config_path)
296        result = (
297            ab_datetime_now() - timedelta(days=1)
298            if expiry_date == ""
299            else ab_datetime_parse(str(expiry_date))
300        )
301        if isinstance(result, AirbyteDateTime):
302            return result
303        raise TypeError("Invalid datetime conversion")

Retrieves the token expiry date from the configuration.

This method fetches the token expiry date from the configuration using the specified path. If the expiry date is an empty string, it returns the current date and time minus one day. Otherwise, it parses the expiry date string into an AirbyteDateTime object.

Returns:

AirbyteDateTime: The parsed or calculated token expiry date.

Raises:
  • TypeError: If the result is not an instance of AirbyteDateTime.
def set_token_expiry_date( self, new_token_expiry_date: airbyte_cdk.utils.datetime_helpers.AirbyteDateTime) -> None:
305    def set_token_expiry_date(self, new_token_expiry_date: AirbyteDateTime) -> None:  # type: ignore[override]
306        """
307        Sets the token expiry date in the configuration.
308
309        Args:
310            new_token_expiry_date (AirbyteDateTime): The new expiry date for the token.
311        """
312        self._set_config_value_by_path(
313            self._token_expiry_date_config_path, str(new_token_expiry_date)
314        )

Sets the token expiry date in the configuration.

Arguments:
  • new_token_expiry_date (AirbyteDateTime): The new expiry date for the token.
def token_has_expired(self) -> bool:
316    def token_has_expired(self) -> bool:
317        """Returns True if the token is expired"""
318        return ab_datetime_now() > self.get_token_expiry_date()

Returns True if the token is expired

@staticmethod
def get_new_token_expiry_date( access_token_expires_in: str, token_expiry_date_format: str | None = None) -> airbyte_cdk.utils.datetime_helpers.AirbyteDateTime:
320    @staticmethod
321    def get_new_token_expiry_date(
322        access_token_expires_in: str,
323        token_expiry_date_format: str | None = None,
324    ) -> AirbyteDateTime:
325        """
326        Calculate the new token expiry date based on the provided expiration duration or format.
327
328        Args:
329            access_token_expires_in (str): The duration (in seconds) until the access token expires, or the expiry date in a specific format.
330            token_expiry_date_format (str | None, optional): The format of the expiry date if provided. Defaults to None.
331
332        Returns:
333            AirbyteDateTime: The calculated expiry date of the access token.
334        """
335        if token_expiry_date_format:
336            return ab_datetime_parse(access_token_expires_in)
337        else:
338            return ab_datetime_now() + timedelta(seconds=int(access_token_expires_in))

Calculate the new token expiry date based on the provided expiration duration or format.

Arguments:
  • access_token_expires_in (str): The duration (in seconds) until the access token expires, or the expiry date in a specific format.
  • token_expiry_date_format (str | None, optional): The format of the expiry date if provided. Defaults to None.
Returns:

AirbyteDateTime: The calculated expiry date of the access token.

def get_access_token(self) -> str:
340    def get_access_token(self) -> str:
341        """Retrieve new access and refresh token if the access token has expired.
342        The new refresh token is persisted with the set_refresh_token function
343        Returns:
344            str: The current access_token, updated if it was previously expired.
345        """
346        if self.token_has_expired():
347            new_access_token, access_token_expires_in, new_refresh_token = (
348                self.refresh_access_token()
349            )
350            new_token_expiry_date: AirbyteDateTime = self.get_new_token_expiry_date(
351                access_token_expires_in, self._token_expiry_date_format
352            )
353            self.access_token = new_access_token
354            self.set_refresh_token(new_refresh_token)
355            self.set_token_expiry_date(new_token_expiry_date)
356            self._emit_control_message()
357        return self.access_token

Retrieve new access and refresh token if the access token has expired. The new refresh token is persisted with the set_refresh_token function

Returns:

str: The current access_token, updated if it was previously expired.

def refresh_access_token(self) -> Tuple[str, str, str]:
359    def refresh_access_token(self) -> Tuple[str, str, str]:  # type: ignore[override]
360        """
361        Refreshes the access token by making a handled request and extracting the necessary token information.
362
363        Returns:
364            Tuple[str, str, str]: A tuple containing the new access token, token expiry date, and refresh token.
365        """
366        response_json = self._make_handled_request()
367        return (
368            self._extract_access_token(response_json),
369            self._extract_token_expiry_date(response_json),
370            self._extract_refresh_token(response_json),
371        )

Refreshes the access token by making a handled request and extracting the necessary token information.

Returns:

Tuple[str, str, str]: A tuple containing the new access token, token expiry date, and refresh token.

class TokenAuthenticator(airbyte_cdk.AbstractHeaderAuthenticator):
39class TokenAuthenticator(AbstractHeaderAuthenticator):
40    """
41    Builds auth header, based on the token provided.
42    The token is attached to each request via the `auth_header` header.
43    """
44
45    @property
46    def auth_header(self) -> str:
47        return self._auth_header
48
49    @property
50    def token(self) -> str:
51        return f"{self._auth_method} {self._token}"
52
53    def __init__(self, token: str, auth_method: str = "Bearer", auth_header: str = "Authorization"):
54        self._auth_header = auth_header
55        self._auth_method = auth_method
56        self._token = token

Builds auth header, based on the token provided. The token is attached to each request via the auth_header header.

TokenAuthenticator( token: str, auth_method: str = 'Bearer', auth_header: str = 'Authorization')
53    def __init__(self, token: str, auth_method: str = "Bearer", auth_header: str = "Authorization"):
54        self._auth_header = auth_header
55        self._auth_method = auth_method
56        self._token = token
auth_header: str
45    @property
46    def auth_header(self) -> str:
47        return self._auth_header

HTTP header to set on the requests

token: str
49    @property
50    def token(self) -> str:
51        return f"{self._auth_method} {self._token}"

The header value to set on outgoing HTTP requests

class UserDefinedBackoffException(airbyte_cdk.BaseBackoffException):
36class UserDefinedBackoffException(BaseBackoffException):
37    """
38    An exception that exposes how long it attempted to backoff
39    """
40
41    def __init__(
42        self,
43        backoff: Union[int, float],
44        request: requests.PreparedRequest,
45        response: Optional[Union[requests.Response, Exception]],
46        error_message: str = "",
47    ):
48        """
49        :param backoff: how long to backoff in seconds
50        :param request: the request that triggered this backoff exception
51        :param response: the response that triggered the backoff exception
52        """
53        self.backoff = backoff
54        super().__init__(request=request, response=response, error_message=error_message)

An exception that exposes how long it attempted to backoff

UserDefinedBackoffException( backoff: Union[int, float], request: requests.models.PreparedRequest, response: Union[requests.models.Response, Exception, NoneType], error_message: str = '')
41    def __init__(
42        self,
43        backoff: Union[int, float],
44        request: requests.PreparedRequest,
45        response: Optional[Union[requests.Response, Exception]],
46        error_message: str = "",
47    ):
48        """
49        :param backoff: how long to backoff in seconds
50        :param request: the request that triggered this backoff exception
51        :param response: the response that triggered the backoff exception
52        """
53        self.backoff = backoff
54        super().__init__(request=request, response=response, error_message=error_message)
Parameters
  • backoff: how long to backoff in seconds
  • request: the request that triggered this backoff exception
  • response: the response that triggered the backoff exception
backoff
class AirbyteLogFormatter(logging.Formatter):
60class AirbyteLogFormatter(logging.Formatter):
61    """Output log records using AirbyteMessage"""
62
63    # Transforming Python log levels to Airbyte protocol log levels
64    level_mapping = {
65        logging.FATAL: Level.FATAL,
66        logging.ERROR: Level.ERROR,
67        logging.WARNING: Level.WARN,
68        logging.INFO: Level.INFO,
69        logging.DEBUG: Level.DEBUG,
70    }
71
72    def format(self, record: logging.LogRecord) -> str:
73        """Return a JSON representation of the log message"""
74        airbyte_level = self.level_mapping.get(record.levelno, "INFO")
75        if airbyte_level == Level.DEBUG:
76            extras = self.extract_extra_args_from_record(record)
77            debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras}
78            return filter_secrets(json.dumps(debug_dict))
79        else:
80            message = super().format(record)
81            message = filter_secrets(message)
82            log_message = AirbyteMessage(
83                type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
84            )
85            return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
86
87    @staticmethod
88    def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]:
89        """
90        The python logger conflates default args with extra args. We use an empty log record and set operations
91        to isolate fields passed to the log record via extra by the developer.
92        """
93        default_attrs = logging.LogRecord("", 0, "", 0, None, None, None).__dict__.keys()
94        extra_keys = set(record.__dict__.keys()) - default_attrs
95        return {k: str(getattr(record, k)) for k in extra_keys if hasattr(record, k)}

Output log records using AirbyteMessage

level_mapping = {50: <Level.FATAL: 'FATAL'>, 40: <Level.ERROR: 'ERROR'>, 30: <Level.WARN: 'WARN'>, 20: <Level.INFO: 'INFO'>, 10: <Level.DEBUG: 'DEBUG'>}
def format(self, record: logging.LogRecord) -> str:
72    def format(self, record: logging.LogRecord) -> str:
73        """Return a JSON representation of the log message"""
74        airbyte_level = self.level_mapping.get(record.levelno, "INFO")
75        if airbyte_level == Level.DEBUG:
76            extras = self.extract_extra_args_from_record(record)
77            debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras}
78            return filter_secrets(json.dumps(debug_dict))
79        else:
80            message = super().format(record)
81            message = filter_secrets(message)
82            log_message = AirbyteMessage(
83                type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
84            )
85            return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()

Return a JSON representation of the log message

@staticmethod
def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]:
87    @staticmethod
88    def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]:
89        """
90        The python logger conflates default args with extra args. We use an empty log record and set operations
91        to isolate fields passed to the log record via extra by the developer.
92        """
93        default_attrs = logging.LogRecord("", 0, "", 0, None, None, None).__dict__.keys()
94        extra_keys = set(record.__dict__.keys()) - default_attrs
95        return {k: str(getattr(record, k)) for k in extra_keys if hasattr(record, k)}

The python logger conflates default args with extra args. We use an empty log record and set operations to isolate fields passed to the log record via extra by the developer.

def init_logger(name: Optional[str] = None) -> logging.Logger:
44def init_logger(name: Optional[str] = None) -> logging.Logger:
45    """Initial set up of logger"""
46    logger = logging.getLogger(name)
47    logger.setLevel(logging.INFO)
48    logging.config.dictConfig(LOGGING_CONFIG)
49    return logger

Initial set up of logger

@dataclass
class AirbyteStream:
264@dataclass
265class AirbyteStream:
266    name: str
267    json_schema: Dict[str, Any]
268    supported_sync_modes: List[SyncMode]
269    source_defined_cursor: Optional[bool] = None
270    default_cursor_field: Optional[List[str]] = None
271    source_defined_primary_key: Optional[List[List[str]]] = None
272    namespace: Optional[str] = None
273    is_resumable: Optional[bool] = None
AirbyteStream( name: str, json_schema: Dict[str, Any], supported_sync_modes: List[airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode], source_defined_cursor: Optional[bool] = None, default_cursor_field: Optional[List[str]] = None, source_defined_primary_key: Optional[List[List[str]]] = None, namespace: Optional[str] = None, is_resumable: Optional[bool] = None)
name: str
json_schema: Dict[str, Any]
supported_sync_modes: List[airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode]
source_defined_cursor: Optional[bool] = None
default_cursor_field: Optional[List[str]] = None
source_defined_primary_key: Optional[List[List[str]]] = None
namespace: Optional[str] = None
is_resumable: Optional[bool] = None
@dataclass
class AirbyteConnectionStatus:
172@dataclass
173class AirbyteConnectionStatus:
174    status: Status
175    message: Optional[str] = None
AirbyteConnectionStatus( status: airbyte_protocol_dataclasses.models.airbyte_protocol.Status, message: Optional[str] = None)
status: airbyte_protocol_dataclasses.models.airbyte_protocol.Status
message: Optional[str] = None
@dataclass
class AirbyteMessage:
81@dataclass
82class AirbyteMessage:
83    type: Type  # type: ignore [name-defined]
84    log: Optional[AirbyteLogMessage] = None  # type: ignore [name-defined]
85    spec: Optional[ConnectorSpecification] = None  # type: ignore [name-defined]
86    connectionStatus: Optional[AirbyteConnectionStatus] = None  # type: ignore [name-defined]
87    catalog: Optional[AirbyteCatalog] = None  # type: ignore [name-defined]
88    record: Optional[Union[AirbyteFileTransferRecordMessage, AirbyteRecordMessage]] = None  # type: ignore [name-defined]
89    state: Optional[AirbyteStateMessage] = None
90    trace: Optional[AirbyteTraceMessage] = None  # type: ignore [name-defined]
91    control: Optional[AirbyteControlMessage] = None  # type: ignore [name-defined]
AirbyteMessage( type: airbyte_protocol_dataclasses.models.airbyte_protocol.Type, log: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteLogMessage] = None, spec: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification] = None, connectionStatus: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteConnectionStatus] = None, catalog: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteCatalog] = None, record: Union[airbyte_cdk.models.file_transfer_record_message.AirbyteFileTransferRecordMessage, airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteRecordMessage, NoneType] = None, state: Optional[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage] = None, trace: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteTraceMessage] = None, control: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteControlMessage] = None)
type: airbyte_protocol_dataclasses.models.airbyte_protocol.Type
log: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteLogMessage] = None
spec: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification] = None
connectionStatus: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteConnectionStatus] = None
catalog: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteCatalog] = None
record: Union[airbyte_cdk.models.file_transfer_record_message.AirbyteFileTransferRecordMessage, airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteRecordMessage, NoneType] = None
trace: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteTraceMessage] = None
control: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteControlMessage] = None
@dataclass
class ConfiguredAirbyteCatalog:
333@dataclass
334class ConfiguredAirbyteCatalog:
335    streams: List[ConfiguredAirbyteStream]
ConfiguredAirbyteCatalog( streams: List[airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteStream])
streams: List[airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteStream]
class Status(enum.Enum):
167class Status(Enum):
168    SUCCEEDED = 'SUCCEEDED'
169    FAILED = 'FAILED'

An enumeration.

SUCCEEDED = <Status.SUCCEEDED: 'SUCCEEDED'>
FAILED = <Status.FAILED: 'FAILED'>
class Type(enum.Enum):
12class Type(Enum):
13    RECORD = 'RECORD'
14    STATE = 'STATE'
15    LOG = 'LOG'
16    SPEC = 'SPEC'
17    CONNECTION_STATUS = 'CONNECTION_STATUS'
18    CATALOG = 'CATALOG'
19    TRACE = 'TRACE'
20    CONTROL = 'CONTROL'

An enumeration.

RECORD = <Type.RECORD: 'RECORD'>
STATE = <Type.STATE: 'STATE'>
LOG = <Type.LOG: 'LOG'>
SPEC = <Type.SPEC: 'SPEC'>
CONNECTION_STATUS = <Type.CONNECTION_STATUS: 'CONNECTION_STATUS'>
CATALOG = <Type.CATALOG: 'CATALOG'>
TRACE = <Type.TRACE: 'TRACE'>
CONTROL = <Type.CONTROL: 'CONTROL'>
class OrchestratorType(enum.Enum):
158class OrchestratorType(Enum):
159    CONNECTOR_CONFIG = 'CONNECTOR_CONFIG'

An enumeration.

CONNECTOR_CONFIG = <OrchestratorType.CONNECTOR_CONFIG: 'CONNECTOR_CONFIG'>
@dataclass
class ConfiguredAirbyteStream:
276@dataclass
277class ConfiguredAirbyteStream:
278    stream: AirbyteStream
279    sync_mode: SyncMode
280    destination_sync_mode: DestinationSyncMode
281    cursor_field: Optional[List[str]] = None
282    primary_key: Optional[List[List[str]]] = None
283    generation_id: Optional[int] = None
284    minimum_generation_id: Optional[int] = None
285    sync_id: Optional[int] = None
ConfiguredAirbyteStream( stream: airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteStream, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, destination_sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.DestinationSyncMode, cursor_field: Optional[List[str]] = None, primary_key: Optional[List[List[str]]] = None, generation_id: Optional[int] = None, minimum_generation_id: Optional[int] = None, sync_id: Optional[int] = None)
stream: airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteStream
sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode
destination_sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.DestinationSyncMode
cursor_field: Optional[List[str]] = None
primary_key: Optional[List[List[str]]] = None
generation_id: Optional[int] = None
minimum_generation_id: Optional[int] = None
sync_id: Optional[int] = None
class DestinationSyncMode(enum.Enum):
183class DestinationSyncMode(Enum):
184    append = 'append'
185    overwrite = 'overwrite'
186    append_dedup = 'append_dedup'

An enumeration.

append = <DestinationSyncMode.append: 'append'>
overwrite = <DestinationSyncMode.overwrite: 'overwrite'>
append_dedup = <DestinationSyncMode.append_dedup: 'append_dedup'>
class SyncMode(enum.Enum):
178class SyncMode(Enum):
179    full_refresh = 'full_refresh'
180    incremental = 'incremental'

An enumeration.

full_refresh = <SyncMode.full_refresh: 'full_refresh'>
incremental = <SyncMode.incremental: 'incremental'>
class FailureType(enum.Enum):
94class FailureType(Enum):
95    system_error = 'system_error'
96    config_error = 'config_error'
97    transient_error = 'transient_error'

An enumeration.

system_error = <FailureType.system_error: 'system_error'>
config_error = <FailureType.config_error: 'config_error'>
transient_error = <FailureType.transient_error: 'transient_error'>
@dataclass
class AdvancedAuth:
288@dataclass
289class AdvancedAuth:
290    auth_flow_type: Optional[AuthFlowType] = None
291    predicate_key: Optional[List[str]] = None
292    predicate_value: Optional[str] = None
293    oauth_config_specification: Optional[OAuthConfigSpecification] = None
AdvancedAuth( auth_flow_type: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AuthFlowType] = None, predicate_key: Optional[List[str]] = None, predicate_value: Optional[str] = None, oauth_config_specification: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.OAuthConfigSpecification] = None)
auth_flow_type: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AuthFlowType] = None
predicate_key: Optional[List[str]] = None
predicate_value: Optional[str] = None
oauth_config_specification: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.OAuthConfigSpecification] = None
@dataclass
class AirbyteLogMessage:
80@dataclass
81class AirbyteLogMessage:
82    level: Level
83    message: str
84    stack_trace: Optional[str] = None
AirbyteLogMessage( level: airbyte_protocol_dataclasses.models.airbyte_protocol.Level, message: str, stack_trace: Optional[str] = None)
level: airbyte_protocol_dataclasses.models.airbyte_protocol.Level
message: str
stack_trace: Optional[str] = None
@dataclass
class OAuthConfigSpecification:
217@dataclass
218class OAuthConfigSpecification:
219    oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = (
220        None
221    )
222    oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = (
223        None
224    )
225    complete_oauth_output_specification: Optional[Dict[str, Any]] = None
226    complete_oauth_server_input_specification: Optional[Dict[str, Any]] = None
227    complete_oauth_server_output_specification: Optional[Dict[str, Any]] = None
OAuthConfigSpecification( oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = None, oauth_connector_input_specification: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.OauthConnectorInputSpecification] = None, complete_oauth_output_specification: Optional[Dict[str, Any]] = None, complete_oauth_server_input_specification: Optional[Dict[str, Any]] = None, complete_oauth_server_output_specification: Optional[Dict[str, Any]] = None)
oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = None
oauth_connector_input_specification: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.OauthConnectorInputSpecification] = None
complete_oauth_output_specification: Optional[Dict[str, Any]] = None
complete_oauth_server_input_specification: Optional[Dict[str, Any]] = None
complete_oauth_server_output_specification: Optional[Dict[str, Any]] = None
@dataclass
class ConnectorSpecification:
296@dataclass
297class ConnectorSpecification:
298    connectionSpecification: Dict[str, Any]
299    documentationUrl: Optional[str] = None
300    changelogUrl: Optional[str] = None
301    supportsIncremental: Optional[bool] = None
302    supportsNormalization: Optional[bool] = False
303    supportsDBT: Optional[bool] = False
304    supported_destination_sync_modes: Optional[List[DestinationSyncMode]] = None
305    advanced_auth: Optional[AdvancedAuth] = None
306    protocol_version: Optional[str] = None
ConnectorSpecification( connectionSpecification: Dict[str, Any], documentationUrl: Optional[str] = None, changelogUrl: Optional[str] = None, supportsIncremental: Optional[bool] = None, supportsNormalization: Optional[bool] = False, supportsDBT: Optional[bool] = False, supported_destination_sync_modes: Optional[List[airbyte_protocol_dataclasses.models.airbyte_protocol.DestinationSyncMode]] = None, advanced_auth: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AdvancedAuth] = None, protocol_version: Optional[str] = None)
connectionSpecification: Dict[str, Any]
documentationUrl: Optional[str] = None
changelogUrl: Optional[str] = None
supportsIncremental: Optional[bool] = None
supportsNormalization: Optional[bool] = False
supportsDBT: Optional[bool] = False
supported_destination_sync_modes: Optional[List[airbyte_protocol_dataclasses.models.airbyte_protocol.DestinationSyncMode]] = None
advanced_auth: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AdvancedAuth] = None
protocol_version: Optional[str] = None
class Level(enum.Enum):
71class Level(Enum):
72    FATAL = 'FATAL'
73    ERROR = 'ERROR'
74    WARN = 'WARN'
75    INFO = 'INFO'
76    DEBUG = 'DEBUG'
77    TRACE = 'TRACE'

An enumeration.

FATAL = <Level.FATAL: 'FATAL'>
ERROR = <Level.ERROR: 'ERROR'>
WARN = <Level.WARN: 'WARN'>
INFO = <Level.INFO: 'INFO'>
DEBUG = <Level.DEBUG: 'DEBUG'>
TRACE = <Level.TRACE: 'TRACE'>
@dataclass
class AirbyteRecordMessage:
309@dataclass
310class AirbyteRecordMessage:
311    stream: str
312    data: Dict[str, Any]
313    emitted_at: int
314    namespace: Optional[str] = None
315    meta: Optional[AirbyteRecordMessageMeta] = None
AirbyteRecordMessage( stream: str, data: Dict[str, Any], emitted_at: int, namespace: Optional[str] = None, meta: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteRecordMessageMeta] = None)
stream: str
data: Dict[str, Any]
emitted_at: int
namespace: Optional[str] = None
meta: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteRecordMessageMeta] = None
class InMemoryMessageRepository(airbyte_cdk.MessageRepository):
75class InMemoryMessageRepository(MessageRepository):
76    def __init__(self, log_level: Level = Level.INFO) -> None:
77        self._message_queue: Deque[AirbyteMessage] = deque()
78        self._log_level = log_level
79
80    def emit_message(self, message: AirbyteMessage) -> None:
81        self._message_queue.append(message)
82
83    def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
84        if _is_severe_enough(self._log_level, level):
85            self.emit_message(
86                AirbyteMessage(
87                    type=Type.LOG,
88                    log=AirbyteLogMessage(
89                        level=level, message=filter_secrets(json.dumps(message_provider()))
90                    ),
91                )
92            )
93
94    def consume_queue(self) -> Iterable[AirbyteMessage]:
95        while self._message_queue:
96            yield self._message_queue.popleft()

Helper class that provides a standard way to create an ABC using inheritance.

InMemoryMessageRepository( log_level: airbyte_protocol_dataclasses.models.airbyte_protocol.Level = <Level.INFO: 'INFO'>)
76    def __init__(self, log_level: Level = Level.INFO) -> None:
77        self._message_queue: Deque[AirbyteMessage] = deque()
78        self._log_level = log_level
def emit_message( self, message: AirbyteMessage) -> None:
80    def emit_message(self, message: AirbyteMessage) -> None:
81        self._message_queue.append(message)
def log_message( self, level: airbyte_protocol_dataclasses.models.airbyte_protocol.Level, message_provider: Callable[[], dict[str, Union[dict[str, Union[dict[str, ForwardRef('JsonType')], list[ForwardRef('JsonType')], str, int, float, bool, NoneType]], list[Union[dict[str, ForwardRef('JsonType')], list[ForwardRef('JsonType')], str, int, float, bool, NoneType]], str, int, float, bool, NoneType]]]) -> None:
83    def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
84        if _is_severe_enough(self._log_level, level):
85            self.emit_message(
86                AirbyteMessage(
87                    type=Type.LOG,
88                    log=AirbyteLogMessage(
89                        level=level, message=filter_secrets(json.dumps(message_provider()))
90                    ),
91                )
92            )

Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if the log level is less severe than what is configured

def consume_queue(self) -> Iterable[AirbyteMessage]:
94    def consume_queue(self) -> Iterable[AirbyteMessage]:
95        while self._message_queue:
96            yield self._message_queue.popleft()
class MessageRepository(abc.ABC):
46class MessageRepository(ABC):
47    @abstractmethod
48    def emit_message(self, message: AirbyteMessage) -> None:
49        raise NotImplementedError()
50
51    @abstractmethod
52    def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
53        """
54        Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if
55        the log level is less severe than what is configured
56        """
57        raise NotImplementedError()
58
59    @abstractmethod
60    def consume_queue(self) -> Iterable[AirbyteMessage]:
61        raise NotImplementedError()

Helper class that provides a standard way to create an ABC using inheritance.

@abstractmethod
def emit_message( self, message: AirbyteMessage) -> None:
47    @abstractmethod
48    def emit_message(self, message: AirbyteMessage) -> None:
49        raise NotImplementedError()
@abstractmethod
def log_message( self, level: airbyte_protocol_dataclasses.models.airbyte_protocol.Level, message_provider: Callable[[], dict[str, Union[dict[str, Union[dict[str, ForwardRef('JsonType')], list[ForwardRef('JsonType')], str, int, float, bool, NoneType]], list[Union[dict[str, ForwardRef('JsonType')], list[ForwardRef('JsonType')], str, int, float, bool, NoneType]], str, int, float, bool, NoneType]]]) -> None:
51    @abstractmethod
52    def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
53        """
54        Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if
55        the log level is less severe than what is configured
56        """
57        raise NotImplementedError()

Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if the log level is less severe than what is configured

@abstractmethod
def consume_queue(self) -> Iterable[AirbyteMessage]:
59    @abstractmethod
60    def consume_queue(self) -> Iterable[AirbyteMessage]:
61        raise NotImplementedError()
class ConnectorStateManager:
 33class ConnectorStateManager:
 34    """
 35    ConnectorStateManager consolidates the various forms of a stream's incoming state message (STREAM / GLOBAL) under a common
 36    interface. It also provides methods to extract and update state
 37    """
 38
 39    def __init__(self, state: Optional[List[AirbyteStateMessage]] = None):
 40        shared_state, per_stream_states = self._extract_from_state_message(state)
 41
 42        # We explicitly throw an error if we receive a GLOBAL state message that contains a shared_state because API sources are
 43        # designed to checkpoint state independently of one another. API sources should never be emitting a state message where
 44        # shared_state is populated. Rather than define how to handle shared_state without a clear use case, we're opting to throw an
 45        # error instead and if/when we find one, we will then implement processing of the shared_state value.
 46        if shared_state:
 47            raise ValueError(
 48                "Received a GLOBAL AirbyteStateMessage that contains a shared_state. This library only ever generates per-STREAM "
 49                "STATE messages so this was not generated by this connector. This must be an orchestrator or platform error. GLOBAL "
 50                "state messages with shared_state will not be processed correctly. "
 51            )
 52        self.per_stream_states = per_stream_states
 53
 54    def get_stream_state(
 55        self, stream_name: str, namespace: Optional[str]
 56    ) -> MutableMapping[str, Any]:
 57        """
 58        Retrieves the state of a given stream based on its descriptor (name + namespace).
 59        :param stream_name: Name of the stream being fetched
 60        :param namespace: Namespace of the stream being fetched
 61        :return: The per-stream state for a stream
 62        """
 63        stream_state: AirbyteStateBlob | None = self.per_stream_states.get(
 64            HashableStreamDescriptor(name=stream_name, namespace=namespace)
 65        )
 66        if stream_state:
 67            return copy.deepcopy({k: v for k, v in stream_state.__dict__.items()})
 68        return {}
 69
 70    def update_state_for_stream(
 71        self, stream_name: str, namespace: Optional[str], value: Mapping[str, Any]
 72    ) -> None:
 73        """
 74        Overwrites the state blob of a specific stream based on the provided stream name and optional namespace
 75        :param stream_name: The name of the stream whose state is being updated
 76        :param namespace: The namespace of the stream if it exists
 77        :param value: A stream state mapping that is being updated for a stream
 78        """
 79        stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
 80        self.per_stream_states[stream_descriptor] = AirbyteStateBlob(value)
 81
 82    def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
 83        """
 84        Generates an AirbyteMessage using the current per-stream state of a specified stream
 85        :param stream_name: The name of the stream for the message that is being created
 86        :param namespace: The namespace of the stream for the message that is being created
 87        :return: The Airbyte state message to be emitted by the connector during a sync
 88        """
 89        hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
 90        stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
 91
 92        return AirbyteMessage(
 93            type=MessageType.STATE,
 94            state=AirbyteStateMessage(
 95                type=AirbyteStateType.STREAM,
 96                stream=AirbyteStreamState(
 97                    stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace),
 98                    stream_state=stream_state,
 99                ),
100            ),
101        )
102
103    @classmethod
104    def _extract_from_state_message(
105        cls,
106        state: Optional[List[AirbyteStateMessage]],
107    ) -> Tuple[
108        Optional[AirbyteStateBlob],
109        MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]],
110    ]:
111        """
112        Takes an incoming list of state messages or a global state message and extracts state attributes according to
113        type which can then be assigned to the new state manager being instantiated
114        :param state: The incoming state input
115        :return: A tuple of shared state and per stream state assembled from the incoming state list
116        """
117        if state is None:
118            return None, {}
119
120        is_global = cls._is_global_state(state)
121
122        if is_global:
123            # We already validate that this is a global state message, not None:
124            global_state = cast(AirbyteGlobalState, state[0].global_)
125            # global_state has shared_state, also not None:
126            shared_state: AirbyteStateBlob = cast(
127                AirbyteStateBlob, copy.deepcopy(global_state.shared_state, {})
128            )
129            streams = {
130                HashableStreamDescriptor(
131                    name=per_stream_state.stream_descriptor.name,
132                    namespace=per_stream_state.stream_descriptor.namespace,
133                ): per_stream_state.stream_state
134                for per_stream_state in global_state.stream_states  # type: ignore[union-attr] # global_state has shared_state
135            }
136            return shared_state, streams
137        else:
138            streams = {
139                HashableStreamDescriptor(
140                    name=per_stream_state.stream.stream_descriptor.name,  # type: ignore[union-attr] # stream has stream_descriptor
141                    namespace=per_stream_state.stream.stream_descriptor.namespace,  # type: ignore[union-attr] # stream has stream_descriptor
142                ): per_stream_state.stream.stream_state  # type: ignore[union-attr] # stream has stream_state
143                for per_stream_state in state
144                if per_stream_state.type == AirbyteStateType.STREAM
145                and hasattr(per_stream_state, "stream")  # type: ignore # state is always a list of AirbyteStateMessage if is_per_stream is True
146            }
147            return None, streams
148
149    @staticmethod
150    def _is_global_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
151        return (
152            isinstance(state, List)
153            and len(state) == 1
154            and isinstance(state[0], AirbyteStateMessage)
155            and state[0].type == AirbyteStateType.GLOBAL
156        )
157
158    @staticmethod
159    def _is_per_stream_state(
160        state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]],
161    ) -> bool:
162        return isinstance(state, List)

ConnectorStateManager consolidates the various forms of a stream's incoming state message (STREAM / GLOBAL) under a common interface. It also provides methods to extract and update state

ConnectorStateManager( state: Optional[List[airbyte_cdk.models.airbyte_protocol.AirbyteStateMessage]] = None)
39    def __init__(self, state: Optional[List[AirbyteStateMessage]] = None):
40        shared_state, per_stream_states = self._extract_from_state_message(state)
41
42        # We explicitly throw an error if we receive a GLOBAL state message that contains a shared_state because API sources are
43        # designed to checkpoint state independently of one another. API sources should never be emitting a state message where
44        # shared_state is populated. Rather than define how to handle shared_state without a clear use case, we're opting to throw an
45        # error instead and if/when we find one, we will then implement processing of the shared_state value.
46        if shared_state:
47            raise ValueError(
48                "Received a GLOBAL AirbyteStateMessage that contains a shared_state. This library only ever generates per-STREAM "
49                "STATE messages so this was not generated by this connector. This must be an orchestrator or platform error. GLOBAL "
50                "state messages with shared_state will not be processed correctly. "
51            )
52        self.per_stream_states = per_stream_states
per_stream_states
def get_stream_state( self, stream_name: str, namespace: Optional[str]) -> MutableMapping[str, Any]:
54    def get_stream_state(
55        self, stream_name: str, namespace: Optional[str]
56    ) -> MutableMapping[str, Any]:
57        """
58        Retrieves the state of a given stream based on its descriptor (name + namespace).
59        :param stream_name: Name of the stream being fetched
60        :param namespace: Namespace of the stream being fetched
61        :return: The per-stream state for a stream
62        """
63        stream_state: AirbyteStateBlob | None = self.per_stream_states.get(
64            HashableStreamDescriptor(name=stream_name, namespace=namespace)
65        )
66        if stream_state:
67            return copy.deepcopy({k: v for k, v in stream_state.__dict__.items()})
68        return {}

Retrieves the state of a given stream based on its descriptor (name + namespace).

Parameters
  • stream_name: Name of the stream being fetched
  • namespace: Namespace of the stream being fetched
Returns

The per-stream state for a stream

def update_state_for_stream( self, stream_name: str, namespace: Optional[str], value: Mapping[str, Any]) -> None:
70    def update_state_for_stream(
71        self, stream_name: str, namespace: Optional[str], value: Mapping[str, Any]
72    ) -> None:
73        """
74        Overwrites the state blob of a specific stream based on the provided stream name and optional namespace
75        :param stream_name: The name of the stream whose state is being updated
76        :param namespace: The namespace of the stream if it exists
77        :param value: A stream state mapping that is being updated for a stream
78        """
79        stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
80        self.per_stream_states[stream_descriptor] = AirbyteStateBlob(value)

Overwrites the state blob of a specific stream based on the provided stream name and optional namespace

Parameters
  • stream_name: The name of the stream whose state is being updated
  • namespace: The namespace of the stream if it exists
  • value: A stream state mapping that is being updated for a stream
def create_state_message( self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
 82    def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
 83        """
 84        Generates an AirbyteMessage using the current per-stream state of a specified stream
 85        :param stream_name: The name of the stream for the message that is being created
 86        :param namespace: The namespace of the stream for the message that is being created
 87        :return: The Airbyte state message to be emitted by the connector during a sync
 88        """
 89        hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
 90        stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
 91
 92        return AirbyteMessage(
 93            type=MessageType.STATE,
 94            state=AirbyteStateMessage(
 95                type=AirbyteStateType.STREAM,
 96                stream=AirbyteStreamState(
 97                    stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace),
 98                    stream_state=stream_state,
 99                ),
100            ),
101        )

Generates an AirbyteMessage using the current per-stream state of a specified stream

Parameters
  • stream_name: The name of the stream for the message that is being created
  • namespace: The namespace of the stream for the message that is being created
Returns

The Airbyte state message to be emitted by the connector during a sync

@deprecated('Deprecated as of CDK version 0.87.0. Deprecated in favor of the `CheckpointMixin` which offers similar functionality.')
class IncrementalMixin(airbyte_cdk.sources.streams.core.CheckpointMixin, abc.ABC):
 95@deprecated(
 96    "Deprecated as of CDK version 0.87.0. "
 97    "Deprecated in favor of the `CheckpointMixin` which offers similar functionality."
 98)
 99class IncrementalMixin(CheckpointMixin, ABC):
100    """Mixin to make stream incremental.
101
102    class IncrementalStream(Stream, IncrementalMixin):
103        @property
104        def state(self):
105            return self._state
106
107        @state.setter
108        def state(self, value):
109            self._state[self.cursor_field] = value[self.cursor_field]
110    """

Mixin to make stream incremental.

class IncrementalStream(Stream, IncrementalMixin): @property def state(self): return self._state

@state.setter
def state(self, value):
    self._state[self.cursor_field] = value[self.cursor_field]
class Stream(abc.ABC):
119class Stream(ABC):
120    """
121    Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol.
122    """
123
124    _configured_json_schema: Optional[Dict[str, Any]] = None
125    _exit_on_rate_limit: bool = False
126
127    # Use self.logger in subclasses to log any messages
128    @property
129    def logger(self) -> logging.Logger:
130        return logging.getLogger(f"airbyte.streams.{self.name}")
131
132    # TypeTransformer object to perform output data transformation
133    transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform)
134
135    cursor: Optional[Cursor] = None
136
137    has_multiple_slices = False
138
139    @cached_property
140    def name(self) -> str:
141        """
142        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
143        """
144        return casing.camel_to_snake(self.__class__.__name__)
145
146    def get_error_display_message(self, exception: BaseException) -> Optional[str]:
147        """
148        Retrieves the user-friendly display message that corresponds to an exception.
149        This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.
150
151        The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed.
152
153        :param exception: The exception that was raised
154        :return: A user-friendly message that indicates the cause of the error
155        """
156        return None
157
158    def read(  # type: ignore  # ignoring typing for ConnectorStateManager because of circular dependencies
159        self,
160        configured_stream: ConfiguredAirbyteStream,
161        logger: logging.Logger,
162        slice_logger: SliceLogger,
163        stream_state: MutableMapping[str, Any],
164        state_manager,
165        internal_config: InternalConfig,
166    ) -> Iterable[StreamData]:
167        sync_mode = configured_stream.sync_mode
168        cursor_field = configured_stream.cursor_field
169        self.configured_json_schema = configured_stream.stream.json_schema
170
171        # WARNING: When performing a read() that uses incoming stream state, we MUST use the self.state that is defined as
172        # opposed to the incoming stream_state value. Because some connectors like ones using the file-based CDK modify
173        # state before setting the value on the Stream attribute, the most up-to-date state is derived from Stream.state
174        # instead of the stream_state parameter. This does not apply to legacy connectors using get_updated_state().
175        try:
176            stream_state = self.state  # type: ignore # we know the field might not exist...
177        except AttributeError:
178            pass
179
180        should_checkpoint = bool(state_manager)
181        checkpoint_reader = self._get_checkpoint_reader(
182            logger=logger, cursor_field=cursor_field, sync_mode=sync_mode, stream_state=stream_state
183        )
184
185        next_slice = checkpoint_reader.next()
186        record_counter = 0
187        stream_state_tracker = copy.deepcopy(stream_state)
188        while next_slice is not None:
189            if slice_logger.should_log_slice_message(logger):
190                yield slice_logger.create_slice_log_message(next_slice)
191            records = self.read_records(
192                sync_mode=sync_mode,  # todo: change this interface to no longer rely on sync_mode for behavior
193                stream_slice=next_slice,
194                stream_state=stream_state,
195                cursor_field=cursor_field or None,
196            )
197            for record_data_or_message in records:
198                yield record_data_or_message
199                if isinstance(record_data_or_message, Mapping) or (
200                    hasattr(record_data_or_message, "type")
201                    and record_data_or_message.type == MessageType.RECORD
202                ):
203                    record_data = (
204                        record_data_or_message
205                        if isinstance(record_data_or_message, Mapping)
206                        else record_data_or_message.record
207                    )
208
209                    # Thanks I hate it. RFR fundamentally doesn't fit with the concept of the legacy Stream.get_updated_state()
210                    # method because RFR streams rely on pagination as a cursor. Stream.get_updated_state() was designed to make
211                    # the CDK manage state using specifically the last seen record. don't @ brian.lai
212                    #
213                    # Also, because the legacy incremental state case decouples observing incoming records from emitting state, it
214                    # requires that we separate CheckpointReader.observe() and CheckpointReader.get_checkpoint() which could
215                    # otherwise be combined.
216                    if self.cursor_field:
217                        # Some connectors have streams that implement get_updated_state(), but do not define a cursor_field. This
218                        # should be fixed on the stream implementation, but we should also protect against this in the CDK as well
219                        stream_state_tracker = self.get_updated_state(
220                            stream_state_tracker,
221                            record_data,  # type: ignore [arg-type]
222                        )
223                        self._observe_state(checkpoint_reader, stream_state_tracker)
224                    record_counter += 1
225
226                    checkpoint_interval = self.state_checkpoint_interval
227                    if (
228                        should_checkpoint
229                        and checkpoint_interval
230                        and record_counter % checkpoint_interval == 0
231                    ):
232                        checkpoint = checkpoint_reader.get_checkpoint()
233                        if checkpoint:
234                            airbyte_state_message = self._checkpoint_state(
235                                checkpoint, state_manager=state_manager
236                            )
237                            yield airbyte_state_message
238
239                    if internal_config.is_limit_reached(record_counter):
240                        break
241            self._observe_state(checkpoint_reader)
242            checkpoint_state = checkpoint_reader.get_checkpoint()
243            if should_checkpoint and checkpoint_state is not None:
244                airbyte_state_message = self._checkpoint_state(
245                    checkpoint_state, state_manager=state_manager
246                )
247                yield airbyte_state_message
248
249            next_slice = checkpoint_reader.next()
250
251        checkpoint = checkpoint_reader.get_checkpoint()
252        if should_checkpoint and checkpoint is not None:
253            airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager)
254            yield airbyte_state_message
255
256    def read_only_records(self, state: Optional[Mapping[str, Any]] = None) -> Iterable[StreamData]:
257        """
258        Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports
259        incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter)
260        or emit state messages.
261        """
262
263        configured_stream = ConfiguredAirbyteStream(
264            stream=AirbyteStream(
265                name=self.name,
266                json_schema={},
267                supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental],
268            ),
269            sync_mode=SyncMode.incremental if state else SyncMode.full_refresh,
270            destination_sync_mode=DestinationSyncMode.append,
271        )
272
273        yield from self.read(
274            configured_stream=configured_stream,
275            logger=self.logger,
276            slice_logger=DebugSliceLogger(),
277            stream_state=dict(state)
278            if state
279            else {},  # read() expects MutableMapping instead of Mapping which is used more often
280            state_manager=None,
281            internal_config=InternalConfig(),  # type: ignore [call-arg]
282        )
283
284    @abstractmethod
285    def read_records(
286        self,
287        sync_mode: SyncMode,
288        cursor_field: Optional[List[str]] = None,
289        stream_slice: Optional[Mapping[str, Any]] = None,
290        stream_state: Optional[Mapping[str, Any]] = None,
291    ) -> Iterable[StreamData]:
292        """
293        This method should be overridden by subclasses to read records based on the inputs
294        """
295
296    @lru_cache(maxsize=None)
297    def get_json_schema(self) -> Mapping[str, Any]:
298        """
299        :return: A dict of the JSON schema representing this stream.
300
301        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
302        Override as needed.
303        """
304        # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec
305        return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema(self.name)
306
307    def as_airbyte_stream(self) -> AirbyteStream:
308        stream = AirbyteStream(
309            name=self.name,
310            json_schema=dict(self.get_json_schema()),
311            supported_sync_modes=[SyncMode.full_refresh],
312            is_resumable=self.is_resumable,
313        )
314
315        if self.namespace:
316            stream.namespace = self.namespace
317
318        # If we can offer incremental we always should. RFR is always less reliable than incremental which uses a real cursor value
319        if self.supports_incremental:
320            stream.source_defined_cursor = self.source_defined_cursor
321            stream.supported_sync_modes.append(SyncMode.incremental)
322            stream.default_cursor_field = self._wrapped_cursor_field()
323
324        keys = Stream._wrapped_primary_key(self.primary_key)
325        if keys and len(keys) > 0:
326            stream.source_defined_primary_key = keys
327
328        return stream
329
330    @property
331    def supports_incremental(self) -> bool:
332        """
333        :return: True if this stream supports incrementally reading data
334        """
335        return len(self._wrapped_cursor_field()) > 0
336
337    @property
338    def is_resumable(self) -> bool:
339        """
340        :return: True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts.
341        This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh
342        can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning
343        on the next sync job.
344        """
345        if self.supports_incremental:
346            return True
347        if self.has_multiple_slices:
348            # We temporarily gate substream to not support RFR because puts a pretty high burden on connector developers
349            # to structure stream state in a very specific way. We also can't check for issubclass(HttpSubStream) because
350            # not all substreams implement the interface and it would be a circular dependency so we use parent as a surrogate
351            return False
352        elif hasattr(type(self), "state") and getattr(type(self), "state").fset is not None:
353            # Modern case where a stream manages state using getter/setter
354            return True
355        else:
356            # Legacy case where the CDK manages state via the get_updated_state() method. This is determined by checking if
357            # the stream's get_updated_state() differs from the Stream class and therefore has been overridden
358            return type(self).get_updated_state != Stream.get_updated_state
359
360    def _wrapped_cursor_field(self) -> List[str]:
361        return [self.cursor_field] if isinstance(self.cursor_field, str) else self.cursor_field
362
363    @property
364    def cursor_field(self) -> Union[str, List[str]]:
365        """
366        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
367        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
368        """
369        return []
370
371    @property
372    def namespace(self) -> Optional[str]:
373        """
374        Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for.
375        :return: A string containing the name of the namespace.
376        """
377        return None
378
379    @property
380    def source_defined_cursor(self) -> bool:
381        """
382        Return False if the cursor can be configured by the user.
383        """
384        return True
385
386    @property
387    def exit_on_rate_limit(self) -> bool:
388        """Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited."""
389        return self._exit_on_rate_limit
390
391    @exit_on_rate_limit.setter
392    def exit_on_rate_limit(self, value: bool) -> None:
393        """Exit on rate limit setter, accept bool value."""
394        self._exit_on_rate_limit = value
395
396    @property
397    @abstractmethod
398    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
399        """
400        :return: string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields.
401          If the stream has no primary keys, return None.
402        """
403
404    def stream_slices(
405        self,
406        *,
407        sync_mode: SyncMode,
408        cursor_field: Optional[List[str]] = None,
409        stream_state: Optional[Mapping[str, Any]] = None,
410    ) -> Iterable[Optional[Mapping[str, Any]]]:
411        """
412        Override to define the slices for this stream. See the stream slicing section of the docs for more information.
413
414        :param sync_mode:
415        :param cursor_field:
416        :param stream_state:
417        :return:
418        """
419        yield StreamSlice(partition={}, cursor_slice={})
420
421    @property
422    def state_checkpoint_interval(self) -> Optional[int]:
423        """
424        Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading
425        100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source.
426
427        Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled.
428
429        return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in
430        ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of
431        created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read.
432        """
433        return None
434
435    # Commented-out to avoid any runtime penalty, since this is used in a hot per-record codepath.
436    # To be evaluated for re-introduction here: https://github.com/airbytehq/airbyte-python-cdk/issues/116
437    # @deprecated(
438    #     "Deprecated method `get_updated_state` as of CDK version 0.1.49. "
439    #     "Please use explicit state property instead, see `IncrementalMixin` docs."
440    # )
441    def get_updated_state(
442        self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
443    ) -> MutableMapping[str, Any]:
444        """DEPRECATED. Please use explicit state property instead, see `IncrementalMixin` docs.
445
446        Override to extract state from the latest record. Needed to implement incremental sync.
447
448        Inspects the latest record extracted from the data source and the current state object and return an updated state object.
449
450        For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is
451        {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.
452
453        :param current_stream_state: The stream's current state object
454        :param latest_record: The latest record extracted from the stream
455        :return: An updated state object
456        """
457        return {}
458
459    def get_cursor(self) -> Optional[Cursor]:
460        """
461        A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while
462        reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need
463        to define a cursor implementation and override this method to manage state through a Cursor.
464        """
465        return self.cursor
466
467    def _get_checkpoint_reader(
468        self,
469        logger: logging.Logger,
470        cursor_field: Optional[List[str]],
471        sync_mode: SyncMode,
472        stream_state: MutableMapping[str, Any],
473    ) -> CheckpointReader:
474        mappings_or_slices = self.stream_slices(
475            cursor_field=cursor_field,
476            sync_mode=sync_mode,  # todo: change this interface to no longer rely on sync_mode for behavior
477            stream_state=stream_state,
478        )
479
480        # Because of poor foresight, we wrote the default Stream.stream_slices() method to return [None] which is confusing and
481        # has now normalized this behavior for connector developers. Now some connectors return [None]. This is objectively
482        # misleading and a more ideal interface is [{}] to indicate we still want to iterate over one slice, but with no
483        # specific slice values. None is bad, and now I feel bad that I have to write this hack.
484        if mappings_or_slices == [None]:
485            mappings_or_slices = [{}]
486
487        slices_iterable_copy, iterable_for_detecting_format = itertools.tee(mappings_or_slices, 2)
488        stream_classification = self._classify_stream(
489            mappings_or_slices=iterable_for_detecting_format
490        )
491
492        # Streams that override has_multiple_slices are explicitly indicating that they will iterate over
493        # multiple partitions. Inspecting slices to automatically apply the correct cursor is only needed as
494        # a backup. So if this value was already assigned to True by the stream, we don't need to reassign it
495        self.has_multiple_slices = (
496            self.has_multiple_slices or stream_classification.has_multiple_slices
497        )
498
499        cursor = self.get_cursor()
500        if cursor:
501            cursor.set_initial_state(stream_state=stream_state)
502
503        checkpoint_mode = self._checkpoint_mode
504
505        if cursor and stream_classification.is_legacy_format:
506            return LegacyCursorBasedCheckpointReader(
507                stream_slices=slices_iterable_copy, cursor=cursor, read_state_from_cursor=True
508            )
509        elif cursor:
510            return CursorBasedCheckpointReader(
511                stream_slices=slices_iterable_copy,
512                cursor=cursor,
513                read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH,
514            )
515        elif checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH:
516            # Resumable full refresh readers rely on the stream state dynamically being updated during pagination and does
517            # not iterate over a static set of slices.
518            return ResumableFullRefreshCheckpointReader(stream_state=stream_state)
519        elif checkpoint_mode == CheckpointMode.INCREMENTAL:
520            return IncrementalCheckpointReader(
521                stream_slices=slices_iterable_copy, stream_state=stream_state
522            )
523        else:
524            return FullRefreshCheckpointReader(stream_slices=slices_iterable_copy)
525
526    @property
527    def _checkpoint_mode(self) -> CheckpointMode:
528        if self.is_resumable and len(self._wrapped_cursor_field()) > 0:
529            return CheckpointMode.INCREMENTAL
530        elif self.is_resumable:
531            return CheckpointMode.RESUMABLE_FULL_REFRESH
532        else:
533            return CheckpointMode.FULL_REFRESH
534
535    @staticmethod
536    def _classify_stream(
537        mappings_or_slices: Iterator[Optional[Union[Mapping[str, Any], StreamSlice]]],
538    ) -> StreamClassification:
539        """
540        This is a bit of a crazy solution, but also the only way we can detect certain attributes about the stream since Python
541        streams do not follow consistent implementation patterns. We care about the following two attributes:
542        - is_substream: Helps to incrementally release changes since substreams w/ parents are much more complicated. Also
543          helps de-risk the release of changes that might impact all connectors
544        - uses_legacy_slice_format: Since the checkpoint reader must manage a complex state object, we opted to have it always
545          use the structured StreamSlice object. However, this requires backwards compatibility with Python sources that only
546          support the legacy mapping object
547
548        Both attributes can eventually be deprecated once stream's define this method deleted once substreams have been implemented and
549        legacy connectors all adhere to the StreamSlice object.
550        """
551        if not mappings_or_slices:
552            raise ValueError("A stream should always have at least one slice")
553        try:
554            next_slice = next(mappings_or_slices)
555            if isinstance(next_slice, StreamSlice) and next_slice == StreamSlice(
556                partition={}, cursor_slice={}
557            ):
558                is_legacy_format = False
559                slice_has_value = False
560            elif next_slice == {}:
561                is_legacy_format = True
562                slice_has_value = False
563            elif isinstance(next_slice, StreamSlice):
564                is_legacy_format = False
565                slice_has_value = True
566            else:
567                is_legacy_format = True
568                slice_has_value = True
569        except StopIteration:
570            # If the stream has no slices, the format ultimately does not matter since no data will get synced. This is technically
571            # a valid case because it is up to the stream to define its slicing behavior
572            return StreamClassification(is_legacy_format=False, has_multiple_slices=False)
573
574        if slice_has_value:
575            # If the first slice contained a partition value from the result of stream_slices(), this is a substream that might
576            # have multiple parent records to iterate over
577            return StreamClassification(
578                is_legacy_format=is_legacy_format, has_multiple_slices=slice_has_value
579            )
580
581        try:
582            # If stream_slices() returns multiple slices, this is also a substream that can potentially generate empty slices
583            next(mappings_or_slices)
584            return StreamClassification(is_legacy_format=is_legacy_format, has_multiple_slices=True)
585        except StopIteration:
586            # If the result of stream_slices() only returns a single empty stream slice, then we know this is a regular stream
587            return StreamClassification(
588                is_legacy_format=is_legacy_format, has_multiple_slices=False
589            )
590
591    def log_stream_sync_configuration(self) -> None:
592        """
593        Logs the configuration of this stream.
594        """
595        self.logger.debug(
596            f"Syncing stream instance: {self.name}",
597            extra={
598                "primary_key": self.primary_key,
599                "cursor_field": self.cursor_field,
600            },
601        )
602
603    @staticmethod
604    def _wrapped_primary_key(
605        keys: Optional[Union[str, List[str], List[List[str]]]],
606    ) -> Optional[List[List[str]]]:
607        """
608        :return: wrap the primary_key property in a list of list of strings required by the Airbyte Stream object.
609        """
610        if not keys:
611            return None
612
613        if isinstance(keys, str):
614            return [[keys]]
615        elif isinstance(keys, list):
616            wrapped_keys = []
617            for component in keys:
618                if isinstance(component, str):
619                    wrapped_keys.append([component])
620                elif isinstance(component, list):
621                    wrapped_keys.append(component)
622                else:
623                    raise ValueError(f"Element must be either list or str. Got: {type(component)}")
624            return wrapped_keys
625        else:
626            raise ValueError(f"Element must be either list or str. Got: {type(keys)}")
627
628    def _observe_state(
629        self, checkpoint_reader: CheckpointReader, stream_state: Optional[Mapping[str, Any]] = None
630    ) -> None:
631        """
632        Convenience method that attempts to read the Stream's state using the recommended way of connector's managing their
633        own state via state setter/getter. But if we get back an AttributeError, then the legacy Stream.get_updated_state()
634        method is used as a fallback method.
635        """
636
637        # This is an inversion of the original logic that used to try state getter/setters first. As part of the work to
638        # automatically apply resumable full refresh to all streams, all HttpStream classes implement default state
639        # getter/setter methods, we should default to only using the incoming stream_state parameter value is {} which
640        # indicates the stream does not override the default get_updated_state() implementation. When the default method
641        # is not overridden, then the stream defers to self.state getter
642        if stream_state:
643            checkpoint_reader.observe(stream_state)
644        elif type(self).get_updated_state == Stream.get_updated_state:
645            # We only default to the state getter/setter if the stream does not use the legacy get_updated_state() method
646            try:
647                new_state = self.state  # type: ignore # This will always exist on HttpStreams, but may not for Stream
648                if new_state:
649                    checkpoint_reader.observe(new_state)
650            except AttributeError:
651                pass
652
653    def _checkpoint_state(  # type: ignore  # ignoring typing for ConnectorStateManager because of circular dependencies
654        self,
655        stream_state: Mapping[str, Any],
656        state_manager,
657    ) -> AirbyteMessage:
658        # todo: This can be consolidated into one ConnectorStateManager.update_and_create_state_message() method, but I want
659        #  to reduce changes right now and this would span concurrent as well
660        state_manager.update_state_for_stream(self.name, self.namespace, stream_state)
661        return state_manager.create_state_message(self.name, self.namespace)  # type: ignore [no-any-return]
662
663    @property
664    def configured_json_schema(self) -> Optional[Dict[str, Any]]:
665        """
666        This property is set from the read method.
667
668        :return Optional[Dict]: JSON schema from configured catalog if provided, otherwise None.
669        """
670        return self._configured_json_schema
671
672    @configured_json_schema.setter
673    def configured_json_schema(self, json_schema: Dict[str, Any]) -> None:
674        self._configured_json_schema = self._filter_schema_invalid_properties(json_schema)
675
676    def _filter_schema_invalid_properties(
677        self, configured_catalog_json_schema: Dict[str, Any]
678    ) -> Dict[str, Any]:
679        """
680        Filters the properties in json_schema that are not present in the stream schema.
681        Configured Schemas can have very old fields, so we need to housekeeping ourselves.
682        """
683        configured_schema: Any = configured_catalog_json_schema.get("properties", {})
684        stream_schema_properties: Any = self.get_json_schema().get("properties", {})
685
686        configured_keys = configured_schema.keys()
687        stream_keys = stream_schema_properties.keys()
688        invalid_properties = configured_keys - stream_keys
689        if not invalid_properties:
690            return configured_catalog_json_schema
691
692        self.logger.warning(
693            f"Stream {self.name}: the following fields are deprecated and cannot be synced. {invalid_properties}. Refresh the connection's source schema to resolve this warning."
694        )
695
696        valid_configured_schema_properties_keys = stream_keys & configured_keys
697        valid_configured_schema_properties = {}
698
699        for configured_schema_property in valid_configured_schema_properties_keys:
700            valid_configured_schema_properties[configured_schema_property] = (
701                stream_schema_properties[configured_schema_property]
702            )
703
704        return {**configured_catalog_json_schema, "properties": valid_configured_schema_properties}

Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol.

logger: logging.Logger
128    @property
129    def logger(self) -> logging.Logger:
130        return logging.getLogger(f"airbyte.streams.{self.name}")
transformer: TypeTransformer = <TypeTransformer object>
has_multiple_slices = False
name: str
139    @cached_property
140    def name(self) -> str:
141        """
142        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
143        """
144        return casing.camel_to_snake(self.__class__.__name__)
Returns

Stream name. By default this is the implementing class name, but it can be overridden as needed.

def get_error_display_message(self, exception: BaseException) -> Optional[str]:
146    def get_error_display_message(self, exception: BaseException) -> Optional[str]:
147        """
148        Retrieves the user-friendly display message that corresponds to an exception.
149        This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.
150
151        The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed.
152
153        :param exception: The exception that was raised
154        :return: A user-friendly message that indicates the cause of the error
155        """
156        return None

Retrieves the user-friendly display message that corresponds to an exception. This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.

The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed.

Parameters
  • exception: The exception that was raised
Returns

A user-friendly message that indicates the cause of the error

def read( self, configured_stream: airbyte_protocol_dataclasses.models.airbyte_protocol.ConfiguredAirbyteStream, logger: logging.Logger, slice_logger: airbyte_cdk.sources.utils.slice_logger.SliceLogger, stream_state: MutableMapping[str, Any], state_manager, internal_config: InternalConfig) -> Iterable[Union[Mapping[str, Any], AirbyteMessage]]:
158    def read(  # type: ignore  # ignoring typing for ConnectorStateManager because of circular dependencies
159        self,
160        configured_stream: ConfiguredAirbyteStream,
161        logger: logging.Logger,
162        slice_logger: SliceLogger,
163        stream_state: MutableMapping[str, Any],
164        state_manager,
165        internal_config: InternalConfig,
166    ) -> Iterable[StreamData]:
167        sync_mode = configured_stream.sync_mode
168        cursor_field = configured_stream.cursor_field
169        self.configured_json_schema = configured_stream.stream.json_schema
170
171        # WARNING: When performing a read() that uses incoming stream state, we MUST use the self.state that is defined as
172        # opposed to the incoming stream_state value. Because some connectors like ones using the file-based CDK modify
173        # state before setting the value on the Stream attribute, the most up-to-date state is derived from Stream.state
174        # instead of the stream_state parameter. This does not apply to legacy connectors using get_updated_state().
175        try:
176            stream_state = self.state  # type: ignore # we know the field might not exist...
177        except AttributeError:
178            pass
179
180        should_checkpoint = bool(state_manager)
181        checkpoint_reader = self._get_checkpoint_reader(
182            logger=logger, cursor_field=cursor_field, sync_mode=sync_mode, stream_state=stream_state
183        )
184
185        next_slice = checkpoint_reader.next()
186        record_counter = 0
187        stream_state_tracker = copy.deepcopy(stream_state)
188        while next_slice is not None:
189            if slice_logger.should_log_slice_message(logger):
190                yield slice_logger.create_slice_log_message(next_slice)
191            records = self.read_records(
192                sync_mode=sync_mode,  # todo: change this interface to no longer rely on sync_mode for behavior
193                stream_slice=next_slice,
194                stream_state=stream_state,
195                cursor_field=cursor_field or None,
196            )
197            for record_data_or_message in records:
198                yield record_data_or_message
199                if isinstance(record_data_or_message, Mapping) or (
200                    hasattr(record_data_or_message, "type")
201                    and record_data_or_message.type == MessageType.RECORD
202                ):
203                    record_data = (
204                        record_data_or_message
205                        if isinstance(record_data_or_message, Mapping)
206                        else record_data_or_message.record
207                    )
208
209                    # Thanks I hate it. RFR fundamentally doesn't fit with the concept of the legacy Stream.get_updated_state()
210                    # method because RFR streams rely on pagination as a cursor. Stream.get_updated_state() was designed to make
211                    # the CDK manage state using specifically the last seen record. don't @ brian.lai
212                    #
213                    # Also, because the legacy incremental state case decouples observing incoming records from emitting state, it
214                    # requires that we separate CheckpointReader.observe() and CheckpointReader.get_checkpoint() which could
215                    # otherwise be combined.
216                    if self.cursor_field:
217                        # Some connectors have streams that implement get_updated_state(), but do not define a cursor_field. This
218                        # should be fixed on the stream implementation, but we should also protect against this in the CDK as well
219                        stream_state_tracker = self.get_updated_state(
220                            stream_state_tracker,
221                            record_data,  # type: ignore [arg-type]
222                        )
223                        self._observe_state(checkpoint_reader, stream_state_tracker)
224                    record_counter += 1
225
226                    checkpoint_interval = self.state_checkpoint_interval
227                    if (
228                        should_checkpoint
229                        and checkpoint_interval
230                        and record_counter % checkpoint_interval == 0
231                    ):
232                        checkpoint = checkpoint_reader.get_checkpoint()
233                        if checkpoint:
234                            airbyte_state_message = self._checkpoint_state(
235                                checkpoint, state_manager=state_manager
236                            )
237                            yield airbyte_state_message
238
239                    if internal_config.is_limit_reached(record_counter):
240                        break
241            self._observe_state(checkpoint_reader)
242            checkpoint_state = checkpoint_reader.get_checkpoint()
243            if should_checkpoint and checkpoint_state is not None:
244                airbyte_state_message = self._checkpoint_state(
245                    checkpoint_state, state_manager=state_manager
246                )
247                yield airbyte_state_message
248
249            next_slice = checkpoint_reader.next()
250
251        checkpoint = checkpoint_reader.get_checkpoint()
252        if should_checkpoint and checkpoint is not None:
253            airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager)
254            yield airbyte_state_message
def read_only_records( self, state: Optional[Mapping[str, Any]] = None) -> Iterable[Union[Mapping[str, Any], AirbyteMessage]]:
256    def read_only_records(self, state: Optional[Mapping[str, Any]] = None) -> Iterable[StreamData]:
257        """
258        Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports
259        incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter)
260        or emit state messages.
261        """
262
263        configured_stream = ConfiguredAirbyteStream(
264            stream=AirbyteStream(
265                name=self.name,
266                json_schema={},
267                supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental],
268            ),
269            sync_mode=SyncMode.incremental if state else SyncMode.full_refresh,
270            destination_sync_mode=DestinationSyncMode.append,
271        )
272
273        yield from self.read(
274            configured_stream=configured_stream,
275            logger=self.logger,
276            slice_logger=DebugSliceLogger(),
277            stream_state=dict(state)
278            if state
279            else {},  # read() expects MutableMapping instead of Mapping which is used more often
280            state_manager=None,
281            internal_config=InternalConfig(),  # type: ignore [call-arg]
282        )

Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter) or emit state messages.

@abstractmethod
def read_records( self, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, cursor_field: Optional[List[str]] = None, stream_slice: Optional[Mapping[str, Any]] = None, stream_state: Optional[Mapping[str, Any]] = None) -> Iterable[Union[Mapping[str, Any], AirbyteMessage]]:
284    @abstractmethod
285    def read_records(
286        self,
287        sync_mode: SyncMode,
288        cursor_field: Optional[List[str]] = None,
289        stream_slice: Optional[Mapping[str, Any]] = None,
290        stream_state: Optional[Mapping[str, Any]] = None,
291    ) -> Iterable[StreamData]:
292        """
293        This method should be overridden by subclasses to read records based on the inputs
294        """

This method should be overridden by subclasses to read records based on the inputs

@lru_cache(maxsize=None)
def get_json_schema(self) -> Mapping[str, Any]:
296    @lru_cache(maxsize=None)
297    def get_json_schema(self) -> Mapping[str, Any]:
298        """
299        :return: A dict of the JSON schema representing this stream.
300
301        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
302        Override as needed.
303        """
304        # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec
305        return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema(self.name)
Returns

A dict of the JSON schema representing this stream.

The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. Override as needed.

def as_airbyte_stream( self) -> airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteStream:
307    def as_airbyte_stream(self) -> AirbyteStream:
308        stream = AirbyteStream(
309            name=self.name,
310            json_schema=dict(self.get_json_schema()),
311            supported_sync_modes=[SyncMode.full_refresh],
312            is_resumable=self.is_resumable,
313        )
314
315        if self.namespace:
316            stream.namespace = self.namespace
317
318        # If we can offer incremental we always should. RFR is always less reliable than incremental which uses a real cursor value
319        if self.supports_incremental:
320            stream.source_defined_cursor = self.source_defined_cursor
321            stream.supported_sync_modes.append(SyncMode.incremental)
322            stream.default_cursor_field = self._wrapped_cursor_field()
323
324        keys = Stream._wrapped_primary_key(self.primary_key)
325        if keys and len(keys) > 0:
326            stream.source_defined_primary_key = keys
327
328        return stream
supports_incremental: bool
330    @property
331    def supports_incremental(self) -> bool:
332        """
333        :return: True if this stream supports incrementally reading data
334        """
335        return len(self._wrapped_cursor_field()) > 0
Returns

True if this stream supports incrementally reading data

is_resumable: bool
337    @property
338    def is_resumable(self) -> bool:
339        """
340        :return: True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts.
341        This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh
342        can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning
343        on the next sync job.
344        """
345        if self.supports_incremental:
346            return True
347        if self.has_multiple_slices:
348            # We temporarily gate substream to not support RFR because puts a pretty high burden on connector developers
349            # to structure stream state in a very specific way. We also can't check for issubclass(HttpSubStream) because
350            # not all substreams implement the interface and it would be a circular dependency so we use parent as a surrogate
351            return False
352        elif hasattr(type(self), "state") and getattr(type(self), "state").fset is not None:
353            # Modern case where a stream manages state using getter/setter
354            return True
355        else:
356            # Legacy case where the CDK manages state via the get_updated_state() method. This is determined by checking if
357            # the stream's get_updated_state() differs from the Stream class and therefore has been overridden
358            return type(self).get_updated_state != Stream.get_updated_state
Returns

True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning on the next sync job.

cursor_field: Union[str, List[str]]
363    @property
364    def cursor_field(self) -> Union[str, List[str]]:
365        """
366        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
367        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
368        """
369        return []

Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.

Returns

The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.

namespace: Optional[str]
371    @property
372    def namespace(self) -> Optional[str]:
373        """
374        Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for.
375        :return: A string containing the name of the namespace.
376        """
377        return None

Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for.

Returns

A string containing the name of the namespace.

source_defined_cursor: bool
379    @property
380    def source_defined_cursor(self) -> bool:
381        """
382        Return False if the cursor can be configured by the user.
383        """
384        return True

Return False if the cursor can be configured by the user.

exit_on_rate_limit: bool
386    @property
387    def exit_on_rate_limit(self) -> bool:
388        """Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited."""
389        return self._exit_on_rate_limit

Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.

primary_key: Union[str, List[str], List[List[str]], NoneType]
396    @property
397    @abstractmethod
398    def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]:
399        """
400        :return: string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields.
401          If the stream has no primary keys, return None.
402        """
Returns

string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. If the stream has no primary keys, return None.

def stream_slices( self, *, sync_mode: airbyte_protocol_dataclasses.models.airbyte_protocol.SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None) -> Iterable[Optional[Mapping[str, Any]]]:
404    def stream_slices(
405        self,
406        *,
407        sync_mode: SyncMode,
408        cursor_field: Optional[List[str]] = None,
409        stream_state: Optional[Mapping[str, Any]] = None,
410    ) -> Iterable[Optional[Mapping[str, Any]]]:
411        """
412        Override to define the slices for this stream. See the stream slicing section of the docs for more information.
413
414        :param sync_mode:
415        :param cursor_field:
416        :param stream_state:
417        :return:
418        """
419        yield StreamSlice(partition={}, cursor_slice={})

Override to define the slices for this stream. See the stream slicing section of the docs for more information.

Parameters
  • sync_mode:
  • cursor_field:
  • stream_state:
Returns
state_checkpoint_interval: Optional[int]
421    @property
422    def state_checkpoint_interval(self) -> Optional[int]:
423        """
424        Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading
425        100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source.
426
427        Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled.
428
429        return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in
430        ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of
431        created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read.
432        """
433        return None

Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source.

Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled.

return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read.

def get_updated_state( self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> MutableMapping[str, Any]:
441    def get_updated_state(
442        self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
443    ) -> MutableMapping[str, Any]:
444        """DEPRECATED. Please use explicit state property instead, see `IncrementalMixin` docs.
445
446        Override to extract state from the latest record. Needed to implement incremental sync.
447
448        Inspects the latest record extracted from the data source and the current state object and return an updated state object.
449
450        For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is
451        {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.
452
453        :param current_stream_state: The stream's current state object
454        :param latest_record: The latest record extracted from the stream
455        :return: An updated state object
456        """
457        return {}

DEPRECATED. Please use explicit state property instead, see IncrementalMixin docs.

Override to extract state from the latest record. Needed to implement incremental sync.

Inspects the latest record extracted from the data source and the current state object and return an updated state object.

For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.

Parameters
  • current_stream_state: The stream's current state object
  • latest_record: The latest record extracted from the stream
Returns

An updated state object

def get_cursor(self) -> Optional[airbyte_cdk.sources.streams.checkpoint.Cursor]:
459    def get_cursor(self) -> Optional[Cursor]:
460        """
461        A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while
462        reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need
463        to define a cursor implementation and override this method to manage state through a Cursor.
464        """
465        return self.cursor

A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need to define a cursor implementation and override this method to manage state through a Cursor.

def log_stream_sync_configuration(self) -> None:
591    def log_stream_sync_configuration(self) -> None:
592        """
593        Logs the configuration of this stream.
594        """
595        self.logger.debug(
596            f"Syncing stream instance: {self.name}",
597            extra={
598                "primary_key": self.primary_key,
599                "cursor_field": self.cursor_field,
600            },
601        )

Logs the configuration of this stream.

configured_json_schema: Optional[Dict[str, Any]]
663    @property
664    def configured_json_schema(self) -> Optional[Dict[str, Any]]:
665        """
666        This property is set from the read method.
667
668        :return Optional[Dict]: JSON schema from configured catalog if provided, otherwise None.
669        """
670        return self._configured_json_schema

This property is set from the read method.

Returns

JSON schema from configured catalog if provided, otherwise None.

StreamData
def package_name_from_class(cls: object) -> str:
52def package_name_from_class(cls: object) -> str:
53    """Find the package name given a class name"""
54    module = inspect.getmodule(cls)
55    if module is not None:
56        return module.__name__.split(".")[0]
57    else:
58        raise ValueError(f"Could not find package name for class {cls}")

Find the package name given a class name

class AirbyteTracedException(builtins.Exception):
 26class AirbyteTracedException(Exception):
 27    """
 28    An exception that should be emitted as an AirbyteTraceMessage
 29    """
 30
 31    def __init__(
 32        self,
 33        internal_message: Optional[str] = None,
 34        message: Optional[str] = None,
 35        failure_type: FailureType = FailureType.system_error,
 36        exception: Optional[BaseException] = None,
 37        stream_descriptor: Optional[StreamDescriptor] = None,
 38    ):
 39        """
 40        :param internal_message: the internal error that caused the failure
 41        :param message: a user-friendly message that indicates the cause of the error
 42        :param failure_type: the type of error
 43        :param exception: the exception that caused the error, from which the stack trace should be retrieved
 44        :param stream_descriptor: describe the stream from which the exception comes from
 45        """
 46        self.internal_message = internal_message
 47        self.message = message
 48        self.failure_type = failure_type
 49        self._exception = exception
 50        self._stream_descriptor = stream_descriptor
 51        super().__init__(internal_message)
 52
 53    def as_airbyte_message(
 54        self, stream_descriptor: Optional[StreamDescriptor] = None
 55    ) -> AirbyteMessage:
 56        """
 57        Builds an AirbyteTraceMessage from the exception
 58
 59        :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many
 60          stream_descriptors are defined, the one from `as_airbyte_message` will be discarded.
 61        """
 62        now_millis = time.time_ns() // 1_000_000
 63
 64        trace_exc = self._exception or self
 65        stack_trace_str = "".join(traceback.TracebackException.from_exception(trace_exc).format())
 66
 67        trace_message = AirbyteTraceMessage(
 68            type=TraceType.ERROR,
 69            emitted_at=now_millis,
 70            error=AirbyteErrorTraceMessage(
 71                message=self.message
 72                or "Something went wrong in the connector. See the logs for more details.",
 73                internal_message=self.internal_message,
 74                failure_type=self.failure_type,
 75                stack_trace=stack_trace_str,
 76                stream_descriptor=self._stream_descriptor
 77                if self._stream_descriptor is not None
 78                else stream_descriptor,
 79            ),
 80        )
 81
 82        return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)
 83
 84    def as_connection_status_message(self) -> Optional[AirbyteMessage]:
 85        if self.failure_type == FailureType.config_error:
 86            return AirbyteMessage(
 87                type=MessageType.CONNECTION_STATUS,
 88                connectionStatus=AirbyteConnectionStatus(
 89                    status=Status.FAILED, message=self.message
 90                ),
 91            )
 92        return None
 93
 94    def emit_message(self) -> None:
 95        """
 96        Prints the exception as an AirbyteTraceMessage.
 97        Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint.
 98        """
 99        message = orjson.dumps(AirbyteMessageSerializer.dump(self.as_airbyte_message())).decode()
100        filtered_message = filter_secrets(message)
101        print(filtered_message)
102
103    @classmethod
104    def from_exception(
105        cls,
106        exc: BaseException,
107        stream_descriptor: Optional[StreamDescriptor] = None,
108        *args: Any,
109        **kwargs: Any,
110    ) -> "AirbyteTracedException":
111        """
112        Helper to create an AirbyteTracedException from an existing exception
113        :param exc: the exception that caused the error
114        :param stream_descriptor: describe the stream from which the exception comes from
115        """
116        return cls(
117            internal_message=str(exc),
118            exception=exc,
119            stream_descriptor=stream_descriptor,
120            *args,
121            **kwargs,
122        )  # type: ignore  # ignoring because of args and kwargs
123
124    def as_sanitized_airbyte_message(
125        self, stream_descriptor: Optional[StreamDescriptor] = None
126    ) -> AirbyteMessage:
127        """
128        Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body
129
130        :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many
131          stream_descriptors are defined, the one from `as_sanitized_airbyte_message` will be discarded.
132        """
133        error_message = self.as_airbyte_message(stream_descriptor=stream_descriptor)
134        if error_message.trace.error.message:  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
135            error_message.trace.error.message = filter_secrets(  # type: ignore[union-attr]
136                error_message.trace.error.message,  # type: ignore[union-attr]
137            )
138        if error_message.trace.error.internal_message:  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
139            error_message.trace.error.internal_message = filter_secrets(  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
140                error_message.trace.error.internal_message  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
141            )
142        if error_message.trace.error.stack_trace:  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
143            error_message.trace.error.stack_trace = filter_secrets(  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
144                error_message.trace.error.stack_trace  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
145            )
146        return error_message

An exception that should be emitted as an AirbyteTraceMessage

AirbyteTracedException( internal_message: Optional[str] = None, message: Optional[str] = None, failure_type: airbyte_protocol_dataclasses.models.airbyte_protocol.FailureType = <FailureType.system_error: 'system_error'>, exception: Optional[BaseException] = None, stream_descriptor: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.StreamDescriptor] = None)
31    def __init__(
32        self,
33        internal_message: Optional[str] = None,
34        message: Optional[str] = None,
35        failure_type: FailureType = FailureType.system_error,
36        exception: Optional[BaseException] = None,
37        stream_descriptor: Optional[StreamDescriptor] = None,
38    ):
39        """
40        :param internal_message: the internal error that caused the failure
41        :param message: a user-friendly message that indicates the cause of the error
42        :param failure_type: the type of error
43        :param exception: the exception that caused the error, from which the stack trace should be retrieved
44        :param stream_descriptor: describe the stream from which the exception comes from
45        """
46        self.internal_message = internal_message
47        self.message = message
48        self.failure_type = failure_type
49        self._exception = exception
50        self._stream_descriptor = stream_descriptor
51        super().__init__(internal_message)
Parameters
  • internal_message: the internal error that caused the failure
  • message: a user-friendly message that indicates the cause of the error
  • failure_type: the type of error
  • exception: the exception that caused the error, from which the stack trace should be retrieved
  • stream_descriptor: describe the stream from which the exception comes from
internal_message
message
failure_type
def as_airbyte_message( self, stream_descriptor: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.StreamDescriptor] = None) -> AirbyteMessage:
53    def as_airbyte_message(
54        self, stream_descriptor: Optional[StreamDescriptor] = None
55    ) -> AirbyteMessage:
56        """
57        Builds an AirbyteTraceMessage from the exception
58
59        :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many
60          stream_descriptors are defined, the one from `as_airbyte_message` will be discarded.
61        """
62        now_millis = time.time_ns() // 1_000_000
63
64        trace_exc = self._exception or self
65        stack_trace_str = "".join(traceback.TracebackException.from_exception(trace_exc).format())
66
67        trace_message = AirbyteTraceMessage(
68            type=TraceType.ERROR,
69            emitted_at=now_millis,
70            error=AirbyteErrorTraceMessage(
71                message=self.message
72                or "Something went wrong in the connector. See the logs for more details.",
73                internal_message=self.internal_message,
74                failure_type=self.failure_type,
75                stack_trace=stack_trace_str,
76                stream_descriptor=self._stream_descriptor
77                if self._stream_descriptor is not None
78                else stream_descriptor,
79            ),
80        )
81
82        return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)

Builds an AirbyteTraceMessage from the exception

:param stream_descriptor is deprecated, please use the stream_description in __init__ orfrom_exception. If many stream_descriptors are defined, the one fromas_airbyte_message` will be discarded.

def as_connection_status_message(self) -> Optional[AirbyteMessage]:
84    def as_connection_status_message(self) -> Optional[AirbyteMessage]:
85        if self.failure_type == FailureType.config_error:
86            return AirbyteMessage(
87                type=MessageType.CONNECTION_STATUS,
88                connectionStatus=AirbyteConnectionStatus(
89                    status=Status.FAILED, message=self.message
90                ),
91            )
92        return None
def emit_message(self) -> None:
 94    def emit_message(self) -> None:
 95        """
 96        Prints the exception as an AirbyteTraceMessage.
 97        Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint.
 98        """
 99        message = orjson.dumps(AirbyteMessageSerializer.dump(self.as_airbyte_message())).decode()
100        filtered_message = filter_secrets(message)
101        print(filtered_message)

Prints the exception as an AirbyteTraceMessage. Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint.

@classmethod
def from_exception( cls, exc: BaseException, stream_descriptor: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.StreamDescriptor] = None, *args: Any, **kwargs: Any) -> AirbyteTracedException:
103    @classmethod
104    def from_exception(
105        cls,
106        exc: BaseException,
107        stream_descriptor: Optional[StreamDescriptor] = None,
108        *args: Any,
109        **kwargs: Any,
110    ) -> "AirbyteTracedException":
111        """
112        Helper to create an AirbyteTracedException from an existing exception
113        :param exc: the exception that caused the error
114        :param stream_descriptor: describe the stream from which the exception comes from
115        """
116        return cls(
117            internal_message=str(exc),
118            exception=exc,
119            stream_descriptor=stream_descriptor,
120            *args,
121            **kwargs,
122        )  # type: ignore  # ignoring because of args and kwargs

Helper to create an AirbyteTracedException from an existing exception

Parameters
  • exc: the exception that caused the error
  • stream_descriptor: describe the stream from which the exception comes from
def as_sanitized_airbyte_message( self, stream_descriptor: Optional[airbyte_protocol_dataclasses.models.airbyte_protocol.StreamDescriptor] = None) -> AirbyteMessage:
124    def as_sanitized_airbyte_message(
125        self, stream_descriptor: Optional[StreamDescriptor] = None
126    ) -> AirbyteMessage:
127        """
128        Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body
129
130        :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many
131          stream_descriptors are defined, the one from `as_sanitized_airbyte_message` will be discarded.
132        """
133        error_message = self.as_airbyte_message(stream_descriptor=stream_descriptor)
134        if error_message.trace.error.message:  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
135            error_message.trace.error.message = filter_secrets(  # type: ignore[union-attr]
136                error_message.trace.error.message,  # type: ignore[union-attr]
137            )
138        if error_message.trace.error.internal_message:  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
139            error_message.trace.error.internal_message = filter_secrets(  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
140                error_message.trace.error.internal_message  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
141            )
142        if error_message.trace.error.stack_trace:  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
143            error_message.trace.error.stack_trace = filter_secrets(  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
144                error_message.trace.error.stack_trace  # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage
145            )
146        return error_message

Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body

:param stream_descriptor is deprecated, please use the stream_description in __init__ orfrom_exception. If many stream_descriptors are defined, the one fromas_sanitized_airbyte_message` will be discarded.

def is_cloud_environment() -> bool:
11def is_cloud_environment() -> bool:
12    """
13    Returns True if the connector is running in a cloud environment, False otherwise.
14
15    The function checks the value of the DEPLOYMENT_MODE environment variable which is set by the platform.
16    This function can be used to determine whether stricter security measures should be applied.
17    """
18    deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
19    return deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE

Returns True if the connector is running in a cloud environment, False otherwise.

The function checks the value of the DEPLOYMENT_MODE environment variable which is set by the platform. This function can be used to determine whether stricter security measures should be applied.

class InternalConfig(pydantic.v1.main.BaseModel):
190class InternalConfig(BaseModel):
191    KEYWORDS: ClassVar[set[str]] = {"_limit", "_page_size"}
192    limit: int = Field(None, alias="_limit")
193    page_size: int = Field(None, alias="_page_size")
194
195    def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
196        kwargs["by_alias"] = True
197        kwargs["exclude_unset"] = True
198        return super().dict(*args, **kwargs)
199
200    def is_limit_reached(self, records_counter: int) -> bool:
201        """
202        Check if record count reached limit set by internal config.
203        :param records_counter - number of records already red
204        :return True if limit reached, False otherwise
205        """
206        if self.limit:
207            if records_counter >= self.limit:
208                return True
209        return False
KEYWORDS: ClassVar[set[str]] = {'_limit', '_page_size'}
limit: int
page_size: int
def dict(self, *args: Any, **kwargs: Any) -> dict[str, typing.Any]:
195    def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
196        kwargs["by_alias"] = True
197        kwargs["exclude_unset"] = True
198        return super().dict(*args, **kwargs)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

def is_limit_reached(self, records_counter: int) -> bool:
200    def is_limit_reached(self, records_counter: int) -> bool:
201        """
202        Check if record count reached limit set by internal config.
203        :param records_counter - number of records already red
204        :return True if limit reached, False otherwise
205        """
206        if self.limit:
207            if records_counter >= self.limit:
208                return True
209        return False

Check if record count reached limit set by internal config. :param records_counter - number of records already red :return True if limit reached, False otherwise

class ResourceSchemaLoader:
116class ResourceSchemaLoader:
117    """JSONSchema loader from package resources"""
118
119    def __init__(self, package_name: str):
120        self.package_name = package_name
121
122    def get_schema(self, name: str) -> dict[str, Any]:
123        """
124        This method retrieves a JSON schema from the schemas/ folder.
125
126
127        The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs
128        living inside the "schemas/shared/" folder. For example:
129
130        schemas/shared/<shared_definition>.json
131        schemas/<name>.json # contains a $ref to shared_definition
132        schemas/<name2>.json # contains a $ref to shared_definition
133        """
134
135        schema_filename = f"schemas/{name}.json"
136        raw_file = pkgutil.get_data(self.package_name, schema_filename)
137        if not raw_file:
138            raise IOError(f"Cannot find file {schema_filename}")
139        try:
140            raw_schema = json.loads(raw_file)
141        except ValueError as err:
142            raise RuntimeError(f"Invalid JSON file format for file {schema_filename}") from err
143
144        return self._resolve_schema_references(raw_schema)
145
146    def _resolve_schema_references(self, raw_schema: dict[str, Any]) -> dict[str, Any]:
147        """
148        Resolve links to external references and move it to local "definitions" map.
149
150        :param raw_schema jsonschema to lookup for external links.
151        :return JSON serializable object with references without external dependencies.
152        """
153
154        package = importlib.import_module(self.package_name)
155        if package.__file__:
156            base = os.path.dirname(package.__file__) + "/"
157        else:
158            raise ValueError(f"Package {package} does not have a valid __file__ field")
159        resolved = jsonref.JsonRef.replace_refs(
160            raw_schema, loader=JsonFileLoader(base, "schemas/shared"), base_uri=base
161        )
162        resolved = resolve_ref_links(resolved)
163        if isinstance(resolved, dict):
164            return resolved
165        else:
166            raise ValueError(f"Expected resolved to be a dict. Got {resolved}")

JSONSchema loader from package resources

ResourceSchemaLoader(package_name: str)
119    def __init__(self, package_name: str):
120        self.package_name = package_name
package_name
def get_schema(self, name: str) -> dict[str, typing.Any]:
122    def get_schema(self, name: str) -> dict[str, Any]:
123        """
124        This method retrieves a JSON schema from the schemas/ folder.
125
126
127        The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs
128        living inside the "schemas/shared/" folder. For example:
129
130        schemas/shared/<shared_definition>.json
131        schemas/<name>.json # contains a $ref to shared_definition
132        schemas/<name2>.json # contains a $ref to shared_definition
133        """
134
135        schema_filename = f"schemas/{name}.json"
136        raw_file = pkgutil.get_data(self.package_name, schema_filename)
137        if not raw_file:
138            raise IOError(f"Cannot find file {schema_filename}")
139        try:
140            raw_schema = json.loads(raw_file)
141        except ValueError as err:
142            raise RuntimeError(f"Invalid JSON file format for file {schema_filename}") from err
143
144        return self._resolve_schema_references(raw_schema)

This method retrieves a JSON schema from the schemas/ folder.

The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs living inside the "schemas/shared/" folder. For example:

schemas/shared/.json schemas/.json # contains a $ref to shared_definition schemas/.json # contains a $ref to shared_definition

def check_config_against_spec_or_exit( config: Mapping[str, Any], spec: airbyte_protocol_dataclasses.models.airbyte_protocol.ConnectorSpecification) -> None:
169def check_config_against_spec_or_exit(
170    config: Mapping[str, Any], spec: ConnectorSpecification
171) -> None:
172    """
173    Check config object against spec. In case of spec is invalid, throws
174    an exception with validation error description.
175
176    :param config - config loaded from file specified over command line
177    :param spec - spec object generated by connector
178    """
179    spec_schema = spec.connectionSpecification
180    try:
181        validate(instance=config, schema=spec_schema)
182    except ValidationError as validation_error:
183        raise AirbyteTracedException(
184            message="Config validation error: " + validation_error.message,
185            internal_message=validation_error.message,
186            failure_type=FailureType.config_error,
187        ) from None  # required to prevent logging config secrets from the ValidationError's stacktrace

Check config object against spec. In case of spec is invalid, throws an exception with validation error description.

:param config - config loaded from file specified over command line :param spec - spec object generated by connector

def split_config( config: Mapping[str, Any]) -> Tuple[dict[str, Any], InternalConfig]:
212def split_config(config: Mapping[str, Any]) -> Tuple[dict[str, Any], InternalConfig]:
213    """
214    Break config map object into 2 instances: first is a dict with user defined
215    configuration and second is internal config that contains private keys for
216    acceptance test configuration.
217
218    :param
219     config - Dict object that has been loaded from config file.
220
221    :return tuple of user defined config dict with filtered out internal
222    parameters and connector acceptance test internal config object.
223    """
224    main_config = {}
225    internal_config = {}
226    for k, v in config.items():
227        if k in InternalConfig.KEYWORDS:
228            internal_config[k] = v
229        else:
230            main_config[k] = v
231    return main_config, InternalConfig.parse_obj(internal_config)

Break config map object into 2 instances: first is a dict with user defined configuration and second is internal config that contains private keys for acceptance test configuration.

:param config - Dict object that has been loaded from config file.

:return tuple of user defined config dict with filtered out internal parameters and connector acceptance test internal config object.

class TransformConfig(enum.Flag):
48class TransformConfig(Flag):
49    """
50    TypeTransformer class config. Configs can be combined using bitwise or operator e.g.
51        ```
52        TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization
53        ```
54    """
55
56    # No action taken, default behavior. Cannot be combined with any other options.
57    NoTransform = auto()
58    # Applies default type casting with default_convert method which converts
59    # values by applying simple type casting to specified jsonschema type.
60    DefaultSchemaNormalization = auto()
61    # Allow registering custom type transformation callback. Can be combined
62    # with DefaultSchemaNormalization. In this case default type casting would
63    # be applied before custom one.
64    CustomSchemaNormalization = auto()

TypeTransformer class config. Configs can be combined using bitwise or operator e.g.

TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization
NoTransform = <TransformConfig.NoTransform: 1>
DefaultSchemaNormalization = <TransformConfig.DefaultSchemaNormalization: 2>
CustomSchemaNormalization = <TransformConfig.CustomSchemaNormalization: 4>
class TypeTransformer:
 67class TypeTransformer:
 68    """
 69    Class for transforming object before output.
 70    """
 71
 72    _custom_normalizer: Optional[Callable[[Any, Dict[str, Any]], Any]] = None
 73
 74    def __init__(self, config: TransformConfig):
 75        """
 76        Initialize TypeTransformer instance.
 77        :param config Transform config that would be applied to object
 78        """
 79        if TransformConfig.NoTransform in config and config != TransformConfig.NoTransform:
 80            raise Exception("NoTransform option cannot be combined with other flags.")
 81        self._config = config
 82        all_validators = {
 83            key: self.__get_normalizer(key, orig_validator)
 84            for key, orig_validator in Draft7Validator.VALIDATORS.items()
 85            # Do not validate field we do not transform for maximum performance.
 86            if key in ["type", "array", "$ref", "properties", "items"]
 87        }
 88        self._normalizer = validators.create(
 89            meta_schema=Draft7Validator.META_SCHEMA, validators=all_validators
 90        )
 91
 92    def registerCustomTransform(
 93        self, normalization_callback: Callable[[Any, dict[str, Any]], Any]
 94    ) -> Callable[[Any, dict[str, Any]], Any]:
 95        """
 96        Register custom normalization callback.
 97        :param normalization_callback function to be used for value
 98        normalization. Takes original value and part type schema. Should return
 99        normalized value. See docs/connector-development/cdk-python/schemas.md
100        for details.
101        :return Same callback, this is useful for using registerCustomTransform function as decorator.
102        """
103        if TransformConfig.CustomSchemaNormalization not in self._config:
104            raise Exception(
105                "Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer"
106            )
107        self._custom_normalizer = normalization_callback
108        return normalization_callback
109
110    def __normalize(self, original_item: Any, subschema: Dict[str, Any]) -> Any:
111        """
112        Applies different transform function to object's field according to config.
113        :param original_item original value of field.
114        :param subschema part of the jsonschema containing field type/format data.
115        :return Final field value.
116        """
117        if TransformConfig.DefaultSchemaNormalization in self._config:
118            original_item = self.default_convert(original_item, subschema)
119
120        if self._custom_normalizer:
121            original_item = self._custom_normalizer(original_item, subschema)
122        return original_item
123
124    @staticmethod
125    def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any:
126        """
127        Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set.
128        :param original_item original value of field.
129        :param subschema part of the jsonschema containing field type/format data.
130        :return transformed field value.
131        """
132        target_type = subschema.get("type", [])
133        if original_item is None and "null" in target_type:
134            return None
135        if isinstance(target_type, list):
136            # jsonschema type could either be a single string or array of type
137            # strings. In case if there is some disambigous and more than one
138            # type (except null) do not do any conversion and return original
139            # value. If type array has one type and null i.e. {"type":
140            # ["integer", "null"]}, convert value to specified type.
141            target_type = [t for t in target_type if t != "null"]
142            if len(target_type) != 1:
143                return original_item
144            target_type = target_type[0]
145        try:
146            if target_type == "string":
147                return str(original_item)
148            elif target_type == "number":
149                return float(original_item)
150            elif target_type == "integer":
151                return int(original_item)
152            elif target_type == "boolean":
153                if isinstance(original_item, str):
154                    return _strtobool(original_item) == 1
155                return bool(original_item)
156            elif target_type == "array":
157                item_types = set(subschema.get("items", {}).get("type", set()))
158                if (
159                    item_types.issubset(json_to_python_simple)
160                    and type(original_item) in json_to_python_simple.values()
161                ):
162                    return [original_item]
163        except (ValueError, TypeError):
164            return original_item
165        return original_item
166
167    def __get_normalizer(
168        self,
169        schema_key: str,
170        original_validator: Callable,  # type: ignore[type-arg]
171    ) -> Callable[[Any, Any, Any, dict[str, Any]], Generator[Any, Any, None]]:
172        """
173        Traverse through object fields using native jsonschema validator and apply normalization function.
174        :param schema_key related json schema key that currently being validated/normalized.
175        :original_validator: native jsonschema validator callback.
176        """
177
178        def normalizator(
179            validator_instance: Validator,
180            property_value: Any,
181            instance: Any,
182            schema: Dict[str, Any],
183        ) -> Generator[Any, Any, None]:
184            """
185            Jsonschema validator callable it uses for validating instance. We
186            override default Draft7Validator to perform value transformation
187            before validation take place. We do not take any action except
188            logging warn if object does not conform to json schema, just using
189            jsonschema algorithm to traverse through object fields.
190            Look
191            https://python-jsonschema.readthedocs.io/en/stable/creating/?highlight=validators.create#jsonschema.validators.create
192            validators parameter for detailed description.
193            :
194            """
195
196            def resolve(subschema: dict[str, Any]) -> dict[str, Any]:
197                if "$ref" in subschema:
198                    _, resolved = cast(
199                        RefResolver,
200                        validator_instance.resolver,
201                    ).resolve(subschema["$ref"])
202                    return cast(dict[str, Any], resolved)
203                return subschema
204
205            # Transform object and array values before running json schema type checking for each element.
206            # Recursively normalize every value of the "instance" sub-object,
207            # if "instance" is an incorrect type - skip recursive normalization of "instance"
208            if schema_key == "properties" and isinstance(instance, dict):
209                for k, subschema in property_value.items():
210                    if k in instance:
211                        subschema = resolve(subschema)
212                        instance[k] = self.__normalize(instance[k], subschema)
213            # Recursively normalize every item of the "instance" sub-array,
214            # if "instance" is an incorrect type - skip recursive normalization of "instance"
215            elif schema_key == "items" and isinstance(instance, list):
216                subschema = resolve(property_value)
217                for index, item in enumerate(instance):
218                    instance[index] = self.__normalize(item, subschema)
219
220            # Running native jsonschema traverse algorithm after field normalization is done.
221            yield from original_validator(
222                validator_instance,
223                property_value,
224                instance,
225                schema,
226            )
227
228        return normalizator
229
230    def transform(
231        self,
232        record: Dict[str, Any],
233        schema: Mapping[str, Any],
234    ) -> None:
235        """
236        Normalize and validate according to config.
237        :param record: record instance for normalization/transformation. All modification are done by modifying existent object.
238        :param schema: object's jsonschema for normalization.
239        """
240        if TransformConfig.NoTransform in self._config:
241            return
242        normalizer = self._normalizer(schema)
243        for e in normalizer.iter_errors(record):
244            """
245            just calling normalizer.validate() would throw an exception on
246            first validation occurrences and stop processing rest of schema.
247            """
248            logger.warning(self.get_error_message(e))
249
250    def get_error_message(self, e: ValidationError) -> str:
251        """
252        Construct a sanitized error message from a ValidationError instance.
253        """
254        field_path = ".".join(map(str, e.path))
255        type_structure = self._get_type_structure(e.instance)
256
257        return f"Failed to transform value from type '{type_structure}' to type '{e.validator_value}' at path: '{field_path}'"
258
259    def _get_type_structure(self, input_data: Any, current_depth: int = 0) -> Any:
260        """
261        Get the structure of a given input data for use in error message construction.
262        """
263        # Handle null values
264        if input_data is None:
265            return "null"
266
267        # Avoid recursing too deep
268        if current_depth >= MAX_NESTING_DEPTH:
269            return "object" if isinstance(input_data, dict) else python_to_json[type(input_data)]
270
271        if isinstance(input_data, dict):
272            return {
273                key: self._get_type_structure(field_value, current_depth + 1)
274                for key, field_value in input_data.items()
275            }
276
277        else:
278            return python_to_json[type(input_data)]

Class for transforming object before output.

TypeTransformer(config: TransformConfig)
74    def __init__(self, config: TransformConfig):
75        """
76        Initialize TypeTransformer instance.
77        :param config Transform config that would be applied to object
78        """
79        if TransformConfig.NoTransform in config and config != TransformConfig.NoTransform:
80            raise Exception("NoTransform option cannot be combined with other flags.")
81        self._config = config
82        all_validators = {
83            key: self.__get_normalizer(key, orig_validator)
84            for key, orig_validator in Draft7Validator.VALIDATORS.items()
85            # Do not validate field we do not transform for maximum performance.
86            if key in ["type", "array", "$ref", "properties", "items"]
87        }
88        self._normalizer = validators.create(
89            meta_schema=Draft7Validator.META_SCHEMA, validators=all_validators
90        )

Initialize TypeTransformer instance. :param config Transform config that would be applied to object

def registerCustomTransform( self, normalization_callback: Callable[[Any, dict[str, Any]], Any]) -> Callable[[Any, dict[str, Any]], Any]:
 92    def registerCustomTransform(
 93        self, normalization_callback: Callable[[Any, dict[str, Any]], Any]
 94    ) -> Callable[[Any, dict[str, Any]], Any]:
 95        """
 96        Register custom normalization callback.
 97        :param normalization_callback function to be used for value
 98        normalization. Takes original value and part type schema. Should return
 99        normalized value. See docs/connector-development/cdk-python/schemas.md
100        for details.
101        :return Same callback, this is useful for using registerCustomTransform function as decorator.
102        """
103        if TransformConfig.CustomSchemaNormalization not in self._config:
104            raise Exception(
105                "Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer"
106            )
107        self._custom_normalizer = normalization_callback
108        return normalization_callback

Register custom normalization callback. :param normalization_callback function to be used for value normalization. Takes original value and part type schema. Should return normalized value. See docs/connector-development/cdk-python/schemas.md for details. :return Same callback, this is useful for using registerCustomTransform function as decorator.

@staticmethod
def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any:
124    @staticmethod
125    def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any:
126        """
127        Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set.
128        :param original_item original value of field.
129        :param subschema part of the jsonschema containing field type/format data.
130        :return transformed field value.
131        """
132        target_type = subschema.get("type", [])
133        if original_item is None and "null" in target_type:
134            return None
135        if isinstance(target_type, list):
136            # jsonschema type could either be a single string or array of type
137            # strings. In case if there is some disambigous and more than one
138            # type (except null) do not do any conversion and return original
139            # value. If type array has one type and null i.e. {"type":
140            # ["integer", "null"]}, convert value to specified type.
141            target_type = [t for t in target_type if t != "null"]
142            if len(target_type) != 1:
143                return original_item
144            target_type = target_type[0]
145        try:
146            if target_type == "string":
147                return str(original_item)
148            elif target_type == "number":
149                return float(original_item)
150            elif target_type == "integer":
151                return int(original_item)
152            elif target_type == "boolean":
153                if isinstance(original_item, str):
154                    return _strtobool(original_item) == 1
155                return bool(original_item)
156            elif target_type == "array":
157                item_types = set(subschema.get("items", {}).get("type", set()))
158                if (
159                    item_types.issubset(json_to_python_simple)
160                    and type(original_item) in json_to_python_simple.values()
161                ):
162                    return [original_item]
163        except (ValueError, TypeError):
164            return original_item
165        return original_item

Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set. :param original_item original value of field. :param subschema part of the jsonschema containing field type/format data. :return transformed field value.

def transform(self, record: Dict[str, Any], schema: Mapping[str, Any]) -> None:
230    def transform(
231        self,
232        record: Dict[str, Any],
233        schema: Mapping[str, Any],
234    ) -> None:
235        """
236        Normalize and validate according to config.
237        :param record: record instance for normalization/transformation. All modification are done by modifying existent object.
238        :param schema: object's jsonschema for normalization.
239        """
240        if TransformConfig.NoTransform in self._config:
241            return
242        normalizer = self._normalizer(schema)
243        for e in normalizer.iter_errors(record):
244            """
245            just calling normalizer.validate() would throw an exception on
246            first validation occurrences and stop processing rest of schema.
247            """
248            logger.warning(self.get_error_message(e))

Normalize and validate according to config.

Parameters
  • record: record instance for normalization/transformation. All modification are done by modifying existent object.
  • schema: object's jsonschema for normalization.
def get_error_message(self, e: jsonschema.exceptions.ValidationError) -> str:
250    def get_error_message(self, e: ValidationError) -> str:
251        """
252        Construct a sanitized error message from a ValidationError instance.
253        """
254        field_path = ".".join(map(str, e.path))
255        type_structure = self._get_type_structure(e.instance)
256
257        return f"Failed to transform value from type '{type_structure}' to type '{e.validator_value}' at path: '{field_path}'"

Construct a sanitized error message from a ValidationError instance.

ENV_REQUEST_CACHE_PATH = 'REQUEST_CACHE_PATH'
@contextmanager
def create_timer( name: str) -> Generator[airbyte_cdk.utils.event_timing.EventTimer, Any, NoneType]:
80@contextmanager
81def create_timer(name: str) -> Generator[EventTimer, Any, None]:
82    """
83    Creates a new EventTimer as a context manager to improve code readability.
84    """
85    a_timer = EventTimer(name)
86    yield a_timer

Creates a new EventTimer as a context manager to improve code readability.

class OneOfOptionConfig:
 9class OneOfOptionConfig:
10    """
11    Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
12
13    Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
14
15    Usage:
16
17        ```python
18        class OptionModel(BaseModel):
19            mode: Literal["option_a"] = Field("option_a", const=True)
20            option_a_field: str = Field(...)
21
22            class Config(OneOfOptionConfig):
23                title = "Option A"
24                description = "Option A description"
25                discriminator = "mode"
26        ```
27    """
28
29    @staticmethod
30    def schema_extra(schema: Dict[str, Any], model: Any) -> None:
31        if hasattr(model.Config, "description"):
32            schema["description"] = model.Config.description
33        if hasattr(model.Config, "discriminator"):
34            schema.setdefault("required", []).append(model.Config.discriminator)

Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:
class OptionModel(BaseModel):
    mode: Literal["option_a"] = Field("option_a", const=True)
    option_a_field: str = Field(...)

    class Config(OneOfOptionConfig):
        title = "Option A"
        description = "Option A description"
        discriminator = "mode"
@staticmethod
def schema_extra(schema: Dict[str, Any], model: Any) -> None:
29    @staticmethod
30    def schema_extra(schema: Dict[str, Any], model: Any) -> None:
31        if hasattr(model.Config, "description"):
32            schema["description"] = model.Config.description
33        if hasattr(model.Config, "discriminator"):
34            schema.setdefault("required", []).append(model.Config.discriminator)
def resolve_refs(schema: dict[str, typing.Any]) -> dict[str, typing.Any]:
13def resolve_refs(schema: dict[str, Any]) -> dict[str, Any]:
14    """
15    For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object
16    relationships.
17    """
18    json_schema_ref_resolver = RefResolver.from_schema(schema)
19    str_schema = json.dumps(schema)
20    for ref_block in re.findall(r'{"\$ref": "#\/definitions\/.+?(?="})"}', str_schema):
21        ref = json.loads(ref_block)["$ref"]
22        str_schema = str_schema.replace(
23            ref_block, json.dumps(json_schema_ref_resolver.resolve(ref)[1])
24        )
25    pyschema: dict[str, Any] = json.loads(str_schema)
26    del pyschema["definitions"]
27    return pyschema

For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object relationships.

def as_airbyte_message( stream: Union[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteStream, airbyte_protocol_dataclasses.models.airbyte_protocol.StreamDescriptor], current_status: airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteStreamStatus, reasons: Optional[List[airbyte_protocol_dataclasses.models.airbyte_protocol.AirbyteStreamStatusReason]] = None) -> AirbyteMessage:
23def as_airbyte_message(
24    stream: Union[AirbyteStream, StreamDescriptor],
25    current_status: AirbyteStreamStatus,
26    reasons: Optional[List[AirbyteStreamStatusReason]] = None,
27) -> AirbyteMessage:
28    """
29    Builds an AirbyteStreamStatusTraceMessage for the provided stream
30    """
31
32    now_millis = datetime.now().timestamp() * 1000.0
33
34    trace_message = AirbyteTraceMessage(
35        type=TraceType.STREAM_STATUS,
36        emitted_at=now_millis,
37        stream_status=AirbyteStreamStatusTraceMessage(
38            stream_descriptor=StreamDescriptor(name=stream.name, namespace=stream.namespace),
39            status=current_status,
40            reasons=reasons,
41        ),
42    )
43
44    return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)

Builds an AirbyteStreamStatusTraceMessage for the provided stream

Config = typing.Mapping[str, typing.Any]
class Record(typing.Mapping[str, typing.Any]):
21class Record(Mapping[str, Any]):
22    def __init__(
23        self,
24        data: Mapping[str, Any],
25        stream_name: str,
26        associated_slice: Optional[StreamSlice] = None,
27        is_file_transfer_message: bool = False,
28    ):
29        self._data = data
30        self._associated_slice = associated_slice
31        self.stream_name = stream_name
32        self.is_file_transfer_message = is_file_transfer_message
33
34    @property
35    def data(self) -> Mapping[str, Any]:
36        return self._data
37
38    @property
39    def associated_slice(self) -> Optional[StreamSlice]:
40        return self._associated_slice
41
42    def __repr__(self) -> str:
43        return repr(self._data)
44
45    def __getitem__(self, key: str) -> Any:
46        return self._data[key]
47
48    def __len__(self) -> int:
49        return len(self._data)
50
51    def __iter__(self) -> Any:
52        return iter(self._data)
53
54    def __contains__(self, item: object) -> bool:
55        return item in self._data
56
57    def __eq__(self, other: object) -> bool:
58        if isinstance(other, Record):
59            # noinspection PyProtectedMember
60            return self._data == other._data
61        return False
62
63    def __ne__(self, other: object) -> bool:
64        return not self.__eq__(other)

A Mapping is a generic container for associating key/value pairs.

This class provides concrete generic implementations of all methods except for __getitem__, __iter__, and __len__.

Record( data: Mapping[str, Any], stream_name: str, associated_slice: Optional[StreamSlice] = None, is_file_transfer_message: bool = False)
22    def __init__(
23        self,
24        data: Mapping[str, Any],
25        stream_name: str,
26        associated_slice: Optional[StreamSlice] = None,
27        is_file_transfer_message: bool = False,
28    ):
29        self._data = data
30        self._associated_slice = associated_slice
31        self.stream_name = stream_name
32        self.is_file_transfer_message = is_file_transfer_message
stream_name
is_file_transfer_message
data: Mapping[str, Any]
34    @property
35    def data(self) -> Mapping[str, Any]:
36        return self._data
associated_slice: Optional[StreamSlice]
38    @property
39    def associated_slice(self) -> Optional[StreamSlice]:
40        return self._associated_slice