airbyte_cdk
Welcome to the Airbyte Python CDK!
The Airbyte Python CDK is a Python library that provides a set of tools to help you build connectors for the Airbyte platform.
Building Source Connectors
To build a source connector, you will want to refer to the following classes and modules:
airbyte_cdk.sources
airbyte_cdk.sources.concurrent_source
airbyte_cdk.sources.config
airbyte_cdk.sources.file_based
airbyte_cdk.sources.streams
Building Destination Connectors
To build a destination connector, you will want to refer to the following classes and modules:
airbyte_cdk.destinations
airbyte_cdk.destinations.Destination
airbyte_cdk.destinations.vector_db_based
Working with Airbyte Protocol Models
The Airbyte CDK provides a set of classes that help you work with the Airbyte protocol models:
API Reference
1# Copyright (c) 2021 Airbyte, Inc., all rights reserved. 2""" 3# Welcome to the Airbyte Python CDK! 4 5The Airbyte Python CDK is a Python library that provides a set of tools to help you build 6connectors for the Airbyte platform. 7 8## Building Source Connectors 9 10To build a source connector, you will want to refer to 11the following classes and modules: 12 13- `airbyte_cdk.sources` 14- `airbyte_cdk.sources.concurrent_source` 15- `airbyte_cdk.sources.config` 16- `airbyte_cdk.sources.file_based` 17- `airbyte_cdk.sources.streams` 18 19## Building Destination Connectors 20 21To build a destination connector, you will want to refer to 22the following classes and modules: 23 24- `airbyte_cdk.destinations` 25- `airbyte_cdk.destinations.Destination` 26- `airbyte_cdk.destinations.vector_db_based` 27 28## Working with Airbyte Protocol Models 29 30The Airbyte CDK provides a set of classes that help you work with the Airbyte protocol models: 31 32- `airbyte_cdk.models.airbyte_protocol` 33- `airbyte_cdk.models.airbyte_protocol_serializers` 34 35--- 36 37API Reference 38 39--- 40 41""" 42 43# Warning: The below imports are not stable and will cause circular 44# dependencies if auto-sorted with isort. Please keep them in the same order. 45# TODO: Submodules should import from lower-level modules, rather than importing from here. 46# Imports should also be placed in `if TYPE_CHECKING` blocks if they are only used as type 47# hints - again, to avoid circular dependencies. 48# Once those issues are resolved, the below can be sorted with isort. 49import dunamai as _dunamai 50 51from .config_observation import ( 52 create_connector_config_control_message, 53 emit_configuration_as_airbyte_control_message, 54) 55from .connector import BaseConnector, Connector 56from .destinations import Destination 57from .entrypoint import AirbyteEntrypoint, launch 58from .logger import AirbyteLogFormatter, init_logger 59from .models import ( 60 AdvancedAuth, 61 AirbyteConnectionStatus, 62 AirbyteLogMessage, 63 AirbyteMessage, 64 AirbyteRecordMessage, 65 AirbyteStream, 66 ConfiguredAirbyteCatalog, 67 ConfiguredAirbyteStream, 68 ConnectorSpecification, 69 DestinationSyncMode, 70 FailureType, 71 Level, 72 OAuthConfigSpecification, 73 OrchestratorType, 74 Status, 75 SyncMode, 76 Type, 77) 78from .sources import AbstractSource, Source 79from .sources.concurrent_source.concurrent_source import ConcurrentSource 80from .sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter 81from .sources.config import BaseConfig 82from .sources.connector_state_manager import ConnectorStateManager 83from .sources.declarative.auth import DeclarativeOauth2Authenticator 84from .sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator, NoAuth 85from .sources.declarative.auth.oauth import DeclarativeSingleUseRefreshTokenOauth2Authenticator 86from .sources.declarative.auth.token import ( 87 ApiKeyAuthenticator, 88 BasicHttpAuthenticator, 89 BearerAuthenticator, 90) 91from .sources.declarative.datetime.min_max_datetime import MinMaxDatetime 92from .sources.declarative.declarative_stream import DeclarativeStream 93from .sources.declarative.decoders import Decoder, JsonDecoder 94from .sources.declarative.exceptions import ReadException 95from .sources.declarative.extractors import DpathExtractor, RecordSelector 96from .sources.declarative.extractors.record_extractor import RecordExtractor 97from .sources.declarative.extractors.record_filter import RecordFilter 98from .sources.declarative.incremental import DatetimeBasedCursor 99from .sources.declarative.interpolation import InterpolatedBoolean, InterpolatedString 100from .sources.declarative.manifest_declarative_source import ManifestDeclarativeSource 101from .sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 102 LegacyToPerPartitionStateMigration, 103) 104from .sources.declarative.partition_routers import ( 105 CartesianProductStreamSlicer, 106 SinglePartitionRouter, 107 SubstreamPartitionRouter, 108) 109from .sources.declarative.partition_routers.substream_partition_router import ParentStreamConfig 110from .sources.declarative.requesters import HttpRequester, Requester 111from .sources.declarative.requesters.error_handlers import BackoffStrategy 112from .sources.declarative.requesters.paginators import DefaultPaginator, PaginationStrategy 113from .sources.declarative.requesters.paginators.strategies import ( 114 CursorPaginationStrategy, 115 OffsetIncrement, 116 PageIncrement, 117 StopConditionPaginationStrategyDecorator, 118) 119from .sources.declarative.requesters.request_option import RequestOption, RequestOptionType 120from .sources.declarative.requesters.request_options.default_request_options_provider import ( 121 DefaultRequestOptionsProvider, 122) 123from .sources.declarative.requesters.request_options.interpolated_request_input_provider import ( 124 InterpolatedRequestInputProvider, 125) 126from .sources.declarative.requesters.requester import HttpMethod 127from .sources.declarative.retrievers import SimpleRetriever 128from .sources.declarative.schema import JsonFileSchemaLoader 129from .sources.declarative.transformations.add_fields import AddedFieldDefinition, AddFields 130from .sources.declarative.transformations.transformation import RecordTransformation 131from .sources.declarative.types import FieldPointer 132from .sources.declarative.yaml_declarative_source import YamlDeclarativeSource 133from .sources.message import InMemoryMessageRepository, MessageRepository 134from .sources.source import TState 135from .sources.streams.availability_strategy import AvailabilityStrategy 136from .sources.streams.call_rate import ( 137 AbstractAPIBudget, 138 CachedLimiterSession, 139 HttpAPIBudget, 140 HttpRequestMatcher, 141 LimiterSession, 142 MovingWindowCallRatePolicy, 143 Rate, 144) 145from .sources.streams.checkpoint import Cursor as LegacyCursor 146from .sources.streams.checkpoint import ResumableFullRefreshCursor 147from .sources.streams.concurrent.adapters import StreamFacade 148from .sources.streams.concurrent.cursor import ( 149 ConcurrentCursor, 150 Cursor, 151 CursorField, 152 FinalStateCursor, 153) 154from .sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 155 EpochValueConcurrentStreamStateConverter, 156 IsoMillisConcurrentStreamStateConverter, 157) 158from .sources.streams.core import IncrementalMixin, Stream, package_name_from_class 159from .sources.streams.http import HttpStream, HttpSubStream 160from .sources.streams.http.availability_strategy import HttpAvailabilityStrategy 161from .sources.streams.http.exceptions import ( 162 BaseBackoffException, 163 DefaultBackoffException, 164 UserDefinedBackoffException, 165) 166from .sources.streams.http.rate_limiting import default_backoff_handler 167from .sources.streams.http.requests_native_auth import ( 168 Oauth2Authenticator, 169 SingleUseRefreshTokenOauth2Authenticator, 170 TokenAuthenticator, 171) 172from .sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator 173from .sources.types import Config, Record, StreamSlice 174from .sources.utils import casing 175from .sources.utils.schema_helpers import ( 176 InternalConfig, 177 ResourceSchemaLoader, 178 check_config_against_spec_or_exit, 179 expand_refs, 180 split_config, 181) 182from .sources.utils.transform import TransformConfig, TypeTransformer 183from .utils import AirbyteTracedException, is_cloud_environment 184from .utils.constants import ENV_REQUEST_CACHE_PATH 185from .utils.event_timing import create_timer 186from .utils.oneof_option_config import OneOfOptionConfig 187from .utils.spec_schema_transformations import resolve_refs 188from .utils.stream_status_utils import as_airbyte_message 189 190__all__ = [ 191 # Availability strategy 192 "AvailabilityStrategy", 193 "HttpAvailabilityStrategy", 194 # Checkpoint 195 "LegacyCursor", 196 "ResumableFullRefreshCursor", 197 # Concurrent 198 "ConcurrentCursor", 199 "ConcurrentSource", 200 "ConcurrentSourceAdapter", 201 "Cursor", 202 "CursorField", 203 "DEFAULT_CONCURRENCY", 204 "EpochValueConcurrentStreamStateConverter", 205 "FinalStateCursor", 206 "IsoMillisConcurrentStreamStateConverter", 207 "StreamFacade", 208 # Config observation 209 "create_connector_config_control_message", 210 "emit_configuration_as_airbyte_control_message", 211 # Connector 212 "AbstractSource", 213 "BaseConfig", 214 "BaseConnector", 215 "Connector", 216 "Destination", 217 "Source", 218 "TState", 219 # Declarative 220 "AddFields", 221 "AddedFieldDefinition", 222 "ApiKeyAuthenticator", 223 "BackoffStrategy", 224 "BasicHttpAuthenticator", 225 "BearerAuthenticator", 226 "CartesianProductStreamSlicer", 227 "CursorPaginationStrategy", 228 "DatetimeBasedCursor", 229 "DeclarativeAuthenticator", 230 "DeclarativeOauth2Authenticator", 231 "DeclarativeSingleUseRefreshTokenOauth2Authenticator", 232 "DeclarativeStream", 233 "Decoder", 234 "DefaultPaginator", 235 "DefaultRequestOptionsProvider", 236 "DpathExtractor", 237 "FieldPointer", 238 "HttpMethod", 239 "HttpRequester", 240 "InterpolatedBoolean", 241 "InterpolatedRequestInputProvider", 242 "InterpolatedString", 243 "JsonDecoder", 244 "JsonFileSchemaLoader", 245 "LegacyToPerPartitionStateMigration", 246 "ManifestDeclarativeSource", 247 "MinMaxDatetime", 248 "NoAuth", 249 "OffsetIncrement", 250 "PageIncrement", 251 "PaginationStrategy", 252 "ParentStreamConfig", 253 "ReadException", 254 "RecordExtractor", 255 "RecordFilter", 256 "RecordSelector", 257 "RecordTransformation", 258 "RequestOption", 259 "RequestOptionType", 260 "Requester", 261 "ResponseStatus", 262 "SimpleRetriever", 263 "SinglePartitionRouter", 264 "StopConditionPaginationStrategyDecorator", 265 "StreamSlice", 266 "SubstreamPartitionRouter", 267 "YamlDeclarativeSource", 268 # Entrypoint 269 "launch", 270 "AirbyteEntrypoint", 271 # HTTP 272 "AbstractAPIBudget", 273 "AbstractHeaderAuthenticator", 274 "BaseBackoffException", 275 "CachedLimiterSession", 276 "DefaultBackoffException", 277 "default_backoff_handler", 278 "HttpAPIBudget", 279 "HttpAuthenticator", 280 "HttpRequestMatcher", 281 "HttpStream", 282 "HttpSubStream", 283 "LimiterSession", 284 "MovingWindowCallRatePolicy", 285 "MultipleTokenAuthenticator", 286 "Oauth2Authenticator", 287 "Rate", 288 "SingleUseRefreshTokenOauth2Authenticator", 289 "TokenAuthenticator", 290 "UserDefinedBackoffException", 291 # Logger 292 "AirbyteLogFormatter", 293 "init_logger", 294 # Protocol classes 295 "AirbyteStream", 296 "AirbyteConnectionStatus", 297 "AirbyteMessage", 298 "ConfiguredAirbyteCatalog", 299 "Status", 300 "Type", 301 "OrchestratorType", 302 "ConfiguredAirbyteStream", 303 "DestinationSyncMode", 304 "SyncMode", 305 "FailureType", 306 "AdvancedAuth", 307 "AirbyteLogMessage", 308 "OAuthConfigSpecification", 309 "ConnectorSpecification", 310 "Level", 311 "AirbyteRecordMessage", 312 # Repository 313 "InMemoryMessageRepository", 314 "MessageRepository", 315 # State management 316 "ConnectorStateManager", 317 # Stream 318 "IncrementalMixin", 319 "Stream", 320 "StreamData", 321 "package_name_from_class", 322 # Utils 323 "AirbyteTracedException", 324 "is_cloud_environment", 325 "casing", 326 "InternalConfig", 327 "ResourceSchemaLoader", 328 "check_config_against_spec_or_exit", 329 "split_config", 330 "TransformConfig", 331 "TypeTransformer", 332 "ENV_REQUEST_CACHE_PATH", 333 "create_timer", 334 "OneOfOptionConfig", 335 "resolve_refs", 336 "as_airbyte_message", 337 # Types 338 "Config", 339 "Record", 340 "Source", 341 "StreamSlice", 342] 343 344__version__: str 345"""Version generated by poetry dynamic versioning during publish. 346 347When running in development, dunamai will calculate a new prerelease version 348from existing git release tag info. 349""" 350 351try: 352 __version__ = _dunamai.get_version( 353 "airbyte-cdk", 354 third_choice=_dunamai.Version.from_any_vcs, 355 fallback=_dunamai.Version("0.0.0+dev"), 356 ).serialize() 357except: 358 __version__ = "0.0.0+dev"
18class AvailabilityStrategy(ABC): 19 """ 20 Abstract base class for checking stream availability. 21 """ 22 23 @abstractmethod 24 def check_availability( 25 self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None 26 ) -> Tuple[bool, Optional[str]]: 27 """ 28 Checks stream availability. 29 30 :param stream: stream 31 :param logger: source logger 32 :param source: (optional) source 33 :return: A tuple of (boolean, str). If boolean is true, then the stream 34 is available, and no str is required. Otherwise, the stream is unavailable 35 for some reason and the str should describe what went wrong and how to 36 resolve the unavailability, if possible. 37 """ 38 39 @staticmethod 40 def get_first_stream_slice(stream: Stream) -> Optional[Mapping[str, Any]]: 41 """ 42 Gets the first stream_slice from a given stream's stream_slices. 43 :param stream: stream 44 :raises StopIteration: if there is no first slice to return (the stream_slices generator is empty) 45 :return: first stream slice from 'stream_slices' generator (`None` is a valid stream slice) 46 """ 47 # We wrap the return output of stream_slices() because some implementations return types that are iterable, 48 # but not iterators such as lists or tuples 49 slices = iter( 50 stream.stream_slices( 51 cursor_field=stream.cursor_field, # type: ignore[arg-type] 52 sync_mode=SyncMode.full_refresh, 53 ) 54 ) 55 return next(slices) 56 57 @staticmethod 58 def get_first_record_for_slice( 59 stream: Stream, stream_slice: Optional[Mapping[str, Any]] 60 ) -> StreamData: 61 """ 62 Gets the first record for a stream_slice of a stream. 63 64 :param stream: stream instance from which to read records 65 :param stream_slice: stream_slice parameters for slicing the stream 66 :raises StopIteration: if there is no first record to return (the read_records generator is empty) 67 :return: StreamData containing the first record in the slice 68 """ 69 # Store the original value of exit_on_rate_limit 70 original_exit_on_rate_limit = stream.exit_on_rate_limit 71 72 try: 73 # Ensure exit_on_rate_limit is safely set to True if possible 74 stream.exit_on_rate_limit = True 75 76 # We wrap the return output of read_records() because some implementations return types that are iterable, 77 # but not iterators such as lists or tuples 78 records_for_slice = iter( 79 stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice) 80 ) 81 82 return next(records_for_slice) 83 finally: 84 # Restore the original exit_on_rate_limit value 85 stream.exit_on_rate_limit = original_exit_on_rate_limit
Abstract base class for checking stream availability.
23 @abstractmethod 24 def check_availability( 25 self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None 26 ) -> Tuple[bool, Optional[str]]: 27 """ 28 Checks stream availability. 29 30 :param stream: stream 31 :param logger: source logger 32 :param source: (optional) source 33 :return: A tuple of (boolean, str). If boolean is true, then the stream 34 is available, and no str is required. Otherwise, the stream is unavailable 35 for some reason and the str should describe what went wrong and how to 36 resolve the unavailability, if possible. 37 """
Checks stream availability.
Parameters
- stream: stream
- logger: source logger
- source: (optional) source
Returns
A tuple of (boolean, str). If boolean is true, then the stream is available, and no str is required. Otherwise, the stream is unavailable for some reason and the str should describe what went wrong and how to resolve the unavailability, if possible.
39 @staticmethod 40 def get_first_stream_slice(stream: Stream) -> Optional[Mapping[str, Any]]: 41 """ 42 Gets the first stream_slice from a given stream's stream_slices. 43 :param stream: stream 44 :raises StopIteration: if there is no first slice to return (the stream_slices generator is empty) 45 :return: first stream slice from 'stream_slices' generator (`None` is a valid stream slice) 46 """ 47 # We wrap the return output of stream_slices() because some implementations return types that are iterable, 48 # but not iterators such as lists or tuples 49 slices = iter( 50 stream.stream_slices( 51 cursor_field=stream.cursor_field, # type: ignore[arg-type] 52 sync_mode=SyncMode.full_refresh, 53 ) 54 ) 55 return next(slices)
Gets the first stream_slice from a given stream's stream_slices.
Parameters
- stream: stream
Raises
- StopIteration: if there is no first slice to return (the stream_slices generator is empty)
Returns
first stream slice from 'stream_slices' generator (
None
is a valid stream slice)
57 @staticmethod 58 def get_first_record_for_slice( 59 stream: Stream, stream_slice: Optional[Mapping[str, Any]] 60 ) -> StreamData: 61 """ 62 Gets the first record for a stream_slice of a stream. 63 64 :param stream: stream instance from which to read records 65 :param stream_slice: stream_slice parameters for slicing the stream 66 :raises StopIteration: if there is no first record to return (the read_records generator is empty) 67 :return: StreamData containing the first record in the slice 68 """ 69 # Store the original value of exit_on_rate_limit 70 original_exit_on_rate_limit = stream.exit_on_rate_limit 71 72 try: 73 # Ensure exit_on_rate_limit is safely set to True if possible 74 stream.exit_on_rate_limit = True 75 76 # We wrap the return output of read_records() because some implementations return types that are iterable, 77 # but not iterators such as lists or tuples 78 records_for_slice = iter( 79 stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice) 80 ) 81 82 return next(records_for_slice) 83 finally: 84 # Restore the original exit_on_rate_limit value 85 stream.exit_on_rate_limit = original_exit_on_rate_limit
Gets the first record for a stream_slice of a stream.
Parameters
- stream: stream instance from which to read records
- stream_slice: stream_slice parameters for slicing the stream
Raises
- StopIteration: if there is no first record to return (the read_records generator is empty)
Returns
StreamData containing the first record in the slice
18class HttpAvailabilityStrategy(AvailabilityStrategy): 19 def check_availability( 20 self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None 21 ) -> Tuple[bool, Optional[str]]: 22 """ 23 Check stream availability by attempting to read the first record of the 24 stream. 25 26 :param stream: stream 27 :param logger: source logger 28 :param source: (optional) source 29 :return: A tuple of (boolean, str). If boolean is true, then the stream 30 is available, and no str is required. Otherwise, the stream is unavailable 31 for some reason and the str should describe what went wrong and how to 32 resolve the unavailability, if possible. 33 """ 34 reason: Optional[str] 35 try: 36 # Some streams need a stream slice to read records (e.g. if they have a SubstreamPartitionRouter) 37 # Streams that don't need a stream slice will return `None` as their first stream slice. 38 stream_slice = self.get_first_stream_slice(stream) 39 except StopIteration: 40 # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!) 41 # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield <something>` 42 # without accounting for the case in which the parent stream is empty. 43 reason = f"Cannot attempt to connect to stream {stream.name} - no stream slices were found, likely because the parent stream is empty." 44 return False, reason 45 except AirbyteTracedException as error: 46 return False, error.message 47 48 try: 49 self.get_first_record_for_slice(stream, stream_slice) 50 return True, None 51 except StopIteration: 52 logger.info(f"Successfully connected to stream {stream.name}, but got 0 records.") 53 return True, None 54 except AirbyteTracedException as error: 55 return False, error.message
Abstract base class for checking stream availability.
19 def check_availability( 20 self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None 21 ) -> Tuple[bool, Optional[str]]: 22 """ 23 Check stream availability by attempting to read the first record of the 24 stream. 25 26 :param stream: stream 27 :param logger: source logger 28 :param source: (optional) source 29 :return: A tuple of (boolean, str). If boolean is true, then the stream 30 is available, and no str is required. Otherwise, the stream is unavailable 31 for some reason and the str should describe what went wrong and how to 32 resolve the unavailability, if possible. 33 """ 34 reason: Optional[str] 35 try: 36 # Some streams need a stream slice to read records (e.g. if they have a SubstreamPartitionRouter) 37 # Streams that don't need a stream slice will return `None` as their first stream slice. 38 stream_slice = self.get_first_stream_slice(stream) 39 except StopIteration: 40 # If stream_slices has no `next()` item (Note - this is different from stream_slices returning [None]!) 41 # This can happen when a substream's `stream_slices` method does a `for record in parent_records: yield <something>` 42 # without accounting for the case in which the parent stream is empty. 43 reason = f"Cannot attempt to connect to stream {stream.name} - no stream slices were found, likely because the parent stream is empty." 44 return False, reason 45 except AirbyteTracedException as error: 46 return False, error.message 47 48 try: 49 self.get_first_record_for_slice(stream, stream_slice) 50 return True, None 51 except StopIteration: 52 logger.info(f"Successfully connected to stream {stream.name}, but got 0 records.") 53 return True, None 54 except AirbyteTracedException as error: 55 return False, error.message
Check stream availability by attempting to read the first record of the stream.
Parameters
- stream: stream
- logger: source logger
- source: (optional) source
Returns
A tuple of (boolean, str). If boolean is true, then the stream is available, and no str is required. Otherwise, the stream is unavailable for some reason and the str should describe what went wrong and how to resolve the unavailability, if possible.
Inherited Members
11@dataclass 12class ResumableFullRefreshCursor(Cursor): 13 """ 14 Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state 15 of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job 16 with the platform responsible for removing said state. 17 """ 18 19 def __init__(self) -> None: 20 self._cursor: StreamState = {} 21 22 def get_stream_state(self) -> StreamState: 23 return self._cursor 24 25 def set_initial_state(self, stream_state: StreamState) -> None: 26 self._cursor = stream_state 27 28 def observe(self, stream_slice: StreamSlice, record: Record) -> None: 29 """ 30 Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records. 31 """ 32 pass 33 34 def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: 35 self._cursor = stream_slice.cursor_slice 36 37 def should_be_synced(self, record: Record) -> bool: 38 """ 39 Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages 40 that don't have filterable bounds. We should always return them. 41 """ 42 return True 43 44 def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: 45 """ 46 RFR record don't have ordering to be compared between one another. 47 """ 48 return False 49 50 def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: 51 # A top-level RFR cursor only manages the state of a single partition 52 return self._cursor
Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job with the platform responsible for removing said state.
Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it is used for two things:
- Interpolation of the requests
- Transformation of records
- Saving the state
For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that allows for emitting the state to the platform.
Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called before calling anything else
Parameters
- stream_state: The state of the stream as returned by get_stream_state
28 def observe(self, stream_slice: StreamSlice, record: Record) -> None: 29 """ 30 Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records. 31 """ 32 pass
Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records.
34 def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: 35 self._cursor = stream_slice.cursor_slice
Update state based on the stream slice. Note that stream_slice.cursor_slice
and most_recent_record.associated_slice
are expected
to be the same but we make it explicit here that stream_slice
should be leveraged to update the state. We do not pass in the
latest record, since cursor instances should maintain the relevant internal state on their own.
Parameters
- stream_slice: slice to close
37 def should_be_synced(self, record: Record) -> bool: 38 """ 39 Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages 40 that don't have filterable bounds. We should always return them. 41 """ 42 return True
Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages that don't have filterable bounds. We should always return them.
44 def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: 45 """ 46 RFR record don't have ordering to be compared between one another. 47 """ 48 return False
RFR record don't have ordering to be compared between one another.
50 def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: 51 # A top-level RFR cursor only manages the state of a single partition 52 return self._cursor
Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of a specific parent delineated by the incoming slice's partition object.
128class ConcurrentCursor(Cursor): 129 _START_BOUNDARY = 0 130 _END_BOUNDARY = 1 131 132 def __init__( 133 self, 134 stream_name: str, 135 stream_namespace: Optional[str], 136 stream_state: Any, 137 message_repository: MessageRepository, 138 connector_state_manager: ConnectorStateManager, 139 connector_state_converter: AbstractStreamStateConverter, 140 cursor_field: CursorField, 141 slice_boundary_fields: Optional[Tuple[str, str]], 142 start: Optional[CursorValueType], 143 end_provider: Callable[[], CursorValueType], 144 lookback_window: Optional[GapType] = None, 145 slice_range: Optional[GapType] = None, 146 cursor_granularity: Optional[GapType] = None, 147 clamping_strategy: ClampingStrategy = NoClamping(), 148 ) -> None: 149 self._stream_name = stream_name 150 self._stream_namespace = stream_namespace 151 self._message_repository = message_repository 152 self._connector_state_converter = connector_state_converter 153 self._connector_state_manager = connector_state_manager 154 self._cursor_field = cursor_field 155 # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379 156 self._slice_boundary_fields = slice_boundary_fields 157 self._start = start 158 self._end_provider = end_provider 159 self.start, self._concurrent_state = self._get_concurrent_state(stream_state) 160 self._lookback_window = lookback_window 161 self._slice_range = slice_range 162 self._most_recent_cursor_value_per_partition: MutableMapping[ 163 Union[StreamSlice, Mapping[str, Any], None], Any 164 ] = {} 165 self._has_closed_at_least_one_slice = False 166 self._cursor_granularity = cursor_granularity 167 # Flag to track if the logger has been triggered (per stream) 168 self._should_be_synced_logger_triggered = False 169 self._clamping_strategy = clamping_strategy 170 171 @property 172 def state(self) -> MutableMapping[str, Any]: 173 return self._connector_state_converter.convert_to_state_message( 174 self.cursor_field, self._concurrent_state 175 ) 176 177 @property 178 def cursor_field(self) -> CursorField: 179 return self._cursor_field 180 181 @property 182 def _slice_boundary_fields_wrapper(self) -> Tuple[str, str]: 183 return ( 184 self._slice_boundary_fields 185 if self._slice_boundary_fields 186 else ( 187 self._connector_state_converter.START_KEY, 188 self._connector_state_converter.END_KEY, 189 ) 190 ) 191 192 def _get_concurrent_state( 193 self, state: MutableMapping[str, Any] 194 ) -> Tuple[CursorValueType, MutableMapping[str, Any]]: 195 if self._connector_state_converter.is_state_message_compatible(state): 196 return ( 197 self._start or self._connector_state_converter.zero_value, 198 self._connector_state_converter.deserialize(state), 199 ) 200 return self._connector_state_converter.convert_from_sequential_state( 201 self._cursor_field, state, self._start 202 ) 203 204 def observe(self, record: Record) -> None: 205 most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get( 206 record.associated_slice 207 ) 208 try: 209 cursor_value = self._extract_cursor_value(record) 210 211 if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value: 212 self._most_recent_cursor_value_per_partition[record.associated_slice] = cursor_value 213 except ValueError: 214 self._log_for_record_without_cursor_value() 215 216 def _extract_cursor_value(self, record: Record) -> Any: 217 return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record)) 218 219 def close_partition(self, partition: Partition) -> None: 220 slice_count_before = len(self._concurrent_state.get("slices", [])) 221 self._add_slice_to_state(partition) 222 if slice_count_before < len( 223 self._concurrent_state["slices"] 224 ): # only emit if at least one slice has been processed 225 self._merge_partitions() 226 self._emit_state_message() 227 self._has_closed_at_least_one_slice = True 228 229 def _add_slice_to_state(self, partition: Partition) -> None: 230 most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get( 231 partition.to_slice() 232 ) 233 234 if self._slice_boundary_fields: 235 if "slices" not in self._concurrent_state: 236 raise RuntimeError( 237 f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support." 238 ) 239 self._concurrent_state["slices"].append( 240 { 241 self._connector_state_converter.START_KEY: self._extract_from_slice( 242 partition, self._slice_boundary_fields[self._START_BOUNDARY] 243 ), 244 self._connector_state_converter.END_KEY: self._extract_from_slice( 245 partition, self._slice_boundary_fields[self._END_BOUNDARY] 246 ), 247 self._connector_state_converter.MOST_RECENT_RECORD_KEY: most_recent_cursor_value, 248 } 249 ) 250 elif most_recent_cursor_value: 251 if self._has_closed_at_least_one_slice: 252 # If we track state value using records cursor field, we can only do that if there is one partition. This is because we save 253 # the state every time we close a partition. We assume that if there are multiple slices, they need to be providing 254 # boundaries. There are cases where partitions could not have boundaries: 255 # * The cursor should be per-partition 256 # * The stream state is actually the parent stream state 257 # There might be other cases not listed above. Those are not supported today hence the stream should not use this cursor for 258 # state management. For the specific user that was affected with this issue, we need to: 259 # * Fix state tracking (which is currently broken) 260 # * Make the new version available 261 # * (Probably) ask the user to reset the stream to avoid data loss 262 raise ValueError( 263 "Given that slice_boundary_fields is not defined and that per-partition state is not supported, only one slice is " 264 "expected. Please contact the Airbyte team." 265 ) 266 267 self._concurrent_state["slices"].append( 268 { 269 self._connector_state_converter.START_KEY: self.start, 270 self._connector_state_converter.END_KEY: most_recent_cursor_value, 271 self._connector_state_converter.MOST_RECENT_RECORD_KEY: most_recent_cursor_value, 272 } 273 ) 274 275 def _emit_state_message(self) -> None: 276 self._connector_state_manager.update_state_for_stream( 277 self._stream_name, 278 self._stream_namespace, 279 self.state, 280 ) 281 state_message = self._connector_state_manager.create_state_message( 282 self._stream_name, self._stream_namespace 283 ) 284 self._message_repository.emit_message(state_message) 285 286 def _merge_partitions(self) -> None: 287 self._concurrent_state["slices"] = self._connector_state_converter.merge_intervals( 288 self._concurrent_state["slices"] 289 ) 290 291 def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType: 292 try: 293 _slice = partition.to_slice() 294 if not _slice: 295 raise KeyError(f"Could not find key `{key}` in empty slice") 296 return self._connector_state_converter.parse_value(_slice[key]) # type: ignore # we expect the devs to specify a key that would return a CursorValueType 297 except KeyError as exception: 298 raise KeyError( 299 f"Partition is expected to have key `{key}` but could not be found" 300 ) from exception 301 302 def ensure_at_least_one_state_emitted(self) -> None: 303 """ 304 The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be 305 called. 306 """ 307 self._emit_state_message() 308 309 def stream_slices(self) -> Iterable[StreamSlice]: 310 """ 311 Generating slices based on a few parameters: 312 * lookback_window: Buffer to remove from END_KEY of the highest slice 313 * slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created 314 * start: `_split_per_slice_range` will clip any value to `self._start which means that: 315 * if upper is less than self._start, no slices will be generated 316 * if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case) 317 318 Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be 319 inclusive in the API that is queried. 320 """ 321 self._merge_partitions() 322 323 if self._start is not None and self._is_start_before_first_slice(): 324 yield from self._split_per_slice_range( 325 self._start, 326 self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY], 327 False, 328 ) 329 330 if len(self._concurrent_state["slices"]) == 1: 331 yield from self._split_per_slice_range( 332 self._calculate_lower_boundary_of_last_slice( 333 self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY] 334 ), 335 self._end_provider(), 336 True, 337 ) 338 elif len(self._concurrent_state["slices"]) > 1: 339 for i in range(len(self._concurrent_state["slices"]) - 1): 340 if self._cursor_granularity: 341 yield from self._split_per_slice_range( 342 self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY] 343 + self._cursor_granularity, 344 self._concurrent_state["slices"][i + 1][ 345 self._connector_state_converter.START_KEY 346 ], 347 False, 348 ) 349 else: 350 yield from self._split_per_slice_range( 351 self._concurrent_state["slices"][i][ 352 self._connector_state_converter.END_KEY 353 ], 354 self._concurrent_state["slices"][i + 1][ 355 self._connector_state_converter.START_KEY 356 ], 357 False, 358 ) 359 yield from self._split_per_slice_range( 360 self._calculate_lower_boundary_of_last_slice( 361 self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY] 362 ), 363 self._end_provider(), 364 True, 365 ) 366 else: 367 raise ValueError("Expected at least one slice") 368 369 def _is_start_before_first_slice(self) -> bool: 370 return ( 371 self._start is not None 372 and self._start 373 < self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY] 374 ) 375 376 def _calculate_lower_boundary_of_last_slice( 377 self, lower_boundary: CursorValueType 378 ) -> CursorValueType: 379 if self._lookback_window: 380 return lower_boundary - self._lookback_window 381 return lower_boundary 382 383 def _split_per_slice_range( 384 self, lower: CursorValueType, upper: CursorValueType, upper_is_end: bool 385 ) -> Iterable[StreamSlice]: 386 if lower >= upper: 387 return 388 389 if self._start and upper < self._start: 390 return 391 392 lower = max(lower, self._start) if self._start else lower 393 if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper: 394 clamped_lower = self._clamping_strategy.clamp(lower) 395 clamped_upper = self._clamping_strategy.clamp(upper) 396 start_value, end_value = ( 397 (clamped_lower, clamped_upper - self._cursor_granularity) 398 if self._cursor_granularity and not upper_is_end 399 else (clamped_lower, clamped_upper) 400 ) 401 yield StreamSlice( 402 partition={}, 403 cursor_slice={ 404 self._slice_boundary_fields_wrapper[ 405 self._START_BOUNDARY 406 ]: self._connector_state_converter.output_format(start_value), 407 self._slice_boundary_fields_wrapper[ 408 self._END_BOUNDARY 409 ]: self._connector_state_converter.output_format(end_value), 410 }, 411 ) 412 else: 413 stop_processing = False 414 current_lower_boundary = lower 415 while not stop_processing: 416 current_upper_boundary = min( 417 self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper 418 ) 419 has_reached_upper_boundary = current_upper_boundary >= upper 420 421 clamped_upper = ( 422 self._clamping_strategy.clamp(current_upper_boundary) 423 if current_upper_boundary != upper 424 else current_upper_boundary 425 ) 426 clamped_lower = self._clamping_strategy.clamp(current_lower_boundary) 427 if clamped_lower >= clamped_upper: 428 # clamping collapsed both values which means that it is time to stop processing 429 # FIXME should this be replace by proper end_provider 430 break 431 start_value, end_value = ( 432 (clamped_lower, clamped_upper - self._cursor_granularity) 433 if self._cursor_granularity 434 and (not upper_is_end or not has_reached_upper_boundary) 435 else (clamped_lower, clamped_upper) 436 ) 437 yield StreamSlice( 438 partition={}, 439 cursor_slice={ 440 self._slice_boundary_fields_wrapper[ 441 self._START_BOUNDARY 442 ]: self._connector_state_converter.output_format(start_value), 443 self._slice_boundary_fields_wrapper[ 444 self._END_BOUNDARY 445 ]: self._connector_state_converter.output_format(end_value), 446 }, 447 ) 448 current_lower_boundary = clamped_upper 449 if current_upper_boundary >= upper: 450 stop_processing = True 451 452 def _evaluate_upper_safely(self, lower: CursorValueType, step: GapType) -> CursorValueType: 453 """ 454 Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date 455 This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code 456 would have broken anyway. 457 """ 458 try: 459 return lower + step 460 except OverflowError: 461 return self._end_provider() 462 463 def should_be_synced(self, record: Record) -> bool: 464 """ 465 Determines if a record should be synced based on its cursor value. 466 :param record: The record to evaluate 467 468 :return: True if the record's cursor value falls within the sync boundaries 469 """ 470 try: 471 record_cursor_value: CursorValueType = self._extract_cursor_value(record) 472 except ValueError: 473 self._log_for_record_without_cursor_value() 474 return True 475 return self.start <= record_cursor_value <= self._end_provider() 476 477 def _log_for_record_without_cursor_value(self) -> None: 478 if not self._should_be_synced_logger_triggered: 479 LOGGER.warning( 480 f"Could not find cursor field `{self.cursor_field.cursor_field_key}` in record for stream {self._stream_name}. The incremental sync will assume it needs to be synced" 481 ) 482 self._should_be_synced_logger_triggered = True
Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.
132 def __init__( 133 self, 134 stream_name: str, 135 stream_namespace: Optional[str], 136 stream_state: Any, 137 message_repository: MessageRepository, 138 connector_state_manager: ConnectorStateManager, 139 connector_state_converter: AbstractStreamStateConverter, 140 cursor_field: CursorField, 141 slice_boundary_fields: Optional[Tuple[str, str]], 142 start: Optional[CursorValueType], 143 end_provider: Callable[[], CursorValueType], 144 lookback_window: Optional[GapType] = None, 145 slice_range: Optional[GapType] = None, 146 cursor_granularity: Optional[GapType] = None, 147 clamping_strategy: ClampingStrategy = NoClamping(), 148 ) -> None: 149 self._stream_name = stream_name 150 self._stream_namespace = stream_namespace 151 self._message_repository = message_repository 152 self._connector_state_converter = connector_state_converter 153 self._connector_state_manager = connector_state_manager 154 self._cursor_field = cursor_field 155 # To see some example where the slice boundaries might not be defined, check https://github.com/airbytehq/airbyte/blob/1ce84d6396e446e1ac2377362446e3fb94509461/airbyte-integrations/connectors/source-stripe/source_stripe/streams.py#L363-L379 156 self._slice_boundary_fields = slice_boundary_fields 157 self._start = start 158 self._end_provider = end_provider 159 self.start, self._concurrent_state = self._get_concurrent_state(stream_state) 160 self._lookback_window = lookback_window 161 self._slice_range = slice_range 162 self._most_recent_cursor_value_per_partition: MutableMapping[ 163 Union[StreamSlice, Mapping[str, Any], None], Any 164 ] = {} 165 self._has_closed_at_least_one_slice = False 166 self._cursor_granularity = cursor_granularity 167 # Flag to track if the logger has been triggered (per stream) 168 self._should_be_synced_logger_triggered = False 169 self._clamping_strategy = clamping_strategy
204 def observe(self, record: Record) -> None: 205 most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get( 206 record.associated_slice 207 ) 208 try: 209 cursor_value = self._extract_cursor_value(record) 210 211 if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value: 212 self._most_recent_cursor_value_per_partition[record.associated_slice] = cursor_value 213 except ValueError: 214 self._log_for_record_without_cursor_value()
Indicate to the cursor that the record has been emitted
219 def close_partition(self, partition: Partition) -> None: 220 slice_count_before = len(self._concurrent_state.get("slices", [])) 221 self._add_slice_to_state(partition) 222 if slice_count_before < len( 223 self._concurrent_state["slices"] 224 ): # only emit if at least one slice has been processed 225 self._merge_partitions() 226 self._emit_state_message() 227 self._has_closed_at_least_one_slice = True
Indicate to the cursor that the partition has been successfully processed
302 def ensure_at_least_one_state_emitted(self) -> None: 303 """ 304 The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be 305 called. 306 """ 307 self._emit_state_message()
The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be called.
309 def stream_slices(self) -> Iterable[StreamSlice]: 310 """ 311 Generating slices based on a few parameters: 312 * lookback_window: Buffer to remove from END_KEY of the highest slice 313 * slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created 314 * start: `_split_per_slice_range` will clip any value to `self._start which means that: 315 * if upper is less than self._start, no slices will be generated 316 * if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case) 317 318 Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be 319 inclusive in the API that is queried. 320 """ 321 self._merge_partitions() 322 323 if self._start is not None and self._is_start_before_first_slice(): 324 yield from self._split_per_slice_range( 325 self._start, 326 self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY], 327 False, 328 ) 329 330 if len(self._concurrent_state["slices"]) == 1: 331 yield from self._split_per_slice_range( 332 self._calculate_lower_boundary_of_last_slice( 333 self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY] 334 ), 335 self._end_provider(), 336 True, 337 ) 338 elif len(self._concurrent_state["slices"]) > 1: 339 for i in range(len(self._concurrent_state["slices"]) - 1): 340 if self._cursor_granularity: 341 yield from self._split_per_slice_range( 342 self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY] 343 + self._cursor_granularity, 344 self._concurrent_state["slices"][i + 1][ 345 self._connector_state_converter.START_KEY 346 ], 347 False, 348 ) 349 else: 350 yield from self._split_per_slice_range( 351 self._concurrent_state["slices"][i][ 352 self._connector_state_converter.END_KEY 353 ], 354 self._concurrent_state["slices"][i + 1][ 355 self._connector_state_converter.START_KEY 356 ], 357 False, 358 ) 359 yield from self._split_per_slice_range( 360 self._calculate_lower_boundary_of_last_slice( 361 self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY] 362 ), 363 self._end_provider(), 364 True, 365 ) 366 else: 367 raise ValueError("Expected at least one slice")
Generating slices based on a few parameters:
- lookback_window: Buffer to remove from END_KEY of the highest slice
- slice_range: Max difference between two slices. If the difference between two slices is greater, multiple slices will be created
- start:
_split_per_slice_range
will clip any value to `self._start which means that:- if upper is less than self._start, no slices will be generated
- if lower is less than self._start, self._start will be used as the lower boundary (lookback_window will not be considered in that case)
Note that the slices will overlap at their boundaries. We therefore expect to have at least the lower or the upper boundary to be inclusive in the API that is queried.
463 def should_be_synced(self, record: Record) -> bool: 464 """ 465 Determines if a record should be synced based on its cursor value. 466 :param record: The record to evaluate 467 468 :return: True if the record's cursor value falls within the sync boundaries 469 """ 470 try: 471 record_cursor_value: CursorValueType = self._extract_cursor_value(record) 472 except ValueError: 473 self._log_for_record_without_cursor_value() 474 return True 475 return self.start <= record_cursor_value <= self._end_provider()
Determines if a record should be synced based on its cursor value.
Parameters
- record: The record to evaluate
Returns
True if the record's cursor value falls within the sync boundaries
30class ConcurrentSource: 31 """ 32 A Source that reads data from multiple AbstractStreams concurrently. 33 It does so by submitting partition generation, and partition read tasks to a thread pool. 34 The tasks asynchronously add their output to a shared queue. 35 The read is done when all partitions for all streams w ere generated and read. 36 """ 37 38 DEFAULT_TIMEOUT_SECONDS = 900 39 40 @staticmethod 41 def create( 42 num_workers: int, 43 initial_number_of_partitions_to_generate: int, 44 logger: logging.Logger, 45 slice_logger: SliceLogger, 46 message_repository: MessageRepository, 47 timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, 48 ) -> "ConcurrentSource": 49 is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1 50 too_many_generator = ( 51 not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers 52 ) 53 assert ( 54 not too_many_generator 55 ), "It is required to have more workers than threads generating partitions" 56 threadpool = ThreadPoolManager( 57 concurrent.futures.ThreadPoolExecutor( 58 max_workers=num_workers, thread_name_prefix="workerpool" 59 ), 60 logger, 61 ) 62 return ConcurrentSource( 63 threadpool, 64 logger, 65 slice_logger, 66 message_repository, 67 initial_number_of_partitions_to_generate, 68 timeout_seconds, 69 ) 70 71 def __init__( 72 self, 73 threadpool: ThreadPoolManager, 74 logger: logging.Logger, 75 slice_logger: SliceLogger = DebugSliceLogger(), 76 message_repository: MessageRepository = InMemoryMessageRepository(), 77 initial_number_partitions_to_generate: int = 1, 78 timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, 79 ) -> None: 80 """ 81 :param threadpool: The threadpool to submit tasks to 82 :param logger: The logger to log to 83 :param slice_logger: The slice logger used to create messages on new slices 84 :param message_repository: The repository to emit messages to 85 :param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible. 86 :param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return. 87 """ 88 self._threadpool = threadpool 89 self._logger = logger 90 self._slice_logger = slice_logger 91 self._message_repository = message_repository 92 self._initial_number_partitions_to_generate = initial_number_partitions_to_generate 93 self._timeout_seconds = timeout_seconds 94 95 def read( 96 self, 97 streams: List[AbstractStream], 98 ) -> Iterator[AirbyteMessage]: 99 self._logger.info("Starting syncing") 100 101 # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less 102 # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating 103 # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more 104 # information and might even need to be configurable depending on the source 105 queue: Queue[QueueItem] = Queue(maxsize=10_000) 106 concurrent_stream_processor = ConcurrentReadProcessor( 107 streams, 108 PartitionEnqueuer(queue, self._threadpool), 109 self._threadpool, 110 self._logger, 111 self._slice_logger, 112 self._message_repository, 113 PartitionReader(queue), 114 ) 115 116 # Enqueue initial partition generation tasks 117 yield from self._submit_initial_partition_generators(concurrent_stream_processor) 118 119 # Read from the queue until all partitions were generated and read 120 yield from self._consume_from_queue( 121 queue, 122 concurrent_stream_processor, 123 ) 124 self._threadpool.check_for_errors_and_shutdown() 125 self._logger.info("Finished syncing") 126 127 def _submit_initial_partition_generators( 128 self, concurrent_stream_processor: ConcurrentReadProcessor 129 ) -> Iterable[AirbyteMessage]: 130 for _ in range(self._initial_number_partitions_to_generate): 131 status_message = concurrent_stream_processor.start_next_partition_generator() 132 if status_message: 133 yield status_message 134 135 def _consume_from_queue( 136 self, 137 queue: Queue[QueueItem], 138 concurrent_stream_processor: ConcurrentReadProcessor, 139 ) -> Iterable[AirbyteMessage]: 140 while airbyte_message_or_record_or_exception := queue.get(): 141 yield from self._handle_item( 142 airbyte_message_or_record_or_exception, 143 concurrent_stream_processor, 144 ) 145 if concurrent_stream_processor.is_done() and queue.empty(): 146 # all partitions were generated and processed. we're done here 147 break 148 149 def _handle_item( 150 self, 151 queue_item: QueueItem, 152 concurrent_stream_processor: ConcurrentReadProcessor, 153 ) -> Iterable[AirbyteMessage]: 154 # handle queue item and call the appropriate handler depending on the type of the queue item 155 if isinstance(queue_item, StreamThreadException): 156 yield from concurrent_stream_processor.on_exception(queue_item) 157 elif isinstance(queue_item, PartitionGenerationCompletedSentinel): 158 yield from concurrent_stream_processor.on_partition_generation_completed(queue_item) 159 elif isinstance(queue_item, Partition): 160 concurrent_stream_processor.on_partition(queue_item) 161 elif isinstance(queue_item, PartitionCompleteSentinel): 162 yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item) 163 elif isinstance(queue_item, Record): 164 yield from concurrent_stream_processor.on_record(queue_item) 165 else: 166 raise ValueError(f"Unknown queue item type: {type(queue_item)}")
A Source that reads data from multiple AbstractStreams concurrently. It does so by submitting partition generation, and partition read tasks to a thread pool. The tasks asynchronously add their output to a shared queue. The read is done when all partitions for all streams w ere generated and read.
71 def __init__( 72 self, 73 threadpool: ThreadPoolManager, 74 logger: logging.Logger, 75 slice_logger: SliceLogger = DebugSliceLogger(), 76 message_repository: MessageRepository = InMemoryMessageRepository(), 77 initial_number_partitions_to_generate: int = 1, 78 timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, 79 ) -> None: 80 """ 81 :param threadpool: The threadpool to submit tasks to 82 :param logger: The logger to log to 83 :param slice_logger: The slice logger used to create messages on new slices 84 :param message_repository: The repository to emit messages to 85 :param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible. 86 :param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return. 87 """ 88 self._threadpool = threadpool 89 self._logger = logger 90 self._slice_logger = slice_logger 91 self._message_repository = message_repository 92 self._initial_number_partitions_to_generate = initial_number_partitions_to_generate 93 self._timeout_seconds = timeout_seconds
Parameters
- threadpool: The threadpool to submit tasks to
- logger: The logger to log to
- slice_logger: The slice logger used to create messages on new slices
- message_repository: The repository to emit messages to
- initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible.
- timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return.
40 @staticmethod 41 def create( 42 num_workers: int, 43 initial_number_of_partitions_to_generate: int, 44 logger: logging.Logger, 45 slice_logger: SliceLogger, 46 message_repository: MessageRepository, 47 timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, 48 ) -> "ConcurrentSource": 49 is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1 50 too_many_generator = ( 51 not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers 52 ) 53 assert ( 54 not too_many_generator 55 ), "It is required to have more workers than threads generating partitions" 56 threadpool = ThreadPoolManager( 57 concurrent.futures.ThreadPoolExecutor( 58 max_workers=num_workers, thread_name_prefix="workerpool" 59 ), 60 logger, 61 ) 62 return ConcurrentSource( 63 threadpool, 64 logger, 65 slice_logger, 66 message_repository, 67 initial_number_of_partitions_to_generate, 68 timeout_seconds, 69 )
95 def read( 96 self, 97 streams: List[AbstractStream], 98 ) -> Iterator[AirbyteMessage]: 99 self._logger.info("Starting syncing") 100 101 # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less 102 # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating 103 # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more 104 # information and might even need to be configurable depending on the source 105 queue: Queue[QueueItem] = Queue(maxsize=10_000) 106 concurrent_stream_processor = ConcurrentReadProcessor( 107 streams, 108 PartitionEnqueuer(queue, self._threadpool), 109 self._threadpool, 110 self._logger, 111 self._slice_logger, 112 self._message_repository, 113 PartitionReader(queue), 114 ) 115 116 # Enqueue initial partition generation tasks 117 yield from self._submit_initial_partition_generators(concurrent_stream_processor) 118 119 # Read from the queue until all partitions were generated and read 120 yield from self._consume_from_queue( 121 queue, 122 concurrent_stream_processor, 123 ) 124 self._threadpool.check_for_errors_and_shutdown() 125 self._logger.info("Finished syncing")
34class ConcurrentSourceAdapter(AbstractSource, ABC): 35 def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None: 36 """ 37 ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source. 38 39 The source's streams are still defined through the streams() method. 40 Streams wrapped in a StreamFacade will be processed concurrently. 41 Other streams will be processed sequentially as a later step. 42 """ 43 self._concurrent_source = concurrent_source 44 super().__init__(**kwargs) 45 46 def read( 47 self, 48 logger: logging.Logger, 49 config: Mapping[str, Any], 50 catalog: ConfiguredAirbyteCatalog, 51 state: Optional[List[AirbyteStateMessage]] = None, 52 ) -> Iterator[AirbyteMessage]: 53 abstract_streams = self._select_abstract_streams(config, catalog) 54 concurrent_stream_names = {stream.name for stream in abstract_streams} 55 configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog( 56 streams=[ 57 stream 58 for stream in catalog.streams 59 if stream.stream.name not in concurrent_stream_names 60 ] 61 ) 62 if abstract_streams: 63 yield from self._concurrent_source.read(abstract_streams) 64 if configured_catalog_for_regular_streams.streams: 65 yield from super().read(logger, config, configured_catalog_for_regular_streams, state) 66 67 def _select_abstract_streams( 68 self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog 69 ) -> List[AbstractStream]: 70 """ 71 Selects streams that can be processed concurrently and returns their abstract representations. 72 """ 73 all_streams = self.streams(config) 74 stream_name_to_instance: Mapping[str, Stream] = {s.name: s for s in all_streams} 75 abstract_streams: List[AbstractStream] = [] 76 for configured_stream in configured_catalog.streams: 77 stream_instance = stream_name_to_instance.get(configured_stream.stream.name) 78 if not stream_instance: 79 continue 80 81 if isinstance(stream_instance, AbstractStreamFacade): 82 abstract_streams.append(stream_instance.get_underlying_stream()) 83 return abstract_streams 84 85 def convert_to_concurrent_stream( 86 self, 87 logger: logging.Logger, 88 stream: Stream, 89 state_manager: ConnectorStateManager, 90 cursor: Optional[Cursor] = None, 91 ) -> Stream: 92 """ 93 Prepares a stream for concurrent processing by initializing or assigning a cursor, 94 managing the stream's state, and returning an updated Stream instance. 95 """ 96 state: MutableMapping[str, Any] = {} 97 98 if cursor: 99 state = state_manager.get_stream_state(stream.name, stream.namespace) 100 101 stream.cursor = cursor # type: ignore[assignment] # cursor is of type ConcurrentCursor, which inherits from Cursor 102 if hasattr(stream, "parent"): 103 stream.parent.cursor = cursor 104 else: 105 cursor = FinalStateCursor( 106 stream_name=stream.name, 107 stream_namespace=stream.namespace, 108 message_repository=self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case 109 ) 110 return StreamFacade.create_from_stream(stream, self, logger, state, cursor) 111 112 def initialize_cursor( 113 self, 114 stream: Stream, 115 state_manager: ConnectorStateManager, 116 converter: AbstractStreamStateConverter, 117 slice_boundary_fields: Optional[Tuple[str, str]], 118 start: Optional[CursorValueType], 119 end_provider: Callable[[], CursorValueType], 120 lookback_window: Optional[GapType] = None, 121 slice_range: Optional[GapType] = None, 122 ) -> Optional[ConcurrentCursor]: 123 lookback_window = lookback_window or timedelta(seconds=DEFAULT_LOOKBACK_SECONDS) 124 125 cursor_field_name = stream.cursor_field 126 127 if cursor_field_name: 128 if not isinstance(cursor_field_name, str): 129 raise ValueError( 130 f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}." 131 ) 132 133 return ConcurrentCursor( 134 stream.name, 135 stream.namespace, 136 state_manager.get_stream_state(stream.name, stream.namespace), 137 self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case 138 state_manager, 139 converter, 140 CursorField(cursor_field_name), 141 slice_boundary_fields, 142 start, 143 end_provider, 144 lookback_window, 145 slice_range, 146 ) 147 148 return None
Abstract base class for an Airbyte Source. Consumers should implement any abstract methods in this class to create an Airbyte Specification compliant Source.
35 def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None: 36 """ 37 ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source. 38 39 The source's streams are still defined through the streams() method. 40 Streams wrapped in a StreamFacade will be processed concurrently. 41 Other streams will be processed sequentially as a later step. 42 """ 43 self._concurrent_source = concurrent_source 44 super().__init__(**kwargs)
ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source.
The source's streams are still defined through the streams() method. Streams wrapped in a StreamFacade will be processed concurrently. Other streams will be processed sequentially as a later step.
46 def read( 47 self, 48 logger: logging.Logger, 49 config: Mapping[str, Any], 50 catalog: ConfiguredAirbyteCatalog, 51 state: Optional[List[AirbyteStateMessage]] = None, 52 ) -> Iterator[AirbyteMessage]: 53 abstract_streams = self._select_abstract_streams(config, catalog) 54 concurrent_stream_names = {stream.name for stream in abstract_streams} 55 configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog( 56 streams=[ 57 stream 58 for stream in catalog.streams 59 if stream.stream.name not in concurrent_stream_names 60 ] 61 ) 62 if abstract_streams: 63 yield from self._concurrent_source.read(abstract_streams) 64 if configured_catalog_for_regular_streams.streams: 65 yield from super().read(logger, config, configured_catalog_for_regular_streams, state)
Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.
85 def convert_to_concurrent_stream( 86 self, 87 logger: logging.Logger, 88 stream: Stream, 89 state_manager: ConnectorStateManager, 90 cursor: Optional[Cursor] = None, 91 ) -> Stream: 92 """ 93 Prepares a stream for concurrent processing by initializing or assigning a cursor, 94 managing the stream's state, and returning an updated Stream instance. 95 """ 96 state: MutableMapping[str, Any] = {} 97 98 if cursor: 99 state = state_manager.get_stream_state(stream.name, stream.namespace) 100 101 stream.cursor = cursor # type: ignore[assignment] # cursor is of type ConcurrentCursor, which inherits from Cursor 102 if hasattr(stream, "parent"): 103 stream.parent.cursor = cursor 104 else: 105 cursor = FinalStateCursor( 106 stream_name=stream.name, 107 stream_namespace=stream.namespace, 108 message_repository=self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case 109 ) 110 return StreamFacade.create_from_stream(stream, self, logger, state, cursor)
Prepares a stream for concurrent processing by initializing or assigning a cursor, managing the stream's state, and returning an updated Stream instance.
112 def initialize_cursor( 113 self, 114 stream: Stream, 115 state_manager: ConnectorStateManager, 116 converter: AbstractStreamStateConverter, 117 slice_boundary_fields: Optional[Tuple[str, str]], 118 start: Optional[CursorValueType], 119 end_provider: Callable[[], CursorValueType], 120 lookback_window: Optional[GapType] = None, 121 slice_range: Optional[GapType] = None, 122 ) -> Optional[ConcurrentCursor]: 123 lookback_window = lookback_window or timedelta(seconds=DEFAULT_LOOKBACK_SECONDS) 124 125 cursor_field_name = stream.cursor_field 126 127 if cursor_field_name: 128 if not isinstance(cursor_field_name, str): 129 raise ValueError( 130 f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}." 131 ) 132 133 return ConcurrentCursor( 134 stream.name, 135 stream.namespace, 136 state_manager.get_stream_state(stream.name, stream.namespace), 137 self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case 138 state_manager, 139 converter, 140 CursorField(cursor_field_name), 141 slice_boundary_fields, 142 start, 143 end_provider, 144 lookback_window, 145 slice_range, 146 ) 147 148 return None
Inherited Members
51class Cursor(StreamSlicer, ABC): 52 @property 53 @abstractmethod 54 def state(self) -> MutableMapping[str, Any]: ... 55 56 @abstractmethod 57 def observe(self, record: Record) -> None: 58 """ 59 Indicate to the cursor that the record has been emitted 60 """ 61 raise NotImplementedError() 62 63 @abstractmethod 64 def close_partition(self, partition: Partition) -> None: 65 """ 66 Indicate to the cursor that the partition has been successfully processed 67 """ 68 raise NotImplementedError() 69 70 @abstractmethod 71 def ensure_at_least_one_state_emitted(self) -> None: 72 """ 73 State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per 74 stream. Hence, if no partitions are generated, this method needs to be called. 75 """ 76 raise NotImplementedError() 77 78 def stream_slices(self) -> Iterable[StreamSlice]: 79 """ 80 Default placeholder implementation of generate_slices. 81 Subclasses can override this method to provide actual behavior. 82 """ 83 yield StreamSlice(partition={}, cursor_slice={})
Slices the stream into chunks that can be fetched independently. Slices enable state checkpointing and data retrieval parallelization.
56 @abstractmethod 57 def observe(self, record: Record) -> None: 58 """ 59 Indicate to the cursor that the record has been emitted 60 """ 61 raise NotImplementedError()
Indicate to the cursor that the record has been emitted
63 @abstractmethod 64 def close_partition(self, partition: Partition) -> None: 65 """ 66 Indicate to the cursor that the partition has been successfully processed 67 """ 68 raise NotImplementedError()
Indicate to the cursor that the partition has been successfully processed
70 @abstractmethod 71 def ensure_at_least_one_state_emitted(self) -> None: 72 """ 73 State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per 74 stream. Hence, if no partitions are generated, this method needs to be called. 75 """ 76 raise NotImplementedError()
State messages are emitted when a partition is closed. However, the platform expects at least one state to be emitted per sync per stream. Hence, if no partitions are generated, this method needs to be called.
78 def stream_slices(self) -> Iterable[StreamSlice]: 79 """ 80 Default placeholder implementation of generate_slices. 81 Subclasses can override this method to provide actual behavior. 82 """ 83 yield StreamSlice(partition={}, cursor_slice={})
Default placeholder implementation of generate_slices. Subclasses can override this method to provide actual behavior.
40class CursorField: 41 def __init__(self, cursor_field_key: str) -> None: 42 self.cursor_field_key = cursor_field_key 43 44 def extract_value(self, record: Record) -> CursorValueType: 45 cursor_value = record.data.get(self.cursor_field_key) 46 if cursor_value is None: 47 raise ValueError(f"Could not find cursor field {self.cursor_field_key} in record") 48 return cursor_value # type: ignore # we assume that the value the path points at is a comparable
44 def extract_value(self, record: Record) -> CursorValueType: 45 cursor_value = record.data.get(self.cursor_field_key) 46 if cursor_value is None: 47 raise ValueError(f"Could not find cursor field {self.cursor_field_key} in record") 48 return cursor_value # type: ignore # we assume that the value the path points at is a comparable
115class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter): 116 """ 117 e.g. 118 { "created": 1617030403 } 119 => 120 { 121 "state_type": "date-range", 122 "metadata": { … }, 123 "slices": [ 124 {starts: 0, end: 1617030403, finished_processing: true} 125 ] 126 } 127 """ 128 129 _zero_value = 0 130 131 def increment(self, timestamp: datetime) -> datetime: 132 return timestamp + timedelta(seconds=1) 133 134 def output_format(self, timestamp: datetime) -> int: 135 return int(timestamp.timestamp()) 136 137 def parse_timestamp(self, timestamp: int) -> datetime: 138 dt_object = AirbyteDateTime.fromtimestamp(timestamp, timezone.utc) 139 if not isinstance(dt_object, AirbyteDateTime): 140 raise ValueError( 141 f"AirbyteDateTime object was expected but got {type(dt_object)} from AirbyteDateTime.fromtimestamp({timestamp})" 142 ) 143 return dt_object
e.g. { "created": 1617030403 } => { "state_type": "date-range", "metadata": { … }, "slices": [ {starts: 0, end: 1617030403, finished_processing: true} ] }
131 def increment(self, timestamp: datetime) -> datetime: 132 return timestamp + timedelta(seconds=1)
Increment a timestamp by a single unit.
Convert the cursor value type to a JSON valid type.
137 def parse_timestamp(self, timestamp: int) -> datetime: 138 dt_object = AirbyteDateTime.fromtimestamp(timestamp, timezone.utc) 139 if not isinstance(dt_object, AirbyteDateTime): 140 raise ValueError( 141 f"AirbyteDateTime object was expected but got {type(dt_object)} from AirbyteDateTime.fromtimestamp({timestamp})" 142 ) 143 return dt_object
Inherited Members
86class FinalStateCursor(Cursor): 87 """Cursor that is used to guarantee at least one state message is emitted for a concurrent stream.""" 88 89 def __init__( 90 self, 91 stream_name: str, 92 stream_namespace: Optional[str], 93 message_repository: MessageRepository, 94 ) -> None: 95 self._stream_name = stream_name 96 self._stream_namespace = stream_namespace 97 self._message_repository = message_repository 98 # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel 99 # state message rather than manage overall source state. This is also only temporary as we move to the resumable 100 # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state. 101 self._connector_state_manager = ConnectorStateManager() 102 self._has_closed_at_least_one_slice = False 103 104 @property 105 def state(self) -> MutableMapping[str, Any]: 106 return {NO_CURSOR_STATE_KEY: True} 107 108 def observe(self, record: Record) -> None: 109 pass 110 111 def close_partition(self, partition: Partition) -> None: 112 pass 113 114 def ensure_at_least_one_state_emitted(self) -> None: 115 """ 116 Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync 117 """ 118 119 self._connector_state_manager.update_state_for_stream( 120 self._stream_name, self._stream_namespace, self.state 121 ) 122 state_message = self._connector_state_manager.create_state_message( 123 self._stream_name, self._stream_namespace 124 ) 125 self._message_repository.emit_message(state_message)
Cursor that is used to guarantee at least one state message is emitted for a concurrent stream.
89 def __init__( 90 self, 91 stream_name: str, 92 stream_namespace: Optional[str], 93 message_repository: MessageRepository, 94 ) -> None: 95 self._stream_name = stream_name 96 self._stream_namespace = stream_namespace 97 self._message_repository = message_repository 98 # Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel 99 # state message rather than manage overall source state. This is also only temporary as we move to the resumable 100 # full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state. 101 self._connector_state_manager = ConnectorStateManager() 102 self._has_closed_at_least_one_slice = False
Indicate to the cursor that the partition has been successfully processed
114 def ensure_at_least_one_state_emitted(self) -> None: 115 """ 116 Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync 117 """ 118 119 self._connector_state_manager.update_state_for_stream( 120 self._stream_name, self._stream_namespace, self.state 121 ) 122 state_message = self._connector_state_manager.create_state_message( 123 self._stream_name, self._stream_namespace 124 ) 125 self._message_repository.emit_message(state_message)
Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync
Inherited Members
146class IsoMillisConcurrentStreamStateConverter(DateTimeStreamStateConverter): 147 """ 148 e.g. 149 { "created": "2021-01-18T21:18:20.000Z" } 150 => 151 { 152 "state_type": "date-range", 153 "metadata": { … }, 154 "slices": [ 155 {starts: "2020-01-18T21:18:20.000Z", end: "2021-01-18T21:18:20.000Z", finished_processing: true} 156 ] 157 } 158 """ 159 160 _zero_value = "0001-01-01T00:00:00.000Z" 161 162 def __init__( 163 self, is_sequential_state: bool = True, cursor_granularity: Optional[timedelta] = None 164 ): 165 super().__init__(is_sequential_state=is_sequential_state) 166 self._cursor_granularity = cursor_granularity or timedelta(milliseconds=1) 167 168 def increment(self, timestamp: datetime) -> datetime: 169 return timestamp + self._cursor_granularity 170 171 def output_format(self, timestamp: datetime) -> str: 172 """Format datetime with milliseconds always included. 173 174 Args: 175 timestamp: The datetime to format. 176 177 Returns: 178 str: ISO8601/RFC3339 formatted string with milliseconds. 179 """ 180 dt = AirbyteDateTime.from_datetime(timestamp) 181 # Always include milliseconds, even if zero 182 millis = dt.microsecond // 1000 if dt.microsecond else 0 183 return f"{dt.year:04d}-{dt.month:02d}-{dt.day:02d}T{dt.hour:02d}:{dt.minute:02d}:{dt.second:02d}.{millis:03d}Z" 184 185 def parse_timestamp(self, timestamp: str) -> datetime: 186 dt_object = ab_datetime_parse(timestamp) 187 if not isinstance(dt_object, AirbyteDateTime): 188 raise ValueError( 189 f"AirbyteDateTime object was expected but got {type(dt_object)} from parse({timestamp})" 190 ) 191 return dt_object
e.g. { "created": "2021-01-18T21:18:20.000Z" } => { "state_type": "date-range", "metadata": { … }, "slices": [ {starts: "2020-01-18T21:18:20.000Z", end: "2021-01-18T21:18:20.000Z", finished_processing: true} ] }
168 def increment(self, timestamp: datetime) -> datetime: 169 return timestamp + self._cursor_granularity
Increment a timestamp by a single unit.
171 def output_format(self, timestamp: datetime) -> str: 172 """Format datetime with milliseconds always included. 173 174 Args: 175 timestamp: The datetime to format. 176 177 Returns: 178 str: ISO8601/RFC3339 formatted string with milliseconds. 179 """ 180 dt = AirbyteDateTime.from_datetime(timestamp) 181 # Always include milliseconds, even if zero 182 millis = dt.microsecond // 1000 if dt.microsecond else 0 183 return f"{dt.year:04d}-{dt.month:02d}-{dt.day:02d}T{dt.hour:02d}:{dt.minute:02d}:{dt.second:02d}.{millis:03d}Z"
Format datetime with milliseconds always included.
Arguments:
- timestamp: The datetime to format.
Returns:
str: ISO8601/RFC3339 formatted string with milliseconds.
Inherited Members
54@deprecated( 55 "This class is experimental. Use at your own risk.", 56 category=ExperimentalClassWarning, 57) 58class StreamFacade(AbstractStreamFacade[DefaultStream], Stream): 59 """ 60 The StreamFacade is a Stream that wraps an AbstractStream and exposes it as a Stream. 61 62 All methods either delegate to the wrapped AbstractStream or provide a default implementation. 63 The default implementations define restrictions imposed on Streams migrated to the new interface. For instance, only source-defined cursors are supported. 64 """ 65 66 @classmethod 67 def create_from_stream( 68 cls, 69 stream: Stream, 70 source: AbstractSource, 71 logger: logging.Logger, 72 state: Optional[MutableMapping[str, Any]], 73 cursor: Cursor, 74 ) -> Stream: 75 """ 76 Create a ConcurrentStream from a Stream object. 77 :param source: The source 78 :param stream: The stream 79 :param max_workers: The maximum number of worker thread to use 80 :return: 81 """ 82 pk = get_primary_key_from_stream(stream.primary_key) 83 cursor_field = get_cursor_field_from_stream(stream) 84 85 if not source.message_repository: 86 raise ValueError( 87 "A message repository is required to emit non-record messages. Please set the message repository on the source." 88 ) 89 90 message_repository = source.message_repository 91 return StreamFacade( 92 DefaultStream( 93 partition_generator=StreamPartitionGenerator( 94 stream, 95 message_repository, 96 SyncMode.full_refresh 97 if isinstance(cursor, FinalStateCursor) 98 else SyncMode.incremental, 99 [cursor_field] if cursor_field is not None else None, 100 state, 101 ), 102 name=stream.name, 103 namespace=stream.namespace, 104 json_schema=stream.get_json_schema(), 105 availability_strategy=AlwaysAvailableAvailabilityStrategy(), 106 primary_key=pk, 107 cursor_field=cursor_field, 108 logger=logger, 109 cursor=cursor, 110 ), 111 stream, 112 cursor, 113 slice_logger=source._slice_logger, 114 logger=logger, 115 ) 116 117 @property 118 def state(self) -> MutableMapping[str, Any]: 119 raise NotImplementedError( 120 "This should not be called as part of the Concurrent CDK code. Please report the problem to Airbyte" 121 ) 122 123 @state.setter 124 def state(self, value: Mapping[str, Any]) -> None: 125 if "state" in dir(self._legacy_stream): 126 self._legacy_stream.state = value # type: ignore # validating `state` is attribute of stream using `if` above 127 128 def __init__( 129 self, 130 stream: DefaultStream, 131 legacy_stream: Stream, 132 cursor: Cursor, 133 slice_logger: SliceLogger, 134 logger: logging.Logger, 135 ): 136 """ 137 :param stream: The underlying AbstractStream 138 """ 139 self._abstract_stream = stream 140 self._legacy_stream = legacy_stream 141 self._cursor = cursor 142 self._slice_logger = slice_logger 143 self._logger = logger 144 145 def read( 146 self, 147 configured_stream: ConfiguredAirbyteStream, 148 logger: logging.Logger, 149 slice_logger: SliceLogger, 150 stream_state: MutableMapping[str, Any], 151 state_manager: ConnectorStateManager, 152 internal_config: InternalConfig, 153 ) -> Iterable[StreamData]: 154 yield from self._read_records() 155 156 def read_records( 157 self, 158 sync_mode: SyncMode, 159 cursor_field: Optional[List[str]] = None, 160 stream_slice: Optional[Mapping[str, Any]] = None, 161 stream_state: Optional[Mapping[str, Any]] = None, 162 ) -> Iterable[StreamData]: 163 try: 164 yield from self._read_records() 165 except Exception as exc: 166 if hasattr(self._cursor, "state"): 167 state = str(self._cursor.state) 168 else: 169 # This shouldn't happen if the ConcurrentCursor was used 170 state = "unknown; no state attribute was available on the cursor" 171 yield AirbyteMessage( 172 type=Type.LOG, 173 log=AirbyteLogMessage( 174 level=Level.ERROR, message=f"Cursor State at time of exception: {state}" 175 ), 176 ) 177 raise exc 178 179 def _read_records(self) -> Iterable[StreamData]: 180 for partition in self._abstract_stream.generate_partitions(): 181 if self._slice_logger.should_log_slice_message(self._logger): 182 yield self._slice_logger.create_slice_log_message(partition.to_slice()) 183 for record in partition.read(): 184 yield record.data 185 186 @property 187 def name(self) -> str: 188 return self._abstract_stream.name 189 190 @property 191 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 192 # This method is not expected to be called directly. It is only implemented for backward compatibility with the old interface 193 return self.as_airbyte_stream().source_defined_primary_key # type: ignore # source_defined_primary_key is known to be an Optional[List[List[str]]] 194 195 @property 196 def cursor_field(self) -> Union[str, List[str]]: 197 if self._abstract_stream.cursor_field is None: 198 return [] 199 else: 200 return self._abstract_stream.cursor_field 201 202 @property 203 def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor 204 return self._cursor 205 206 @lru_cache(maxsize=None) 207 def get_json_schema(self) -> Mapping[str, Any]: 208 return self._abstract_stream.get_json_schema() 209 210 @property 211 def supports_incremental(self) -> bool: 212 return self._legacy_stream.supports_incremental 213 214 def check_availability( 215 self, logger: logging.Logger, source: Optional["Source"] = None 216 ) -> Tuple[bool, Optional[str]]: 217 """ 218 Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters 219 :param logger: (ignored) 220 :param source: (ignored) 221 :return: 222 """ 223 availability = self._abstract_stream.check_availability() 224 return availability.is_available(), availability.message() 225 226 def as_airbyte_stream(self) -> AirbyteStream: 227 return self._abstract_stream.as_airbyte_stream() 228 229 def log_stream_sync_configuration(self) -> None: 230 self._abstract_stream.log_stream_sync_configuration() 231 232 def get_underlying_stream(self) -> DefaultStream: 233 return self._abstract_stream
The StreamFacade is a Stream that wraps an AbstractStream and exposes it as a Stream.
All methods either delegate to the wrapped AbstractStream or provide a default implementation. The default implementations define restrictions imposed on Streams migrated to the new interface. For instance, only source-defined cursors are supported.
128 def __init__( 129 self, 130 stream: DefaultStream, 131 legacy_stream: Stream, 132 cursor: Cursor, 133 slice_logger: SliceLogger, 134 logger: logging.Logger, 135 ): 136 """ 137 :param stream: The underlying AbstractStream 138 """ 139 self._abstract_stream = stream 140 self._legacy_stream = legacy_stream 141 self._cursor = cursor 142 self._slice_logger = slice_logger 143 self._logger = logger
Parameters
- stream: The underlying AbstractStream
66 @classmethod 67 def create_from_stream( 68 cls, 69 stream: Stream, 70 source: AbstractSource, 71 logger: logging.Logger, 72 state: Optional[MutableMapping[str, Any]], 73 cursor: Cursor, 74 ) -> Stream: 75 """ 76 Create a ConcurrentStream from a Stream object. 77 :param source: The source 78 :param stream: The stream 79 :param max_workers: The maximum number of worker thread to use 80 :return: 81 """ 82 pk = get_primary_key_from_stream(stream.primary_key) 83 cursor_field = get_cursor_field_from_stream(stream) 84 85 if not source.message_repository: 86 raise ValueError( 87 "A message repository is required to emit non-record messages. Please set the message repository on the source." 88 ) 89 90 message_repository = source.message_repository 91 return StreamFacade( 92 DefaultStream( 93 partition_generator=StreamPartitionGenerator( 94 stream, 95 message_repository, 96 SyncMode.full_refresh 97 if isinstance(cursor, FinalStateCursor) 98 else SyncMode.incremental, 99 [cursor_field] if cursor_field is not None else None, 100 state, 101 ), 102 name=stream.name, 103 namespace=stream.namespace, 104 json_schema=stream.get_json_schema(), 105 availability_strategy=AlwaysAvailableAvailabilityStrategy(), 106 primary_key=pk, 107 cursor_field=cursor_field, 108 logger=logger, 109 cursor=cursor, 110 ), 111 stream, 112 cursor, 113 slice_logger=source._slice_logger, 114 logger=logger, 115 )
Create a ConcurrentStream from a Stream object.
Parameters
- source: The source
- stream: The stream
- max_workers: The maximum number of worker thread to use
Returns
214 def check_availability( 215 self, logger: logging.Logger, source: Optional["Source"] = None 216 ) -> Tuple[bool, Optional[str]]: 217 """ 218 Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters 219 :param logger: (ignored) 220 :param source: (ignored) 221 :return: 222 """ 223 availability = self._abstract_stream.check_availability() 224 return availability.is_available(), availability.message()
Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters
Parameters
- logger: (ignored)
- source: (ignored)
Returns
Return the underlying stream facade object.
Inherited Members
- Stream
- logger
- transformer
- cursor
- has_multiple_slices
- name
- get_error_display_message
- read
- read_only_records
- read_records
- get_json_schema
- as_airbyte_stream
- supports_incremental
- is_resumable
- cursor_field
- namespace
- source_defined_cursor
- exit_on_rate_limit
- primary_key
- stream_slices
- state_checkpoint_interval
- get_updated_state
- get_cursor
- log_stream_sync_configuration
- configured_json_schema
99def create_connector_config_control_message(config: MutableMapping[str, Any]) -> AirbyteMessage: 100 control_message = AirbyteControlMessage( 101 type=OrchestratorType.CONNECTOR_CONFIG, 102 emitted_at=time.time() * 1000, 103 connectorConfig=AirbyteControlConnectorConfigMessage(config=config), 104 ) 105 return AirbyteMessage(type=Type.CONTROL, control=control_message)
90def emit_configuration_as_airbyte_control_message(config: MutableMapping[str, Any]) -> None: 91 """ 92 WARNING: deprecated - emit_configuration_as_airbyte_control_message is being deprecated in favor of the MessageRepository mechanism. 93 See the airbyte_cdk.sources.message package 94 """ 95 airbyte_message = create_connector_config_control_message(config) 96 print(orjson.dumps(AirbyteMessageSerializer.dump(airbyte_message)).decode())
WARNING: deprecated - emit_configuration_as_airbyte_control_message is being deprecated in favor of the MessageRepository mechanism. See the airbyte_cdk.sources.message package
53class AbstractSource(Source, ABC): 54 """ 55 Abstract base class for an Airbyte Source. Consumers should implement any abstract methods 56 in this class to create an Airbyte Specification compliant Source. 57 """ 58 59 @abstractmethod 60 def check_connection( 61 self, logger: logging.Logger, config: Mapping[str, Any] 62 ) -> Tuple[bool, Optional[Any]]: 63 """ 64 :param logger: source logger 65 :param config: The user-provided configuration as specified by the source's spec. 66 This usually contains information required to check connection e.g. tokens, secrets and keys etc. 67 :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful 68 and we can connect to the underlying data source using the provided configuration. 69 Otherwise, the input config cannot be used to connect to the underlying data source, 70 and the "error" object should describe what went wrong. 71 The error object will be cast to string to display the problem to the user. 72 """ 73 74 @abstractmethod 75 def streams(self, config: Mapping[str, Any]) -> List[Stream]: 76 """ 77 :param config: The user-provided configuration as specified by the source's spec. 78 Any stream construction related operation should happen here. 79 :return: A list of the streams in this source connector. 80 """ 81 82 # Stream name to instance map for applying output object transformation 83 _stream_to_instance_map: Dict[str, Stream] = {} 84 _slice_logger: SliceLogger = DebugSliceLogger() 85 86 def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog: 87 """Implements the Discover operation from the Airbyte Specification. 88 See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover. 89 """ 90 streams = [stream.as_airbyte_stream() for stream in self.streams(config=config)] 91 return AirbyteCatalog(streams=streams) 92 93 def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: 94 """Implements the Check Connection operation from the Airbyte Specification. 95 See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check. 96 """ 97 check_succeeded, error = self.check_connection(logger, config) 98 if not check_succeeded: 99 return AirbyteConnectionStatus(status=Status.FAILED, message=repr(error)) 100 return AirbyteConnectionStatus(status=Status.SUCCEEDED) 101 102 def read( 103 self, 104 logger: logging.Logger, 105 config: Mapping[str, Any], 106 catalog: ConfiguredAirbyteCatalog, 107 state: Optional[List[AirbyteStateMessage]] = None, 108 ) -> Iterator[AirbyteMessage]: 109 """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.""" 110 logger.info(f"Starting syncing {self.name}") 111 config, internal_config = split_config(config) 112 # TODO assert all streams exist in the connector 113 # get the streams once in case the connector needs to make any queries to generate them 114 stream_instances = {s.name: s for s in self.streams(config)} 115 state_manager = ConnectorStateManager(state=state) 116 self._stream_to_instance_map = stream_instances 117 118 stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {} 119 120 with create_timer(self.name) as timer: 121 for configured_stream in catalog.streams: 122 stream_instance = stream_instances.get(configured_stream.stream.name) 123 is_stream_exist = bool(stream_instance) 124 try: 125 # Used direct reference to `stream_instance` instead of `is_stream_exist` to avoid mypy type checking errors 126 if not stream_instance: 127 if not self.raise_exception_on_missing_stream: 128 yield stream_status_as_airbyte_message( 129 configured_stream.stream, AirbyteStreamStatus.INCOMPLETE 130 ) 131 continue 132 133 error_message = ( 134 f"The stream '{configured_stream.stream.name}' in your connection configuration was not found in the source. " 135 f"Refresh the schema in your replication settings and remove this stream from future sync attempts." 136 ) 137 138 # Use configured_stream as stream_instance to support references in error handling. 139 stream_instance = configured_stream.stream 140 141 raise AirbyteTracedException( 142 message="A stream listed in your configuration was not found in the source. Please check the logs for more " 143 "details.", 144 internal_message=error_message, 145 failure_type=FailureType.config_error, 146 ) 147 148 timer.start_event(f"Syncing stream {configured_stream.stream.name}") 149 logger.info(f"Marking stream {configured_stream.stream.name} as STARTED") 150 yield stream_status_as_airbyte_message( 151 configured_stream.stream, AirbyteStreamStatus.STARTED 152 ) 153 yield from self._read_stream( 154 logger=logger, 155 stream_instance=stream_instance, 156 configured_stream=configured_stream, 157 state_manager=state_manager, 158 internal_config=internal_config, 159 ) 160 logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED") 161 yield stream_status_as_airbyte_message( 162 configured_stream.stream, AirbyteStreamStatus.COMPLETE 163 ) 164 165 except Exception as e: 166 yield from self._emit_queued_messages() 167 logger.exception( 168 f"Encountered an exception while reading stream {configured_stream.stream.name}" 169 ) 170 logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED") 171 yield stream_status_as_airbyte_message( 172 configured_stream.stream, AirbyteStreamStatus.INCOMPLETE 173 ) 174 175 stream_descriptor = StreamDescriptor(name=configured_stream.stream.name) 176 177 if isinstance(e, AirbyteTracedException): 178 traced_exception = e 179 info_message = f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error." 180 else: 181 traced_exception = self._serialize_exception( 182 stream_descriptor, e, stream_instance=stream_instance 183 ) 184 info_message = f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}" 185 186 yield traced_exception.as_sanitized_airbyte_message( 187 stream_descriptor=stream_descriptor 188 ) 189 stream_name_to_exception[stream_instance.name] = traced_exception # type: ignore # use configured_stream if stream_instance is None 190 if self.stop_sync_on_stream_failure: 191 logger.info(info_message) 192 break 193 finally: 194 # Finish read event only if the stream instance exists; 195 # otherwise, there's no need as it never started 196 if is_stream_exist: 197 timer.finish_event() 198 logger.info(f"Finished syncing {configured_stream.stream.name}") 199 logger.info(timer.report()) 200 201 if len(stream_name_to_exception) > 0: 202 error_message = generate_failed_streams_error_message( 203 {key: [value] for key, value in stream_name_to_exception.items()} 204 ) 205 logger.info(error_message) 206 # We still raise at least one exception when a stream raises an exception because the platform currently relies 207 # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error 208 # type because this combined error isn't actionable, but rather the previously emitted individual errors. 209 raise AirbyteTracedException( 210 message=error_message, failure_type=FailureType.config_error 211 ) 212 logger.info(f"Finished syncing {self.name}") 213 214 @staticmethod 215 def _serialize_exception( 216 stream_descriptor: StreamDescriptor, e: Exception, stream_instance: Optional[Stream] = None 217 ) -> AirbyteTracedException: 218 display_message = stream_instance.get_error_display_message(e) if stream_instance else None 219 if display_message: 220 return AirbyteTracedException.from_exception( 221 e, message=display_message, stream_descriptor=stream_descriptor 222 ) 223 return AirbyteTracedException.from_exception(e, stream_descriptor=stream_descriptor) 224 225 @property 226 def raise_exception_on_missing_stream(self) -> bool: 227 return False 228 229 def _read_stream( 230 self, 231 logger: logging.Logger, 232 stream_instance: Stream, 233 configured_stream: ConfiguredAirbyteStream, 234 state_manager: ConnectorStateManager, 235 internal_config: InternalConfig, 236 ) -> Iterator[AirbyteMessage]: 237 if internal_config.page_size and isinstance(stream_instance, HttpStream): 238 logger.info( 239 f"Setting page size for {stream_instance.name} to {internal_config.page_size}" 240 ) 241 stream_instance.page_size = internal_config.page_size 242 logger.debug( 243 f"Syncing configured stream: {configured_stream.stream.name}", 244 extra={ 245 "sync_mode": configured_stream.sync_mode, 246 "primary_key": configured_stream.primary_key, 247 "cursor_field": configured_stream.cursor_field, 248 }, 249 ) 250 stream_instance.log_stream_sync_configuration() 251 252 stream_name = configured_stream.stream.name 253 stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace) 254 255 # This is a hack. Existing full refresh streams that are converted into resumable full refresh need to discard 256 # the state because the terminal state for a full refresh sync is not compatible with substream resumable full 257 # refresh state. This is only required when running live traffic regression testing since the platform normally 258 # handles whether to pass state 259 if stream_state == {"__ab_no_cursor_state_message": True}: 260 stream_state = {} 261 262 if "state" in dir(stream_instance): 263 stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance) 264 logger.info(f"Setting state of {self.name} stream to {stream_state}") 265 266 record_iterator = stream_instance.read( 267 configured_stream, 268 logger, 269 self._slice_logger, 270 stream_state, 271 state_manager, 272 internal_config, 273 ) 274 275 record_counter = 0 276 logger.info(f"Syncing stream: {stream_name} ") 277 for record_data_or_message in record_iterator: 278 record = self._get_message(record_data_or_message, stream_instance) 279 if record.type == MessageType.RECORD: 280 record_counter += 1 281 if record_counter == 1: 282 logger.info(f"Marking stream {stream_name} as RUNNING") 283 # If we just read the first record of the stream, emit the transition to the RUNNING state 284 yield stream_status_as_airbyte_message( 285 configured_stream.stream, AirbyteStreamStatus.RUNNING 286 ) 287 yield from self._emit_queued_messages() 288 yield record 289 290 logger.info(f"Read {record_counter} records from {stream_name} stream") 291 292 def _emit_queued_messages(self) -> Iterable[AirbyteMessage]: 293 if self.message_repository: 294 yield from self.message_repository.consume_queue() 295 return 296 297 def _get_message( 298 self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream 299 ) -> AirbyteMessage: 300 """ 301 Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage 302 """ 303 match record_data_or_message: 304 case AirbyteMessage(): 305 return record_data_or_message 306 case _: 307 return stream_data_to_airbyte_message( 308 stream.name, 309 record_data_or_message, 310 stream.transformer, 311 stream.get_json_schema(), 312 ) 313 314 @property 315 def message_repository(self) -> Union[None, MessageRepository]: 316 return _default_message_repository 317 318 @property 319 def stop_sync_on_stream_failure(self) -> bool: 320 """ 321 WARNING: This function is in-development which means it is subject to change. Use at your own risk. 322 323 By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then 324 continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync 325 on the first error seen and emit a single error trace message for that stream. 326 """ 327 return False
Abstract base class for an Airbyte Source. Consumers should implement any abstract methods in this class to create an Airbyte Specification compliant Source.
59 @abstractmethod 60 def check_connection( 61 self, logger: logging.Logger, config: Mapping[str, Any] 62 ) -> Tuple[bool, Optional[Any]]: 63 """ 64 :param logger: source logger 65 :param config: The user-provided configuration as specified by the source's spec. 66 This usually contains information required to check connection e.g. tokens, secrets and keys etc. 67 :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful 68 and we can connect to the underlying data source using the provided configuration. 69 Otherwise, the input config cannot be used to connect to the underlying data source, 70 and the "error" object should describe what went wrong. 71 The error object will be cast to string to display the problem to the user. 72 """
Parameters
- logger: source logger
- config: The user-provided configuration as specified by the source's spec. This usually contains information required to check connection e.g. tokens, secrets and keys etc.
Returns
A tuple of (boolean, error). If boolean is true, then the connection check is successful and we can connect to the underlying data source using the provided configuration. Otherwise, the input config cannot be used to connect to the underlying data source, and the "error" object should describe what went wrong. The error object will be cast to string to display the problem to the user.
74 @abstractmethod 75 def streams(self, config: Mapping[str, Any]) -> List[Stream]: 76 """ 77 :param config: The user-provided configuration as specified by the source's spec. 78 Any stream construction related operation should happen here. 79 :return: A list of the streams in this source connector. 80 """
Parameters
- config: The user-provided configuration as specified by the source's spec. Any stream construction related operation should happen here.
Returns
A list of the streams in this source connector.
86 def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog: 87 """Implements the Discover operation from the Airbyte Specification. 88 See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover. 89 """ 90 streams = [stream.as_airbyte_stream() for stream in self.streams(config=config)] 91 return AirbyteCatalog(streams=streams)
Implements the Discover operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#discover.
93 def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: 94 """Implements the Check Connection operation from the Airbyte Specification. 95 See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check. 96 """ 97 check_succeeded, error = self.check_connection(logger, config) 98 if not check_succeeded: 99 return AirbyteConnectionStatus(status=Status.FAILED, message=repr(error)) 100 return AirbyteConnectionStatus(status=Status.SUCCEEDED)
Implements the Check Connection operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check.
102 def read( 103 self, 104 logger: logging.Logger, 105 config: Mapping[str, Any], 106 catalog: ConfiguredAirbyteCatalog, 107 state: Optional[List[AirbyteStateMessage]] = None, 108 ) -> Iterator[AirbyteMessage]: 109 """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.""" 110 logger.info(f"Starting syncing {self.name}") 111 config, internal_config = split_config(config) 112 # TODO assert all streams exist in the connector 113 # get the streams once in case the connector needs to make any queries to generate them 114 stream_instances = {s.name: s for s in self.streams(config)} 115 state_manager = ConnectorStateManager(state=state) 116 self._stream_to_instance_map = stream_instances 117 118 stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {} 119 120 with create_timer(self.name) as timer: 121 for configured_stream in catalog.streams: 122 stream_instance = stream_instances.get(configured_stream.stream.name) 123 is_stream_exist = bool(stream_instance) 124 try: 125 # Used direct reference to `stream_instance` instead of `is_stream_exist` to avoid mypy type checking errors 126 if not stream_instance: 127 if not self.raise_exception_on_missing_stream: 128 yield stream_status_as_airbyte_message( 129 configured_stream.stream, AirbyteStreamStatus.INCOMPLETE 130 ) 131 continue 132 133 error_message = ( 134 f"The stream '{configured_stream.stream.name}' in your connection configuration was not found in the source. " 135 f"Refresh the schema in your replication settings and remove this stream from future sync attempts." 136 ) 137 138 # Use configured_stream as stream_instance to support references in error handling. 139 stream_instance = configured_stream.stream 140 141 raise AirbyteTracedException( 142 message="A stream listed in your configuration was not found in the source. Please check the logs for more " 143 "details.", 144 internal_message=error_message, 145 failure_type=FailureType.config_error, 146 ) 147 148 timer.start_event(f"Syncing stream {configured_stream.stream.name}") 149 logger.info(f"Marking stream {configured_stream.stream.name} as STARTED") 150 yield stream_status_as_airbyte_message( 151 configured_stream.stream, AirbyteStreamStatus.STARTED 152 ) 153 yield from self._read_stream( 154 logger=logger, 155 stream_instance=stream_instance, 156 configured_stream=configured_stream, 157 state_manager=state_manager, 158 internal_config=internal_config, 159 ) 160 logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED") 161 yield stream_status_as_airbyte_message( 162 configured_stream.stream, AirbyteStreamStatus.COMPLETE 163 ) 164 165 except Exception as e: 166 yield from self._emit_queued_messages() 167 logger.exception( 168 f"Encountered an exception while reading stream {configured_stream.stream.name}" 169 ) 170 logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED") 171 yield stream_status_as_airbyte_message( 172 configured_stream.stream, AirbyteStreamStatus.INCOMPLETE 173 ) 174 175 stream_descriptor = StreamDescriptor(name=configured_stream.stream.name) 176 177 if isinstance(e, AirbyteTracedException): 178 traced_exception = e 179 info_message = f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error." 180 else: 181 traced_exception = self._serialize_exception( 182 stream_descriptor, e, stream_instance=stream_instance 183 ) 184 info_message = f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}" 185 186 yield traced_exception.as_sanitized_airbyte_message( 187 stream_descriptor=stream_descriptor 188 ) 189 stream_name_to_exception[stream_instance.name] = traced_exception # type: ignore # use configured_stream if stream_instance is None 190 if self.stop_sync_on_stream_failure: 191 logger.info(info_message) 192 break 193 finally: 194 # Finish read event only if the stream instance exists; 195 # otherwise, there's no need as it never started 196 if is_stream_exist: 197 timer.finish_event() 198 logger.info(f"Finished syncing {configured_stream.stream.name}") 199 logger.info(timer.report()) 200 201 if len(stream_name_to_exception) > 0: 202 error_message = generate_failed_streams_error_message( 203 {key: [value] for key, value in stream_name_to_exception.items()} 204 ) 205 logger.info(error_message) 206 # We still raise at least one exception when a stream raises an exception because the platform currently relies 207 # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error 208 # type because this combined error isn't actionable, but rather the previously emitted individual errors. 209 raise AirbyteTracedException( 210 message=error_message, failure_type=FailureType.config_error 211 ) 212 logger.info(f"Finished syncing {self.name}")
Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.
318 @property 319 def stop_sync_on_stream_failure(self) -> bool: 320 """ 321 WARNING: This function is in-development which means it is subject to change. Use at your own risk. 322 323 By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then 324 continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync 325 on the first error seen and emit a single error trace message for that stream. 326 """ 327 return False
WARNING: This function is in-development which means it is subject to change. Use at your own risk.
By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then continue syncing the next stream. This can be overwritten on a per-source basis so that the source will stop the sync on the first error seen and emit a single error trace message for that stream.
13class BaseConfig(BaseModel): 14 """Base class for connector spec, adds the following behaviour: 15 16 - resolve $ref and replace it with definition 17 - replace all occurrences of anyOf with oneOf 18 - drop description 19 """ 20 21 @classmethod 22 def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]: 23 """We're overriding the schema classmethod to enable some post-processing""" 24 schema = super().schema(*args, **kwargs) 25 rename_key(schema, old_key="anyOf", new_key="oneOf") # UI supports only oneOf 26 expand_refs(schema) 27 schema.pop("description", None) # description added from the docstring 28 return schema
Base class for connector spec, adds the following behaviour:
- resolve $ref and replace it with definition
- replace all occurrences of anyOf with oneOf
- drop description
21 @classmethod 22 def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]: 23 """We're overriding the schema classmethod to enable some post-processing""" 24 schema = super().schema(*args, **kwargs) 25 rename_key(schema, old_key="anyOf", new_key="oneOf") # UI supports only oneOf 26 expand_refs(schema) 27 schema.pop("description", None) # description added from the docstring 28 return schema
We're overriding the schema classmethod to enable some post-processing
34class BaseConnector(ABC, Generic[TConfig]): 35 # configure whether the `check_config_against_spec_or_exit()` needs to be called 36 check_config_against_spec: bool = True 37 38 @abstractmethod 39 def configure(self, config: Mapping[str, Any], temp_dir: str) -> TConfig: 40 """ 41 Persist config in temporary directory to run the Source job 42 """ 43 44 @staticmethod 45 def read_config(config_path: str) -> Mapping[str, Any]: 46 config = BaseConnector._read_json_file(config_path) 47 if isinstance(config, Mapping): 48 return config 49 else: 50 raise ValueError( 51 f"The content of {config_path} is not an object and therefore is not a valid config. Please ensure the file represent a config." 52 ) 53 54 @staticmethod 55 def _read_json_file(file_path: str) -> Any: 56 with open(file_path, "r") as file: 57 contents = file.read() 58 59 try: 60 return json.loads(contents) 61 except json.JSONDecodeError as error: 62 raise ValueError( 63 f"Could not read json file {file_path}: {error}. Please ensure that it is a valid JSON." 64 ) 65 66 @staticmethod 67 def write_config(config: TConfig, config_path: str) -> None: 68 with open(config_path, "w") as fh: 69 fh.write(json.dumps(config)) 70 71 def spec(self, logger: logging.Logger) -> ConnectorSpecification: 72 """ 73 Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password) 74 required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root. 75 """ 76 77 package = self.__class__.__module__.split(".")[0] 78 79 yaml_spec = load_optional_package_file(package, "spec.yaml") 80 json_spec = load_optional_package_file(package, "spec.json") 81 82 if yaml_spec and json_spec: 83 raise RuntimeError( 84 "Found multiple spec files in the package. Only one of spec.yaml or spec.json should be provided." 85 ) 86 87 if yaml_spec: 88 spec_obj = yaml.load(yaml_spec, Loader=yaml.SafeLoader) 89 elif json_spec: 90 try: 91 spec_obj = json.loads(json_spec) 92 except json.JSONDecodeError as error: 93 raise ValueError( 94 f"Could not read json spec file: {error}. Please ensure that it is a valid JSON." 95 ) 96 else: 97 raise FileNotFoundError("Unable to find spec.yaml or spec.json in the package.") 98 99 return ConnectorSpecificationSerializer.load(spec_obj) 100 101 @abstractmethod 102 def check(self, logger: logging.Logger, config: TConfig) -> AirbyteConnectionStatus: 103 """ 104 Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect 105 to the Stripe API. 106 """
Helper class that provides a standard way to create an ABC using inheritance.
38 @abstractmethod 39 def configure(self, config: Mapping[str, Any], temp_dir: str) -> TConfig: 40 """ 41 Persist config in temporary directory to run the Source job 42 """
Persist config in temporary directory to run the Source job
44 @staticmethod 45 def read_config(config_path: str) -> Mapping[str, Any]: 46 config = BaseConnector._read_json_file(config_path) 47 if isinstance(config, Mapping): 48 return config 49 else: 50 raise ValueError( 51 f"The content of {config_path} is not an object and therefore is not a valid config. Please ensure the file represent a config." 52 )
71 def spec(self, logger: logging.Logger) -> ConnectorSpecification: 72 """ 73 Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password) 74 required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root. 75 """ 76 77 package = self.__class__.__module__.split(".")[0] 78 79 yaml_spec = load_optional_package_file(package, "spec.yaml") 80 json_spec = load_optional_package_file(package, "spec.json") 81 82 if yaml_spec and json_spec: 83 raise RuntimeError( 84 "Found multiple spec files in the package. Only one of spec.yaml or spec.json should be provided." 85 ) 86 87 if yaml_spec: 88 spec_obj = yaml.load(yaml_spec, Loader=yaml.SafeLoader) 89 elif json_spec: 90 try: 91 spec_obj = json.loads(json_spec) 92 except json.JSONDecodeError as error: 93 raise ValueError( 94 f"Could not read json spec file: {error}. Please ensure that it is a valid JSON." 95 ) 96 else: 97 raise FileNotFoundError("Unable to find spec.yaml or spec.json in the package.") 98 99 return ConnectorSpecificationSerializer.load(spec_obj)
Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password) required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root.
101 @abstractmethod 102 def check(self, logger: logging.Logger, config: TConfig) -> AirbyteConnectionStatus: 103 """ 104 Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect 105 to the Stripe API. 106 """
Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect to the Stripe API.
124class Connector(DefaultConnectorMixin, BaseConnector[Mapping[str, Any]], ABC): ...
Helper class that provides a standard way to create an ABC using inheritance.
30class Destination(Connector, ABC): 31 VALID_CMDS = {"spec", "check", "write"} 32 33 @abstractmethod 34 def write( 35 self, 36 config: Mapping[str, Any], 37 configured_catalog: ConfiguredAirbyteCatalog, 38 input_messages: Iterable[AirbyteMessage], 39 ) -> Iterable[AirbyteMessage]: 40 """Implement to define how the connector writes data to the destination""" 41 42 def _run_check(self, config: Mapping[str, Any]) -> AirbyteMessage: 43 check_result = self.check(logger, config) 44 return AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result) 45 46 def _parse_input_stream(self, input_stream: io.TextIOWrapper) -> Iterable[AirbyteMessage]: 47 """Reads from stdin, converting to Airbyte messages""" 48 for line in input_stream: 49 try: 50 yield AirbyteMessageSerializer.load(orjson.loads(line)) 51 except orjson.JSONDecodeError: 52 logger.info( 53 f"ignoring input which can't be deserialized as Airbyte Message: {line}" 54 ) 55 56 def _run_write( 57 self, 58 config: Mapping[str, Any], 59 configured_catalog_path: str, 60 input_stream: io.TextIOWrapper, 61 ) -> Iterable[AirbyteMessage]: 62 catalog = ConfiguredAirbyteCatalogSerializer.load( 63 orjson.loads(open(configured_catalog_path).read()) 64 ) 65 input_messages = self._parse_input_stream(input_stream) 66 logger.info("Begin writing to the destination...") 67 yield from self.write( 68 config=config, configured_catalog=catalog, input_messages=input_messages 69 ) 70 logger.info("Writing complete.") 71 72 def parse_args(self, args: List[str]) -> argparse.Namespace: 73 """ 74 :param args: commandline arguments 75 :return: 76 """ 77 78 parent_parser = argparse.ArgumentParser(add_help=False) 79 main_parser = argparse.ArgumentParser() 80 subparsers = main_parser.add_subparsers(title="commands", dest="command") 81 82 # spec 83 subparsers.add_parser( 84 "spec", help="outputs the json configuration specification", parents=[parent_parser] 85 ) 86 87 # check 88 check_parser = subparsers.add_parser( 89 "check", help="checks the config can be used to connect", parents=[parent_parser] 90 ) 91 required_check_parser = check_parser.add_argument_group("required named arguments") 92 required_check_parser.add_argument( 93 "--config", type=str, required=True, help="path to the json configuration file" 94 ) 95 96 # write 97 write_parser = subparsers.add_parser( 98 "write", help="Writes data to the destination", parents=[parent_parser] 99 ) 100 write_required = write_parser.add_argument_group("required named arguments") 101 write_required.add_argument( 102 "--config", type=str, required=True, help="path to the JSON configuration file" 103 ) 104 write_required.add_argument( 105 "--catalog", type=str, required=True, help="path to the configured catalog JSON file" 106 ) 107 108 parsed_args = main_parser.parse_args(args) 109 cmd = parsed_args.command 110 if not cmd: 111 raise Exception("No command entered. ") 112 elif cmd not in ["spec", "check", "write"]: 113 # This is technically dead code since parse_args() would fail if this was the case 114 # But it's non-obvious enough to warrant placing it here anyways 115 raise Exception(f"Unknown command entered: {cmd}") 116 117 return parsed_args 118 119 def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]: 120 cmd = parsed_args.command 121 if cmd not in self.VALID_CMDS: 122 raise Exception(f"Unrecognized command: {cmd}") 123 124 spec = self.spec(logger) 125 if cmd == "spec": 126 yield AirbyteMessage(type=Type.SPEC, spec=spec) 127 return 128 config = self.read_config(config_path=parsed_args.config) 129 if self.check_config_against_spec or cmd == "check": 130 try: 131 check_config_against_spec_or_exit(config, spec) 132 except AirbyteTracedException as traced_exc: 133 connection_status = traced_exc.as_connection_status_message() 134 if connection_status and cmd == "check": 135 yield connection_status 136 return 137 raise traced_exc 138 139 if cmd == "check": 140 yield self._run_check(config=config) 141 elif cmd == "write": 142 # Wrap in UTF-8 to override any other input encodings 143 wrapped_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8") 144 yield from self._run_write( 145 config=config, 146 configured_catalog_path=parsed_args.catalog, 147 input_stream=wrapped_stdin, 148 ) 149 150 def run(self, args: List[str]) -> None: 151 init_uncaught_exception_handler(logger) 152 parsed_args = self.parse_args(args) 153 output_messages = self.run_cmd(parsed_args) 154 for message in output_messages: 155 print(orjson.dumps(AirbyteMessageSerializer.dump(message)).decode())
Helper class that provides a standard way to create an ABC using inheritance.
33 @abstractmethod 34 def write( 35 self, 36 config: Mapping[str, Any], 37 configured_catalog: ConfiguredAirbyteCatalog, 38 input_messages: Iterable[AirbyteMessage], 39 ) -> Iterable[AirbyteMessage]: 40 """Implement to define how the connector writes data to the destination"""
Implement to define how the connector writes data to the destination
72 def parse_args(self, args: List[str]) -> argparse.Namespace: 73 """ 74 :param args: commandline arguments 75 :return: 76 """ 77 78 parent_parser = argparse.ArgumentParser(add_help=False) 79 main_parser = argparse.ArgumentParser() 80 subparsers = main_parser.add_subparsers(title="commands", dest="command") 81 82 # spec 83 subparsers.add_parser( 84 "spec", help="outputs the json configuration specification", parents=[parent_parser] 85 ) 86 87 # check 88 check_parser = subparsers.add_parser( 89 "check", help="checks the config can be used to connect", parents=[parent_parser] 90 ) 91 required_check_parser = check_parser.add_argument_group("required named arguments") 92 required_check_parser.add_argument( 93 "--config", type=str, required=True, help="path to the json configuration file" 94 ) 95 96 # write 97 write_parser = subparsers.add_parser( 98 "write", help="Writes data to the destination", parents=[parent_parser] 99 ) 100 write_required = write_parser.add_argument_group("required named arguments") 101 write_required.add_argument( 102 "--config", type=str, required=True, help="path to the JSON configuration file" 103 ) 104 write_required.add_argument( 105 "--catalog", type=str, required=True, help="path to the configured catalog JSON file" 106 ) 107 108 parsed_args = main_parser.parse_args(args) 109 cmd = parsed_args.command 110 if not cmd: 111 raise Exception("No command entered. ") 112 elif cmd not in ["spec", "check", "write"]: 113 # This is technically dead code since parse_args() would fail if this was the case 114 # But it's non-obvious enough to warrant placing it here anyways 115 raise Exception(f"Unknown command entered: {cmd}") 116 117 return parsed_args
Parameters
- args: commandline arguments
Returns
119 def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]: 120 cmd = parsed_args.command 121 if cmd not in self.VALID_CMDS: 122 raise Exception(f"Unrecognized command: {cmd}") 123 124 spec = self.spec(logger) 125 if cmd == "spec": 126 yield AirbyteMessage(type=Type.SPEC, spec=spec) 127 return 128 config = self.read_config(config_path=parsed_args.config) 129 if self.check_config_against_spec or cmd == "check": 130 try: 131 check_config_against_spec_or_exit(config, spec) 132 except AirbyteTracedException as traced_exc: 133 connection_status = traced_exc.as_connection_status_message() 134 if connection_status and cmd == "check": 135 yield connection_status 136 return 137 raise traced_exc 138 139 if cmd == "check": 140 yield self._run_check(config=config) 141 elif cmd == "write": 142 # Wrap in UTF-8 to override any other input encodings 143 wrapped_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8") 144 yield from self._run_write( 145 config=config, 146 configured_catalog_path=parsed_args.catalog, 147 input_stream=wrapped_stdin, 148 )
56class Source( 57 DefaultConnectorMixin, 58 BaseSource[Mapping[str, Any], List[AirbyteStateMessage], ConfiguredAirbyteCatalog], 59 ABC, 60): 61 # can be overridden to change an input state. 62 @classmethod 63 def read_state(cls, state_path: str) -> List[AirbyteStateMessage]: 64 """ 65 Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either 66 a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the 67 incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s). 68 :param state_path: The filepath to where the stream states are located 69 :return: The complete stream state based on the connector's previous sync 70 """ 71 parsed_state_messages = [] 72 if state_path: 73 state_obj = BaseConnector._read_json_file(state_path) 74 if state_obj: 75 for state in state_obj: # type: ignore # `isinstance(state_obj, List)` ensures that this is a list 76 parsed_message = AirbyteStateMessageSerializer.load(state) 77 if ( 78 not parsed_message.stream 79 and not parsed_message.data 80 and not parsed_message.global_ 81 ): 82 raise ValueError( 83 "AirbyteStateMessage should contain either a stream, global, or state field" 84 ) 85 parsed_state_messages.append(parsed_message) 86 return parsed_state_messages 87 88 # can be overridden to change an input catalog 89 @classmethod 90 def read_catalog(cls, catalog_path: str) -> ConfiguredAirbyteCatalog: 91 return ConfiguredAirbyteCatalogSerializer.load(cls._read_json_file(catalog_path)) 92 93 @property 94 def name(self) -> str: 95 """Source name""" 96 return self.__class__.__name__
Helper class that provides a standard way to create an ABC using inheritance.
62 @classmethod 63 def read_state(cls, state_path: str) -> List[AirbyteStateMessage]: 64 """ 65 Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either 66 a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the 67 incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s). 68 :param state_path: The filepath to where the stream states are located 69 :return: The complete stream state based on the connector's previous sync 70 """ 71 parsed_state_messages = [] 72 if state_path: 73 state_obj = BaseConnector._read_json_file(state_path) 74 if state_obj: 75 for state in state_obj: # type: ignore # `isinstance(state_obj, List)` ensures that this is a list 76 parsed_message = AirbyteStateMessageSerializer.load(state) 77 if ( 78 not parsed_message.stream 79 and not parsed_message.data 80 and not parsed_message.global_ 81 ): 82 raise ValueError( 83 "AirbyteStateMessage should contain either a stream, global, or state field" 84 ) 85 parsed_state_messages.append(parsed_message) 86 return parsed_state_messages
Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s).
Parameters
- state_path: The filepath to where the stream states are located
Returns
The complete stream state based on the connector's previous sync
37@dataclass 38class AddFields(RecordTransformation): 39 """ 40 Transformation which adds field to an output record. The path of the added field can be nested. Adding nested fields will create all 41 necessary parent objects (like mkdir -p). Adding fields to an array will extend the array to that index (filling intermediate 42 indices with null values). So if you add a field at index 5 to the array ["value"], it will become ["value", null, null, null, null, 43 "new_value"]. 44 45 46 This transformation has access to the following contextual values: 47 record: the record about to be output by the connector 48 config: the input configuration provided to a connector 49 stream_state: the current state of the stream 50 stream_slice: the current stream slice being read 51 52 53 54 Examples of instantiating this transformation via YAML: 55 - type: AddFields 56 fields: 57 # hardcoded constant 58 - path: ["path"] 59 value: "static_value" 60 61 # nested path 62 - path: ["path", "to", "field"] 63 value: "static" 64 65 # from config 66 - path: ["shop_id"] 67 value: "{{ config.shop_id }}" 68 69 # from stream_interval 70 - path: ["date"] 71 value: "{{ stream_interval.start_date }}" 72 73 # from record 74 - path: ["unnested_value"] 75 value: {{ record.nested.field }} 76 77 # from stream_slice 78 - path: ["start_date"] 79 value: {{ stream_slice.start_date }} 80 81 # by supplying any valid Jinja template directive or expression https://jinja.palletsprojects.com/en/3.1.x/templates/# 82 - path: ["two_times_two"] 83 value: {{ 2 * 2 }} 84 85 Attributes: 86 fields (List[AddedFieldDefinition]): A list of transformations (path and corresponding value) that will be added to the record 87 """ 88 89 fields: List[AddedFieldDefinition] 90 parameters: InitVar[Mapping[str, Any]] 91 condition: str = "" 92 _parsed_fields: List[ParsedAddFieldDefinition] = field( 93 init=False, repr=False, default_factory=list 94 ) 95 96 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 97 self._filter_interpolator = InterpolatedBoolean( 98 condition=self.condition, parameters=parameters 99 ) 100 101 for add_field in self.fields: 102 if len(add_field.path) < 1: 103 raise ValueError( 104 f"Expected a non-zero-length path for the AddFields transformation {add_field}" 105 ) 106 107 if not isinstance(add_field.value, InterpolatedString): 108 if not isinstance(add_field.value, str): 109 raise f"Expected a string value for the AddFields transformation: {add_field}" 110 else: 111 self._parsed_fields.append( 112 ParsedAddFieldDefinition( 113 add_field.path, 114 InterpolatedString.create(add_field.value, parameters=parameters), 115 value_type=add_field.value_type, 116 parameters=parameters, 117 ) 118 ) 119 else: 120 self._parsed_fields.append( 121 ParsedAddFieldDefinition( 122 add_field.path, 123 add_field.value, 124 value_type=add_field.value_type, 125 parameters={}, 126 ) 127 ) 128 129 def transform( 130 self, 131 record: Dict[str, Any], 132 config: Optional[Config] = None, 133 stream_state: Optional[StreamState] = None, 134 stream_slice: Optional[StreamSlice] = None, 135 ) -> None: 136 if config is None: 137 config = {} 138 kwargs = {"record": record, "stream_slice": stream_slice} 139 for parsed_field in self._parsed_fields: 140 valid_types = (parsed_field.value_type,) if parsed_field.value_type else None 141 value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs) 142 is_empty_condition = not self.condition 143 if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs): 144 dpath.new(record, parsed_field.path, value) 145 146 def __eq__(self, other: Any) -> bool: 147 return bool(self.__dict__ == other.__dict__)
Transformation which adds field to an output record. The path of the added field can be nested. Adding nested fields will create all necessary parent objects (like mkdir -p). Adding fields to an array will extend the array to that index (filling intermediate indices with null values). So if you add a field at index 5 to the array ["value"], it will become ["value", null, null, null, null, "new_value"].
This transformation has access to the following contextual values:
record: the record about to be output by the connector config: the input configuration provided to a connector stream_state: the current state of the stream stream_slice: the current stream slice being read
Examples of instantiating this transformation via YAML:
- type: AddFields
fields:
# hardcoded constant
- path: ["path"] value: "static_value"
# nested path
- path: ["path", "to", "field"]
value: "static"
# from config
- path: ["shop_id"]
value: "{{ config.shop_id }}"
# from stream_interval
- path: ["date"]
value: "{{ stream_interval.start_date }}"
# from record
- path: ["unnested_value"]
value: {{ record.nested.field }}
# from stream_slice
- path: ["start_date"]
value: {{ stream_slice.start_date }}
# by supplying any valid Jinja template directive or expression https://jinja.palletsprojects.com/en/3.1.x/templates/#
- path: ["two_times_two"]
value: {{ 2 * 2 }}
Attributes:
- fields (List[AddedFieldDefinition]): A list of transformations (path and corresponding value) that will be added to the record
129 def transform( 130 self, 131 record: Dict[str, Any], 132 config: Optional[Config] = None, 133 stream_state: Optional[StreamState] = None, 134 stream_slice: Optional[StreamSlice] = None, 135 ) -> None: 136 if config is None: 137 config = {} 138 kwargs = {"record": record, "stream_slice": stream_slice} 139 for parsed_field in self._parsed_fields: 140 valid_types = (parsed_field.value_type,) if parsed_field.value_type else None 141 value = parsed_field.value.eval(config, valid_types=valid_types, **kwargs) 142 is_empty_condition = not self.condition 143 if is_empty_condition or self._filter_interpolator.eval(config, value=value, **kwargs): 144 dpath.new(record, parsed_field.path, value)
Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.
Parameters
- record: The input record to be transformed
- config: The user-provided configuration as specified by the source's spec
- stream_state: The stream state
- stream_slice: The stream slice
Returns
The transformed record
17@dataclass(frozen=True) 18class AddedFieldDefinition: 19 """Defines the field to add on a record""" 20 21 path: FieldPointer 22 value: Union[InterpolatedString, str] 23 value_type: Optional[Type[Any]] 24 parameters: InitVar[Mapping[str, Any]]
Defines the field to add on a record
24@dataclass 25class ApiKeyAuthenticator(DeclarativeAuthenticator): 26 """ 27 ApiKeyAuth sets a request header on the HTTP requests sent. 28 29 The header is of the form: 30 `"<header>": "<token>"` 31 32 For example, 33 `ApiKeyAuthenticator("Authorization", "Bearer hello")` 34 will result in the following header set on the HTTP request 35 `"Authorization": "Bearer hello"` 36 37 Attributes: 38 request_option (RequestOption): request option how to inject the token into the request 39 token_provider (TokenProvider): Provider of the token 40 config (Config): The user-provided configuration as specified by the source's spec 41 parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation 42 """ 43 44 request_option: RequestOption 45 token_provider: TokenProvider 46 config: Config 47 parameters: InitVar[Mapping[str, Any]] 48 49 @property 50 def auth_header(self) -> str: 51 options = self._get_request_options(RequestOptionType.header) 52 return next(iter(options.keys()), "") 53 54 @property 55 def token(self) -> str: 56 return self.token_provider.get_token() 57 58 def _get_request_options(self, option_type: RequestOptionType) -> Mapping[str, Any]: 59 options: MutableMapping[str, Any] = {} 60 if self.request_option.inject_into == option_type: 61 self.request_option.inject_into_request(options, self.token, self.config) 62 return options 63 64 def get_request_params(self) -> Mapping[str, Any]: 65 return self._get_request_options(RequestOptionType.request_parameter) 66 67 def get_request_body_data(self) -> Union[Mapping[str, Any], str]: 68 return self._get_request_options(RequestOptionType.body_data) 69 70 def get_request_body_json(self) -> Mapping[str, Any]: 71 return self._get_request_options(RequestOptionType.body_json)
ApiKeyAuth sets a request header on the HTTP requests sent.
The header is of the form:
"<header>": "<token>"
For example,
ApiKeyAuthenticator("Authorization", "Bearer hello")
will result in the following header set on the HTTP request
"Authorization": "Bearer hello"
Attributes:
- request_option (RequestOption): request option how to inject the token into the request
- token_provider (TokenProvider): Provider of the token
- config (Config): The user-provided configuration as specified by the source's spec
- parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
49 @property 50 def auth_header(self) -> str: 51 options = self._get_request_options(RequestOptionType.header) 52 return next(iter(options.keys()), "")
HTTP header to set on the requests
64 def get_request_params(self) -> Mapping[str, Any]: 65 return self._get_request_options(RequestOptionType.request_parameter)
HTTP request parameter to add to the requests
67 def get_request_body_data(self) -> Union[Mapping[str, Any], str]: 68 return self._get_request_options(RequestOptionType.body_data)
Form-encoded body data to set on the requests
70 def get_request_body_json(self) -> Mapping[str, Any]: 71 return self._get_request_options(RequestOptionType.body_json)
JSON-encoded body data to set on the requests
Inherited Members
12class BackoffStrategy(ABC): 13 @abstractmethod 14 def backoff_time( 15 self, 16 response_or_exception: Optional[Union[requests.Response, requests.RequestException]], 17 attempt_count: int, 18 ) -> Optional[float]: 19 """ 20 Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header. 21 22 This method is called only if should_backoff() returns True for the input request. 23 24 :param response_or_exception: The response or exception that caused the backoff. 25 :param attempt_count: The number of attempts already performed for this request. 26 :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff 27 to the default backoff behavior (e.g using an exponential algorithm). 28 """ 29 pass
Helper class that provides a standard way to create an ABC using inheritance.
13 @abstractmethod 14 def backoff_time( 15 self, 16 response_or_exception: Optional[Union[requests.Response, requests.RequestException]], 17 attempt_count: int, 18 ) -> Optional[float]: 19 """ 20 Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header. 21 22 This method is called only if should_backoff() returns True for the input request. 23 24 :param response_or_exception: The response or exception that caused the backoff. 25 :param attempt_count: The number of attempts already performed for this request. 26 :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff 27 to the default backoff behavior (e.g using an exponential algorithm). 28 """ 29 pass
Override this method to dynamically determine backoff time e.g: by reading the X-Retry-After header.
This method is called only if should_backoff() returns True for the input request.
Parameters
- response_or_exception: The response or exception that caused the backoff.
- attempt_count: The number of attempts already performed for this request. :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff to the default backoff behavior (e.g using an exponential algorithm).
101@dataclass 102class BasicHttpAuthenticator(DeclarativeAuthenticator): 103 """ 104 Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using base64 105 https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme 106 107 The header is of the form 108 `"Authorization": "Basic <encoded_credentials>"` 109 110 Attributes: 111 username (Union[InterpolatedString, str]): The username 112 config (Config): The user-provided configuration as specified by the source's spec 113 password (Union[InterpolatedString, str]): The password 114 parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation 115 """ 116 117 username: Union[InterpolatedString, str] 118 config: Config 119 parameters: InitVar[Mapping[str, Any]] 120 password: Union[InterpolatedString, str] = "" 121 122 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 123 self._username = InterpolatedString.create(self.username, parameters=parameters) 124 self._password = InterpolatedString.create(self.password, parameters=parameters) 125 126 @property 127 def auth_header(self) -> str: 128 return "Authorization" 129 130 @property 131 def token(self) -> str: 132 auth_string = ( 133 f"{self._username.eval(self.config)}:{self._password.eval(self.config)}".encode("utf8") 134 ) 135 b64_encoded = base64.b64encode(auth_string).decode("utf8") 136 return f"Basic {b64_encoded}"
Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using base64 https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme
The header is of the form
"Authorization": "Basic <encoded_credentials>"
Attributes:
- username (Union[InterpolatedString, str]): The username
- config (Config): The user-provided configuration as specified by the source's spec
- password (Union[InterpolatedString, str]): The password
- parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
130 @property 131 def token(self) -> str: 132 auth_string = ( 133 f"{self._username.eval(self.config)}:{self._password.eval(self.config)}".encode("utf8") 134 ) 135 b64_encoded = base64.b64encode(auth_string).decode("utf8") 136 return f"Basic {b64_encoded}"
The header value to set on outgoing HTTP requests
74@dataclass 75class BearerAuthenticator(DeclarativeAuthenticator): 76 """ 77 Authenticator that sets the Authorization header on the HTTP requests sent. 78 79 The header is of the form: 80 `"Authorization": "Bearer <token>"` 81 82 Attributes: 83 token_provider (TokenProvider): Provider of the token 84 config (Config): The user-provided configuration as specified by the source's spec 85 parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation 86 """ 87 88 token_provider: TokenProvider 89 config: Config 90 parameters: InitVar[Mapping[str, Any]] 91 92 @property 93 def auth_header(self) -> str: 94 return "Authorization" 95 96 @property 97 def token(self) -> str: 98 return f"Bearer {self.token_provider.get_token()}"
Authenticator that sets the Authorization header on the HTTP requests sent.
The header is of the form:
"Authorization": "Bearer <token>"
Attributes:
- token_provider (TokenProvider): Provider of the token
- config (Config): The user-provided configuration as specified by the source's spec
- parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
40@dataclass 41class CartesianProductStreamSlicer(PartitionRouter): 42 """ 43 Stream slicers that iterates over the cartesian product of input stream slicers 44 Given 2 stream slicers with the following slices: 45 A: [{"i": 0}, {"i": 1}, {"i": 2}] 46 B: [{"s": "hello"}, {"s": "world"}] 47 the resulting stream slices are 48 [ 49 {"i": 0, "s": "hello"}, 50 {"i": 0, "s": "world"}, 51 {"i": 1, "s": "hello"}, 52 {"i": 1, "s": "world"}, 53 {"i": 2, "s": "hello"}, 54 {"i": 2, "s": "world"}, 55 ] 56 57 Attributes: 58 stream_slicers (List[PartitionRouter]): Underlying stream slicers. The RequestOptions (e.g: Request headers, parameters, etc..) returned by this slicer are the combination of the RequestOptions of its input slicers. If there are conflicts e.g: two slicers define the same header or request param, the conflict is resolved by taking the value from the first slicer, where ordering is determined by the order in which slicers were input to this composite slicer. 59 """ 60 61 stream_slicers: List[PartitionRouter] 62 parameters: InitVar[Mapping[str, Any]] 63 64 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 65 check_for_substream_in_slicers(self.stream_slicers, self.logger.warning) 66 67 def get_request_params( 68 self, 69 *, 70 stream_state: Optional[StreamState] = None, 71 stream_slice: Optional[StreamSlice] = None, 72 next_page_token: Optional[Mapping[str, Any]] = None, 73 ) -> Mapping[str, Any]: 74 return dict( 75 ChainMap( 76 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 77 s.get_request_params( 78 stream_state=stream_state, 79 stream_slice=stream_slice, 80 next_page_token=next_page_token, 81 ) 82 for s in self.stream_slicers 83 ] 84 ) 85 ) 86 87 def get_request_headers( 88 self, 89 *, 90 stream_state: Optional[StreamState] = None, 91 stream_slice: Optional[StreamSlice] = None, 92 next_page_token: Optional[Mapping[str, Any]] = None, 93 ) -> Mapping[str, Any]: 94 return dict( 95 ChainMap( 96 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 97 s.get_request_headers( 98 stream_state=stream_state, 99 stream_slice=stream_slice, 100 next_page_token=next_page_token, 101 ) 102 for s in self.stream_slicers 103 ] 104 ) 105 ) 106 107 def get_request_body_data( 108 self, 109 *, 110 stream_state: Optional[StreamState] = None, 111 stream_slice: Optional[StreamSlice] = None, 112 next_page_token: Optional[Mapping[str, Any]] = None, 113 ) -> Mapping[str, Any]: 114 return dict( 115 ChainMap( 116 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 117 s.get_request_body_data( 118 stream_state=stream_state, 119 stream_slice=stream_slice, 120 next_page_token=next_page_token, 121 ) 122 for s in self.stream_slicers 123 ] 124 ) 125 ) 126 127 def get_request_body_json( 128 self, 129 *, 130 stream_state: Optional[StreamState] = None, 131 stream_slice: Optional[StreamSlice] = None, 132 next_page_token: Optional[Mapping[str, Any]] = None, 133 ) -> Mapping[str, Any]: 134 return dict( 135 ChainMap( 136 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 137 s.get_request_body_json( 138 stream_state=stream_state, 139 stream_slice=stream_slice, 140 next_page_token=next_page_token, 141 ) 142 for s in self.stream_slicers 143 ] 144 ) 145 ) 146 147 def stream_slices(self) -> Iterable[StreamSlice]: 148 sub_slices = (s.stream_slices() for s in self.stream_slicers) 149 product = itertools.product(*sub_slices) 150 for stream_slice_tuple in product: 151 partition = dict(ChainMap(*[s.partition for s in stream_slice_tuple])) # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 152 cursor_slices = [s.cursor_slice for s in stream_slice_tuple if s.cursor_slice] 153 if len(cursor_slices) > 1: 154 raise ValueError( 155 f"There should only be a single cursor slice. Found {cursor_slices}" 156 ) 157 if cursor_slices: 158 cursor_slice = cursor_slices[0] 159 else: 160 cursor_slice = {} 161 yield StreamSlice(partition=partition, cursor_slice=cursor_slice) 162 163 def set_initial_state(self, stream_state: StreamState) -> None: 164 """ 165 Parent stream states are not supported for cartesian product stream slicer 166 """ 167 pass 168 169 def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: 170 """ 171 Parent stream states are not supported for cartesian product stream slicer 172 """ 173 pass 174 175 @property 176 def logger(self) -> logging.Logger: 177 return logging.getLogger("airbyte.CartesianProductStreamSlicer")
Stream slicers that iterates over the cartesian product of input stream slicers Given 2 stream slicers with the following slices: A: [{"i": 0}, {"i": 1}, {"i": 2}] B: [{"s": "hello"}, {"s": "world"}] the resulting stream slices are [ {"i": 0, "s": "hello"}, {"i": 0, "s": "world"}, {"i": 1, "s": "hello"}, {"i": 1, "s": "world"}, {"i": 2, "s": "hello"}, {"i": 2, "s": "world"}, ]
Attributes:
- stream_slicers (List[PartitionRouter]): Underlying stream slicers. The RequestOptions (e.g: Request headers, parameters, etc..) returned by this slicer are the combination of the RequestOptions of its input slicers. If there are conflicts e.g: two slicers define the same header or request param, the conflict is resolved by taking the value from the first slicer, where ordering is determined by the order in which slicers were input to this composite slicer.
67 def get_request_params( 68 self, 69 *, 70 stream_state: Optional[StreamState] = None, 71 stream_slice: Optional[StreamSlice] = None, 72 next_page_token: Optional[Mapping[str, Any]] = None, 73 ) -> Mapping[str, Any]: 74 return dict( 75 ChainMap( 76 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 77 s.get_request_params( 78 stream_state=stream_state, 79 stream_slice=stream_slice, 80 next_page_token=next_page_token, 81 ) 82 for s in self.stream_slicers 83 ] 84 ) 85 )
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
87 def get_request_headers( 88 self, 89 *, 90 stream_state: Optional[StreamState] = None, 91 stream_slice: Optional[StreamSlice] = None, 92 next_page_token: Optional[Mapping[str, Any]] = None, 93 ) -> Mapping[str, Any]: 94 return dict( 95 ChainMap( 96 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 97 s.get_request_headers( 98 stream_state=stream_state, 99 stream_slice=stream_slice, 100 next_page_token=next_page_token, 101 ) 102 for s in self.stream_slicers 103 ] 104 ) 105 )
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
107 def get_request_body_data( 108 self, 109 *, 110 stream_state: Optional[StreamState] = None, 111 stream_slice: Optional[StreamSlice] = None, 112 next_page_token: Optional[Mapping[str, Any]] = None, 113 ) -> Mapping[str, Any]: 114 return dict( 115 ChainMap( 116 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 117 s.get_request_body_data( 118 stream_state=stream_state, 119 stream_slice=stream_slice, 120 next_page_token=next_page_token, 121 ) 122 for s in self.stream_slicers 123 ] 124 ) 125 )
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
127 def get_request_body_json( 128 self, 129 *, 130 stream_state: Optional[StreamState] = None, 131 stream_slice: Optional[StreamSlice] = None, 132 next_page_token: Optional[Mapping[str, Any]] = None, 133 ) -> Mapping[str, Any]: 134 return dict( 135 ChainMap( 136 *[ # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 137 s.get_request_body_json( 138 stream_state=stream_state, 139 stream_slice=stream_slice, 140 next_page_token=next_page_token, 141 ) 142 for s in self.stream_slicers 143 ] 144 ) 145 )
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
147 def stream_slices(self) -> Iterable[StreamSlice]: 148 sub_slices = (s.stream_slices() for s in self.stream_slicers) 149 product = itertools.product(*sub_slices) 150 for stream_slice_tuple in product: 151 partition = dict(ChainMap(*[s.partition for s in stream_slice_tuple])) # type: ignore # ChainMap expects a MutableMapping[Never, Never] for reasons 152 cursor_slices = [s.cursor_slice for s in stream_slice_tuple if s.cursor_slice] 153 if len(cursor_slices) > 1: 154 raise ValueError( 155 f"There should only be a single cursor slice. Found {cursor_slices}" 156 ) 157 if cursor_slices: 158 cursor_slice = cursor_slices[0] 159 else: 160 cursor_slice = {} 161 yield StreamSlice(partition=partition, cursor_slice=cursor_slice)
Defines stream slices
Returns
An iterable of stream slices
163 def set_initial_state(self, stream_state: StreamState) -> None: 164 """ 165 Parent stream states are not supported for cartesian product stream slicer 166 """ 167 pass
Parent stream states are not supported for cartesian product stream slicer
169 def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: 170 """ 171 Parent stream states are not supported for cartesian product stream slicer 172 """ 173 pass
Parent stream states are not supported for cartesian product stream slicer
24@dataclass 25class CursorPaginationStrategy(PaginationStrategy): 26 """ 27 Pagination strategy that evaluates an interpolated string to define the next page token 28 29 Attributes: 30 page_size (Optional[int]): the number of records to request 31 cursor_value (Union[InterpolatedString, str]): template string evaluating to the cursor value 32 config (Config): connection config 33 stop_condition (Optional[InterpolatedBoolean]): template string evaluating when to stop paginating 34 decoder (Decoder): decoder to decode the response 35 """ 36 37 cursor_value: Union[InterpolatedString, str] 38 config: Config 39 parameters: InitVar[Mapping[str, Any]] 40 page_size: Optional[int] = None 41 stop_condition: Optional[Union[InterpolatedBoolean, str]] = None 42 decoder: Decoder = field( 43 default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 44 ) 45 46 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 47 if isinstance(self.cursor_value, str): 48 self._cursor_value = InterpolatedString.create(self.cursor_value, parameters=parameters) 49 else: 50 self._cursor_value = self.cursor_value 51 if isinstance(self.stop_condition, str): 52 self._stop_condition: Optional[InterpolatedBoolean] = InterpolatedBoolean( 53 condition=self.stop_condition, parameters=parameters 54 ) 55 else: 56 self._stop_condition = self.stop_condition 57 58 @property 59 def initial_token(self) -> Optional[Any]: 60 """ 61 CursorPaginationStrategy does not have an initial value because the next cursor is typically included 62 in the response of the first request. For Resumable Full Refresh streams that checkpoint the page 63 cursor, the next cursor should be read from the state or stream slice object. 64 """ 65 return None 66 67 def next_page_token( 68 self, 69 response: requests.Response, 70 last_page_size: int, 71 last_record: Optional[Record], 72 last_page_token_value: Optional[Any] = None, 73 ) -> Optional[Any]: 74 decoded_response = next(self.decoder.decode(response)) 75 # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This 76 # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links 77 headers: Dict[str, Any] = dict(response.headers) 78 headers["link"] = response.links 79 if self._stop_condition: 80 should_stop = self._stop_condition.eval( 81 self.config, 82 response=decoded_response, 83 headers=headers, 84 last_record=last_record, 85 last_page_size=last_page_size, 86 ) 87 if should_stop: 88 return None 89 token = self._cursor_value.eval( 90 config=self.config, 91 response=decoded_response, 92 headers=headers, 93 last_record=last_record, 94 last_page_size=last_page_size, 95 ) 96 return token if token else None 97 98 def get_page_size(self) -> Optional[int]: 99 return self.page_size
Pagination strategy that evaluates an interpolated string to define the next page token
Attributes:
- page_size (Optional[int]): the number of records to request
- cursor_value (Union[InterpolatedString, str]): template string evaluating to the cursor value
- config (Config): connection config
- stop_condition (Optional[InterpolatedBoolean]): template string evaluating when to stop paginating
- decoder (Decoder): decoder to decode the response
58 @property 59 def initial_token(self) -> Optional[Any]: 60 """ 61 CursorPaginationStrategy does not have an initial value because the next cursor is typically included 62 in the response of the first request. For Resumable Full Refresh streams that checkpoint the page 63 cursor, the next cursor should be read from the state or stream slice object. 64 """ 65 return None
CursorPaginationStrategy does not have an initial value because the next cursor is typically included in the response of the first request. For Resumable Full Refresh streams that checkpoint the page cursor, the next cursor should be read from the state or stream slice object.
67 def next_page_token( 68 self, 69 response: requests.Response, 70 last_page_size: int, 71 last_record: Optional[Record], 72 last_page_token_value: Optional[Any] = None, 73 ) -> Optional[Any]: 74 decoded_response = next(self.decoder.decode(response)) 75 # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This 76 # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links 77 headers: Dict[str, Any] = dict(response.headers) 78 headers["link"] = response.links 79 if self._stop_condition: 80 should_stop = self._stop_condition.eval( 81 self.config, 82 response=decoded_response, 83 headers=headers, 84 last_record=last_record, 85 last_page_size=last_page_size, 86 ) 87 if should_stop: 88 return None 89 token = self._cursor_value.eval( 90 config=self.config, 91 response=decoded_response, 92 headers=headers, 93 last_record=last_record, 94 last_page_size=last_page_size, 95 ) 96 return token if token else None
Parameters
- response: response to process
- last_page_size: the number of records read from the response
- last_record: the last record extracted from the response
- last_page_token_value: The current value of the page token made on the last request
Returns
next page token. Returns None if there are no more pages to fetch
28@dataclass 29class DatetimeBasedCursor(DeclarativeCursor): 30 """ 31 Slices the stream over a datetime range and create a state with format {<cursor_field>: <datetime> } 32 33 Given a start time, end time, a step function, and an optional lookback window, 34 the stream slicer will partition the date range from start time - lookback window to end time. 35 36 The step function is defined as a string of the form ISO8601 duration 37 38 The timestamp format accepts the same format codes as datetime.strfptime, which are 39 all the format codes required by the 1989 C standard. 40 Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html 41 42 Attributes: 43 start_datetime (Union[MinMaxDatetime, str]): the datetime that determines the earliest record that should be synced 44 end_datetime (Optional[Union[MinMaxDatetime, str]]): the datetime that determines the last record that should be synced 45 cursor_field (Union[InterpolatedString, str]): record's cursor field 46 datetime_format (str): format of the datetime 47 step (Optional[str]): size of the timewindow (ISO8601 duration) 48 cursor_granularity (Optional[str]): smallest increment the datetime_format has (ISO 8601 duration) that will be used to ensure that the start of a slice does not overlap with the end of the previous one 49 config (Config): connection config 50 start_time_option (Optional[RequestOption]): request option for start time 51 end_time_option (Optional[RequestOption]): request option for end time 52 partition_field_start (Optional[str]): partition start time field 53 partition_field_end (Optional[str]): stream slice end time field 54 lookback_window (Optional[InterpolatedString]): how many days before start_datetime to read data for (ISO8601 duration) 55 """ 56 57 start_datetime: Union[MinMaxDatetime, str] 58 cursor_field: Union[InterpolatedString, str] 59 datetime_format: str 60 config: Config 61 parameters: InitVar[Mapping[str, Any]] 62 _highest_observed_cursor_field_value: Optional[str] = field( 63 repr=False, default=None 64 ) # tracks the latest observed datetime, which may not be safe to emit in the case of out-of-order records 65 _cursor: Optional[str] = field( 66 repr=False, default=None 67 ) # tracks the latest observed datetime that is appropriate to emit as stream state 68 end_datetime: Optional[Union[MinMaxDatetime, str]] = None 69 step: Optional[Union[InterpolatedString, str]] = None 70 cursor_granularity: Optional[str] = None 71 start_time_option: Optional[RequestOption] = None 72 end_time_option: Optional[RequestOption] = None 73 partition_field_start: Optional[str] = None 74 partition_field_end: Optional[str] = None 75 lookback_window: Optional[Union[InterpolatedString, str]] = None 76 message_repository: Optional[MessageRepository] = None 77 is_compare_strictly: Optional[bool] = False 78 cursor_datetime_formats: List[str] = field(default_factory=lambda: []) 79 80 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 81 if (self.step and not self.cursor_granularity) or ( 82 not self.step and self.cursor_granularity 83 ): 84 raise ValueError( 85 f"If step is defined, cursor_granularity should be as well and vice-versa. " 86 f"Right now, step is `{self.step}` and cursor_granularity is `{self.cursor_granularity}`" 87 ) 88 self._start_datetime = MinMaxDatetime.create(self.start_datetime, parameters) 89 self._end_datetime = ( 90 None if not self.end_datetime else MinMaxDatetime.create(self.end_datetime, parameters) 91 ) 92 93 self._timezone = datetime.timezone.utc 94 self._interpolation = JinjaInterpolation() 95 96 self._step = ( 97 self._parse_timedelta( 98 InterpolatedString.create(self.step, parameters=parameters).eval(self.config) 99 ) 100 if self.step 101 else datetime.timedelta.max 102 ) 103 self._cursor_granularity = self._parse_timedelta(self.cursor_granularity) 104 self.cursor_field = InterpolatedString.create(self.cursor_field, parameters=parameters) 105 self._lookback_window = ( 106 InterpolatedString.create(self.lookback_window, parameters=parameters) 107 if self.lookback_window 108 else None 109 ) 110 self._partition_field_start = InterpolatedString.create( 111 self.partition_field_start or "start_time", parameters=parameters 112 ) 113 self._partition_field_end = InterpolatedString.create( 114 self.partition_field_end or "end_time", parameters=parameters 115 ) 116 self._parser = DatetimeParser() 117 118 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 119 if not self._start_datetime.datetime_format: 120 self._start_datetime.datetime_format = self.datetime_format 121 if self._end_datetime and not self._end_datetime.datetime_format: 122 self._end_datetime.datetime_format = self.datetime_format 123 124 if not self.cursor_datetime_formats: 125 self.cursor_datetime_formats = [self.datetime_format] 126 127 _validate_component_request_option_paths( 128 self.config, self.start_time_option, self.end_time_option 129 ) 130 131 def get_stream_state(self) -> StreamState: 132 return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 133 134 def set_initial_state(self, stream_state: StreamState) -> None: 135 """ 136 Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called 137 before calling anything else 138 139 :param stream_state: The state of the stream as returned by get_stream_state 140 """ 141 self._cursor = ( 142 stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None # type: ignore [union-attr] 143 ) 144 145 def observe(self, stream_slice: StreamSlice, record: Record) -> None: 146 """ 147 Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read. 148 149 :param stream_slice: The current slice, which may or may not contain the most recently observed record 150 :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the 151 stream state may need to be deferred depending on whether the source reliably orders records by the cursor field. 152 """ 153 record_cursor_value = record.get(self.cursor_field.eval(self.config)) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 154 # if the current record has no cursor value, we cannot meaningfully update the state based on it, so there is nothing more to do 155 if not record_cursor_value: 156 return 157 158 start_field = self._partition_field_start.eval(self.config) 159 end_field = self._partition_field_end.eval(self.config) 160 is_highest_observed_cursor_value = ( 161 not self._highest_observed_cursor_field_value 162 or self.parse_date(record_cursor_value) 163 > self.parse_date(self._highest_observed_cursor_field_value) 164 ) 165 if ( 166 self._is_within_daterange_boundaries( 167 record, 168 stream_slice.get(start_field), # type: ignore [arg-type] 169 stream_slice.get(end_field), # type: ignore [arg-type] 170 ) 171 and is_highest_observed_cursor_value 172 ): 173 self._highest_observed_cursor_field_value = record_cursor_value 174 175 def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: 176 if stream_slice.partition: 177 raise ValueError( 178 f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}." 179 ) 180 cursor_value_str_by_cursor_value_datetime = dict( 181 map( 182 # we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like 183 # 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z' 184 lambda datetime_str: (self.parse_date(datetime_str), datetime_str), # type: ignore # because of the filter on the next line, this will only be called with a str 185 filter( 186 lambda item: item, [self._cursor, self._highest_observed_cursor_field_value] 187 ), 188 ) 189 ) 190 self._cursor = ( 191 cursor_value_str_by_cursor_value_datetime[ 192 max(cursor_value_str_by_cursor_value_datetime.keys()) 193 ] 194 if cursor_value_str_by_cursor_value_datetime 195 else None 196 ) 197 198 def stream_slices(self) -> Iterable[StreamSlice]: 199 """ 200 Partition the daterange into slices of size = step. 201 202 The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime 203 The end of the window is the minimum datetime between the start of the window and end_datetime. 204 205 :return: 206 """ 207 end_datetime = self.select_best_end_datetime() 208 start_datetime = self._calculate_earliest_possible_value(self.select_best_end_datetime()) 209 return self._partition_daterange(start_datetime, end_datetime, self._step) 210 211 def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: 212 # Datetime based cursors operate over slices made up of datetime ranges. Stream state is based on the progress 213 # through each slice and does not belong to a specific slice. We just return stream state as it is. 214 return self.get_stream_state() 215 216 def _calculate_earliest_possible_value( 217 self, end_datetime: datetime.datetime 218 ) -> datetime.datetime: 219 lookback_delta = self._parse_timedelta( 220 self._lookback_window.eval(self.config) if self._lookback_window else "P0D" 221 ) 222 earliest_possible_start_datetime = min( 223 self._start_datetime.get_datetime(self.config), end_datetime 224 ) 225 try: 226 cursor_datetime = ( 227 self._calculate_cursor_datetime_from_state(self.get_stream_state()) - lookback_delta 228 ) 229 except OverflowError: 230 # cursor_datetime defers to the minimum date if it does not exist in the state. Trying to subtract 231 # a timedelta from the minimum datetime results in an OverflowError 232 cursor_datetime = self._calculate_cursor_datetime_from_state(self.get_stream_state()) 233 return max(earliest_possible_start_datetime, cursor_datetime) 234 235 def select_best_end_datetime(self) -> datetime.datetime: 236 """ 237 Returns the optimal end datetime. 238 This method compares the current datetime with a pre-configured end datetime 239 and returns the earlier of the two. If no pre-configured end datetime is set, 240 the current datetime is returned. 241 242 :return datetime.datetime: The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier. 243 """ 244 now = datetime.datetime.now(tz=self._timezone) 245 if not self._end_datetime: 246 return now 247 return min(self._end_datetime.get_datetime(self.config), now) 248 249 def _calculate_cursor_datetime_from_state( 250 self, stream_state: Mapping[str, Any] 251 ) -> datetime.datetime: 252 if self.cursor_field.eval(self.config, stream_state=stream_state) in stream_state: # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 253 return self.parse_date(stream_state[self.cursor_field.eval(self.config)]) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 254 return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) 255 256 def _format_datetime(self, dt: datetime.datetime) -> str: 257 return self._parser.format(dt, self.datetime_format) 258 259 def _partition_daterange( 260 self, 261 start: datetime.datetime, 262 end: datetime.datetime, 263 step: Union[datetime.timedelta, Duration], 264 ) -> List[StreamSlice]: 265 start_field = self._partition_field_start.eval(self.config) 266 end_field = self._partition_field_end.eval(self.config) 267 dates = [] 268 269 while self._is_within_date_range(start, end): 270 next_start = self._evaluate_next_start_date_safely(start, step) 271 end_date = self._get_date(next_start - self._cursor_granularity, end, min) 272 dates.append( 273 StreamSlice( 274 partition={}, 275 cursor_slice={ 276 start_field: self._format_datetime(start), 277 end_field: self._format_datetime(end_date), 278 }, 279 ) 280 ) 281 start = next_start 282 return dates 283 284 def _is_within_date_range(self, start: datetime.datetime, end: datetime.datetime) -> bool: 285 if self.is_compare_strictly: 286 return start < end 287 return start <= end 288 289 def _evaluate_next_start_date_safely( 290 self, start: datetime.datetime, step: datetime.timedelta 291 ) -> datetime.datetime: 292 """ 293 Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date 294 This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code 295 would have broken anyway. 296 """ 297 try: 298 return start + step 299 except OverflowError: 300 return datetime.datetime.max.replace(tzinfo=datetime.timezone.utc) 301 302 def _get_date( 303 self, 304 cursor_value: datetime.datetime, 305 default_date: datetime.datetime, 306 comparator: Callable[[datetime.datetime, datetime.datetime], datetime.datetime], 307 ) -> datetime.datetime: 308 cursor_date = cursor_value or default_date 309 return comparator(cursor_date, default_date) 310 311 def parse_date(self, date: str) -> datetime.datetime: 312 for datetime_format in self.cursor_datetime_formats + [self.datetime_format]: 313 try: 314 return self._parser.parse(date, datetime_format) 315 except ValueError: 316 pass 317 raise ValueError(f"No format in {self.cursor_datetime_formats} matching {date}") 318 319 @classmethod 320 def _parse_timedelta(cls, time_str: Optional[str]) -> Union[datetime.timedelta, Duration]: 321 """ 322 :return Parses an ISO 8601 durations into datetime.timedelta or Duration objects. 323 """ 324 if not time_str: 325 return datetime.timedelta(0) 326 return parse_duration(time_str) 327 328 def get_request_params( 329 self, 330 *, 331 stream_state: Optional[StreamState] = None, 332 stream_slice: Optional[StreamSlice] = None, 333 next_page_token: Optional[Mapping[str, Any]] = None, 334 ) -> Mapping[str, Any]: 335 return self._get_request_options(RequestOptionType.request_parameter, stream_slice) 336 337 def get_request_headers( 338 self, 339 *, 340 stream_state: Optional[StreamState] = None, 341 stream_slice: Optional[StreamSlice] = None, 342 next_page_token: Optional[Mapping[str, Any]] = None, 343 ) -> Mapping[str, Any]: 344 return self._get_request_options(RequestOptionType.header, stream_slice) 345 346 def get_request_body_data( 347 self, 348 *, 349 stream_state: Optional[StreamState] = None, 350 stream_slice: Optional[StreamSlice] = None, 351 next_page_token: Optional[Mapping[str, Any]] = None, 352 ) -> Mapping[str, Any]: 353 return self._get_request_options(RequestOptionType.body_data, stream_slice) 354 355 def get_request_body_json( 356 self, 357 *, 358 stream_state: Optional[StreamState] = None, 359 stream_slice: Optional[StreamSlice] = None, 360 next_page_token: Optional[Mapping[str, Any]] = None, 361 ) -> Mapping[str, Any]: 362 return self._get_request_options(RequestOptionType.body_json, stream_slice) 363 364 def request_kwargs(self) -> Mapping[str, Any]: 365 # Never update kwargs 366 return {} 367 368 def _get_request_options( 369 self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice] 370 ) -> Mapping[str, Any]: 371 options: MutableMapping[str, Any] = {} 372 if not stream_slice: 373 return options 374 375 if self.start_time_option and self.start_time_option.inject_into == option_type: 376 start_time_value = stream_slice.get(self._partition_field_start.eval(self.config)) 377 self.start_time_option.inject_into_request(options, start_time_value, self.config) 378 379 if self.end_time_option and self.end_time_option.inject_into == option_type: 380 end_time_value = stream_slice.get(self._partition_field_end.eval(self.config)) 381 self.end_time_option.inject_into_request(options, end_time_value, self.config) 382 383 return options 384 385 def should_be_synced(self, record: Record) -> bool: 386 cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 387 record_cursor_value = record.get(cursor_field) 388 if not record_cursor_value: 389 self._send_log( 390 Level.WARN, 391 f"Could not find cursor field `{cursor_field}` in record. The incremental sync will assume it needs to be synced", 392 ) 393 return True 394 latest_possible_cursor_value = self.select_best_end_datetime() 395 earliest_possible_cursor_value = self._calculate_earliest_possible_value( 396 latest_possible_cursor_value 397 ) 398 return self._is_within_daterange_boundaries( 399 record, earliest_possible_cursor_value, latest_possible_cursor_value 400 ) 401 402 def _is_within_daterange_boundaries( 403 self, 404 record: Record, 405 start_datetime_boundary: Union[datetime.datetime, str], 406 end_datetime_boundary: Union[datetime.datetime, str], 407 ) -> bool: 408 cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 409 record_cursor_value = record.get(cursor_field) 410 if not record_cursor_value: 411 self._send_log( 412 Level.WARN, 413 f"Could not find cursor field `{cursor_field}` in record. The record will not be considered when emitting sync state", 414 ) 415 return False 416 if isinstance(start_datetime_boundary, str): 417 start_datetime_boundary = self.parse_date(start_datetime_boundary) 418 if isinstance(end_datetime_boundary, str): 419 end_datetime_boundary = self.parse_date(end_datetime_boundary) 420 return ( 421 start_datetime_boundary <= self.parse_date(record_cursor_value) <= end_datetime_boundary 422 ) 423 424 def _send_log(self, level: Level, message: str) -> None: 425 if self.message_repository: 426 self.message_repository.emit_message( 427 AirbyteMessage( 428 type=Type.LOG, 429 log=AirbyteLogMessage(level=level, message=message), 430 ) 431 ) 432 433 def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: 434 cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 435 first_cursor_value = first.get(cursor_field) 436 second_cursor_value = second.get(cursor_field) 437 if first_cursor_value and second_cursor_value: 438 return self.parse_date(first_cursor_value) >= self.parse_date(second_cursor_value) 439 elif first_cursor_value: 440 return True 441 else: 442 return False 443 444 def set_runtime_lookback_window(self, lookback_window_in_seconds: int) -> None: 445 """ 446 Updates the lookback window based on a given number of seconds if the new duration 447 is greater than the currently configured lookback window. 448 449 :param lookback_window_in_seconds: The lookback duration in seconds to potentially update to. 450 """ 451 runtime_lookback_window = duration_isoformat(timedelta(seconds=lookback_window_in_seconds)) 452 config_lookback = parse_duration( 453 self._lookback_window.eval(self.config) if self._lookback_window else "P0D" 454 ) 455 456 # Check if the new runtime lookback window is greater than the current config lookback 457 if parse_duration(runtime_lookback_window) > config_lookback: 458 self._lookback_window = InterpolatedString.create( 459 runtime_lookback_window, parameters={} 460 )
Slices the stream over a datetime range and create a state with format {
Given a start time, end time, a step function, and an optional lookback window, the stream slicer will partition the date range from start time - lookback window to end time.
The step function is defined as a string of the form ISO8601 duration
The timestamp format accepts the same format codes as datetime.strfptime, which are all the format codes required by the 1989 C standard. Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html
Attributes:
- start_datetime (Union[MinMaxDatetime, str]): the datetime that determines the earliest record that should be synced
- end_datetime (Optional[Union[MinMaxDatetime, str]]): the datetime that determines the last record that should be synced
- cursor_field (Union[InterpolatedString, str]): record's cursor field
- datetime_format (str): format of the datetime
- step (Optional[str]): size of the timewindow (ISO8601 duration)
- cursor_granularity (Optional[str]): smallest increment the datetime_format has (ISO 8601 duration) that will be used to ensure that the start of a slice does not overlap with the end of the previous one
- config (Config): connection config
- start_time_option (Optional[RequestOption]): request option for start time
- end_time_option (Optional[RequestOption]): request option for end time
- partition_field_start (Optional[str]): partition start time field
- partition_field_end (Optional[str]): stream slice end time field
- lookback_window (Optional[InterpolatedString]): how many days before start_datetime to read data for (ISO8601 duration)
131 def get_stream_state(self) -> StreamState: 132 return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it is used for two things:
- Interpolation of the requests
- Transformation of records
- Saving the state
For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that allows for emitting the state to the platform.
134 def set_initial_state(self, stream_state: StreamState) -> None: 135 """ 136 Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called 137 before calling anything else 138 139 :param stream_state: The state of the stream as returned by get_stream_state 140 """ 141 self._cursor = ( 142 stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None # type: ignore [union-attr] 143 )
Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called before calling anything else
Parameters
- stream_state: The state of the stream as returned by get_stream_state
145 def observe(self, stream_slice: StreamSlice, record: Record) -> None: 146 """ 147 Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read. 148 149 :param stream_slice: The current slice, which may or may not contain the most recently observed record 150 :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the 151 stream state may need to be deferred depending on whether the source reliably orders records by the cursor field. 152 """ 153 record_cursor_value = record.get(self.cursor_field.eval(self.config)) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 154 # if the current record has no cursor value, we cannot meaningfully update the state based on it, so there is nothing more to do 155 if not record_cursor_value: 156 return 157 158 start_field = self._partition_field_start.eval(self.config) 159 end_field = self._partition_field_end.eval(self.config) 160 is_highest_observed_cursor_value = ( 161 not self._highest_observed_cursor_field_value 162 or self.parse_date(record_cursor_value) 163 > self.parse_date(self._highest_observed_cursor_field_value) 164 ) 165 if ( 166 self._is_within_daterange_boundaries( 167 record, 168 stream_slice.get(start_field), # type: ignore [arg-type] 169 stream_slice.get(end_field), # type: ignore [arg-type] 170 ) 171 and is_highest_observed_cursor_value 172 ): 173 self._highest_observed_cursor_field_value = record_cursor_value
Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.
Parameters
- stream_slice: The current slice, which may or may not contain the most recently observed record
- record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.
175 def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: 176 if stream_slice.partition: 177 raise ValueError( 178 f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}." 179 ) 180 cursor_value_str_by_cursor_value_datetime = dict( 181 map( 182 # we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like 183 # 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z' 184 lambda datetime_str: (self.parse_date(datetime_str), datetime_str), # type: ignore # because of the filter on the next line, this will only be called with a str 185 filter( 186 lambda item: item, [self._cursor, self._highest_observed_cursor_field_value] 187 ), 188 ) 189 ) 190 self._cursor = ( 191 cursor_value_str_by_cursor_value_datetime[ 192 max(cursor_value_str_by_cursor_value_datetime.keys()) 193 ] 194 if cursor_value_str_by_cursor_value_datetime 195 else None 196 )
Update state based on the stream slice. Note that stream_slice.cursor_slice
and most_recent_record.associated_slice
are expected
to be the same but we make it explicit here that stream_slice
should be leveraged to update the state. We do not pass in the
latest record, since cursor instances should maintain the relevant internal state on their own.
Parameters
- stream_slice: slice to close
198 def stream_slices(self) -> Iterable[StreamSlice]: 199 """ 200 Partition the daterange into slices of size = step. 201 202 The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime 203 The end of the window is the minimum datetime between the start of the window and end_datetime. 204 205 :return: 206 """ 207 end_datetime = self.select_best_end_datetime() 208 start_datetime = self._calculate_earliest_possible_value(self.select_best_end_datetime()) 209 return self._partition_daterange(start_datetime, end_datetime, self._step)
Partition the daterange into slices of size = step.
The start of the window is the minimum datetime between start_datetime - lookback_window and the stream_state's datetime The end of the window is the minimum datetime between the start of the window and end_datetime.
Returns
211 def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: 212 # Datetime based cursors operate over slices made up of datetime ranges. Stream state is based on the progress 213 # through each slice and does not belong to a specific slice. We just return stream state as it is. 214 return self.get_stream_state()
Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of a specific parent delineated by the incoming slice's partition object.
235 def select_best_end_datetime(self) -> datetime.datetime: 236 """ 237 Returns the optimal end datetime. 238 This method compares the current datetime with a pre-configured end datetime 239 and returns the earlier of the two. If no pre-configured end datetime is set, 240 the current datetime is returned. 241 242 :return datetime.datetime: The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier. 243 """ 244 now = datetime.datetime.now(tz=self._timezone) 245 if not self._end_datetime: 246 return now 247 return min(self._end_datetime.get_datetime(self.config), now)
Returns the optimal end datetime. This method compares the current datetime with a pre-configured end datetime and returns the earlier of the two. If no pre-configured end datetime is set, the current datetime is returned.
Returns
The best end datetime, which is either the current datetime or the pre-configured end datetime, whichever is earlier.
311 def parse_date(self, date: str) -> datetime.datetime: 312 for datetime_format in self.cursor_datetime_formats + [self.datetime_format]: 313 try: 314 return self._parser.parse(date, datetime_format) 315 except ValueError: 316 pass 317 raise ValueError(f"No format in {self.cursor_datetime_formats} matching {date}")
328 def get_request_params( 329 self, 330 *, 331 stream_state: Optional[StreamState] = None, 332 stream_slice: Optional[StreamSlice] = None, 333 next_page_token: Optional[Mapping[str, Any]] = None, 334 ) -> Mapping[str, Any]: 335 return self._get_request_options(RequestOptionType.request_parameter, stream_slice)
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
337 def get_request_headers( 338 self, 339 *, 340 stream_state: Optional[StreamState] = None, 341 stream_slice: Optional[StreamSlice] = None, 342 next_page_token: Optional[Mapping[str, Any]] = None, 343 ) -> Mapping[str, Any]: 344 return self._get_request_options(RequestOptionType.header, stream_slice)
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
346 def get_request_body_data( 347 self, 348 *, 349 stream_state: Optional[StreamState] = None, 350 stream_slice: Optional[StreamSlice] = None, 351 next_page_token: Optional[Mapping[str, Any]] = None, 352 ) -> Mapping[str, Any]: 353 return self._get_request_options(RequestOptionType.body_data, stream_slice)
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
355 def get_request_body_json( 356 self, 357 *, 358 stream_state: Optional[StreamState] = None, 359 stream_slice: Optional[StreamSlice] = None, 360 next_page_token: Optional[Mapping[str, Any]] = None, 361 ) -> Mapping[str, Any]: 362 return self._get_request_options(RequestOptionType.body_json, stream_slice)
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
385 def should_be_synced(self, record: Record) -> bool: 386 cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 387 record_cursor_value = record.get(cursor_field) 388 if not record_cursor_value: 389 self._send_log( 390 Level.WARN, 391 f"Could not find cursor field `{cursor_field}` in record. The incremental sync will assume it needs to be synced", 392 ) 393 return True 394 latest_possible_cursor_value = self.select_best_end_datetime() 395 earliest_possible_cursor_value = self._calculate_earliest_possible_value( 396 latest_possible_cursor_value 397 ) 398 return self._is_within_daterange_boundaries( 399 record, earliest_possible_cursor_value, latest_possible_cursor_value 400 )
Evaluating if a record should be synced allows for filtering and stop condition on pagination
433 def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: 434 cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__ 435 first_cursor_value = first.get(cursor_field) 436 second_cursor_value = second.get(cursor_field) 437 if first_cursor_value and second_cursor_value: 438 return self.parse_date(first_cursor_value) >= self.parse_date(second_cursor_value) 439 elif first_cursor_value: 440 return True 441 else: 442 return False
Evaluating which record is greater in terms of cursor. This is used to avoid having to capture all the records to close a slice
444 def set_runtime_lookback_window(self, lookback_window_in_seconds: int) -> None: 445 """ 446 Updates the lookback window based on a given number of seconds if the new duration 447 is greater than the currently configured lookback window. 448 449 :param lookback_window_in_seconds: The lookback duration in seconds to potentially update to. 450 """ 451 runtime_lookback_window = duration_isoformat(timedelta(seconds=lookback_window_in_seconds)) 452 config_lookback = parse_duration( 453 self._lookback_window.eval(self.config) if self._lookback_window else "P0D" 454 ) 455 456 # Check if the new runtime lookback window is greater than the current config lookback 457 if parse_duration(runtime_lookback_window) > config_lookback: 458 self._lookback_window = InterpolatedString.create( 459 runtime_lookback_window, parameters={} 460 )
Updates the lookback window based on a given number of seconds if the new duration is greater than the currently configured lookback window.
Parameters
- lookback_window_in_seconds: The lookback duration in seconds to potentially update to.
14@dataclass 15class DeclarativeAuthenticator(AbstractHeaderAuthenticator): 16 """ 17 Interface used to associate which authenticators can be used as part of the declarative framework 18 """ 19 20 def get_request_params(self) -> Mapping[str, Any]: 21 """HTTP request parameter to add to the requests""" 22 return {} 23 24 def get_request_body_data(self) -> Union[Mapping[str, Any], str]: 25 """Form-encoded body data to set on the requests""" 26 return {} 27 28 def get_request_body_json(self) -> Mapping[str, Any]: 29 """JSON-encoded body data to set on the requests""" 30 return {}
Interface used to associate which authenticators can be used as part of the declarative framework
20 def get_request_params(self) -> Mapping[str, Any]: 21 """HTTP request parameter to add to the requests""" 22 return {}
HTTP request parameter to add to the requests
24 def get_request_body_data(self) -> Union[Mapping[str, Any], str]: 25 """Form-encoded body data to set on the requests""" 26 return {}
Form-encoded body data to set on the requests
28 def get_request_body_json(self) -> Mapping[str, Any]: 29 """JSON-encoded body data to set on the requests""" 30 return {}
JSON-encoded body data to set on the requests
Inherited Members
24@dataclass 25class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, DeclarativeAuthenticator): 26 """ 27 Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials based on 28 a declarative connector configuration file. Credentials can be defined explicitly or via interpolation 29 at runtime. The generated access token is attached to each request via the Authorization header. 30 31 Attributes: 32 token_refresh_endpoint (Union[InterpolatedString, str]): The endpoint to refresh the access token 33 client_id (Union[InterpolatedString, str]): The client id 34 client_secret (Union[InterpolatedString, str]): Client secret 35 refresh_token (Union[InterpolatedString, str]): The token used to refresh the access token 36 access_token_name (Union[InterpolatedString, str]): THe field to extract access token from in the response 37 expires_in_name (Union[InterpolatedString, str]): The field to extract expires_in from in the response 38 config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec 39 scopes (Optional[List[str]]): The scopes to request 40 token_expiry_date (Optional[Union[InterpolatedString, str]]): The access token expiration date 41 token_expiry_date_format str: format of the datetime; provide it if expires_in is returned in datetime instead of seconds 42 token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration 43 refresh_request_body (Optional[Mapping[str, Any]]): The request body to send in the refresh request 44 refresh_request_headers (Optional[Mapping[str, Any]]): The request headers to send in the refresh request 45 grant_type: The grant_type to request for access_token. If set to refresh_token, the refresh_token parameter has to be provided 46 message_repository (MessageRepository): the message repository used to emit logs on HTTP requests 47 """ 48 49 config: Mapping[str, Any] 50 parameters: InitVar[Mapping[str, Any]] 51 client_id: Optional[Union[InterpolatedString, str]] = None 52 client_secret: Optional[Union[InterpolatedString, str]] = None 53 token_refresh_endpoint: Optional[Union[InterpolatedString, str]] = None 54 refresh_token: Optional[Union[InterpolatedString, str]] = None 55 scopes: Optional[List[str]] = None 56 token_expiry_date: Optional[Union[InterpolatedString, str]] = None 57 _token_expiry_date: Optional[AirbyteDateTime] = field(init=False, repr=False, default=None) 58 token_expiry_date_format: Optional[str] = None 59 token_expiry_is_time_of_expiration: bool = False 60 access_token_name: Union[InterpolatedString, str] = "access_token" 61 access_token_value: Optional[Union[InterpolatedString, str]] = None 62 client_id_name: Union[InterpolatedString, str] = "client_id" 63 client_secret_name: Union[InterpolatedString, str] = "client_secret" 64 expires_in_name: Union[InterpolatedString, str] = "expires_in" 65 refresh_token_name: Union[InterpolatedString, str] = "refresh_token" 66 refresh_request_body: Optional[Mapping[str, Any]] = None 67 refresh_request_headers: Optional[Mapping[str, Any]] = None 68 grant_type_name: Union[InterpolatedString, str] = "grant_type" 69 grant_type: Union[InterpolatedString, str] = "refresh_token" 70 message_repository: MessageRepository = NoopMessageRepository() 71 profile_assertion: Optional[DeclarativeAuthenticator] = None 72 use_profile_assertion: Optional[Union[InterpolatedBoolean, str, bool]] = False 73 74 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 75 super().__init__() 76 if self.token_refresh_endpoint is not None: 77 self._token_refresh_endpoint: Optional[InterpolatedString] = InterpolatedString.create( 78 self.token_refresh_endpoint, parameters=parameters 79 ) 80 else: 81 self._token_refresh_endpoint = None 82 self._client_id_name = InterpolatedString.create(self.client_id_name, parameters=parameters) 83 self._client_id = ( 84 InterpolatedString.create(self.client_id, parameters=parameters) 85 if self.client_id 86 else self.client_id 87 ) 88 self._client_secret_name = InterpolatedString.create( 89 self.client_secret_name, parameters=parameters 90 ) 91 self._client_secret = ( 92 InterpolatedString.create(self.client_secret, parameters=parameters) 93 if self.client_secret 94 else self.client_secret 95 ) 96 self._refresh_token_name = InterpolatedString.create( 97 self.refresh_token_name, parameters=parameters 98 ) 99 if self.refresh_token is not None: 100 self._refresh_token: Optional[InterpolatedString] = InterpolatedString.create( 101 self.refresh_token, parameters=parameters 102 ) 103 else: 104 self._refresh_token = None 105 self.access_token_name = InterpolatedString.create( 106 self.access_token_name, parameters=parameters 107 ) 108 self.expires_in_name = InterpolatedString.create( 109 self.expires_in_name, parameters=parameters 110 ) 111 self.grant_type_name = InterpolatedString.create( 112 self.grant_type_name, parameters=parameters 113 ) 114 self.grant_type = InterpolatedString.create( 115 "urn:ietf:params:oauth:grant-type:jwt-bearer" 116 if self.use_profile_assertion 117 else self.grant_type, 118 parameters=parameters, 119 ) 120 self._refresh_request_body = InterpolatedMapping( 121 self.refresh_request_body or {}, parameters=parameters 122 ) 123 self._refresh_request_headers = InterpolatedMapping( 124 self.refresh_request_headers or {}, parameters=parameters 125 ) 126 try: 127 if ( 128 isinstance(self.token_expiry_date, (int, str)) 129 and str(self.token_expiry_date).isdigit() 130 ): 131 self._token_expiry_date = ab_datetime_parse(self.token_expiry_date) 132 else: 133 self._token_expiry_date = ( 134 ab_datetime_parse( 135 InterpolatedString.create( 136 self.token_expiry_date, parameters=parameters 137 ).eval(self.config) 138 ) 139 if self.token_expiry_date 140 else ab_datetime_now() - timedelta(days=1) 141 ) 142 except ValueError as e: 143 raise ValueError(f"Invalid token expiry date format: {e}") 144 self.use_profile_assertion = ( 145 InterpolatedBoolean(self.use_profile_assertion, parameters=parameters) 146 if isinstance(self.use_profile_assertion, str) 147 else self.use_profile_assertion 148 ) 149 self.assertion_name = "assertion" 150 151 if self.access_token_value is not None: 152 self._access_token_value = InterpolatedString.create( 153 self.access_token_value, parameters=parameters 154 ).eval(self.config) 155 else: 156 self._access_token_value = None 157 158 self._access_token: Optional[str] = ( 159 self._access_token_value if self.access_token_value else None 160 ) 161 162 if not self.use_profile_assertion and any( 163 client_creds is None for client_creds in [self.client_id, self.client_secret] 164 ): 165 raise ValueError( 166 "OAuthAuthenticator configuration error: Both 'client_id' and 'client_secret' are required for the " 167 "basic OAuth flow." 168 ) 169 if self.profile_assertion is None and self.use_profile_assertion: 170 raise ValueError( 171 "OAuthAuthenticator configuration error: 'profile_assertion' is required when using the profile assertion flow." 172 ) 173 if self.get_grant_type() == "refresh_token" and self._refresh_token is None: 174 raise ValueError( 175 "OAuthAuthenticator configuration error: A 'refresh_token' is required when the 'grant_type' is set to 'refresh_token'." 176 ) 177 178 def get_token_refresh_endpoint(self) -> Optional[str]: 179 if self._token_refresh_endpoint is not None: 180 refresh_token_endpoint: str = self._token_refresh_endpoint.eval(self.config) 181 if not refresh_token_endpoint: 182 raise ValueError( 183 "OAuthAuthenticator was unable to evaluate token_refresh_endpoint parameter" 184 ) 185 return refresh_token_endpoint 186 return None 187 188 def get_client_id_name(self) -> str: 189 return self._client_id_name.eval(self.config) # type: ignore # eval returns a string in this context 190 191 def get_client_id(self) -> str: 192 client_id = self._client_id.eval(self.config) if self._client_id else self._client_id 193 if not client_id: 194 raise ValueError("OAuthAuthenticator was unable to evaluate client_id parameter") 195 return client_id # type: ignore # value will be returned as a string, or an error will be raised 196 197 def get_client_secret_name(self) -> str: 198 return self._client_secret_name.eval(self.config) # type: ignore # eval returns a string in this context 199 200 def get_client_secret(self) -> str: 201 client_secret = ( 202 self._client_secret.eval(self.config) if self._client_secret else self._client_secret 203 ) 204 if not client_secret: 205 raise ValueError("OAuthAuthenticator was unable to evaluate client_secret parameter") 206 return client_secret # type: ignore # value will be returned as a string, or an error will be raised 207 208 def get_refresh_token_name(self) -> str: 209 return self._refresh_token_name.eval(self.config) # type: ignore # eval returns a string in this context 210 211 def get_refresh_token(self) -> Optional[str]: 212 return None if self._refresh_token is None else str(self._refresh_token.eval(self.config)) 213 214 def get_scopes(self) -> List[str]: 215 return self.scopes or [] 216 217 def get_access_token_name(self) -> str: 218 return self.access_token_name.eval(self.config) # type: ignore # eval returns a string in this context 219 220 def get_expires_in_name(self) -> str: 221 return self.expires_in_name.eval(self.config) # type: ignore # eval returns a string in this context 222 223 def get_grant_type_name(self) -> str: 224 return self.grant_type_name.eval(self.config) # type: ignore # eval returns a string in this context 225 226 def get_grant_type(self) -> str: 227 return self.grant_type.eval(self.config) # type: ignore # eval returns a string in this context 228 229 def get_refresh_request_body(self) -> Mapping[str, Any]: 230 return self._refresh_request_body.eval(self.config) 231 232 def get_refresh_request_headers(self) -> Mapping[str, Any]: 233 return self._refresh_request_headers.eval(self.config) 234 235 def get_token_expiry_date(self) -> AirbyteDateTime: 236 if not self._has_access_token_been_initialized(): 237 return AirbyteDateTime.from_datetime(datetime.min) 238 return self._token_expiry_date # type: ignore # _token_expiry_date is an AirbyteDateTime. It is never None despite what mypy thinks 239 240 def _has_access_token_been_initialized(self) -> bool: 241 return self._access_token is not None 242 243 def set_token_expiry_date(self, value: Union[str, int]) -> None: 244 self._token_expiry_date = self._parse_token_expiration_date(value) 245 246 def get_assertion_name(self) -> str: 247 return self.assertion_name 248 249 def get_assertion(self) -> str: 250 if self.profile_assertion is None: 251 raise ValueError("profile_assertion is not set") 252 return self.profile_assertion.token 253 254 def build_refresh_request_body(self) -> Mapping[str, Any]: 255 """ 256 Returns the request body to set on the refresh request 257 258 Override to define additional parameters 259 """ 260 if self.use_profile_assertion: 261 return { 262 self.get_grant_type_name(): self.get_grant_type(), 263 self.get_assertion_name(): self.get_assertion(), 264 } 265 return super().build_refresh_request_body() 266 267 @property 268 def access_token(self) -> str: 269 if self._access_token is None: 270 raise ValueError("access_token is not set") 271 return self._access_token 272 273 @access_token.setter 274 def access_token(self, value: str) -> None: 275 self._access_token = value 276 277 @property 278 def _message_repository(self) -> MessageRepository: 279 """ 280 Overriding AbstractOauth2Authenticator._message_repository to allow for HTTP request logs 281 """ 282 return self.message_repository
Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials based on a declarative connector configuration file. Credentials can be defined explicitly or via interpolation at runtime. The generated access token is attached to each request via the Authorization header.
Attributes:
- token_refresh_endpoint (Union[InterpolatedString, str]): The endpoint to refresh the access token
- client_id (Union[InterpolatedString, str]): The client id
- client_secret (Union[InterpolatedString, str]): Client secret
- refresh_token (Union[InterpolatedString, str]): The token used to refresh the access token
- access_token_name (Union[InterpolatedString, str]): THe field to extract access token from in the response
- expires_in_name (Union[InterpolatedString, str]): The field to extract expires_in from in the response
- config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec
- scopes (Optional[List[str]]): The scopes to request
- token_expiry_date (Optional[Union[InterpolatedString, str]]): The access token expiration date
- token_expiry_date_format str: format of the datetime; provide it if expires_in is returned in datetime instead of seconds
- token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
- refresh_request_body (Optional[Mapping[str, Any]]): The request body to send in the refresh request
- refresh_request_headers (Optional[Mapping[str, Any]]): The request headers to send in the refresh request
- grant_type: The grant_type to request for access_token. If set to refresh_token, the refresh_token parameter has to be provided
- message_repository (MessageRepository): the message repository used to emit logs on HTTP requests
If all of refresh_token_error_status_codes, refresh_token_error_key, and refresh_token_error_values are set, then http errors with such params will be wrapped in AirbyteTracedException.
Format of the datetime; exists it if expires_in is returned as the expiration datetime instead of seconds until it expires
Indicates that the Token Expiry returns the date until which the token will be valid, not the amount of time it will be valid.
178 def get_token_refresh_endpoint(self) -> Optional[str]: 179 if self._token_refresh_endpoint is not None: 180 refresh_token_endpoint: str = self._token_refresh_endpoint.eval(self.config) 181 if not refresh_token_endpoint: 182 raise ValueError( 183 "OAuthAuthenticator was unable to evaluate token_refresh_endpoint parameter" 184 ) 185 return refresh_token_endpoint 186 return None
Returns the endpoint to refresh the access token
188 def get_client_id_name(self) -> str: 189 return self._client_id_name.eval(self.config) # type: ignore # eval returns a string in this context
The client id name to authenticate
191 def get_client_id(self) -> str: 192 client_id = self._client_id.eval(self.config) if self._client_id else self._client_id 193 if not client_id: 194 raise ValueError("OAuthAuthenticator was unable to evaluate client_id parameter") 195 return client_id # type: ignore # value will be returned as a string, or an error will be raised
The client id to authenticate
197 def get_client_secret_name(self) -> str: 198 return self._client_secret_name.eval(self.config) # type: ignore # eval returns a string in this context
The client secret name to authenticate
200 def get_client_secret(self) -> str: 201 client_secret = ( 202 self._client_secret.eval(self.config) if self._client_secret else self._client_secret 203 ) 204 if not client_secret: 205 raise ValueError("OAuthAuthenticator was unable to evaluate client_secret parameter") 206 return client_secret # type: ignore # value will be returned as a string, or an error will be raised
The client secret to authenticate
208 def get_refresh_token_name(self) -> str: 209 return self._refresh_token_name.eval(self.config) # type: ignore # eval returns a string in this context
The refresh token name to authenticate
211 def get_refresh_token(self) -> Optional[str]: 212 return None if self._refresh_token is None else str(self._refresh_token.eval(self.config))
The token used to refresh the access token when it expires
217 def get_access_token_name(self) -> str: 218 return self.access_token_name.eval(self.config) # type: ignore # eval returns a string in this context
Field to extract access token from in the response
220 def get_expires_in_name(self) -> str: 221 return self.expires_in_name.eval(self.config) # type: ignore # eval returns a string in this context
Returns the expires_in field name
223 def get_grant_type_name(self) -> str: 224 return self.grant_type_name.eval(self.config) # type: ignore # eval returns a string in this context
Returns grant_type specified name for requesting access_token
226 def get_grant_type(self) -> str: 227 return self.grant_type.eval(self.config) # type: ignore # eval returns a string in this context
Returns grant_type specified for requesting access_token
229 def get_refresh_request_body(self) -> Mapping[str, Any]: 230 return self._refresh_request_body.eval(self.config)
Returns the request body to set on the refresh request
232 def get_refresh_request_headers(self) -> Mapping[str, Any]: 233 return self._refresh_request_headers.eval(self.config)
Returns the request headers to set on the refresh request
235 def get_token_expiry_date(self) -> AirbyteDateTime: 236 if not self._has_access_token_been_initialized(): 237 return AirbyteDateTime.from_datetime(datetime.min) 238 return self._token_expiry_date # type: ignore # _token_expiry_date is an AirbyteDateTime. It is never None despite what mypy thinks
Expiration date of the access token
243 def set_token_expiry_date(self, value: Union[str, int]) -> None: 244 self._token_expiry_date = self._parse_token_expiration_date(value)
Setter for access token expiration date
254 def build_refresh_request_body(self) -> Mapping[str, Any]: 255 """ 256 Returns the request body to set on the refresh request 257 258 Override to define additional parameters 259 """ 260 if self.use_profile_assertion: 261 return { 262 self.get_grant_type_name(): self.get_grant_type(), 263 self.get_assertion_name(): self.get_assertion(), 264 } 265 return super().build_refresh_request_body()
Returns the request body to set on the refresh request
Override to define additional parameters
267 @property 268 def access_token(self) -> str: 269 if self._access_token is None: 270 raise ValueError("access_token is not set") 271 return self._access_token
Returns the access token
Inherited Members
285@dataclass 286class DeclarativeSingleUseRefreshTokenOauth2Authenticator( 287 SingleUseRefreshTokenOauth2Authenticator, DeclarativeAuthenticator 288): 289 """ 290 Declarative version of SingleUseRefreshTokenOauth2Authenticator which can be used in declarative connectors. 291 """ 292 293 def __init__(self, *args: Any, **kwargs: Any) -> None: 294 super().__init__(*args, **kwargs)
Declarative version of SingleUseRefreshTokenOauth2Authenticator which can be used in declarative connectors.
Arguments:
- connector_config (Mapping[str, Any]): The full connector configuration
- token_refresh_endpoint (str): Full URL to the token refresh endpoint
- scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None.
- access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token".
- expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in".
- refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token".
- refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None.
- refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None.
- grant_type (str, optional): OAuth grant type. Defaults to "refresh_token".
- client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object.
- client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object.
- access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token").
- refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token").
- token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date").
- token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration.
- token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
- message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update
Inherited Members
- SingleUseRefreshTokenOauth2Authenticator
- access_token
- get_refresh_token
- set_refresh_token
- get_token_expiry_date
- set_token_expiry_date
- token_has_expired
- get_new_token_expiry_date
- get_access_token
- refresh_access_token
- Oauth2Authenticator
- get_token_refresh_endpoint
- get_client_id_name
- get_client_id
- get_client_secret_name
- get_client_secret
- get_refresh_token_name
- get_access_token_name
- get_scopes
- get_expires_in_name
- get_refresh_request_body
- get_refresh_request_headers
- get_grant_type_name
- get_grant_type
- token_expiry_is_time_of_expiration
- token_expiry_date_format
32@dataclass 33class DeclarativeStream(Stream): 34 """ 35 DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever 36 37 Attributes: 38 name (str): stream name 39 primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream 40 schema_loader (SchemaLoader): The schema loader 41 retriever (Retriever): The retriever 42 config (Config): The user-provided configuration as specified by the source's spec 43 stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field 44 stream. Transformations are applied in the order in which they are defined. 45 """ 46 47 retriever: Retriever 48 config: Config 49 parameters: InitVar[Mapping[str, Any]] 50 name: str 51 primary_key: Optional[Union[str, List[str], List[List[str]]]] 52 state_migrations: List[StateMigration] = field(repr=True, default_factory=list) 53 schema_loader: Optional[SchemaLoader] = None 54 _name: str = field(init=False, repr=False, default="") 55 _primary_key: str = field(init=False, repr=False, default="") 56 stream_cursor_field: Optional[Union[InterpolatedString, str]] = None 57 58 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 59 self._stream_cursor_field = ( 60 InterpolatedString.create(self.stream_cursor_field, parameters=parameters) 61 if isinstance(self.stream_cursor_field, str) 62 else self.stream_cursor_field 63 ) 64 self._schema_loader = ( 65 self.schema_loader 66 if self.schema_loader 67 else DefaultSchemaLoader(config=self.config, parameters=parameters) 68 ) 69 70 @property # type: ignore 71 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 72 return self._primary_key 73 74 @primary_key.setter 75 def primary_key(self, value: str) -> None: 76 if not isinstance(value, property): 77 self._primary_key = value 78 79 @property 80 def exit_on_rate_limit(self) -> bool: 81 if isinstance(self.retriever, AsyncRetriever): 82 return self.retriever.exit_on_rate_limit 83 84 return self.retriever.requester.exit_on_rate_limit # type: ignore # abstract Retriever class has not requester attribute 85 86 @exit_on_rate_limit.setter 87 def exit_on_rate_limit(self, value: bool) -> None: 88 if isinstance(self.retriever, AsyncRetriever): 89 self.retriever.exit_on_rate_limit = value 90 else: 91 self.retriever.requester.exit_on_rate_limit = value # type: ignore[attr-defined] 92 93 @property # type: ignore 94 def name(self) -> str: 95 """ 96 :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. 97 """ 98 return self._name 99 100 @name.setter 101 def name(self, value: str) -> None: 102 if not isinstance(value, property): 103 self._name = value 104 105 @property 106 def state(self) -> MutableMapping[str, Any]: 107 return self.retriever.state # type: ignore 108 109 @state.setter 110 def state(self, value: MutableMapping[str, Any]) -> None: 111 """State setter, accept state serialized by state getter.""" 112 state: Mapping[str, Any] = value 113 if self.state_migrations: 114 for migration in self.state_migrations: 115 if migration.should_migrate(state): 116 state = migration.migrate(state) 117 self.retriever.state = state 118 119 def get_updated_state( 120 self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] 121 ) -> MutableMapping[str, Any]: 122 return self.state 123 124 @property 125 def cursor_field(self) -> Union[str, List[str]]: 126 """ 127 Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. 128 :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. 129 """ 130 cursor = self._stream_cursor_field.eval(self.config) # type: ignore # _stream_cursor_field is always cast to interpolated string 131 return cursor if cursor else [] 132 133 @property 134 def is_resumable(self) -> bool: 135 # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on 136 # if the retriever has a cursor defined. 137 return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False 138 139 def read_records( 140 self, 141 sync_mode: SyncMode, 142 cursor_field: Optional[List[str]] = None, 143 stream_slice: Optional[Mapping[str, Any]] = None, 144 stream_state: Optional[Mapping[str, Any]] = None, 145 ) -> Iterable[Mapping[str, Any]]: 146 """ 147 :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state. 148 """ 149 if stream_slice is None or ( 150 not isinstance(stream_slice, StreamSlice) and stream_slice == {} 151 ): 152 # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field 153 # As part of the declarative model without custom components, this should never happen as the CDK would wire up a 154 # SinglePartitionRouter that would create this StreamSlice properly 155 # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default 156 # empty slice which seems to make sense. 157 stream_slice = StreamSlice(partition={}, cursor_slice={}) 158 if not isinstance(stream_slice, StreamSlice): 159 raise ValueError( 160 f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}" 161 ) 162 yield from self.retriever.read_records(self.get_json_schema(), stream_slice) # type: ignore # records are of the correct type 163 164 def get_json_schema(self) -> Mapping[str, Any]: # type: ignore 165 """ 166 :return: A dict of the JSON schema representing this stream. 167 168 The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. 169 Override as needed. 170 """ 171 return self._schema_loader.get_json_schema() 172 173 def stream_slices( 174 self, 175 *, 176 sync_mode: SyncMode, 177 cursor_field: Optional[List[str]] = None, 178 stream_state: Optional[Mapping[str, Any]] = None, 179 ) -> Iterable[Optional[StreamSlice]]: 180 """ 181 Override to define the slices for this stream. See the stream slicing section of the docs for more information. 182 183 :param sync_mode: 184 :param cursor_field: 185 :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state 186 :return: 187 """ 188 return self.retriever.stream_slices() 189 190 @property 191 def state_checkpoint_interval(self) -> Optional[int]: 192 """ 193 We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but: 194 * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the 195 cursor value once at the end of every slice. 196 * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the 197 important state is the one at the beginning of the slice 198 """ 199 return None 200 201 def get_cursor(self) -> Optional[Cursor]: 202 if self.retriever and isinstance(self.retriever, SimpleRetriever): 203 return self.retriever.cursor 204 return None 205 206 def _get_checkpoint_reader( 207 self, 208 logger: logging.Logger, 209 cursor_field: Optional[List[str]], 210 sync_mode: SyncMode, 211 stream_state: MutableMapping[str, Any], 212 ) -> CheckpointReader: 213 """ 214 This method is overridden to prevent issues with stream slice classification for incremental streams that have parent streams. 215 216 The classification logic, when used with `itertools.tee`, creates a copy of the stream slices. When `stream_slices` is called 217 the second time, the parent records generated during the classification phase are lost. This occurs because `itertools.tee` 218 only buffers the results, meaning the logic in `simple_retriever` that observes and updates the cursor isn't executed again. 219 220 By overriding this method, we ensure that the stream slices are processed correctly and parent records are not lost, 221 allowing the cursor to function as expected. 222 """ 223 mappings_or_slices = self.stream_slices( 224 cursor_field=cursor_field, 225 sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior 226 stream_state=stream_state, 227 ) 228 229 cursor = self.get_cursor() 230 checkpoint_mode = self._checkpoint_mode 231 232 if isinstance( 233 cursor, (GlobalSubstreamCursor, PerPartitionCursor, PerPartitionWithGlobalCursor) 234 ): 235 self.has_multiple_slices = True 236 return CursorBasedCheckpointReader( 237 stream_slices=mappings_or_slices, 238 cursor=cursor, 239 read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH, 240 ) 241 242 return super()._get_checkpoint_reader(logger, cursor_field, sync_mode, stream_state)
DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever
Attributes:
- name (str): stream name
- primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream
- schema_loader (SchemaLoader): The schema loader
- retriever (Retriever): The retriever
- config (Config): The user-provided configuration as specified by the source's spec
- stream_cursor_field (Optional[Union[InterpolatedString, str]]): The cursor field
- stream. Transformations are applied in the order in which they are defined.
93 @property # type: ignore 94 def name(self) -> str: 95 """ 96 :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. 97 """ 98 return self._name
Returns
Stream name. By default this is the implementing class name, but it can be overridden as needed.
70 @property # type: ignore 71 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 72 return self._primary_key
Returns
string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. If the stream has no primary keys, return None.
79 @property 80 def exit_on_rate_limit(self) -> bool: 81 if isinstance(self.retriever, AsyncRetriever): 82 return self.retriever.exit_on_rate_limit 83 84 return self.retriever.requester.exit_on_rate_limit # type: ignore # abstract Retriever class has not requester attribute
Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.
105 @property 106 def state(self) -> MutableMapping[str, Any]: 107 return self.retriever.state # type: ignore
State setter, accept state serialized by state getter.
119 def get_updated_state( 120 self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] 121 ) -> MutableMapping[str, Any]: 122 return self.state
DEPRECATED. Please use explicit state property instead, see IncrementalMixin
docs.
Override to extract state from the latest record. Needed to implement incremental sync.
Inspects the latest record extracted from the data source and the current state object and return an updated state object.
For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.
Parameters
- current_stream_state: The stream's current state object
- latest_record: The latest record extracted from the stream
Returns
An updated state object
124 @property 125 def cursor_field(self) -> Union[str, List[str]]: 126 """ 127 Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. 128 :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. 129 """ 130 cursor = self._stream_cursor_field.eval(self.config) # type: ignore # _stream_cursor_field is always cast to interpolated string 131 return cursor if cursor else []
Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
Returns
The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
133 @property 134 def is_resumable(self) -> bool: 135 # Declarative sources always implement state getter/setter, but whether it supports checkpointing is based on 136 # if the retriever has a cursor defined. 137 return self.retriever.cursor is not None if hasattr(self.retriever, "cursor") else False
Returns
True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning on the next sync job.
139 def read_records( 140 self, 141 sync_mode: SyncMode, 142 cursor_field: Optional[List[str]] = None, 143 stream_slice: Optional[Mapping[str, Any]] = None, 144 stream_state: Optional[Mapping[str, Any]] = None, 145 ) -> Iterable[Mapping[str, Any]]: 146 """ 147 :param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state. 148 """ 149 if stream_slice is None or ( 150 not isinstance(stream_slice, StreamSlice) and stream_slice == {} 151 ): 152 # As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field 153 # As part of the declarative model without custom components, this should never happen as the CDK would wire up a 154 # SinglePartitionRouter that would create this StreamSlice properly 155 # As part of the declarative model with custom components, a user that would return a `None` slice would now have the default 156 # empty slice which seems to make sense. 157 stream_slice = StreamSlice(partition={}, cursor_slice={}) 158 if not isinstance(stream_slice, StreamSlice): 159 raise ValueError( 160 f"DeclarativeStream does not support stream_slices that are not StreamSlice. Got {stream_slice}" 161 ) 162 yield from self.retriever.read_records(self.get_json_schema(), stream_slice) # type: ignore # records are of the correct type
Parameters
- stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
164 def get_json_schema(self) -> Mapping[str, Any]: # type: ignore 165 """ 166 :return: A dict of the JSON schema representing this stream. 167 168 The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. 169 Override as needed. 170 """ 171 return self._schema_loader.get_json_schema()
Returns
A dict of the JSON schema representing this stream.
The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. Override as needed.
173 def stream_slices( 174 self, 175 *, 176 sync_mode: SyncMode, 177 cursor_field: Optional[List[str]] = None, 178 stream_state: Optional[Mapping[str, Any]] = None, 179 ) -> Iterable[Optional[StreamSlice]]: 180 """ 181 Override to define the slices for this stream. See the stream slicing section of the docs for more information. 182 183 :param sync_mode: 184 :param cursor_field: 185 :param stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state 186 :return: 187 """ 188 return self.retriever.stream_slices()
Override to define the slices for this stream. See the stream slicing section of the docs for more information.
Parameters
- sync_mode:
- cursor_field:
- stream_state: we knowingly avoid using stream_state as we want cursors to manage their own state
Returns
190 @property 191 def state_checkpoint_interval(self) -> Optional[int]: 192 """ 193 We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but: 194 * In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the 195 cursor value once at the end of every slice. 196 * Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the 197 important state is the one at the beginning of the slice 198 """ 199 return None
We explicitly disable checkpointing here. There are a couple reasons for that and not all are documented here but:
- In the case where records are not ordered, the granularity of what is ordered is the slice. Therefore, we will only update the cursor value once at the end of every slice.
- Updating the state once every record would generate issues for data feed stop conditions or semi-incremental syncs where the important state is the one at the beginning of the slice
201 def get_cursor(self) -> Optional[Cursor]: 202 if self.retriever and isinstance(self.retriever, SimpleRetriever): 203 return self.retriever.cursor 204 return None
A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need to define a cursor implementation and override this method to manage state through a Cursor.
15@dataclass 16class Decoder: 17 """ 18 Decoder strategy to transform a requests.Response into a Mapping[str, Any] 19 """ 20 21 @abstractmethod 22 def is_stream_response(self) -> bool: 23 """ 24 Set to True if you'd like to use stream=True option in http requester 25 """ 26 27 @abstractmethod 28 def decode(self, response: requests.Response) -> DECODER_OUTPUT_TYPE: 29 """ 30 Decodes a requests.Response into a Mapping[str, Any] or an array 31 :param response: the response to decode 32 :return: Generator of Mapping describing the response 33 """
Decoder strategy to transform a requests.Response into a Mapping[str, Any]
21 @abstractmethod 22 def is_stream_response(self) -> bool: 23 """ 24 Set to True if you'd like to use stream=True option in http requester 25 """
Set to True if you'd like to use stream=True option in http requester
27 @abstractmethod 28 def decode(self, response: requests.Response) -> DECODER_OUTPUT_TYPE: 29 """ 30 Decodes a requests.Response into a Mapping[str, Any] or an array 31 :param response: the response to decode 32 :return: Generator of Mapping describing the response 33 """
Decodes a requests.Response into a Mapping[str, Any] or an array
Parameters
- response: the response to decode
Returns
Generator of Mapping describing the response
33@dataclass 34class DefaultPaginator(Paginator): 35 """ 36 Default paginator to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token 37 38 Examples: 39 1. 40 * fetches up to 10 records at a time by setting the "limit" request param to 10 41 * updates the request path with "{{ response._metadata.next }}" 42 ``` 43 paginator: 44 type: "DefaultPaginator" 45 page_size_option: 46 type: RequestOption 47 inject_into: request_parameter 48 field_name: limit 49 page_token_option: 50 type: RequestPath 51 path: "location" 52 pagination_strategy: 53 type: "CursorPagination" 54 cursor_value: "{{ response._metadata.next }}" 55 page_size: 10 56 ``` 57 58 2. 59 * fetches up to 5 records at a time by setting the "page_size" header to 5 60 * increments a record counter and set the request parameter "offset" to the value of the counter 61 ``` 62 paginator: 63 type: "DefaultPaginator" 64 page_size_option: 65 type: RequestOption 66 inject_into: header 67 field_name: page_size 68 pagination_strategy: 69 type: "OffsetIncrement" 70 page_size: 5 71 page_token_option: 72 option_type: "request_parameter" 73 field_name: "offset" 74 ``` 75 76 3. 77 * fetches up to 5 records at a time by setting the "page_size" request param to 5 78 * increments a page counter and set the request parameter "page" to the value of the counter 79 ``` 80 paginator: 81 type: "DefaultPaginator" 82 page_size_option: 83 type: RequestOption 84 inject_into: request_parameter 85 field_name: page_size 86 pagination_strategy: 87 type: "PageIncrement" 88 page_size: 5 89 page_token_option: 90 type: RequestOption 91 option_type: "request_parameter" 92 field_name: "page" 93 ``` 94 Attributes: 95 page_size_option (Optional[RequestOption]): the request option to set the page size. Cannot be injected in the path. 96 page_token_option (Optional[RequestPath, RequestOption]): the request option to set the page token 97 pagination_strategy (PaginationStrategy): Strategy defining how to get the next page token 98 config (Config): connection config 99 url_base (Union[InterpolatedString, str]): endpoint's base url 100 decoder (Decoder): decoder to decode the response 101 """ 102 103 pagination_strategy: PaginationStrategy 104 config: Config 105 url_base: Union[InterpolatedString, str] 106 parameters: InitVar[Mapping[str, Any]] 107 decoder: Decoder = field( 108 default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 109 ) 110 page_size_option: Optional[RequestOption] = None 111 page_token_option: Optional[Union[RequestPath, RequestOption]] = None 112 113 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 114 if self.page_size_option and not self.pagination_strategy.get_page_size(): 115 raise ValueError( 116 "page_size_option cannot be set if the pagination strategy does not have a page_size" 117 ) 118 if isinstance(self.url_base, str): 119 self.url_base = InterpolatedString(string=self.url_base, parameters=parameters) 120 121 if self.page_token_option and not isinstance(self.page_token_option, RequestPath): 122 _validate_component_request_option_paths( 123 self.config, 124 self.page_size_option, 125 self.page_token_option, 126 ) 127 128 def get_initial_token(self) -> Optional[Any]: 129 """ 130 Return the page token that should be used for the first request of a stream 131 132 WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing 133 of state using page numbers. Because paginators are stateless 134 """ 135 return self.pagination_strategy.initial_token 136 137 def next_page_token( 138 self, 139 response: requests.Response, 140 last_page_size: int, 141 last_record: Optional[Record], 142 last_page_token_value: Optional[Any] = None, 143 ) -> Optional[Mapping[str, Any]]: 144 next_page_token = self.pagination_strategy.next_page_token( 145 response=response, 146 last_page_size=last_page_size, 147 last_record=last_record, 148 last_page_token_value=last_page_token_value, 149 ) 150 if next_page_token: 151 return {"next_page_token": next_page_token} 152 else: 153 return None 154 155 def path( 156 self, 157 next_page_token: Optional[Mapping[str, Any]], 158 stream_state: Optional[Mapping[str, Any]] = None, 159 stream_slice: Optional[StreamSlice] = None, 160 ) -> Optional[str]: 161 token = next_page_token.get("next_page_token") if next_page_token else None 162 if token and self.page_token_option and isinstance(self.page_token_option, RequestPath): 163 # make additional interpolation context 164 interpolation_context = get_interpolation_context( 165 stream_state=stream_state, 166 stream_slice=stream_slice, 167 next_page_token=next_page_token, 168 ) 169 # Replace url base to only return the path 170 return str(token).replace(self.url_base.eval(self.config, **interpolation_context), "") # type: ignore # url_base is casted to a InterpolatedString in __post_init__ 171 else: 172 return None 173 174 def get_request_params( 175 self, 176 *, 177 stream_state: Optional[StreamState] = None, 178 stream_slice: Optional[StreamSlice] = None, 179 next_page_token: Optional[Mapping[str, Any]] = None, 180 ) -> MutableMapping[str, Any]: 181 return self._get_request_options(RequestOptionType.request_parameter, next_page_token) 182 183 def get_request_headers( 184 self, 185 *, 186 stream_state: Optional[StreamState] = None, 187 stream_slice: Optional[StreamSlice] = None, 188 next_page_token: Optional[Mapping[str, Any]] = None, 189 ) -> Mapping[str, str]: 190 return self._get_request_options(RequestOptionType.header, next_page_token) 191 192 def get_request_body_data( 193 self, 194 *, 195 stream_state: Optional[StreamState] = None, 196 stream_slice: Optional[StreamSlice] = None, 197 next_page_token: Optional[Mapping[str, Any]] = None, 198 ) -> Mapping[str, Any]: 199 return self._get_request_options(RequestOptionType.body_data, next_page_token) 200 201 def get_request_body_json( 202 self, 203 *, 204 stream_state: Optional[StreamState] = None, 205 stream_slice: Optional[StreamSlice] = None, 206 next_page_token: Optional[Mapping[str, Any]] = None, 207 ) -> Mapping[str, Any]: 208 return self._get_request_options(RequestOptionType.body_json, next_page_token) 209 210 def _get_request_options( 211 self, option_type: RequestOptionType, next_page_token: Optional[Mapping[str, Any]] 212 ) -> MutableMapping[str, Any]: 213 options: MutableMapping[str, Any] = {} 214 215 token = next_page_token.get("next_page_token") if next_page_token else None 216 if ( 217 self.page_token_option 218 and token is not None 219 and isinstance(self.page_token_option, RequestOption) 220 and self.page_token_option.inject_into == option_type 221 ): 222 self.page_token_option.inject_into_request(options, token, self.config) 223 224 if ( 225 self.page_size_option 226 and self.pagination_strategy.get_page_size() 227 and self.page_size_option.inject_into == option_type 228 ): 229 page_size = self.pagination_strategy.get_page_size() 230 self.page_size_option.inject_into_request(options, page_size, self.config) 231 232 return options
Default paginator to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token
Examples:
1.
- fetches up to 10 records at a time by setting the "limit" request param to 10
- updates the request path with "{{ response._metadata.next }}"
paginator: type: "DefaultPaginator" page_size_option: type: RequestOption inject_into: request_parameter field_name: limit page_token_option: type: RequestPath path: "location" pagination_strategy: type: "CursorPagination" cursor_value: "{{ response._metadata.next }}" page_size: 10
2.
- fetches up to 5 records at a time by setting the "page_size" header to 5
- increments a record counter and set the request parameter "offset" to the value of the counter
paginator: type: "DefaultPaginator" page_size_option: type: RequestOption inject_into: header field_name: page_size pagination_strategy: type: "OffsetIncrement" page_size: 5 page_token_option: option_type: "request_parameter" field_name: "offset"
3.
- fetches up to 5 records at a time by setting the "page_size" request param to 5
- increments a page counter and set the request parameter "page" to the value of the counter
paginator: type: "DefaultPaginator" page_size_option: type: RequestOption inject_into: request_parameter field_name: page_size pagination_strategy: type: "PageIncrement" page_size: 5 page_token_option: type: RequestOption option_type: "request_parameter" field_name: "page"
Attributes:
- page_size_option (Optional[RequestOption]): the request option to set the page size. Cannot be injected in the path.
- page_token_option (Optional[RequestPath, RequestOption]): the request option to set the page token
- pagination_strategy (PaginationStrategy): Strategy defining how to get the next page token
- config (Config): connection config
- url_base (Union[InterpolatedString, str]): endpoint's base url
- decoder (Decoder): decoder to decode the response
128 def get_initial_token(self) -> Optional[Any]: 129 """ 130 Return the page token that should be used for the first request of a stream 131 132 WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing 133 of state using page numbers. Because paginators are stateless 134 """ 135 return self.pagination_strategy.initial_token
Return the page token that should be used for the first request of a stream
WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing of state using page numbers. Because paginators are stateless
137 def next_page_token( 138 self, 139 response: requests.Response, 140 last_page_size: int, 141 last_record: Optional[Record], 142 last_page_token_value: Optional[Any] = None, 143 ) -> Optional[Mapping[str, Any]]: 144 next_page_token = self.pagination_strategy.next_page_token( 145 response=response, 146 last_page_size=last_page_size, 147 last_record=last_record, 148 last_page_token_value=last_page_token_value, 149 ) 150 if next_page_token: 151 return {"next_page_token": next_page_token} 152 else: 153 return None
Returns the next_page_token to use to fetch the next page of records.
Parameters
- response: the response to process
- last_page_size: the number of records read from the response
- last_record: the last record extracted from the response
- last_page_token_value: The current value of the page token made on the last request
Returns
A mapping {"next_page_token":
} for the next page from the input response object. Returning None means there are no more pages to read in this response.
155 def path( 156 self, 157 next_page_token: Optional[Mapping[str, Any]], 158 stream_state: Optional[Mapping[str, Any]] = None, 159 stream_slice: Optional[StreamSlice] = None, 160 ) -> Optional[str]: 161 token = next_page_token.get("next_page_token") if next_page_token else None 162 if token and self.page_token_option and isinstance(self.page_token_option, RequestPath): 163 # make additional interpolation context 164 interpolation_context = get_interpolation_context( 165 stream_state=stream_state, 166 stream_slice=stream_slice, 167 next_page_token=next_page_token, 168 ) 169 # Replace url base to only return the path 170 return str(token).replace(self.url_base.eval(self.config, **interpolation_context), "") # type: ignore # url_base is casted to a InterpolatedString in __post_init__ 171 else: 172 return None
Returns the URL path to hit to fetch the next page of records
e.g: if you wanted to hit https://myapi.com/v1/some_entity then this will return "some_entity"
Returns
path to hit to fetch the next request. Returning None means the path is not defined by the next_page_token
174 def get_request_params( 175 self, 176 *, 177 stream_state: Optional[StreamState] = None, 178 stream_slice: Optional[StreamSlice] = None, 179 next_page_token: Optional[Mapping[str, Any]] = None, 180 ) -> MutableMapping[str, Any]: 181 return self._get_request_options(RequestOptionType.request_parameter, next_page_token)
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
183 def get_request_headers( 184 self, 185 *, 186 stream_state: Optional[StreamState] = None, 187 stream_slice: Optional[StreamSlice] = None, 188 next_page_token: Optional[Mapping[str, Any]] = None, 189 ) -> Mapping[str, str]: 190 return self._get_request_options(RequestOptionType.header, next_page_token)
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
192 def get_request_body_data( 193 self, 194 *, 195 stream_state: Optional[StreamState] = None, 196 stream_slice: Optional[StreamSlice] = None, 197 next_page_token: Optional[Mapping[str, Any]] = None, 198 ) -> Mapping[str, Any]: 199 return self._get_request_options(RequestOptionType.body_data, next_page_token)
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
201 def get_request_body_json( 202 self, 203 *, 204 stream_state: Optional[StreamState] = None, 205 stream_slice: Optional[StreamSlice] = None, 206 next_page_token: Optional[Mapping[str, Any]] = None, 207 ) -> Mapping[str, Any]: 208 return self._get_request_options(RequestOptionType.body_json, next_page_token)
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
15@dataclass 16class DefaultRequestOptionsProvider(RequestOptionsProvider): 17 """ 18 Request options provider that extracts fields from the stream_slice and injects them into the respective location in the 19 outbound request being made 20 """ 21 22 parameters: InitVar[Mapping[str, Any]] 23 24 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 25 pass 26 27 def get_request_params( 28 self, 29 *, 30 stream_state: Optional[StreamState] = None, 31 stream_slice: Optional[StreamSlice] = None, 32 next_page_token: Optional[Mapping[str, Any]] = None, 33 ) -> Mapping[str, Any]: 34 return {} 35 36 def get_request_headers( 37 self, 38 *, 39 stream_state: Optional[StreamState] = None, 40 stream_slice: Optional[StreamSlice] = None, 41 next_page_token: Optional[Mapping[str, Any]] = None, 42 ) -> Mapping[str, Any]: 43 return {} 44 45 def get_request_body_data( 46 self, 47 *, 48 stream_state: Optional[StreamState] = None, 49 stream_slice: Optional[StreamSlice] = None, 50 next_page_token: Optional[Mapping[str, Any]] = None, 51 ) -> Union[Mapping[str, Any], str]: 52 return {} 53 54 def get_request_body_json( 55 self, 56 *, 57 stream_state: Optional[StreamState] = None, 58 stream_slice: Optional[StreamSlice] = None, 59 next_page_token: Optional[Mapping[str, Any]] = None, 60 ) -> Mapping[str, Any]: 61 return {}
Request options provider that extracts fields from the stream_slice and injects them into the respective location in the outbound request being made
27 def get_request_params( 28 self, 29 *, 30 stream_state: Optional[StreamState] = None, 31 stream_slice: Optional[StreamSlice] = None, 32 next_page_token: Optional[Mapping[str, Any]] = None, 33 ) -> Mapping[str, Any]: 34 return {}
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
36 def get_request_headers( 37 self, 38 *, 39 stream_state: Optional[StreamState] = None, 40 stream_slice: Optional[StreamSlice] = None, 41 next_page_token: Optional[Mapping[str, Any]] = None, 42 ) -> Mapping[str, Any]: 43 return {}
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
45 def get_request_body_data( 46 self, 47 *, 48 stream_state: Optional[StreamState] = None, 49 stream_slice: Optional[StreamSlice] = None, 50 next_page_token: Optional[Mapping[str, Any]] = None, 51 ) -> Union[Mapping[str, Any], str]: 52 return {}
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
54 def get_request_body_json( 55 self, 56 *, 57 stream_state: Optional[StreamState] = None, 58 stream_slice: Optional[StreamSlice] = None, 59 next_page_token: Optional[Mapping[str, Any]] = None, 60 ) -> Mapping[str, Any]: 61 return {}
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
18@dataclass 19class DpathExtractor(RecordExtractor): 20 """ 21 Record extractor that searches a decoded response over a path defined as an array of fields. 22 23 If the field path points to an array, that array is returned. 24 If the field path points to an object, that object is returned wrapped as an array. 25 If the field path points to an empty object, an empty array is returned. 26 If the field path points to a non-existing path, an empty array is returned. 27 28 Examples of instantiating this transform: 29 ``` 30 extractor: 31 type: DpathExtractor 32 field_path: 33 - "root" 34 - "data" 35 ``` 36 37 ``` 38 extractor: 39 type: DpathExtractor 40 field_path: 41 - "root" 42 - "{{ parameters['field'] }}" 43 ``` 44 45 ``` 46 extractor: 47 type: DpathExtractor 48 field_path: [] 49 ``` 50 51 Attributes: 52 field_path (Union[InterpolatedString, str]): Path to the field that should be extracted 53 config (Config): The user-provided configuration as specified by the source's spec 54 decoder (Decoder): The decoder responsible to transfom the response in a Mapping 55 """ 56 57 field_path: List[Union[InterpolatedString, str]] 58 config: Config 59 parameters: InitVar[Mapping[str, Any]] 60 decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={})) 61 62 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 63 self._field_path = [ 64 InterpolatedString.create(path, parameters=parameters) for path in self.field_path 65 ] 66 for path_index in range(len(self.field_path)): 67 if isinstance(self.field_path[path_index], str): 68 self._field_path[path_index] = InterpolatedString.create( 69 self.field_path[path_index], parameters=parameters 70 ) 71 72 def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]: 73 for body in self.decoder.decode(response): 74 if len(self._field_path) == 0: 75 extracted = body 76 else: 77 path = [path.eval(self.config) for path in self._field_path] 78 if "*" in path: 79 extracted = dpath.values(body, path) 80 else: 81 extracted = dpath.get(body, path, default=[]) # type: ignore # extracted will be a MutableMapping, given input data structure 82 if isinstance(extracted, list): 83 yield from extracted 84 elif extracted: 85 yield extracted 86 else: 87 yield from []
Record extractor that searches a decoded response over a path defined as an array of fields.
If the field path points to an array, that array is returned. If the field path points to an object, that object is returned wrapped as an array. If the field path points to an empty object, an empty array is returned. If the field path points to a non-existing path, an empty array is returned.
Examples of instantiating this transform:
extractor:
type: DpathExtractor
field_path:
- "root"
- "data"
extractor:
type: DpathExtractor
field_path:
- "root"
- "{{ parameters['field'] }}"
extractor:
type: DpathExtractor
field_path: []
Attributes:
- field_path (Union[InterpolatedString, str]): Path to the field that should be extracted
- config (Config): The user-provided configuration as specified by the source's spec
- decoder (Decoder): The decoder responsible to transfom the response in a Mapping
72 def extract_records(self, response: requests.Response) -> Iterable[MutableMapping[Any, Any]]: 73 for body in self.decoder.decode(response): 74 if len(self._field_path) == 0: 75 extracted = body 76 else: 77 path = [path.eval(self.config) for path in self._field_path] 78 if "*" in path: 79 extracted = dpath.values(body, path) 80 else: 81 extracted = dpath.get(body, path, default=[]) # type: ignore # extracted will be a MutableMapping, given input data structure 82 if isinstance(extracted, list): 83 yield from extracted 84 elif extracted: 85 yield extracted 86 else: 87 yield from []
Selects records from the response
Parameters
- response: The response to extract the records from
Returns
List of Records extracted from the response
19class HttpMethod(Enum): 20 """ 21 Http Method to use when submitting an outgoing HTTP request 22 """ 23 24 DELETE = "DELETE" 25 GET = "GET" 26 PATCH = "PATCH" 27 POST = "POST"
Http Method to use when submitting an outgoing HTTP request
38@dataclass 39class HttpRequester(Requester): 40 """ 41 Default implementation of a Requester 42 43 Attributes: 44 name (str): Name of the stream. Only used for request/response caching 45 url_base (Union[InterpolatedString, str]): Base url to send requests to 46 path (Union[InterpolatedString, str]): Path to send requests to 47 http_method (Union[str, HttpMethod]): HTTP method to use when sending requests 48 request_options_provider (Optional[InterpolatedRequestOptionsProvider]): request option provider defining the options to set on outgoing requests 49 authenticator (DeclarativeAuthenticator): Authenticator defining how to authenticate to the source 50 error_handler (Optional[ErrorHandler]): Error handler defining how to detect and handle errors 51 backoff_strategies (Optional[List[BackoffStrategy]]): List of backoff strategies to use when retrying requests 52 config (Config): The user-provided configuration as specified by the source's spec 53 use_cache (bool): Indicates that data should be cached for this stream 54 """ 55 56 name: str 57 url_base: Union[InterpolatedString, str] 58 config: Config 59 parameters: InitVar[Mapping[str, Any]] 60 61 path: Optional[Union[InterpolatedString, str]] = None 62 authenticator: Optional[DeclarativeAuthenticator] = None 63 http_method: Union[str, HttpMethod] = HttpMethod.GET 64 request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None 65 error_handler: Optional[ErrorHandler] = None 66 api_budget: Optional[APIBudget] = None 67 disable_retries: bool = False 68 message_repository: MessageRepository = NoopMessageRepository() 69 use_cache: bool = False 70 _exit_on_rate_limit: bool = False 71 stream_response: bool = False 72 decoder: Decoder = field(default_factory=lambda: JsonDecoder(parameters={})) 73 74 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 75 self._url_base = InterpolatedString.create(self.url_base, parameters=parameters) 76 self._path = InterpolatedString.create( 77 self.path if self.path else EmptyString, parameters=parameters 78 ) 79 if self.request_options_provider is None: 80 self._request_options_provider = InterpolatedRequestOptionsProvider( 81 config=self.config, parameters=parameters 82 ) 83 elif isinstance(self.request_options_provider, dict): 84 self._request_options_provider = InterpolatedRequestOptionsProvider( 85 config=self.config, **self.request_options_provider 86 ) 87 else: 88 self._request_options_provider = self.request_options_provider 89 self._authenticator = self.authenticator or NoAuth(parameters=parameters) 90 self._http_method = ( 91 HttpMethod[self.http_method] if isinstance(self.http_method, str) else self.http_method 92 ) 93 self.error_handler = self.error_handler 94 self._parameters = parameters 95 96 if self.error_handler is not None and hasattr(self.error_handler, "backoff_strategies"): 97 backoff_strategies = self.error_handler.backoff_strategies # type: ignore 98 else: 99 backoff_strategies = None 100 101 self._http_client = HttpClient( 102 name=self.name, 103 logger=self.logger, 104 error_handler=self.error_handler, 105 api_budget=self.api_budget, 106 authenticator=self._authenticator, 107 use_cache=self.use_cache, 108 backoff_strategy=backoff_strategies, 109 disable_retries=self.disable_retries, 110 message_repository=self.message_repository, 111 ) 112 113 @property 114 def exit_on_rate_limit(self) -> bool: 115 return self._exit_on_rate_limit 116 117 @exit_on_rate_limit.setter 118 def exit_on_rate_limit(self, value: bool) -> None: 119 self._exit_on_rate_limit = value 120 121 def get_authenticator(self) -> DeclarativeAuthenticator: 122 return self._authenticator 123 124 def get_url_base( 125 self, 126 *, 127 stream_state: Optional[StreamState] = None, 128 stream_slice: Optional[StreamSlice] = None, 129 next_page_token: Optional[Mapping[str, Any]] = None, 130 ) -> str: 131 interpolation_context = get_interpolation_context( 132 stream_state=stream_state, 133 stream_slice=stream_slice, 134 next_page_token=next_page_token, 135 ) 136 return str(self._url_base.eval(self.config, **interpolation_context)) 137 138 def get_path( 139 self, 140 *, 141 stream_state: Optional[StreamState] = None, 142 stream_slice: Optional[StreamSlice] = None, 143 next_page_token: Optional[Mapping[str, Any]] = None, 144 ) -> str: 145 interpolation_context = get_interpolation_context( 146 stream_state=stream_state, 147 stream_slice=stream_slice, 148 next_page_token=next_page_token, 149 ) 150 path = str(self._path.eval(self.config, **interpolation_context)) 151 return path.lstrip("/") 152 153 def get_method(self) -> HttpMethod: 154 return self._http_method 155 156 def get_request_params( 157 self, 158 *, 159 stream_state: Optional[StreamState] = None, 160 stream_slice: Optional[StreamSlice] = None, 161 next_page_token: Optional[Mapping[str, Any]] = None, 162 ) -> MutableMapping[str, Any]: 163 return self._request_options_provider.get_request_params( 164 stream_state=stream_state, 165 stream_slice=stream_slice, 166 next_page_token=next_page_token, 167 ) 168 169 def get_request_headers( 170 self, 171 *, 172 stream_state: Optional[StreamState] = None, 173 stream_slice: Optional[StreamSlice] = None, 174 next_page_token: Optional[Mapping[str, Any]] = None, 175 ) -> Mapping[str, Any]: 176 return self._request_options_provider.get_request_headers( 177 stream_state=stream_state, 178 stream_slice=stream_slice, 179 next_page_token=next_page_token, 180 ) 181 182 # fixing request options provider types has a lot of dependencies 183 def get_request_body_data( # type: ignore 184 self, 185 *, 186 stream_state: Optional[StreamState] = None, 187 stream_slice: Optional[StreamSlice] = None, 188 next_page_token: Optional[Mapping[str, Any]] = None, 189 ) -> Union[Mapping[str, Any], str]: 190 return ( 191 self._request_options_provider.get_request_body_data( 192 stream_state=stream_state, 193 stream_slice=stream_slice, 194 next_page_token=next_page_token, 195 ) 196 or {} 197 ) 198 199 # fixing request options provider types has a lot of dependencies 200 def get_request_body_json( # type: ignore 201 self, 202 *, 203 stream_state: Optional[StreamState] = None, 204 stream_slice: Optional[StreamSlice] = None, 205 next_page_token: Optional[Mapping[str, Any]] = None, 206 ) -> Optional[Mapping[str, Any]]: 207 return self._request_options_provider.get_request_body_json( 208 stream_state=stream_state, 209 stream_slice=stream_slice, 210 next_page_token=next_page_token, 211 ) 212 213 @property 214 def logger(self) -> logging.Logger: 215 return logging.getLogger(f"airbyte.HttpRequester.{self.name}") 216 217 def _get_request_options( 218 self, 219 stream_state: Optional[StreamState], 220 stream_slice: Optional[StreamSlice], 221 next_page_token: Optional[Mapping[str, Any]], 222 requester_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], 223 auth_options_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], 224 extra_options: Optional[Union[Mapping[str, Any], str]] = None, 225 ) -> Union[Mapping[str, Any], str]: 226 """ 227 Get the request_option from the requester, the authenticator and extra_options passed in. 228 Raise a ValueError if there's a key collision 229 Returned merged mapping otherwise 230 """ 231 232 is_body_json = requester_method.__name__ == "get_request_body_json" 233 234 return combine_mappings( 235 [ 236 requester_method( 237 stream_state=stream_state, 238 stream_slice=stream_slice, 239 next_page_token=next_page_token, 240 ), 241 auth_options_method(), 242 extra_options, 243 ], 244 allow_same_value_merge=is_body_json, 245 ) 246 247 def _request_headers( 248 self, 249 stream_state: Optional[StreamState] = None, 250 stream_slice: Optional[StreamSlice] = None, 251 next_page_token: Optional[Mapping[str, Any]] = None, 252 extra_headers: Optional[Mapping[str, Any]] = None, 253 ) -> Mapping[str, Any]: 254 """ 255 Specifies request headers. 256 Authentication headers will overwrite any overlapping headers returned from this method. 257 """ 258 headers = self._get_request_options( 259 stream_state, 260 stream_slice, 261 next_page_token, 262 self.get_request_headers, 263 self.get_authenticator().get_auth_header, 264 extra_headers, 265 ) 266 if isinstance(headers, str): 267 raise ValueError("Request headers cannot be a string") 268 return {str(k): str(v) for k, v in headers.items()} 269 270 def _request_params( 271 self, 272 stream_state: Optional[StreamState], 273 stream_slice: Optional[StreamSlice], 274 next_page_token: Optional[Mapping[str, Any]], 275 extra_params: Optional[Mapping[str, Any]] = None, 276 ) -> Mapping[str, Any]: 277 """ 278 Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. 279 280 E.g: you might want to define query parameters for paging if next_page_token is not None. 281 """ 282 options = self._get_request_options( 283 stream_state, 284 stream_slice, 285 next_page_token, 286 self.get_request_params, 287 self.get_authenticator().get_request_params, 288 extra_params, 289 ) 290 if isinstance(options, str): 291 raise ValueError("Request params cannot be a string") 292 293 for k, v in options.items(): 294 if isinstance(v, (dict,)): 295 raise ValueError( 296 f"Invalid value for `{k}` parameter. The values of request params cannot be an object." 297 ) 298 299 return options 300 301 def _request_body_data( 302 self, 303 stream_state: Optional[StreamState], 304 stream_slice: Optional[StreamSlice], 305 next_page_token: Optional[Mapping[str, Any]], 306 extra_body_data: Optional[Union[Mapping[str, Any], str]] = None, 307 ) -> Optional[Union[Mapping[str, Any], str]]: 308 """ 309 Specifies how to populate the body of the request with a non-JSON payload. 310 311 If returns a ready text that it will be sent as is. 312 If returns a dict that it will be converted to a urlencoded form. 313 E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" 314 315 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 316 """ 317 # Warning: use self.state instead of the stream_state passed as argument! 318 return self._get_request_options( 319 stream_state, 320 stream_slice, 321 next_page_token, 322 self.get_request_body_data, 323 self.get_authenticator().get_request_body_data, 324 extra_body_data, 325 ) 326 327 def _request_body_json( 328 self, 329 stream_state: Optional[StreamState], 330 stream_slice: Optional[StreamSlice], 331 next_page_token: Optional[Mapping[str, Any]], 332 extra_body_json: Optional[Mapping[str, Any]] = None, 333 ) -> Optional[Mapping[str, Any]]: 334 """ 335 Specifies how to populate the body of the request with a JSON payload. 336 337 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 338 """ 339 # Warning: use self.state instead of the stream_state passed as argument! 340 options = self._get_request_options( 341 stream_state, 342 stream_slice, 343 next_page_token, 344 self.get_request_body_json, 345 self.get_authenticator().get_request_body_json, 346 extra_body_json, 347 ) 348 if isinstance(options, str): 349 raise ValueError("Request body json cannot be a string") 350 return options 351 352 @classmethod 353 def _join_url(cls, url_base: str, path: str) -> str: 354 """ 355 Joins a base URL with a given path and returns the resulting URL with any trailing slash removed. 356 357 This method ensures that there are no duplicate slashes when concatenating the base URL and the path, 358 which is useful when the full URL is provided from an interpolation context. 359 360 Args: 361 url_base (str): The base URL to which the path will be appended. 362 path (str): The path to join with the base URL. 363 364 Returns: 365 str: The resulting joined URL. 366 367 Note: 368 Related issue: https://github.com/airbytehq/airbyte-internal-issues/issues/11869 369 - If the path is an empty string or None, the method returns the base URL with any trailing slash removed. 370 371 Example: 372 1) _join_url("https://example.com/api/", "endpoint") >> 'https://example.com/api/endpoint' 373 2) _join_url("https://example.com/api", "/endpoint") >> 'https://example.com/api/endpoint' 374 3) _join_url("https://example.com/api/", "") >> 'https://example.com/api/' 375 4) _join_url("https://example.com/api", None) >> 'https://example.com/api' 376 """ 377 378 # return a full-url if provided directly from interpolation context 379 if path == EmptyString or path is None: 380 return url_base 381 else: 382 # since we didn't provide a full-url, the url_base might not have a trailing slash 383 # so we join the url_base and path correctly 384 if not url_base.endswith("/"): 385 url_base += "/" 386 387 return urljoin(url_base, path) 388 389 def send_request( 390 self, 391 stream_state: Optional[StreamState] = None, 392 stream_slice: Optional[StreamSlice] = None, 393 next_page_token: Optional[Mapping[str, Any]] = None, 394 path: Optional[str] = None, 395 request_headers: Optional[Mapping[str, Any]] = None, 396 request_params: Optional[Mapping[str, Any]] = None, 397 request_body_data: Optional[Union[Mapping[str, Any], str]] = None, 398 request_body_json: Optional[Mapping[str, Any]] = None, 399 log_formatter: Optional[Callable[[requests.Response], Any]] = None, 400 ) -> Optional[requests.Response]: 401 request, response = self._http_client.send_request( 402 http_method=self.get_method().value, 403 url=self._join_url( 404 self.get_url_base( 405 stream_state=stream_state, 406 stream_slice=stream_slice, 407 next_page_token=next_page_token, 408 ), 409 path 410 or self.get_path( 411 stream_state=stream_state, 412 stream_slice=stream_slice, 413 next_page_token=next_page_token, 414 ), 415 ), 416 request_kwargs={"stream": self.stream_response}, 417 headers=self._request_headers( 418 stream_state, stream_slice, next_page_token, request_headers 419 ), 420 params=self._request_params( 421 stream_state, stream_slice, next_page_token, request_params 422 ), 423 json=self._request_body_json( 424 stream_state, stream_slice, next_page_token, request_body_json 425 ), 426 data=self._request_body_data( 427 stream_state, stream_slice, next_page_token, request_body_data 428 ), 429 dedupe_query_params=True, 430 log_formatter=log_formatter, 431 exit_on_rate_limit=self._exit_on_rate_limit, 432 ) 433 434 return response
Default implementation of a Requester
Attributes:
- name (str): Name of the stream. Only used for request/response caching
- url_base (Union[InterpolatedString, str]): Base url to send requests to
- path (Union[InterpolatedString, str]): Path to send requests to
- http_method (Union[str, HttpMethod]): HTTP method to use when sending requests
- request_options_provider (Optional[InterpolatedRequestOptionsProvider]): request option provider defining the options to set on outgoing requests
- authenticator (DeclarativeAuthenticator): Authenticator defining how to authenticate to the source
- error_handler (Optional[ErrorHandler]): Error handler defining how to detect and handle errors
- backoff_strategies (Optional[List[BackoffStrategy]]): List of backoff strategies to use when retrying requests
- config (Config): The user-provided configuration as specified by the source's spec
- use_cache (bool): Indicates that data should be cached for this stream
124 def get_url_base( 125 self, 126 *, 127 stream_state: Optional[StreamState] = None, 128 stream_slice: Optional[StreamSlice] = None, 129 next_page_token: Optional[Mapping[str, Any]] = None, 130 ) -> str: 131 interpolation_context = get_interpolation_context( 132 stream_state=stream_state, 133 stream_slice=stream_slice, 134 next_page_token=next_page_token, 135 ) 136 return str(self._url_base.eval(self.config, **interpolation_context))
Returns
URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
138 def get_path( 139 self, 140 *, 141 stream_state: Optional[StreamState] = None, 142 stream_slice: Optional[StreamSlice] = None, 143 next_page_token: Optional[Mapping[str, Any]] = None, 144 ) -> str: 145 interpolation_context = get_interpolation_context( 146 stream_state=stream_state, 147 stream_slice=stream_slice, 148 next_page_token=next_page_token, 149 ) 150 path = str(self._path.eval(self.config, **interpolation_context)) 151 return path.lstrip("/")
Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
156 def get_request_params( 157 self, 158 *, 159 stream_state: Optional[StreamState] = None, 160 stream_slice: Optional[StreamSlice] = None, 161 next_page_token: Optional[Mapping[str, Any]] = None, 162 ) -> MutableMapping[str, Any]: 163 return self._request_options_provider.get_request_params( 164 stream_state=stream_state, 165 stream_slice=stream_slice, 166 next_page_token=next_page_token, 167 )
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
169 def get_request_headers( 170 self, 171 *, 172 stream_state: Optional[StreamState] = None, 173 stream_slice: Optional[StreamSlice] = None, 174 next_page_token: Optional[Mapping[str, Any]] = None, 175 ) -> Mapping[str, Any]: 176 return self._request_options_provider.get_request_headers( 177 stream_state=stream_state, 178 stream_slice=stream_slice, 179 next_page_token=next_page_token, 180 )
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
183 def get_request_body_data( # type: ignore 184 self, 185 *, 186 stream_state: Optional[StreamState] = None, 187 stream_slice: Optional[StreamSlice] = None, 188 next_page_token: Optional[Mapping[str, Any]] = None, 189 ) -> Union[Mapping[str, Any], str]: 190 return ( 191 self._request_options_provider.get_request_body_data( 192 stream_state=stream_state, 193 stream_slice=stream_slice, 194 next_page_token=next_page_token, 195 ) 196 or {} 197 )
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
200 def get_request_body_json( # type: ignore 201 self, 202 *, 203 stream_state: Optional[StreamState] = None, 204 stream_slice: Optional[StreamSlice] = None, 205 next_page_token: Optional[Mapping[str, Any]] = None, 206 ) -> Optional[Mapping[str, Any]]: 207 return self._request_options_provider.get_request_body_json( 208 stream_state=stream_state, 209 stream_slice=stream_slice, 210 next_page_token=next_page_token, 211 )
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
389 def send_request( 390 self, 391 stream_state: Optional[StreamState] = None, 392 stream_slice: Optional[StreamSlice] = None, 393 next_page_token: Optional[Mapping[str, Any]] = None, 394 path: Optional[str] = None, 395 request_headers: Optional[Mapping[str, Any]] = None, 396 request_params: Optional[Mapping[str, Any]] = None, 397 request_body_data: Optional[Union[Mapping[str, Any], str]] = None, 398 request_body_json: Optional[Mapping[str, Any]] = None, 399 log_formatter: Optional[Callable[[requests.Response], Any]] = None, 400 ) -> Optional[requests.Response]: 401 request, response = self._http_client.send_request( 402 http_method=self.get_method().value, 403 url=self._join_url( 404 self.get_url_base( 405 stream_state=stream_state, 406 stream_slice=stream_slice, 407 next_page_token=next_page_token, 408 ), 409 path 410 or self.get_path( 411 stream_state=stream_state, 412 stream_slice=stream_slice, 413 next_page_token=next_page_token, 414 ), 415 ), 416 request_kwargs={"stream": self.stream_response}, 417 headers=self._request_headers( 418 stream_state, stream_slice, next_page_token, request_headers 419 ), 420 params=self._request_params( 421 stream_state, stream_slice, next_page_token, request_params 422 ), 423 json=self._request_body_json( 424 stream_state, stream_slice, next_page_token, request_body_json 425 ), 426 data=self._request_body_data( 427 stream_state, stream_slice, next_page_token, request_body_data 428 ), 429 dedupe_query_params=True, 430 log_formatter=log_formatter, 431 exit_on_rate_limit=self._exit_on_rate_limit, 432 ) 433 434 return response
Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. If path is set, the path configured on the requester itself is ignored. If header, params and body are set, they are merged with the ones configured on the requester itself.
If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed.
29@dataclass 30class InterpolatedBoolean: 31 f""" 32 Wrapper around a string to be evaluated to a boolean value. 33 The string will be evaluated as False if it interpolates to a value in {FALSE_VALUES} 34 35 Attributes: 36 condition (str): The string representing the condition to evaluate to a boolean 37 """ 38 condition: str 39 parameters: InitVar[Mapping[str, Any]] 40 41 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 42 self._default = "False" 43 self._interpolation = JinjaInterpolation() 44 self._parameters = parameters 45 46 def eval(self, config: Config, **additional_parameters: Any) -> bool: 47 """ 48 Interpolates the predicate condition string using the config and other optional arguments passed as parameter. 49 50 :param config: The user-provided configuration as specified by the source's spec 51 :param additional_parameters: Optional parameters used for interpolation 52 :return: The interpolated string 53 """ 54 if isinstance(self.condition, bool): 55 return self.condition 56 else: 57 evaluated = self._interpolation.eval( 58 self.condition, 59 config, 60 self._default, 61 parameters=self._parameters, 62 **additional_parameters, 63 ) 64 if evaluated in FALSE_VALUES: 65 return False 66 # The presence of a value is generally regarded as truthy, so we treat it as such 67 return True
46 def eval(self, config: Config, **additional_parameters: Any) -> bool: 47 """ 48 Interpolates the predicate condition string using the config and other optional arguments passed as parameter. 49 50 :param config: The user-provided configuration as specified by the source's spec 51 :param additional_parameters: Optional parameters used for interpolation 52 :return: The interpolated string 53 """ 54 if isinstance(self.condition, bool): 55 return self.condition 56 else: 57 evaluated = self._interpolation.eval( 58 self.condition, 59 config, 60 self._default, 61 parameters=self._parameters, 62 **additional_parameters, 63 ) 64 if evaluated in FALSE_VALUES: 65 return False 66 # The presence of a value is generally regarded as truthy, so we treat it as such 67 return True
Interpolates the predicate condition string using the config and other optional arguments passed as parameter.
Parameters
- config: The user-provided configuration as specified by the source's spec
- additional_parameters: Optional parameters used for interpolation
Returns
The interpolated string
14@dataclass 15class InterpolatedRequestInputProvider: 16 """ 17 Helper class that generically performs string interpolation on the provided dictionary or string input 18 """ 19 20 parameters: InitVar[Mapping[str, Any]] 21 request_inputs: Optional[Union[str, Mapping[str, str]]] = field(default=None) 22 config: Config = field(default_factory=dict) 23 _interpolator: Optional[Union[InterpolatedString, InterpolatedMapping]] = field( 24 init=False, repr=False, default=None 25 ) 26 _request_inputs: Optional[Union[str, Mapping[str, str]]] = field( 27 init=False, repr=False, default=None 28 ) 29 30 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 31 self._request_inputs = self.request_inputs or {} 32 if isinstance(self._request_inputs, str): 33 self._interpolator = InterpolatedString( 34 self._request_inputs, default="", parameters=parameters 35 ) 36 else: 37 self._interpolator = InterpolatedMapping(self._request_inputs, parameters=parameters) 38 39 def eval_request_inputs( 40 self, 41 stream_slice: Optional[StreamSlice] = None, 42 next_page_token: Optional[Mapping[str, Any]] = None, 43 valid_key_types: Optional[Tuple[Type[Any]]] = None, 44 valid_value_types: Optional[Tuple[Type[Any], ...]] = None, 45 ) -> Mapping[str, Any]: 46 """ 47 Returns the request inputs to set on an outgoing HTTP request 48 49 :param stream_slice: The stream slice 50 :param next_page_token: The pagination token 51 :param valid_key_types: A tuple of types that the interpolator should allow 52 :param valid_value_types: A tuple of types that the interpolator should allow 53 :return: The request inputs to set on an outgoing HTTP request 54 """ 55 kwargs = { 56 "stream_slice": stream_slice, 57 "next_page_token": next_page_token, 58 } 59 interpolated_value = self._interpolator.eval( # type: ignore # self._interpolator is always initialized with a value and will not be None 60 self.config, 61 valid_key_types=valid_key_types, 62 valid_value_types=valid_value_types, 63 **kwargs, 64 ) 65 66 if isinstance(interpolated_value, dict): 67 non_null_tokens = {k: v for k, v in interpolated_value.items() if v is not None} 68 return non_null_tokens 69 return interpolated_value # type: ignore[no-any-return]
Helper class that generically performs string interpolation on the provided dictionary or string input
39 def eval_request_inputs( 40 self, 41 stream_slice: Optional[StreamSlice] = None, 42 next_page_token: Optional[Mapping[str, Any]] = None, 43 valid_key_types: Optional[Tuple[Type[Any]]] = None, 44 valid_value_types: Optional[Tuple[Type[Any], ...]] = None, 45 ) -> Mapping[str, Any]: 46 """ 47 Returns the request inputs to set on an outgoing HTTP request 48 49 :param stream_slice: The stream slice 50 :param next_page_token: The pagination token 51 :param valid_key_types: A tuple of types that the interpolator should allow 52 :param valid_value_types: A tuple of types that the interpolator should allow 53 :return: The request inputs to set on an outgoing HTTP request 54 """ 55 kwargs = { 56 "stream_slice": stream_slice, 57 "next_page_token": next_page_token, 58 } 59 interpolated_value = self._interpolator.eval( # type: ignore # self._interpolator is always initialized with a value and will not be None 60 self.config, 61 valid_key_types=valid_key_types, 62 valid_value_types=valid_value_types, 63 **kwargs, 64 ) 65 66 if isinstance(interpolated_value, dict): 67 non_null_tokens = {k: v for k, v in interpolated_value.items() if v is not None} 68 return non_null_tokens 69 return interpolated_value # type: ignore[no-any-return]
Returns the request inputs to set on an outgoing HTTP request
Parameters
- stream_slice: The stream slice
- next_page_token: The pagination token
- valid_key_types: A tuple of types that the interpolator should allow
- valid_value_types: A tuple of types that the interpolator should allow
Returns
The request inputs to set on an outgoing HTTP request
13@dataclass 14class InterpolatedString: 15 """ 16 Wrapper around a raw string to be interpolated with the Jinja2 templating engine 17 18 Attributes: 19 string (str): The string to evalute 20 default (Optional[str]): The default value to return if the evaluation returns an empty string 21 parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation 22 """ 23 24 string: str 25 parameters: InitVar[Mapping[str, Any]] 26 default: Optional[str] = None 27 28 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 29 self.default = self.default or self.string 30 self._interpolation = JinjaInterpolation() 31 self._parameters = parameters 32 # indicates whether passed string is just a plain string, not Jinja template 33 # This allows for optimization, but we do not know it yet at this stage 34 self._is_plain_string = None 35 36 def eval(self, config: Config, **kwargs: Any) -> Any: 37 """ 38 Interpolates the input string using the config and other optional arguments passed as parameter. 39 40 :param config: The user-provided configuration as specified by the source's spec 41 :param kwargs: Optional parameters used for interpolation 42 :return: The interpolated string 43 """ 44 if self._is_plain_string: 45 return self.string 46 if self._is_plain_string is None: 47 # Let's check whether output from evaluation is the same as input. 48 # This indicates occurrence of a plain string, not a template and we can skip Jinja in subsequent runs. 49 evaluated = self._interpolation.eval( 50 self.string, config, self.default, parameters=self._parameters, **kwargs 51 ) 52 self._is_plain_string = self.string == evaluated 53 return evaluated 54 return self._interpolation.eval( 55 self.string, config, self.default, parameters=self._parameters, **kwargs 56 ) 57 58 def __eq__(self, other: Any) -> bool: 59 if not isinstance(other, InterpolatedString): 60 return False 61 return self.string == other.string and self.default == other.default 62 63 @classmethod 64 def create( 65 cls, 66 string_or_interpolated: Union["InterpolatedString", str], 67 *, 68 parameters: Mapping[str, Any], 69 ) -> "InterpolatedString": 70 """ 71 Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString. 72 73 :param string_or_interpolated: either a raw string or an InterpolatedString. 74 :param parameters: parameters propagated from parent component 75 :return: InterpolatedString representing the input string. 76 """ 77 if isinstance(string_or_interpolated, str): 78 return InterpolatedString(string=string_or_interpolated, parameters=parameters) 79 else: 80 return string_or_interpolated
Wrapper around a raw string to be interpolated with the Jinja2 templating engine
Attributes:
- string (str): The string to evalute
- default (Optional[str]): The default value to return if the evaluation returns an empty string
- parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
36 def eval(self, config: Config, **kwargs: Any) -> Any: 37 """ 38 Interpolates the input string using the config and other optional arguments passed as parameter. 39 40 :param config: The user-provided configuration as specified by the source's spec 41 :param kwargs: Optional parameters used for interpolation 42 :return: The interpolated string 43 """ 44 if self._is_plain_string: 45 return self.string 46 if self._is_plain_string is None: 47 # Let's check whether output from evaluation is the same as input. 48 # This indicates occurrence of a plain string, not a template and we can skip Jinja in subsequent runs. 49 evaluated = self._interpolation.eval( 50 self.string, config, self.default, parameters=self._parameters, **kwargs 51 ) 52 self._is_plain_string = self.string == evaluated 53 return evaluated 54 return self._interpolation.eval( 55 self.string, config, self.default, parameters=self._parameters, **kwargs 56 )
Interpolates the input string using the config and other optional arguments passed as parameter.
Parameters
- config: The user-provided configuration as specified by the source's spec
- kwargs: Optional parameters used for interpolation
Returns
The interpolated string
63 @classmethod 64 def create( 65 cls, 66 string_or_interpolated: Union["InterpolatedString", str], 67 *, 68 parameters: Mapping[str, Any], 69 ) -> "InterpolatedString": 70 """ 71 Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString. 72 73 :param string_or_interpolated: either a raw string or an InterpolatedString. 74 :param parameters: parameters propagated from parent component 75 :return: InterpolatedString representing the input string. 76 """ 77 if isinstance(string_or_interpolated, str): 78 return InterpolatedString(string=string_or_interpolated, parameters=parameters) 79 else: 80 return string_or_interpolated
Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString.
Parameters
- string_or_interpolated: either a raw string or an InterpolatedString.
- parameters: parameters propagated from parent component
Returns
InterpolatedString representing the input string.
20class JsonDecoder(Decoder): 21 """ 22 Decoder strategy that returns the json-encoded content of a response, if any. 23 24 Usually, we would try to instantiate the equivalent `CompositeRawDecoder(parser=JsonParser(), stream_response=False)` but there were specific historical behaviors related to the JsonDecoder that we didn't know if we could remove like the fallback on {} in case of errors. 25 """ 26 27 def __init__(self, parameters: Mapping[str, Any]): 28 self._decoder = CompositeRawDecoder(parser=JsonParser(), stream_response=False) 29 30 def is_stream_response(self) -> bool: 31 return self._decoder.is_stream_response() 32 33 def decode( 34 self, response: requests.Response 35 ) -> Generator[MutableMapping[str, Any], None, None]: 36 """ 37 Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping. 38 """ 39 has_yielded = False 40 try: 41 for element in self._decoder.decode(response): 42 yield element 43 has_yielded = True 44 except Exception: 45 yield {} 46 47 if not has_yielded: 48 yield {}
Decoder strategy that returns the json-encoded content of a response, if any.
Usually, we would try to instantiate the equivalent CompositeRawDecoder(parser=JsonParser(), stream_response=False)
but there were specific historical behaviors related to the JsonDecoder that we didn't know if we could remove like the fallback on {} in case of errors.
Set to True if you'd like to use stream=True option in http requester
33 def decode( 34 self, response: requests.Response 35 ) -> Generator[MutableMapping[str, Any], None, None]: 36 """ 37 Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping. 38 """ 39 has_yielded = False 40 try: 41 for element in self._decoder.decode(response): 42 yield element 43 has_yielded = True 44 except Exception: 45 yield {} 46 47 if not has_yielded: 48 yield {}
Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping.
33@dataclass 34class JsonFileSchemaLoader(ResourceSchemaLoader, SchemaLoader): 35 """ 36 Loads the schema from a json file 37 38 Attributes: 39 file_path (Union[InterpolatedString, str]): The path to the json file describing the schema 40 name (str): The stream's name 41 config (Config): The user-provided configuration as specified by the source's spec 42 parameters (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed 43 """ 44 45 config: Config 46 parameters: InitVar[Mapping[str, Any]] 47 file_path: Union[InterpolatedString, str] = field(default="") 48 49 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 50 if not self.file_path: 51 self.file_path = _default_file_path() 52 self.file_path = InterpolatedString.create(self.file_path, parameters=parameters) 53 54 def get_json_schema(self) -> Mapping[str, Any]: 55 # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory 56 # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there 57 json_schema_path = self._get_json_filepath() 58 resource, schema_path = self.extract_resource_and_schema_path(json_schema_path) 59 raw_json_file = pkgutil.get_data(resource, schema_path) 60 61 if not raw_json_file: 62 raise IOError(f"Cannot find file {json_schema_path}") 63 try: 64 raw_schema = json.loads(raw_json_file) 65 except ValueError as err: 66 raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err 67 self.package_name = resource 68 return self._resolve_schema_references(raw_schema) 69 70 def _get_json_filepath(self) -> Any: 71 return self.file_path.eval(self.config) # type: ignore # file_path is always cast to an interpolated string 72 73 @staticmethod 74 def extract_resource_and_schema_path(json_schema_path: str) -> Tuple[str, str]: 75 """ 76 When the connector is running on a docker container, package_data is accessible from the resource (source_<name>), so we extract 77 the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight 78 hack to identify the source name while we are in the airbyte_cdk module. 79 :param json_schema_path: The path to the schema JSON file 80 :return: Tuple of the resource name and the path to the schema file 81 """ 82 split_path = json_schema_path.split("/") 83 84 if split_path[0] == "" or split_path[0] == ".": 85 split_path = split_path[1:] 86 87 if len(split_path) == 0: 88 return "", "" 89 90 if len(split_path) == 1: 91 return "", split_path[0] 92 93 return split_path[0], "/".join(split_path[1:])
Loads the schema from a json file
Attributes:
- file_path (Union[InterpolatedString, str]): The path to the json file describing the schema
- name (str): The stream's name
- config (Config): The user-provided configuration as specified by the source's spec
- parameters (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed
54 def get_json_schema(self) -> Mapping[str, Any]: 55 # todo: It is worth revisiting if we can replace file_path with just file_name if every schema is in the /schemas directory 56 # this would require that we find a creative solution to store or retrieve source_name in here since the files are mounted there 57 json_schema_path = self._get_json_filepath() 58 resource, schema_path = self.extract_resource_and_schema_path(json_schema_path) 59 raw_json_file = pkgutil.get_data(resource, schema_path) 60 61 if not raw_json_file: 62 raise IOError(f"Cannot find file {json_schema_path}") 63 try: 64 raw_schema = json.loads(raw_json_file) 65 except ValueError as err: 66 raise RuntimeError(f"Invalid JSON file format for file {json_schema_path}") from err 67 self.package_name = resource 68 return self._resolve_schema_references(raw_schema)
Returns a mapping describing the stream's schema
73 @staticmethod 74 def extract_resource_and_schema_path(json_schema_path: str) -> Tuple[str, str]: 75 """ 76 When the connector is running on a docker container, package_data is accessible from the resource (source_<name>), so we extract 77 the resource from the first part of the schema path and the remaining path is used to find the schema file. This is a slight 78 hack to identify the source name while we are in the airbyte_cdk module. 79 :param json_schema_path: The path to the schema JSON file 80 :return: Tuple of the resource name and the path to the schema file 81 """ 82 split_path = json_schema_path.split("/") 83 84 if split_path[0] == "" or split_path[0] == ".": 85 split_path = split_path[1:] 86 87 if len(split_path) == 0: 88 return "", "" 89 90 if len(split_path) == 1: 91 return "", split_path[0] 92 93 return split_path[0], "/".join(split_path[1:])
When the connector is running on a docker container, package_data is accessible from the resource (source_
Parameters
- json_schema_path: The path to the schema JSON file
Returns
Tuple of the resource name and the path to the schema file
Inherited Members
20class LegacyToPerPartitionStateMigration(StateMigration): 21 """ 22 Transforms the input state for per-partitioned streams from the legacy format to the low-code format. 23 The cursor field and partition ID fields are automatically extracted from the stream's DatetimebasedCursor and SubstreamPartitionRouter. 24 25 Example input state: 26 { 27 "13506132": { 28 "last_changed": "2022-12-27T08:34:39+00:00" 29 } 30 Example output state: 31 { 32 "partition": {"id": "13506132"}, 33 "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"} 34 } 35 """ 36 37 def __init__( 38 self, 39 partition_router: SubstreamPartitionRouter, 40 cursor: CustomIncrementalSync | DatetimeBasedCursor, 41 config: Mapping[str, Any], 42 parameters: Mapping[str, Any], 43 ): 44 self._partition_router = partition_router 45 self._cursor = cursor 46 self._config = config 47 self._parameters = parameters 48 self._partition_key_field = InterpolatedString.create( 49 self._get_partition_field(self._partition_router), parameters=self._parameters 50 ).eval(self._config) 51 self._cursor_field = InterpolatedString.create( 52 self._cursor.cursor_field, parameters=self._parameters 53 ).eval(self._config) 54 55 def _get_partition_field(self, partition_router: SubstreamPartitionRouter) -> str: 56 parent_stream_config = partition_router.parent_stream_configs[0] 57 58 # Retrieve the partition field with a condition, as properties are returned as a dictionary for custom components. 59 partition_field = ( 60 parent_stream_config.partition_field 61 if isinstance(parent_stream_config, ParentStreamConfig) 62 else parent_stream_config.get("partition_field") # type: ignore # See above comment on why parent_stream_config might be a dict 63 ) 64 65 return partition_field 66 67 def should_migrate(self, stream_state: Mapping[str, Any]) -> bool: 68 if _is_already_migrated(stream_state): 69 return False 70 71 # There is exactly one parent stream 72 number_of_parent_streams = len(self._partition_router.parent_stream_configs) # type: ignore # custom partition will introduce this attribute if needed 73 if number_of_parent_streams != 1: 74 # There should be exactly one parent stream 75 return False 76 """ 77 The expected state format is 78 "<parent_key_id>" : { 79 "<cursor_field>" : "<cursor_value>" 80 } 81 """ 82 if stream_state: 83 for key, value in stream_state.items(): 84 if isinstance(value, dict): 85 keys = list(value.keys()) 86 if len(keys) != 1: 87 # The input partitioned state should only have one key 88 return False 89 if keys[0] != self._cursor_field: 90 # Unexpected key. Found {keys[0]}. Expected {self._cursor.cursor_field} 91 return False 92 return True 93 94 def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]: 95 states = [ 96 {"partition": {self._partition_key_field: key}, "cursor": value} 97 for key, value in stream_state.items() 98 ] 99 return {"states": states}
Transforms the input state for per-partitioned streams from the legacy format to the low-code format. The cursor field and partition ID fields are automatically extracted from the stream's DatetimebasedCursor and SubstreamPartitionRouter.
Example input state: { "13506132": { "last_changed": "2022-12-27T08:34:39+00:00" } Example output state: { "partition": {"id": "13506132"}, "cursor": {"last_changed": "2022-12-27T08:34:39+00:00"} }
37 def __init__( 38 self, 39 partition_router: SubstreamPartitionRouter, 40 cursor: CustomIncrementalSync | DatetimeBasedCursor, 41 config: Mapping[str, Any], 42 parameters: Mapping[str, Any], 43 ): 44 self._partition_router = partition_router 45 self._cursor = cursor 46 self._config = config 47 self._parameters = parameters 48 self._partition_key_field = InterpolatedString.create( 49 self._get_partition_field(self._partition_router), parameters=self._parameters 50 ).eval(self._config) 51 self._cursor_field = InterpolatedString.create( 52 self._cursor.cursor_field, parameters=self._parameters 53 ).eval(self._config)
67 def should_migrate(self, stream_state: Mapping[str, Any]) -> bool: 68 if _is_already_migrated(stream_state): 69 return False 70 71 # There is exactly one parent stream 72 number_of_parent_streams = len(self._partition_router.parent_stream_configs) # type: ignore # custom partition will introduce this attribute if needed 73 if number_of_parent_streams != 1: 74 # There should be exactly one parent stream 75 return False 76 """ 77 The expected state format is 78 "<parent_key_id>" : { 79 "<cursor_field>" : "<cursor_value>" 80 } 81 """ 82 if stream_state: 83 for key, value in stream_state.items(): 84 if isinstance(value, dict): 85 keys = list(value.keys()) 86 if len(keys) != 1: 87 # The input partitioned state should only have one key 88 return False 89 if keys[0] != self._cursor_field: 90 # Unexpected key. Found {keys[0]}. Expected {self._cursor.cursor_field} 91 return False 92 return True
Check if the stream_state should be migrated
Parameters
- stream_state: The stream_state to potentially migrate
Returns
true if the state is of the expected format and should be migrated. False otherwise.
94 def migrate(self, stream_state: Mapping[str, Any]) -> Mapping[str, Any]: 95 states = [ 96 {"partition": {self._partition_key_field: key}, "cursor": value} 97 for key, value in stream_state.items() 98 ] 99 return {"states": states}
Migrate the stream_state. Assumes should_migrate(stream_state) returned True.
Parameters
- stream_state: The stream_state to migrate
Returns
The migrated stream_state
61class ManifestDeclarativeSource(DeclarativeSource): 62 """Declarative source defined by a manifest of low-code components that define source connector behavior""" 63 64 def __init__( 65 self, 66 source_config: ConnectionDefinition, 67 *, 68 config: Mapping[str, Any] | None = None, 69 debug: bool = False, 70 emit_connector_builder_messages: bool = False, 71 component_factory: Optional[ModelToComponentFactory] = None, 72 ): 73 """ 74 Args: 75 config: The provided config dict. 76 source_config: The manifest of low-code components that describe the source connector. 77 debug: True if debug mode is enabled. 78 emit_connector_builder_messages: True if messages should be emitted to the connector builder. 79 component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked. 80 """ 81 self.logger = logging.getLogger(f"airbyte.{self.name}") 82 # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing 83 manifest = dict(source_config) 84 if "type" not in manifest: 85 manifest["type"] = "DeclarativeSource" 86 87 # If custom components are needed, locate and/or register them. 88 self.components_module: ModuleType | None = get_registered_components_module(config=config) 89 90 resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) 91 propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( 92 "", resolved_source_config, {} 93 ) 94 self._source_config = propagated_source_config 95 self._debug = debug 96 self._emit_connector_builder_messages = emit_connector_builder_messages 97 self._constructor = ( 98 component_factory 99 if component_factory 100 else ModelToComponentFactory( 101 emit_connector_builder_messages, 102 max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"), 103 ) 104 ) 105 self._message_repository = self._constructor.get_message_repository() 106 self._slice_logger: SliceLogger = ( 107 AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger() 108 ) 109 110 self._validate_source() 111 112 @property 113 def resolved_manifest(self) -> Mapping[str, Any]: 114 return self._source_config 115 116 @property 117 def message_repository(self) -> MessageRepository: 118 return self._message_repository 119 120 @property 121 def connection_checker(self) -> ConnectionChecker: 122 check = self._source_config["check"] 123 if "type" not in check: 124 check["type"] = "CheckStream" 125 check_stream = self._constructor.create_component( 126 COMPONENTS_CHECKER_TYPE_MAPPING[check["type"]], 127 check, 128 dict(), 129 emit_connector_builder_messages=self._emit_connector_builder_messages, 130 ) 131 if isinstance(check_stream, ConnectionChecker): 132 return check_stream 133 else: 134 raise ValueError( 135 f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}" 136 ) 137 138 def streams(self, config: Mapping[str, Any]) -> List[Stream]: 139 self._emit_manifest_debug_message( 140 extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} 141 ) 142 143 stream_configs = self._stream_configs(self._source_config) + self._dynamic_stream_configs( 144 self._source_config, config 145 ) 146 147 api_budget_model = self._source_config.get("api_budget") 148 if api_budget_model: 149 self._constructor.set_api_budget(api_budget_model, config) 150 151 source_streams = [ 152 self._constructor.create_component( 153 StateDelegatingStreamModel 154 if stream_config.get("type") == StateDelegatingStreamModel.__name__ 155 else DeclarativeStreamModel, 156 stream_config, 157 config, 158 emit_connector_builder_messages=self._emit_connector_builder_messages, 159 ) 160 for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) 161 ] 162 163 return source_streams 164 165 @staticmethod 166 def _initialize_cache_for_parent_streams( 167 stream_configs: List[Dict[str, Any]], 168 ) -> List[Dict[str, Any]]: 169 parent_streams = set() 170 171 def update_with_cache_parent_configs(parent_configs: list[dict[str, Any]]) -> None: 172 for parent_config in parent_configs: 173 parent_streams.add(parent_config["stream"]["name"]) 174 if parent_config["stream"]["type"] == "StateDelegatingStream": 175 parent_config["stream"]["full_refresh_stream"]["retriever"]["requester"][ 176 "use_cache" 177 ] = True 178 parent_config["stream"]["incremental_stream"]["retriever"]["requester"][ 179 "use_cache" 180 ] = True 181 else: 182 parent_config["stream"]["retriever"]["requester"]["use_cache"] = True 183 184 for stream_config in stream_configs: 185 if stream_config.get("incremental_sync", {}).get("parent_stream"): 186 parent_streams.add(stream_config["incremental_sync"]["parent_stream"]["name"]) 187 stream_config["incremental_sync"]["parent_stream"]["retriever"]["requester"][ 188 "use_cache" 189 ] = True 190 191 elif stream_config.get("retriever", {}).get("partition_router", {}): 192 partition_router = stream_config["retriever"]["partition_router"] 193 194 if isinstance(partition_router, dict) and partition_router.get( 195 "parent_stream_configs" 196 ): 197 update_with_cache_parent_configs(partition_router["parent_stream_configs"]) 198 elif isinstance(partition_router, list): 199 for router in partition_router: 200 if router.get("parent_stream_configs"): 201 update_with_cache_parent_configs(router["parent_stream_configs"]) 202 203 for stream_config in stream_configs: 204 if stream_config["name"] in parent_streams: 205 if stream_config["type"] == "StateDelegatingStream": 206 stream_config["full_refresh_stream"]["retriever"]["requester"]["use_cache"] = ( 207 True 208 ) 209 stream_config["incremental_stream"]["retriever"]["requester"]["use_cache"] = ( 210 True 211 ) 212 else: 213 stream_config["retriever"]["requester"]["use_cache"] = True 214 215 return stream_configs 216 217 def spec(self, logger: logging.Logger) -> ConnectorSpecification: 218 """ 219 Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible 220 configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this 221 will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json" 222 in the project root. 223 """ 224 self._configure_logger_level(logger) 225 self._emit_manifest_debug_message( 226 extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} 227 ) 228 229 spec = self._source_config.get("spec") 230 if spec: 231 if "type" not in spec: 232 spec["type"] = "Spec" 233 spec_component = self._constructor.create_component(SpecModel, spec, dict()) 234 return spec_component.generate_spec() 235 else: 236 return super().spec(logger) 237 238 def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: 239 self._configure_logger_level(logger) 240 return super().check(logger, config) 241 242 def read( 243 self, 244 logger: logging.Logger, 245 config: Mapping[str, Any], 246 catalog: ConfiguredAirbyteCatalog, 247 state: Optional[List[AirbyteStateMessage]] = None, 248 ) -> Iterator[AirbyteMessage]: 249 self._configure_logger_level(logger) 250 yield from super().read(logger, config, catalog, state) 251 252 def _configure_logger_level(self, logger: logging.Logger) -> None: 253 """ 254 Set the log level to logging.DEBUG if debug mode is enabled 255 """ 256 if self._debug: 257 logger.setLevel(logging.DEBUG) 258 259 def _validate_source(self) -> None: 260 """ 261 Validates the connector manifest against the declarative component schema 262 """ 263 try: 264 raw_component_schema = pkgutil.get_data( 265 "airbyte_cdk", "sources/declarative/declarative_component_schema.yaml" 266 ) 267 if raw_component_schema is not None: 268 declarative_component_schema = yaml.load( 269 raw_component_schema, Loader=yaml.SafeLoader 270 ) 271 else: 272 raise RuntimeError( 273 "Failed to read manifest component json schema required for validation" 274 ) 275 except FileNotFoundError as e: 276 raise FileNotFoundError( 277 f"Failed to read manifest component json schema required for validation: {e}" 278 ) 279 280 streams = self._source_config.get("streams") 281 dynamic_streams = self._source_config.get("dynamic_streams") 282 if not (streams or dynamic_streams): 283 raise ValidationError( 284 f"A valid manifest should have at least one stream defined. Got {streams}" 285 ) 286 287 try: 288 validate(self._source_config, declarative_component_schema) 289 except ValidationError as e: 290 raise ValidationError( 291 "Validation against json schema defined in declarative_component_schema.yaml schema failed" 292 ) from e 293 294 cdk_version_str = metadata.version("airbyte_cdk") 295 cdk_version = self._parse_version(cdk_version_str, "airbyte-cdk") 296 manifest_version_str = self._source_config.get("version") 297 if manifest_version_str is None: 298 raise RuntimeError( 299 "Manifest version is not defined in the manifest. This is unexpected since it should be a required field. Please contact support." 300 ) 301 manifest_version = self._parse_version(manifest_version_str, "manifest") 302 303 if (cdk_version.major, cdk_version.minor, cdk_version.micro) == (0, 0, 0): 304 # Skipping version compatibility check on unreleased dev branch 305 pass 306 elif (cdk_version.major, cdk_version.minor) < ( 307 manifest_version.major, 308 manifest_version.minor, 309 ): 310 raise ValidationError( 311 f"The manifest version {manifest_version!s} is greater than the airbyte-cdk package version ({cdk_version!s}). Your " 312 f"manifest may contain features that are not in the current CDK version." 313 ) 314 elif (manifest_version.major, manifest_version.minor) < (0, 29): 315 raise ValidationError( 316 f"The low-code framework was promoted to Beta in airbyte-cdk version 0.29.0 and contains many breaking changes to the " 317 f"language. The manifest version {manifest_version!s} is incompatible with the airbyte-cdk package version " 318 f"{cdk_version!s} which contains these breaking changes." 319 ) 320 321 @staticmethod 322 def _parse_version( 323 version: str, 324 version_type: str, 325 ) -> Version: 326 """Takes a semantic version represented as a string and splits it into a tuple. 327 328 The fourth part (prerelease) is not returned in the tuple. 329 330 Returns: 331 Version: the parsed version object 332 """ 333 try: 334 parsed_version = Version(version) 335 except InvalidVersion as ex: 336 raise ValidationError( 337 f"The {version_type} version '{version}' is not a valid version format." 338 ) from ex 339 else: 340 # No exception 341 return parsed_version 342 343 def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: 344 # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config 345 stream_configs: List[Dict[str, Any]] = manifest.get("streams", []) 346 for s in stream_configs: 347 if "type" not in s: 348 s["type"] = "DeclarativeStream" 349 return stream_configs 350 351 def _dynamic_stream_configs( 352 self, manifest: Mapping[str, Any], config: Mapping[str, Any] 353 ) -> List[Dict[str, Any]]: 354 dynamic_stream_definitions: List[Dict[str, Any]] = manifest.get("dynamic_streams", []) 355 dynamic_stream_configs: List[Dict[str, Any]] = [] 356 seen_dynamic_streams: Set[str] = set() 357 358 for dynamic_definition in dynamic_stream_definitions: 359 components_resolver_config = dynamic_definition["components_resolver"] 360 361 if not components_resolver_config: 362 raise ValueError( 363 f"Missing 'components_resolver' in dynamic definition: {dynamic_definition}" 364 ) 365 366 resolver_type = components_resolver_config.get("type") 367 if not resolver_type: 368 raise ValueError( 369 f"Missing 'type' in components resolver configuration: {components_resolver_config}" 370 ) 371 372 if resolver_type not in COMPONENTS_RESOLVER_TYPE_MAPPING: 373 raise ValueError( 374 f"Invalid components resolver type '{resolver_type}'. " 375 f"Expected one of {list(COMPONENTS_RESOLVER_TYPE_MAPPING.keys())}." 376 ) 377 378 if "retriever" in components_resolver_config: 379 components_resolver_config["retriever"]["requester"]["use_cache"] = True 380 381 # Create a resolver for dynamic components based on type 382 components_resolver = self._constructor.create_component( 383 COMPONENTS_RESOLVER_TYPE_MAPPING[resolver_type], components_resolver_config, config 384 ) 385 386 stream_template_config = dynamic_definition["stream_template"] 387 388 for dynamic_stream in components_resolver.resolve_components( 389 stream_template_config=stream_template_config 390 ): 391 if "type" not in dynamic_stream: 392 dynamic_stream["type"] = "DeclarativeStream" 393 394 # Ensure that each stream is created with a unique name 395 name = dynamic_stream.get("name") 396 397 if not isinstance(name, str): 398 raise ValueError( 399 f"Expected stream name {name} to be a string, got {type(name)}." 400 ) 401 402 if name in seen_dynamic_streams: 403 error_message = f"Dynamic streams list contains a duplicate name: {name}. Please contact Airbyte Support." 404 failure_type = FailureType.system_error 405 406 if resolver_type == "ConfigComponentsResolver": 407 error_message = f"Dynamic streams list contains a duplicate name: {name}. Please check your configuration." 408 failure_type = FailureType.config_error 409 410 raise AirbyteTracedException( 411 message=error_message, 412 internal_message=error_message, 413 failure_type=failure_type, 414 ) 415 416 seen_dynamic_streams.add(name) 417 dynamic_stream_configs.append(dynamic_stream) 418 419 return dynamic_stream_configs 420 421 def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None: 422 self.logger.debug("declarative source created from manifest", extra=extra_args)
Declarative source defined by a manifest of low-code components that define source connector behavior
64 def __init__( 65 self, 66 source_config: ConnectionDefinition, 67 *, 68 config: Mapping[str, Any] | None = None, 69 debug: bool = False, 70 emit_connector_builder_messages: bool = False, 71 component_factory: Optional[ModelToComponentFactory] = None, 72 ): 73 """ 74 Args: 75 config: The provided config dict. 76 source_config: The manifest of low-code components that describe the source connector. 77 debug: True if debug mode is enabled. 78 emit_connector_builder_messages: True if messages should be emitted to the connector builder. 79 component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked. 80 """ 81 self.logger = logging.getLogger(f"airbyte.{self.name}") 82 # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing 83 manifest = dict(source_config) 84 if "type" not in manifest: 85 manifest["type"] = "DeclarativeSource" 86 87 # If custom components are needed, locate and/or register them. 88 self.components_module: ModuleType | None = get_registered_components_module(config=config) 89 90 resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) 91 propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( 92 "", resolved_source_config, {} 93 ) 94 self._source_config = propagated_source_config 95 self._debug = debug 96 self._emit_connector_builder_messages = emit_connector_builder_messages 97 self._constructor = ( 98 component_factory 99 if component_factory 100 else ModelToComponentFactory( 101 emit_connector_builder_messages, 102 max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"), 103 ) 104 ) 105 self._message_repository = self._constructor.get_message_repository() 106 self._slice_logger: SliceLogger = ( 107 AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger() 108 ) 109 110 self._validate_source()
Arguments:
- config: The provided config dict.
- source_config: The manifest of low-code components that describe the source connector.
- debug: True if debug mode is enabled.
- emit_connector_builder_messages: True if messages should be emitted to the connector builder.
- component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked.
120 @property 121 def connection_checker(self) -> ConnectionChecker: 122 check = self._source_config["check"] 123 if "type" not in check: 124 check["type"] = "CheckStream" 125 check_stream = self._constructor.create_component( 126 COMPONENTS_CHECKER_TYPE_MAPPING[check["type"]], 127 check, 128 dict(), 129 emit_connector_builder_messages=self._emit_connector_builder_messages, 130 ) 131 if isinstance(check_stream, ConnectionChecker): 132 return check_stream 133 else: 134 raise ValueError( 135 f"Expected to generate a ConnectionChecker component, but received {check_stream.__class__}" 136 )
Returns the ConnectionChecker to use for the check
operation
138 def streams(self, config: Mapping[str, Any]) -> List[Stream]: 139 self._emit_manifest_debug_message( 140 extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} 141 ) 142 143 stream_configs = self._stream_configs(self._source_config) + self._dynamic_stream_configs( 144 self._source_config, config 145 ) 146 147 api_budget_model = self._source_config.get("api_budget") 148 if api_budget_model: 149 self._constructor.set_api_budget(api_budget_model, config) 150 151 source_streams = [ 152 self._constructor.create_component( 153 StateDelegatingStreamModel 154 if stream_config.get("type") == StateDelegatingStreamModel.__name__ 155 else DeclarativeStreamModel, 156 stream_config, 157 config, 158 emit_connector_builder_messages=self._emit_connector_builder_messages, 159 ) 160 for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) 161 ] 162 163 return source_streams
Parameters
- config: The user-provided configuration as specified by the source's spec. Any stream construction related operation should happen here.
Returns
A list of the streams in this source connector.
217 def spec(self, logger: logging.Logger) -> ConnectorSpecification: 218 """ 219 Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible 220 configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this 221 will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json" 222 in the project root. 223 """ 224 self._configure_logger_level(logger) 225 self._emit_manifest_debug_message( 226 extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} 227 ) 228 229 spec = self._source_config.get("spec") 230 if spec: 231 if "type" not in spec: 232 spec["type"] = "Spec" 233 spec_component = self._constructor.create_component(SpecModel, spec, dict()) 234 return spec_component.generate_spec() 235 else: 236 return super().spec(logger)
Returns the connector specification (spec) as defined in the Airbyte Protocol. The spec is an object describing the possible configurations (e.g: username and password) which can be configured when running this connector. For low-code connectors, this will first attempt to load the spec from the manifest's spec block, otherwise it will load it from "spec.yaml" or "spec.json" in the project root.
238 def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: 239 self._configure_logger_level(logger) 240 return super().check(logger, config)
Implements the Check Connection operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/#check.
242 def read( 243 self, 244 logger: logging.Logger, 245 config: Mapping[str, Any], 246 catalog: ConfiguredAirbyteCatalog, 247 state: Optional[List[AirbyteStateMessage]] = None, 248 ) -> Iterator[AirbyteMessage]: 249 self._configure_logger_level(logger) 250 yield from super().read(logger, config, catalog, state)
Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.com/understanding-airbyte/airbyte-protocol/.
Inherited Members
14@dataclass 15class MinMaxDatetime: 16 """ 17 Compares the provided date against optional minimum or maximum times. If date is earlier than 18 min_date, then min_date is returned. If date is greater than max_date, then max_date is returned. 19 If neither, the input date is returned. 20 21 The timestamp format accepts the same format codes as datetime.strfptime, which are 22 all the format codes required by the 1989 C standard. 23 Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html 24 25 Attributes: 26 datetime (Union[InterpolatedString, str]): InterpolatedString or string representing the datetime in the format specified by `datetime_format` 27 datetime_format (str): Format of the datetime passed as argument 28 min_datetime (Union[InterpolatedString, str]): Represents the minimum allowed datetime value. 29 max_datetime (Union[InterpolatedString, str]): Represents the maximum allowed datetime value. 30 """ 31 32 datetime: Union[InterpolatedString, str] 33 parameters: InitVar[Mapping[str, Any]] 34 # datetime_format is a unique case where we inherit it from the parent if it is not specified before using the default value 35 # which is why we need dedicated getter/setter methods and private dataclass field 36 datetime_format: str 37 _datetime_format: str = field(init=False, repr=False, default="") 38 min_datetime: Union[InterpolatedString, str] = "" 39 max_datetime: Union[InterpolatedString, str] = "" 40 41 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 42 self.datetime = InterpolatedString.create(self.datetime, parameters=parameters or {}) 43 self._parser = DatetimeParser() 44 self.min_datetime = ( 45 InterpolatedString.create(self.min_datetime, parameters=parameters) # type: ignore [assignment] # expression has type "InterpolatedString | None", variable has type "InterpolatedString | str" 46 if self.min_datetime 47 else None 48 ) # type: ignore 49 self.max_datetime = ( 50 InterpolatedString.create(self.max_datetime, parameters=parameters) # type: ignore [assignment] # expression has type "InterpolatedString | None", variable has type "InterpolatedString | str" 51 if self.max_datetime 52 else None 53 ) # type: ignore 54 55 def get_datetime( 56 self, config: Mapping[str, Any], **additional_parameters: Mapping[str, Any] 57 ) -> dt.datetime: 58 """ 59 Evaluates and returns the datetime 60 :param config: The user-provided configuration as specified by the source's spec 61 :param additional_parameters: Additional arguments to be passed to the strings for interpolation 62 :return: The evaluated datetime 63 """ 64 # We apply a default datetime format here instead of at instantiation, so it can be set by the parent first 65 datetime_format = self._datetime_format 66 if not datetime_format: 67 datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" 68 69 time = self._parser.parse( 70 str( 71 self.datetime.eval( # type: ignore[union-attr] # str has no attribute "eval" 72 config, 73 **additional_parameters, 74 ) 75 ), 76 datetime_format, 77 ) # type: ignore # datetime is always cast to an interpolated string 78 79 if self.min_datetime: 80 min_time = str(self.min_datetime.eval(config, **additional_parameters)) # type: ignore # min_datetime is always cast to an interpolated string 81 if min_time: 82 min_datetime = self._parser.parse(min_time, datetime_format) # type: ignore # min_datetime is always cast to an interpolated string 83 time = max(time, min_datetime) 84 if self.max_datetime: 85 max_time = str(self.max_datetime.eval(config, **additional_parameters)) # type: ignore # max_datetime is always cast to an interpolated string 86 if max_time: 87 max_datetime = self._parser.parse(max_time, datetime_format) 88 time = min(time, max_datetime) 89 return time 90 91 @property # type: ignore # properties don't play well with dataclasses... 92 def datetime_format(self) -> str: 93 """The format of the string representing the datetime""" 94 return self._datetime_format 95 96 @datetime_format.setter 97 def datetime_format(self, value: str) -> None: 98 """Setter for the datetime format""" 99 # Covers the case where datetime_format is not provided in the constructor, which causes the property object 100 # to be set which we need to avoid doing 101 if not isinstance(value, property): 102 self._datetime_format = value 103 104 @classmethod 105 def create( 106 cls, 107 interpolated_string_or_min_max_datetime: Union[InterpolatedString, str, "MinMaxDatetime"], 108 parameters: Optional[Mapping[str, Any]] = None, 109 ) -> "MinMaxDatetime": 110 if parameters is None: 111 parameters = {} 112 if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance( 113 interpolated_string_or_min_max_datetime, str 114 ): 115 return MinMaxDatetime( # type: ignore [call-arg] 116 datetime=interpolated_string_or_min_max_datetime, parameters=parameters 117 ) 118 else: 119 return interpolated_string_or_min_max_datetime
Compares the provided date against optional minimum or maximum times. If date is earlier than min_date, then min_date is returned. If date is greater than max_date, then max_date is returned. If neither, the input date is returned.
The timestamp format accepts the same format codes as datetime.strfptime, which are all the format codes required by the 1989 C standard. Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html
Attributes:
- datetime (Union[InterpolatedString, str]): InterpolatedString or string representing the datetime in the format specified by
datetime_format
- datetime_format (str): Format of the datetime passed as argument
- min_datetime (Union[InterpolatedString, str]): Represents the minimum allowed datetime value.
- max_datetime (Union[InterpolatedString, str]): Represents the maximum allowed datetime value.
91 @property # type: ignore # properties don't play well with dataclasses... 92 def datetime_format(self) -> str: 93 """The format of the string representing the datetime""" 94 return self._datetime_format
The format of the string representing the datetime
55 def get_datetime( 56 self, config: Mapping[str, Any], **additional_parameters: Mapping[str, Any] 57 ) -> dt.datetime: 58 """ 59 Evaluates and returns the datetime 60 :param config: The user-provided configuration as specified by the source's spec 61 :param additional_parameters: Additional arguments to be passed to the strings for interpolation 62 :return: The evaluated datetime 63 """ 64 # We apply a default datetime format here instead of at instantiation, so it can be set by the parent first 65 datetime_format = self._datetime_format 66 if not datetime_format: 67 datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" 68 69 time = self._parser.parse( 70 str( 71 self.datetime.eval( # type: ignore[union-attr] # str has no attribute "eval" 72 config, 73 **additional_parameters, 74 ) 75 ), 76 datetime_format, 77 ) # type: ignore # datetime is always cast to an interpolated string 78 79 if self.min_datetime: 80 min_time = str(self.min_datetime.eval(config, **additional_parameters)) # type: ignore # min_datetime is always cast to an interpolated string 81 if min_time: 82 min_datetime = self._parser.parse(min_time, datetime_format) # type: ignore # min_datetime is always cast to an interpolated string 83 time = max(time, min_datetime) 84 if self.max_datetime: 85 max_time = str(self.max_datetime.eval(config, **additional_parameters)) # type: ignore # max_datetime is always cast to an interpolated string 86 if max_time: 87 max_datetime = self._parser.parse(max_time, datetime_format) 88 time = min(time, max_datetime) 89 return time
Evaluates and returns the datetime
Parameters
- config: The user-provided configuration as specified by the source's spec
- additional_parameters: Additional arguments to be passed to the strings for interpolation
Returns
The evaluated datetime
104 @classmethod 105 def create( 106 cls, 107 interpolated_string_or_min_max_datetime: Union[InterpolatedString, str, "MinMaxDatetime"], 108 parameters: Optional[Mapping[str, Any]] = None, 109 ) -> "MinMaxDatetime": 110 if parameters is None: 111 parameters = {} 112 if isinstance(interpolated_string_or_min_max_datetime, InterpolatedString) or isinstance( 113 interpolated_string_or_min_max_datetime, str 114 ): 115 return MinMaxDatetime( # type: ignore [call-arg] 116 datetime=interpolated_string_or_min_max_datetime, parameters=parameters 117 ) 118 else: 119 return interpolated_string_or_min_max_datetime
33@dataclass 34class NoAuth(DeclarativeAuthenticator): 35 parameters: InitVar[Mapping[str, Any]] 36 37 @property 38 def auth_header(self) -> str: 39 return "" 40 41 @property 42 def token(self) -> str: 43 return ""
23@dataclass 24class OffsetIncrement(PaginationStrategy): 25 """ 26 Pagination strategy that returns the number of records reads so far and returns it as the next page token 27 Examples: 28 # page_size to be a constant integer value 29 pagination_strategy: 30 type: OffsetIncrement 31 page_size: 2 32 33 # page_size to be a constant string value 34 pagination_strategy: 35 type: OffsetIncrement 36 page_size: "2" 37 38 # page_size to be an interpolated string value 39 pagination_strategy: 40 type: OffsetIncrement 41 page_size: "{{ parameters['items_per_page'] }}" 42 43 Attributes: 44 page_size (InterpolatedString): the number of records to request 45 """ 46 47 config: Config 48 page_size: Optional[Union[str, int]] 49 parameters: InitVar[Mapping[str, Any]] 50 decoder: Decoder = field( 51 default_factory=lambda: PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 52 ) 53 inject_on_first_request: bool = False 54 55 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 56 page_size = str(self.page_size) if isinstance(self.page_size, int) else self.page_size 57 if page_size: 58 self._page_size: Optional[InterpolatedString] = InterpolatedString( 59 page_size, parameters=parameters 60 ) 61 else: 62 self._page_size = None 63 64 @property 65 def initial_token(self) -> Optional[Any]: 66 if self.inject_on_first_request: 67 return 0 68 return None 69 70 def next_page_token( 71 self, 72 response: requests.Response, 73 last_page_size: int, 74 last_record: Optional[Record], 75 last_page_token_value: Optional[Any] = None, 76 ) -> Optional[Any]: 77 decoded_response = next(self.decoder.decode(response)) 78 79 # Stop paginating when there are fewer records than the page size or the current page has no records 80 if ( 81 self._page_size 82 and last_page_size < self._page_size.eval(self.config, response=decoded_response) 83 ) or last_page_size == 0: 84 return None 85 elif last_page_token_value is None: 86 # If the OffsetIncrement strategy does not inject on the first request, the incoming last_page_token_value 87 # will be None. For this case, we assume that None was the first page and progress to the next offset 88 return 0 + last_page_size 89 elif not isinstance(last_page_token_value, int): 90 raise ValueError( 91 f"Last page token value {last_page_token_value} for OffsetIncrement pagination strategy was not an integer" 92 ) 93 else: 94 return last_page_token_value + last_page_size 95 96 def get_page_size(self) -> Optional[int]: 97 if self._page_size: 98 page_size = self._page_size.eval(self.config) 99 if not isinstance(page_size, int): 100 raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}") 101 return page_size 102 else: 103 return None
Pagination strategy that returns the number of records reads so far and returns it as the next page token
Examples:
page_size to be a constant integer value
pagination_strategy: type: OffsetIncrement page_size: 2
page_size to be a constant string value
pagination_strategy: type: OffsetIncrement page_size: "2"
page_size to be an interpolated string value
pagination_strategy: type: OffsetIncrement page_size: "{{ parameters['items_per_page'] }}"
Attributes:
- page_size (InterpolatedString): the number of records to request
64 @property 65 def initial_token(self) -> Optional[Any]: 66 if self.inject_on_first_request: 67 return 0 68 return None
Return the initial value of the token
70 def next_page_token( 71 self, 72 response: requests.Response, 73 last_page_size: int, 74 last_record: Optional[Record], 75 last_page_token_value: Optional[Any] = None, 76 ) -> Optional[Any]: 77 decoded_response = next(self.decoder.decode(response)) 78 79 # Stop paginating when there are fewer records than the page size or the current page has no records 80 if ( 81 self._page_size 82 and last_page_size < self._page_size.eval(self.config, response=decoded_response) 83 ) or last_page_size == 0: 84 return None 85 elif last_page_token_value is None: 86 # If the OffsetIncrement strategy does not inject on the first request, the incoming last_page_token_value 87 # will be None. For this case, we assume that None was the first page and progress to the next offset 88 return 0 + last_page_size 89 elif not isinstance(last_page_token_value, int): 90 raise ValueError( 91 f"Last page token value {last_page_token_value} for OffsetIncrement pagination strategy was not an integer" 92 ) 93 else: 94 return last_page_token_value + last_page_size
Parameters
- response: response to process
- last_page_size: the number of records read from the response
- last_record: the last record extracted from the response
- last_page_token_value: The current value of the page token made on the last request
Returns
next page token. Returns None if there are no more pages to fetch
96 def get_page_size(self) -> Optional[int]: 97 if self._page_size: 98 page_size = self._page_size.eval(self.config) 99 if not isinstance(page_size, int): 100 raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}") 101 return page_size 102 else: 103 return None
Returns
page size: The number of records to fetch in a page. Returns None if unspecified
18@dataclass 19class PageIncrement(PaginationStrategy): 20 """ 21 Pagination strategy that returns the number of pages reads so far and returns it as the next page token 22 23 Attributes: 24 page_size (int): the number of records to request 25 start_from_page (int): number of the initial page 26 """ 27 28 config: Config 29 page_size: Optional[Union[str, int]] 30 parameters: InitVar[Mapping[str, Any]] 31 start_from_page: int = 0 32 inject_on_first_request: bool = False 33 34 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 35 if isinstance(self.page_size, int) or (self.page_size is None): 36 self._page_size = self.page_size 37 else: 38 page_size = InterpolatedString(self.page_size, parameters=parameters).eval(self.config) 39 if not isinstance(page_size, int): 40 raise Exception(f"{page_size} is of type {type(page_size)}. Expected {int}") 41 self._page_size = page_size 42 43 @property 44 def initial_token(self) -> Optional[Any]: 45 if self.inject_on_first_request: 46 return self.start_from_page 47 return None 48 49 def next_page_token( 50 self, 51 response: requests.Response, 52 last_page_size: int, 53 last_record: Optional[Record], 54 last_page_token_value: Optional[Any], 55 ) -> Optional[Any]: 56 # Stop paginating when there are fewer records than the page size or the current page has no records 57 if (self._page_size and last_page_size < self._page_size) or last_page_size == 0: 58 return None 59 elif last_page_token_value is None: 60 # If the PageIncrement strategy does not inject on the first request, the incoming last_page_token_value 61 # may be None. When this is the case, we assume we've already requested the first page specified by 62 # start_from_page and must now get the next page 63 return self.start_from_page + 1 64 elif not isinstance(last_page_token_value, int): 65 raise ValueError( 66 f"Last page token value {last_page_token_value} for PageIncrement pagination strategy was not an integer" 67 ) 68 else: 69 return last_page_token_value + 1 70 71 def get_page_size(self) -> Optional[int]: 72 return self._page_size
Pagination strategy that returns the number of pages reads so far and returns it as the next page token
Attributes:
- page_size (int): the number of records to request
- start_from_page (int): number of the initial page
43 @property 44 def initial_token(self) -> Optional[Any]: 45 if self.inject_on_first_request: 46 return self.start_from_page 47 return None
Return the initial value of the token
49 def next_page_token( 50 self, 51 response: requests.Response, 52 last_page_size: int, 53 last_record: Optional[Record], 54 last_page_token_value: Optional[Any], 55 ) -> Optional[Any]: 56 # Stop paginating when there are fewer records than the page size or the current page has no records 57 if (self._page_size and last_page_size < self._page_size) or last_page_size == 0: 58 return None 59 elif last_page_token_value is None: 60 # If the PageIncrement strategy does not inject on the first request, the incoming last_page_token_value 61 # may be None. When this is the case, we assume we've already requested the first page specified by 62 # start_from_page and must now get the next page 63 return self.start_from_page + 1 64 elif not isinstance(last_page_token_value, int): 65 raise ValueError( 66 f"Last page token value {last_page_token_value} for PageIncrement pagination strategy was not an integer" 67 ) 68 else: 69 return last_page_token_value + 1
Parameters
- response: response to process
- last_page_size: the number of records read from the response
- last_record: the last record extracted from the response
- last_page_token_value: The current value of the page token made on the last request
Returns
next page token. Returns None if there are no more pages to fetch
15@dataclass 16class PaginationStrategy: 17 """ 18 Defines how to get the next page token 19 """ 20 21 @property 22 @abstractmethod 23 def initial_token(self) -> Optional[Any]: 24 """ 25 Return the initial value of the token 26 """ 27 28 @abstractmethod 29 def next_page_token( 30 self, 31 response: requests.Response, 32 last_page_size: int, 33 last_record: Optional[Record], 34 last_page_token_value: Optional[Any], 35 ) -> Optional[Any]: 36 """ 37 :param response: response to process 38 :param last_page_size: the number of records read from the response 39 :param last_record: the last record extracted from the response 40 :param last_page_token_value: The current value of the page token made on the last request 41 :return: next page token. Returns None if there are no more pages to fetch 42 """ 43 pass 44 45 @abstractmethod 46 def get_page_size(self) -> Optional[int]: 47 """ 48 :return: page size: The number of records to fetch in a page. Returns None if unspecified 49 """
Defines how to get the next page token
21 @property 22 @abstractmethod 23 def initial_token(self) -> Optional[Any]: 24 """ 25 Return the initial value of the token 26 """
Return the initial value of the token
28 @abstractmethod 29 def next_page_token( 30 self, 31 response: requests.Response, 32 last_page_size: int, 33 last_record: Optional[Record], 34 last_page_token_value: Optional[Any], 35 ) -> Optional[Any]: 36 """ 37 :param response: response to process 38 :param last_page_size: the number of records read from the response 39 :param last_record: the last record extracted from the response 40 :param last_page_token_value: The current value of the page token made on the last request 41 :return: next page token. Returns None if there are no more pages to fetch 42 """ 43 pass
Parameters
- response: response to process
- last_page_size: the number of records read from the response
- last_record: the last record extracted from the response
- last_page_token_value: The current value of the page token made on the last request
Returns
next page token. Returns None if there are no more pages to fetch
45 @abstractmethod 46 def get_page_size(self) -> Optional[int]: 47 """ 48 :return: page size: The number of records to fetch in a page. Returns None if unspecified 49 """
Returns
page size: The number of records to fetch in a page. Returns None if unspecified
31@dataclass 32class ParentStreamConfig: 33 """ 34 Describes how to create a stream slice from a parent stream 35 36 stream: The stream to read records from 37 parent_key: The key of the parent stream's records that will be the stream slice key 38 partition_field: The partition key 39 extra_fields: Additional field paths to include in the stream slice 40 request_option: How to inject the slice value on an outgoing HTTP request 41 incremental_dependency (bool): Indicates if the parent stream should be read incrementally. 42 """ 43 44 stream: "DeclarativeStream" # Parent streams must be DeclarativeStream because we can't know which part of the stream slice is a partition for regular Stream 45 parent_key: Union[InterpolatedString, str] 46 partition_field: Union[InterpolatedString, str] 47 config: Config 48 parameters: InitVar[Mapping[str, Any]] 49 extra_fields: Optional[Union[List[List[str]], List[List[InterpolatedString]]]] = ( 50 None # List of field paths (arrays of strings) 51 ) 52 request_option: Optional[RequestOption] = None 53 incremental_dependency: bool = False 54 lazy_read_pointer: Optional[List[Union[InterpolatedString, str]]] = None 55 56 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 57 self.parent_key = InterpolatedString.create(self.parent_key, parameters=parameters) 58 self.partition_field = InterpolatedString.create( 59 self.partition_field, parameters=parameters 60 ) 61 if self.extra_fields: 62 # Create InterpolatedString for each field path in extra_keys 63 self.extra_fields = [ 64 [InterpolatedString.create(path, parameters=parameters) for path in key_path] 65 for key_path in self.extra_fields 66 ] 67 68 self.lazy_read_pointer = ( 69 [ 70 InterpolatedString.create(path, parameters=parameters) 71 if isinstance(path, str) 72 else path 73 for path in self.lazy_read_pointer 74 ] 75 if self.lazy_read_pointer 76 else None 77 )
Describes how to create a stream slice from a parent stream
stream: The stream to read records from parent_key: The key of the parent stream's records that will be the stream slice key partition_field: The partition key extra_fields: Additional field paths to include in the stream slice request_option: How to inject the slice value on an outgoing HTTP request incremental_dependency (bool): Indicates if the parent stream should be read incrementally.
7class ReadException(Exception): 8 """ 9 Raise when there is an error reading data from an API Source 10 """
Raise when there is an error reading data from an API Source
12@dataclass 13class RecordExtractor: 14 """ 15 Responsible for translating an HTTP response into a list of records by extracting records from the response. 16 """ 17 18 @abstractmethod 19 def extract_records( 20 self, 21 response: requests.Response, 22 ) -> Iterable[Mapping[str, Any]]: 23 """ 24 Selects records from the response 25 :param response: The response to extract the records from 26 :return: List of Records extracted from the response 27 """ 28 pass
Responsible for translating an HTTP response into a list of records by extracting records from the response.
18 @abstractmethod 19 def extract_records( 20 self, 21 response: requests.Response, 22 ) -> Iterable[Mapping[str, Any]]: 23 """ 24 Selects records from the response 25 :param response: The response to extract the records from 26 :return: List of Records extracted from the response 27 """ 28 pass
Selects records from the response
Parameters
- response: The response to extract the records from
Returns
List of Records extracted from the response
17@dataclass 18class RecordFilter: 19 """ 20 Filter applied on a list of Records 21 22 config (Config): The user-provided configuration as specified by the source's spec 23 condition (str): The string representing the predicate to filter a record. Records will be removed if evaluated to False 24 """ 25 26 parameters: InitVar[Mapping[str, Any]] 27 config: Config 28 condition: str = "" 29 30 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 31 self._filter_interpolator = InterpolatedBoolean( 32 condition=self.condition, parameters=parameters 33 ) 34 35 def filter_records( 36 self, 37 records: Iterable[Mapping[str, Any]], 38 stream_state: StreamState, 39 stream_slice: Optional[StreamSlice] = None, 40 next_page_token: Optional[Mapping[str, Any]] = None, 41 ) -> Iterable[Mapping[str, Any]]: 42 kwargs = { 43 "stream_state": stream_state, 44 "stream_slice": stream_slice, 45 "next_page_token": next_page_token, 46 "stream_slice.extra_fields": stream_slice.extra_fields if stream_slice else {}, 47 } 48 for record in records: 49 if self._filter_interpolator.eval(self.config, record=record, **kwargs): 50 yield record
Filter applied on a list of Records
config (Config): The user-provided configuration as specified by the source's spec condition (str): The string representing the predicate to filter a record. Records will be removed if evaluated to False
35 def filter_records( 36 self, 37 records: Iterable[Mapping[str, Any]], 38 stream_state: StreamState, 39 stream_slice: Optional[StreamSlice] = None, 40 next_page_token: Optional[Mapping[str, Any]] = None, 41 ) -> Iterable[Mapping[str, Any]]: 42 kwargs = { 43 "stream_state": stream_state, 44 "stream_slice": stream_slice, 45 "next_page_token": next_page_token, 46 "stream_slice.extra_fields": stream_slice.extra_fields if stream_slice else {}, 47 } 48 for record in records: 49 if self._filter_interpolator.eval(self.config, record=record, **kwargs): 50 yield record
24@dataclass 25class RecordSelector(HttpSelector): 26 """ 27 Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering 28 records based on a heuristic. 29 30 Attributes: 31 extractor (RecordExtractor): The record extractor responsible for extracting records from a response 32 schema_normalization (TypeTransformer): The record normalizer responsible for casting record values to stream schema types 33 record_filter (RecordFilter): The record filter responsible for filtering extracted records 34 transformations (List[RecordTransformation]): The transformations to be done on the records 35 """ 36 37 extractor: RecordExtractor 38 config: Config 39 parameters: InitVar[Mapping[str, Any]] 40 schema_normalization: Union[TypeTransformer, DeclarativeTypeTransformer] 41 name: str 42 _name: Union[InterpolatedString, str] = field(init=False, repr=False, default="") 43 record_filter: Optional[RecordFilter] = None 44 transformations: List[RecordTransformation] = field(default_factory=lambda: []) 45 transform_before_filtering: bool = False 46 47 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 48 self._parameters = parameters 49 self._name = ( 50 InterpolatedString(self._name, parameters=parameters) 51 if isinstance(self._name, str) 52 else self._name 53 ) 54 55 @property # type: ignore 56 def name(self) -> str: 57 """ 58 :return: Stream name 59 """ 60 return ( 61 str(self._name.eval(self.config)) 62 if isinstance(self._name, InterpolatedString) 63 else self._name 64 ) 65 66 @name.setter 67 def name(self, value: str) -> None: 68 if not isinstance(value, property): 69 self._name = value 70 71 def select_records( 72 self, 73 response: requests.Response, 74 stream_state: StreamState, 75 records_schema: Mapping[str, Any], 76 stream_slice: Optional[StreamSlice] = None, 77 next_page_token: Optional[Mapping[str, Any]] = None, 78 ) -> Iterable[Record]: 79 """ 80 Selects records from the response 81 :param response: The response to select the records from 82 :param stream_state: The stream state 83 :param records_schema: json schema of records to return 84 :param stream_slice: The stream slice 85 :param next_page_token: The paginator token 86 :return: List of Records selected from the response 87 """ 88 all_data: Iterable[Mapping[str, Any]] = self.extractor.extract_records(response) 89 yield from self.filter_and_transform( 90 all_data, stream_state, records_schema, stream_slice, next_page_token 91 ) 92 93 def filter_and_transform( 94 self, 95 all_data: Iterable[Mapping[str, Any]], 96 stream_state: StreamState, 97 records_schema: Mapping[str, Any], 98 stream_slice: Optional[StreamSlice] = None, 99 next_page_token: Optional[Mapping[str, Any]] = None, 100 ) -> Iterable[Record]: 101 """ 102 There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and 103 normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests 104 library). 105 106 Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could 107 share the logic of doing transformations on a set of records. 108 """ 109 if self.transform_before_filtering: 110 transformed_data = self._transform(all_data, stream_state, stream_slice) 111 transformed_filtered_data = self._filter( 112 transformed_data, stream_state, stream_slice, next_page_token 113 ) 114 else: 115 filtered_data = self._filter(all_data, stream_state, stream_slice, next_page_token) 116 transformed_filtered_data = self._transform(filtered_data, stream_state, stream_slice) 117 normalized_data = self._normalize_by_schema( 118 transformed_filtered_data, schema=records_schema 119 ) 120 for data in normalized_data: 121 yield Record(data=data, stream_name=self.name, associated_slice=stream_slice) 122 123 def _normalize_by_schema( 124 self, records: Iterable[Mapping[str, Any]], schema: Optional[Mapping[str, Any]] 125 ) -> Iterable[Mapping[str, Any]]: 126 if schema: 127 # record has type Mapping[str, Any], but dict[str, Any] expected 128 for record in records: 129 normalized_record = dict(record) 130 self.schema_normalization.transform(normalized_record, schema) 131 yield normalized_record 132 else: 133 yield from records 134 135 def _filter( 136 self, 137 records: Iterable[Mapping[str, Any]], 138 stream_state: StreamState, 139 stream_slice: Optional[StreamSlice], 140 next_page_token: Optional[Mapping[str, Any]], 141 ) -> Iterable[Mapping[str, Any]]: 142 if self.record_filter: 143 yield from self.record_filter.filter_records( 144 records, 145 stream_state=stream_state, 146 stream_slice=stream_slice, 147 next_page_token=next_page_token, 148 ) 149 else: 150 yield from records 151 152 def _transform( 153 self, 154 records: Iterable[Mapping[str, Any]], 155 stream_state: StreamState, 156 stream_slice: Optional[StreamSlice] = None, 157 ) -> Iterable[Mapping[str, Any]]: 158 for record in records: 159 for transformation in self.transformations: 160 transformation.transform( 161 record, # type: ignore # record has type Mapping[str, Any], but Dict[str, Any] expected 162 config=self.config, 163 stream_state=stream_state, 164 stream_slice=stream_slice, 165 ) 166 yield record
Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering records based on a heuristic.
Attributes:
- extractor (RecordExtractor): The record extractor responsible for extracting records from a response
- schema_normalization (TypeTransformer): The record normalizer responsible for casting record values to stream schema types
- record_filter (RecordFilter): The record filter responsible for filtering extracted records
- transformations (List[RecordTransformation]): The transformations to be done on the records
55 @property # type: ignore 56 def name(self) -> str: 57 """ 58 :return: Stream name 59 """ 60 return ( 61 str(self._name.eval(self.config)) 62 if isinstance(self._name, InterpolatedString) 63 else self._name 64 )
Returns
Stream name
71 def select_records( 72 self, 73 response: requests.Response, 74 stream_state: StreamState, 75 records_schema: Mapping[str, Any], 76 stream_slice: Optional[StreamSlice] = None, 77 next_page_token: Optional[Mapping[str, Any]] = None, 78 ) -> Iterable[Record]: 79 """ 80 Selects records from the response 81 :param response: The response to select the records from 82 :param stream_state: The stream state 83 :param records_schema: json schema of records to return 84 :param stream_slice: The stream slice 85 :param next_page_token: The paginator token 86 :return: List of Records selected from the response 87 """ 88 all_data: Iterable[Mapping[str, Any]] = self.extractor.extract_records(response) 89 yield from self.filter_and_transform( 90 all_data, stream_state, records_schema, stream_slice, next_page_token 91 )
Selects records from the response
Parameters
- response: The response to select the records from
- stream_state: The stream state
- records_schema: json schema of records to return
- stream_slice: The stream slice
- next_page_token: The paginator token
Returns
List of Records selected from the response
93 def filter_and_transform( 94 self, 95 all_data: Iterable[Mapping[str, Any]], 96 stream_state: StreamState, 97 records_schema: Mapping[str, Any], 98 stream_slice: Optional[StreamSlice] = None, 99 next_page_token: Optional[Mapping[str, Any]] = None, 100 ) -> Iterable[Record]: 101 """ 102 There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and 103 normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests 104 library). 105 106 Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could 107 share the logic of doing transformations on a set of records. 108 """ 109 if self.transform_before_filtering: 110 transformed_data = self._transform(all_data, stream_state, stream_slice) 111 transformed_filtered_data = self._filter( 112 transformed_data, stream_state, stream_slice, next_page_token 113 ) 114 else: 115 filtered_data = self._filter(all_data, stream_state, stream_slice, next_page_token) 116 transformed_filtered_data = self._transform(filtered_data, stream_state, stream_slice) 117 normalized_data = self._normalize_by_schema( 118 transformed_filtered_data, schema=records_schema 119 ) 120 for data in normalized_data: 121 yield Record(data=data, stream_name=self.name, associated_slice=stream_slice)
There is an issue with the selector as of 2024-08-30: it does technology-agnostic processing like filtering, transformation and normalization with an API that is technology-specific (as requests.Response is only for HTTP communication using the requests library).
Until we decide to move this logic away from the selector, we made this method public so that users like AsyncJobRetriever could share the logic of doing transformations on a set of records.
13@dataclass 14class RecordTransformation: 15 """ 16 Implementations of this class define transformations that can be applied to records of a stream. 17 """ 18 19 @abstractmethod 20 def transform( 21 self, 22 record: Dict[str, Any], 23 config: Optional[Config] = None, 24 stream_state: Optional[StreamState] = None, 25 stream_slice: Optional[StreamSlice] = None, 26 ) -> None: 27 """ 28 Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument. 29 30 :param record: The input record to be transformed 31 :param config: The user-provided configuration as specified by the source's spec 32 :param stream_state: The stream state 33 :param stream_slice: The stream slice 34 :return: The transformed record 35 """ 36 37 def __eq__(self, other: object) -> bool: 38 return other.__dict__ == self.__dict__
Implementations of this class define transformations that can be applied to records of a stream.
19 @abstractmethod 20 def transform( 21 self, 22 record: Dict[str, Any], 23 config: Optional[Config] = None, 24 stream_state: Optional[StreamState] = None, 25 stream_slice: Optional[StreamSlice] = None, 26 ) -> None: 27 """ 28 Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument. 29 30 :param record: The input record to be transformed 31 :param config: The user-provided configuration as specified by the source's spec 32 :param stream_state: The stream state 33 :param stream_slice: The stream slice 34 :return: The transformed record 35 """
Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.
Parameters
- record: The input record to be transformed
- config: The user-provided configuration as specified by the source's spec
- stream_state: The stream state
- stream_slice: The stream slice
Returns
The transformed record
25@dataclass 26class RequestOption: 27 """ 28 Describes an option to set on a request 29 30 Attributes: 31 field_name (str): Describes the name of the parameter to inject. Mutually exclusive with field_path. 32 field_path (list(str)): Describes the path to a nested field as a list of field names. 33 Only valid for body_json injection type, and mutually exclusive with field_name. 34 inject_into (RequestOptionType): Describes where in the HTTP request to inject the parameter 35 """ 36 37 inject_into: RequestOptionType 38 parameters: InitVar[Mapping[str, Any]] 39 field_name: Optional[Union[InterpolatedString, str]] = None 40 field_path: Optional[List[Union[InterpolatedString, str]]] = None 41 42 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 43 # Validate inputs. We should expect either field_name or field_path, but not both 44 if self.field_name is None and self.field_path is None: 45 raise ValueError("RequestOption requires either a field_name or field_path") 46 47 if self.field_name is not None and self.field_path is not None: 48 raise ValueError( 49 "Only one of field_name or field_path can be provided to RequestOption" 50 ) 51 52 # Nested field injection is only supported for body JSON injection 53 if self.field_path is not None and self.inject_into != RequestOptionType.body_json: 54 raise ValueError( 55 "Nested field injection is only supported for body JSON injection. Please use a top-level field_name for other injection types." 56 ) 57 58 # Convert field_name and field_path into InterpolatedString objects if they are strings 59 if self.field_name is not None: 60 self.field_name = InterpolatedString.create(self.field_name, parameters=parameters) 61 elif self.field_path is not None: 62 self.field_path = [ 63 InterpolatedString.create(segment, parameters=parameters) 64 for segment in self.field_path 65 ] 66 67 @property 68 def _is_field_path(self) -> bool: 69 """Returns whether this option is a field path (ie, a nested field)""" 70 return self.field_path is not None 71 72 def inject_into_request( 73 self, 74 target: MutableMapping[str, Any], 75 value: Any, 76 config: Config, 77 ) -> None: 78 """ 79 Inject a request option value into a target request structure using either field_name or field_path. 80 For non-body-json injection, only top-level field names are supported. 81 For body-json injection, both field names and nested field paths are supported. 82 83 Args: 84 target: The request structure to inject the value into 85 value: The value to inject 86 config: The config object to use for interpolation 87 """ 88 if self._is_field_path: 89 if self.inject_into != RequestOptionType.body_json: 90 raise ValueError( 91 "Nested field injection is only supported for body JSON injection. Please use a top-level field_name for other injection types." 92 ) 93 94 assert self.field_path is not None # for type checker 95 current = target 96 # Convert path segments into strings, evaluating any interpolated segments 97 # Example: ["data", "{{ config[user_type] }}", "id"] -> ["data", "admin", "id"] 98 *path_parts, final_key = [ 99 str( 100 segment.eval(config=config) 101 if isinstance(segment, InterpolatedString) 102 else segment 103 ) 104 for segment in self.field_path 105 ] 106 107 # Build a nested dictionary structure and set the final value at the deepest level 108 for part in path_parts: 109 current = current.setdefault(part, {}) 110 current[final_key] = value 111 else: 112 # For non-nested fields, evaluate the field name if it's an interpolated string 113 key = ( 114 self.field_name.eval(config=config) 115 if isinstance(self.field_name, InterpolatedString) 116 else self.field_name 117 ) 118 target[str(key)] = value
Describes an option to set on a request
Attributes:
- field_name (str): Describes the name of the parameter to inject. Mutually exclusive with field_path.
- field_path (list(str)): Describes the path to a nested field as a list of field names. Only valid for body_json injection type, and mutually exclusive with field_name.
- inject_into (RequestOptionType): Describes where in the HTTP request to inject the parameter
72 def inject_into_request( 73 self, 74 target: MutableMapping[str, Any], 75 value: Any, 76 config: Config, 77 ) -> None: 78 """ 79 Inject a request option value into a target request structure using either field_name or field_path. 80 For non-body-json injection, only top-level field names are supported. 81 For body-json injection, both field names and nested field paths are supported. 82 83 Args: 84 target: The request structure to inject the value into 85 value: The value to inject 86 config: The config object to use for interpolation 87 """ 88 if self._is_field_path: 89 if self.inject_into != RequestOptionType.body_json: 90 raise ValueError( 91 "Nested field injection is only supported for body JSON injection. Please use a top-level field_name for other injection types." 92 ) 93 94 assert self.field_path is not None # for type checker 95 current = target 96 # Convert path segments into strings, evaluating any interpolated segments 97 # Example: ["data", "{{ config[user_type] }}", "id"] -> ["data", "admin", "id"] 98 *path_parts, final_key = [ 99 str( 100 segment.eval(config=config) 101 if isinstance(segment, InterpolatedString) 102 else segment 103 ) 104 for segment in self.field_path 105 ] 106 107 # Build a nested dictionary structure and set the final value at the deepest level 108 for part in path_parts: 109 current = current.setdefault(part, {}) 110 current[final_key] = value 111 else: 112 # For non-nested fields, evaluate the field name if it's an interpolated string 113 key = ( 114 self.field_name.eval(config=config) 115 if isinstance(self.field_name, InterpolatedString) 116 else self.field_name 117 ) 118 target[str(key)] = value
Inject a request option value into a target request structure using either field_name or field_path. For non-body-json injection, only top-level field names are supported. For body-json injection, both field names and nested field paths are supported.
Arguments:
- target: The request structure to inject the value into
- value: The value to inject
- config: The config object to use for interpolation
14class RequestOptionType(Enum): 15 """ 16 Describes where to set a value on a request 17 """ 18 19 request_parameter = "request_parameter" 20 header = "header" 21 body_data = "body_data" 22 body_json = "body_json"
Describes where to set a value on a request
30class Requester(RequestOptionsProvider): 31 @abstractmethod 32 def get_authenticator(self) -> DeclarativeAuthenticator: 33 """ 34 Specifies the authenticator to use when submitting requests 35 """ 36 pass 37 38 @abstractmethod 39 def get_url_base( 40 self, 41 *, 42 stream_state: Optional[StreamState], 43 stream_slice: Optional[StreamSlice], 44 next_page_token: Optional[Mapping[str, Any]], 45 ) -> str: 46 """ 47 :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" 48 """ 49 50 @abstractmethod 51 def get_path( 52 self, 53 *, 54 stream_state: Optional[StreamState], 55 stream_slice: Optional[StreamSlice], 56 next_page_token: Optional[Mapping[str, Any]], 57 ) -> str: 58 """ 59 Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" 60 """ 61 62 @abstractmethod 63 def get_method(self) -> HttpMethod: 64 """ 65 Specifies the HTTP method to use 66 """ 67 68 @abstractmethod 69 def get_request_params( 70 self, 71 *, 72 stream_state: Optional[StreamState] = None, 73 stream_slice: Optional[StreamSlice] = None, 74 next_page_token: Optional[Mapping[str, Any]] = None, 75 ) -> MutableMapping[str, Any]: 76 """ 77 Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. 78 79 E.g: you might want to define query parameters for paging if next_page_token is not None. 80 """ 81 82 @abstractmethod 83 def get_request_headers( 84 self, 85 *, 86 stream_state: Optional[StreamState] = None, 87 stream_slice: Optional[StreamSlice] = None, 88 next_page_token: Optional[Mapping[str, Any]] = None, 89 ) -> Mapping[str, Any]: 90 """ 91 Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. 92 """ 93 94 @abstractmethod 95 def get_request_body_data( 96 self, 97 *, 98 stream_state: Optional[StreamState] = None, 99 stream_slice: Optional[StreamSlice] = None, 100 next_page_token: Optional[Mapping[str, Any]] = None, 101 ) -> Union[Mapping[str, Any], str]: 102 """ 103 Specifies how to populate the body of the request with a non-JSON payload. 104 105 If returns a ready text that it will be sent as is. 106 If returns a dict that it will be converted to a urlencoded form. 107 E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" 108 109 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 110 """ 111 112 @abstractmethod 113 def get_request_body_json( 114 self, 115 *, 116 stream_state: Optional[StreamState] = None, 117 stream_slice: Optional[StreamSlice] = None, 118 next_page_token: Optional[Mapping[str, Any]] = None, 119 ) -> Mapping[str, Any]: 120 """ 121 Specifies how to populate the body of the request with a JSON payload. 122 123 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 124 """ 125 126 @abstractmethod 127 def send_request( 128 self, 129 stream_state: Optional[StreamState] = None, 130 stream_slice: Optional[StreamSlice] = None, 131 next_page_token: Optional[Mapping[str, Any]] = None, 132 path: Optional[str] = None, 133 request_headers: Optional[Mapping[str, Any]] = None, 134 request_params: Optional[Mapping[str, Any]] = None, 135 request_body_data: Optional[Union[Mapping[str, Any], str]] = None, 136 request_body_json: Optional[Mapping[str, Any]] = None, 137 log_formatter: Optional[Callable[[requests.Response], Any]] = None, 138 ) -> Optional[requests.Response]: 139 """ 140 Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. 141 If path is set, the path configured on the requester itself is ignored. 142 If header, params and body are set, they are merged with the ones configured on the requester itself. 143 144 If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed. 145 """
Defines the request options to set on an outgoing HTTP request
Options can be passed by
- request parameter
- request headers
- body data
- json content
31 @abstractmethod 32 def get_authenticator(self) -> DeclarativeAuthenticator: 33 """ 34 Specifies the authenticator to use when submitting requests 35 """ 36 pass
Specifies the authenticator to use when submitting requests
38 @abstractmethod 39 def get_url_base( 40 self, 41 *, 42 stream_state: Optional[StreamState], 43 stream_slice: Optional[StreamSlice], 44 next_page_token: Optional[Mapping[str, Any]], 45 ) -> str: 46 """ 47 :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" 48 """
Returns
URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
50 @abstractmethod 51 def get_path( 52 self, 53 *, 54 stream_state: Optional[StreamState], 55 stream_slice: Optional[StreamSlice], 56 next_page_token: Optional[Mapping[str, Any]], 57 ) -> str: 58 """ 59 Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" 60 """
Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
62 @abstractmethod 63 def get_method(self) -> HttpMethod: 64 """ 65 Specifies the HTTP method to use 66 """
Specifies the HTTP method to use
68 @abstractmethod 69 def get_request_params( 70 self, 71 *, 72 stream_state: Optional[StreamState] = None, 73 stream_slice: Optional[StreamSlice] = None, 74 next_page_token: Optional[Mapping[str, Any]] = None, 75 ) -> MutableMapping[str, Any]: 76 """ 77 Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. 78 79 E.g: you might want to define query parameters for paging if next_page_token is not None. 80 """
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
82 @abstractmethod 83 def get_request_headers( 84 self, 85 *, 86 stream_state: Optional[StreamState] = None, 87 stream_slice: Optional[StreamSlice] = None, 88 next_page_token: Optional[Mapping[str, Any]] = None, 89 ) -> Mapping[str, Any]: 90 """ 91 Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. 92 """
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
94 @abstractmethod 95 def get_request_body_data( 96 self, 97 *, 98 stream_state: Optional[StreamState] = None, 99 stream_slice: Optional[StreamSlice] = None, 100 next_page_token: Optional[Mapping[str, Any]] = None, 101 ) -> Union[Mapping[str, Any], str]: 102 """ 103 Specifies how to populate the body of the request with a non-JSON payload. 104 105 If returns a ready text that it will be sent as is. 106 If returns a dict that it will be converted to a urlencoded form. 107 E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" 108 109 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 110 """
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
112 @abstractmethod 113 def get_request_body_json( 114 self, 115 *, 116 stream_state: Optional[StreamState] = None, 117 stream_slice: Optional[StreamSlice] = None, 118 next_page_token: Optional[Mapping[str, Any]] = None, 119 ) -> Mapping[str, Any]: 120 """ 121 Specifies how to populate the body of the request with a JSON payload. 122 123 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 124 """
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
126 @abstractmethod 127 def send_request( 128 self, 129 stream_state: Optional[StreamState] = None, 130 stream_slice: Optional[StreamSlice] = None, 131 next_page_token: Optional[Mapping[str, Any]] = None, 132 path: Optional[str] = None, 133 request_headers: Optional[Mapping[str, Any]] = None, 134 request_params: Optional[Mapping[str, Any]] = None, 135 request_body_data: Optional[Union[Mapping[str, Any], str]] = None, 136 request_body_json: Optional[Mapping[str, Any]] = None, 137 log_formatter: Optional[Callable[[requests.Response], Any]] = None, 138 ) -> Optional[requests.Response]: 139 """ 140 Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. 141 If path is set, the path configured on the requester itself is ignored. 142 If header, params and body are set, they are merged with the ones configured on the requester itself. 143 144 If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed. 145 """
Sends a request and returns the response. Might return no response if the error handler chooses to ignore the response or throw an exception in case of an error. If path is set, the path configured on the requester itself is ignored. If header, params and body are set, they are merged with the ones configured on the requester itself.
If a log formatter is provided, it's used to log the performed request and response. If it's not provided, no logging is performed.
51@dataclass 52class SimpleRetriever(Retriever): 53 """ 54 Retrieves records by synchronously sending requests to fetch records. 55 56 The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the stream slicer. 57 58 For each stream slice, submit requests until there are no more pages of records to fetch. 59 60 This retriever currently inherits from HttpStream to reuse the request submission and pagination machinery. 61 As a result, some of the parameters passed to some methods are unused. 62 The two will be decoupled in a future release. 63 64 Attributes: 65 stream_name (str): The stream's name 66 stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): The stream's primary key 67 requester (Requester): The HTTP requester 68 record_selector (HttpSelector): The record selector 69 paginator (Optional[Paginator]): The paginator 70 stream_slicer (Optional[StreamSlicer]): The stream slicer 71 cursor (Optional[cursor]): The cursor 72 parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation 73 """ 74 75 requester: Requester 76 record_selector: HttpSelector 77 config: Config 78 parameters: InitVar[Mapping[str, Any]] 79 name: str 80 _name: Union[InterpolatedString, str] = field(init=False, repr=False, default="") 81 primary_key: Optional[Union[str, List[str], List[List[str]]]] 82 _primary_key: str = field(init=False, repr=False, default="") 83 paginator: Optional[Paginator] = None 84 stream_slicer: StreamSlicer = field( 85 default_factory=lambda: SinglePartitionRouter(parameters={}) 86 ) 87 request_option_provider: RequestOptionsProvider = field( 88 default_factory=lambda: DefaultRequestOptionsProvider(parameters={}) 89 ) 90 cursor: Optional[DeclarativeCursor] = None 91 ignore_stream_slicer_parameters_on_paginated_requests: bool = False 92 93 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 94 self._paginator = self.paginator or NoPagination(parameters=parameters) 95 self._parameters = parameters 96 self._name = ( 97 InterpolatedString(self._name, parameters=parameters) 98 if isinstance(self._name, str) 99 else self._name 100 ) 101 102 @property # type: ignore 103 def name(self) -> str: 104 """ 105 :return: Stream name 106 """ 107 return ( 108 str(self._name.eval(self.config)) 109 if isinstance(self._name, InterpolatedString) 110 else self._name 111 ) 112 113 @name.setter 114 def name(self, value: str) -> None: 115 if not isinstance(value, property): 116 self._name = value 117 118 def _get_mapping( 119 self, method: Callable[..., Optional[Union[Mapping[str, Any], str]]], **kwargs: Any 120 ) -> Tuple[Union[Mapping[str, Any], str], Set[str]]: 121 """ 122 Get mapping from the provided method, and get the keys of the mapping. 123 If the method returns a string, it will return the string and an empty set. 124 If the method returns a dict, it will return the dict and its keys. 125 """ 126 mapping = method(**kwargs) or {} 127 keys = set(mapping.keys()) if not isinstance(mapping, str) else set() 128 return mapping, keys 129 130 def _get_request_options( 131 self, 132 stream_state: Optional[StreamData], 133 stream_slice: Optional[StreamSlice], 134 next_page_token: Optional[Mapping[str, Any]], 135 paginator_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], 136 stream_slicer_method: Callable[..., Optional[Union[Mapping[str, Any], str]]], 137 ) -> Union[Mapping[str, Any], str]: 138 """ 139 Get the request_option from the paginator and the stream slicer. 140 Raise a ValueError if there's a key collision 141 Returned merged mapping otherwise 142 """ 143 # FIXME we should eventually remove the usage of stream_state as part of the interpolation 144 145 is_body_json = paginator_method.__name__ == "get_request_body_json" 146 147 mappings = [ 148 paginator_method( 149 stream_slice=stream_slice, 150 next_page_token=next_page_token, 151 ), 152 ] 153 if not next_page_token or not self.ignore_stream_slicer_parameters_on_paginated_requests: 154 mappings.append( 155 stream_slicer_method( 156 stream_slice=stream_slice, 157 next_page_token=next_page_token, 158 ) 159 ) 160 return combine_mappings(mappings, allow_same_value_merge=is_body_json) 161 162 def _request_headers( 163 self, 164 stream_state: Optional[StreamData] = None, 165 stream_slice: Optional[StreamSlice] = None, 166 next_page_token: Optional[Mapping[str, Any]] = None, 167 ) -> Mapping[str, Any]: 168 """ 169 Specifies request headers. 170 Authentication headers will overwrite any overlapping headers returned from this method. 171 """ 172 headers = self._get_request_options( 173 stream_state, 174 stream_slice, 175 next_page_token, 176 self._paginator.get_request_headers, 177 self.request_option_provider.get_request_headers, 178 ) 179 if isinstance(headers, str): 180 raise ValueError("Request headers cannot be a string") 181 return {str(k): str(v) for k, v in headers.items()} 182 183 def _request_params( 184 self, 185 stream_state: Optional[StreamData] = None, 186 stream_slice: Optional[StreamSlice] = None, 187 next_page_token: Optional[Mapping[str, Any]] = None, 188 ) -> Mapping[str, Any]: 189 """ 190 Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. 191 192 E.g: you might want to define query parameters for paging if next_page_token is not None. 193 """ 194 params = self._get_request_options( 195 stream_state, 196 stream_slice, 197 next_page_token, 198 self._paginator.get_request_params, 199 self.request_option_provider.get_request_params, 200 ) 201 if isinstance(params, str): 202 raise ValueError("Request params cannot be a string") 203 return params 204 205 def _request_body_data( 206 self, 207 stream_state: Optional[StreamData] = None, 208 stream_slice: Optional[StreamSlice] = None, 209 next_page_token: Optional[Mapping[str, Any]] = None, 210 ) -> Union[Mapping[str, Any], str]: 211 """ 212 Specifies how to populate the body of the request with a non-JSON payload. 213 214 If returns a ready text that it will be sent as is. 215 If returns a dict that it will be converted to a urlencoded form. 216 E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" 217 218 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 219 """ 220 return self._get_request_options( 221 stream_state, 222 stream_slice, 223 next_page_token, 224 self._paginator.get_request_body_data, 225 self.request_option_provider.get_request_body_data, 226 ) 227 228 def _request_body_json( 229 self, 230 stream_state: Optional[StreamData] = None, 231 stream_slice: Optional[StreamSlice] = None, 232 next_page_token: Optional[Mapping[str, Any]] = None, 233 ) -> Optional[Mapping[str, Any]]: 234 """ 235 Specifies how to populate the body of the request with a JSON payload. 236 237 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 238 """ 239 body_json = self._get_request_options( 240 stream_state, 241 stream_slice, 242 next_page_token, 243 self._paginator.get_request_body_json, 244 self.request_option_provider.get_request_body_json, 245 ) 246 if isinstance(body_json, str): 247 raise ValueError("Request body json cannot be a string") 248 return body_json 249 250 def _paginator_path( 251 self, 252 next_page_token: Optional[Mapping[str, Any]] = None, 253 stream_state: Optional[Mapping[str, Any]] = None, 254 stream_slice: Optional[StreamSlice] = None, 255 ) -> Optional[str]: 256 """ 257 If the paginator points to a path, follow it, else return nothing so the requester is used. 258 :param next_page_token: 259 :return: 260 """ 261 return self._paginator.path( 262 next_page_token=next_page_token, 263 stream_state=stream_state, 264 stream_slice=stream_slice, 265 ) 266 267 def _parse_response( 268 self, 269 response: Optional[requests.Response], 270 stream_state: StreamState, 271 records_schema: Mapping[str, Any], 272 stream_slice: Optional[StreamSlice] = None, 273 next_page_token: Optional[Mapping[str, Any]] = None, 274 ) -> Iterable[Record]: 275 if not response: 276 yield from [] 277 else: 278 yield from self.record_selector.select_records( 279 response=response, 280 stream_state=stream_state, 281 records_schema=records_schema, 282 stream_slice=stream_slice, 283 next_page_token=next_page_token, 284 ) 285 286 @property # type: ignore 287 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 288 """The stream's primary key""" 289 return self._primary_key 290 291 @primary_key.setter 292 def primary_key(self, value: str) -> None: 293 if not isinstance(value, property): 294 self._primary_key = value 295 296 def _next_page_token( 297 self, 298 response: requests.Response, 299 last_page_size: int, 300 last_record: Optional[Record], 301 last_page_token_value: Optional[Any], 302 ) -> Optional[Mapping[str, Any]]: 303 """ 304 Specifies a pagination strategy. 305 306 The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params. 307 308 :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response. 309 """ 310 return self._paginator.next_page_token( 311 response=response, 312 last_page_size=last_page_size, 313 last_record=last_record, 314 last_page_token_value=last_page_token_value, 315 ) 316 317 def _fetch_next_page( 318 self, 319 stream_state: Mapping[str, Any], 320 stream_slice: StreamSlice, 321 next_page_token: Optional[Mapping[str, Any]] = None, 322 ) -> Optional[requests.Response]: 323 return self.requester.send_request( 324 path=self._paginator_path( 325 next_page_token=next_page_token, 326 stream_state=stream_state, 327 stream_slice=stream_slice, 328 ), 329 stream_state=stream_state, 330 stream_slice=stream_slice, 331 next_page_token=next_page_token, 332 request_headers=self._request_headers( 333 stream_state=stream_state, 334 stream_slice=stream_slice, 335 next_page_token=next_page_token, 336 ), 337 request_params=self._request_params( 338 stream_state=stream_state, 339 stream_slice=stream_slice, 340 next_page_token=next_page_token, 341 ), 342 request_body_data=self._request_body_data( 343 stream_state=stream_state, 344 stream_slice=stream_slice, 345 next_page_token=next_page_token, 346 ), 347 request_body_json=self._request_body_json( 348 stream_state=stream_state, 349 stream_slice=stream_slice, 350 next_page_token=next_page_token, 351 ), 352 ) 353 354 # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well. 355 def _read_pages( 356 self, 357 records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], 358 stream_state: Mapping[str, Any], 359 stream_slice: StreamSlice, 360 ) -> Iterable[Record]: 361 pagination_complete = False 362 initial_token = self._paginator.get_initial_token() 363 next_page_token: Optional[Mapping[str, Any]] = ( 364 {"next_page_token": initial_token} if initial_token else None 365 ) 366 while not pagination_complete: 367 response = self._fetch_next_page(stream_state, stream_slice, next_page_token) 368 369 last_page_size = 0 370 last_record: Optional[Record] = None 371 for record in records_generator_fn(response): 372 last_page_size += 1 373 last_record = record 374 yield record 375 376 if not response: 377 pagination_complete = True 378 else: 379 last_page_token_value = ( 380 next_page_token.get("next_page_token") if next_page_token else None 381 ) 382 next_page_token = self._next_page_token( 383 response=response, 384 last_page_size=last_page_size, 385 last_record=last_record, 386 last_page_token_value=last_page_token_value, 387 ) 388 if not next_page_token: 389 pagination_complete = True 390 391 # Always return an empty generator just in case no records were ever yielded 392 yield from [] 393 394 def _read_single_page( 395 self, 396 records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], 397 stream_state: Mapping[str, Any], 398 stream_slice: StreamSlice, 399 ) -> Iterable[StreamData]: 400 initial_token = stream_state.get("next_page_token") 401 if initial_token is None: 402 initial_token = self._paginator.get_initial_token() 403 next_page_token: Optional[Mapping[str, Any]] = ( 404 {"next_page_token": initial_token} if initial_token else None 405 ) 406 407 response = self._fetch_next_page(stream_state, stream_slice, next_page_token) 408 409 last_page_size = 0 410 last_record: Optional[Record] = None 411 for record in records_generator_fn(response): 412 last_page_size += 1 413 last_record = record 414 yield record 415 416 if not response: 417 next_page_token = {FULL_REFRESH_SYNC_COMPLETE_KEY: True} 418 else: 419 last_page_token_value = ( 420 next_page_token.get("next_page_token") if next_page_token else None 421 ) 422 next_page_token = self._next_page_token( 423 response=response, 424 last_page_size=last_page_size, 425 last_record=last_record, 426 last_page_token_value=last_page_token_value, 427 ) or {FULL_REFRESH_SYNC_COMPLETE_KEY: True} 428 429 if self.cursor: 430 self.cursor.close_slice( 431 StreamSlice(cursor_slice=next_page_token, partition=stream_slice.partition) 432 ) 433 434 # Always return an empty generator just in case no records were ever yielded 435 yield from [] 436 437 def read_records( 438 self, 439 records_schema: Mapping[str, Any], 440 stream_slice: Optional[StreamSlice] = None, 441 ) -> Iterable[StreamData]: 442 """ 443 Fetch a stream's records from an HTTP API source 444 445 :param records_schema: json schema to describe record 446 :param stream_slice: The stream slice to read data for 447 :return: The records read from the API source 448 """ 449 _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check 450 451 most_recent_record_from_slice = None 452 record_generator = partial( 453 self._parse_records, 454 stream_slice=stream_slice, 455 stream_state=self.state or {}, 456 records_schema=records_schema, 457 ) 458 459 if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor): 460 stream_state = self.state 461 462 # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to 463 # fetch more records. The platform deletes stream state for full refresh streams before starting a 464 # new job, so we don't need to worry about this value existing for the initial attempt 465 if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY): 466 return 467 468 yield from self._read_single_page(record_generator, stream_state, _slice) 469 else: 470 for stream_data in self._read_pages(record_generator, self.state, _slice): 471 current_record = self._extract_record(stream_data, _slice) 472 if self.cursor and current_record: 473 self.cursor.observe(_slice, current_record) 474 475 # Latest record read, not necessarily within slice boundaries. 476 # TODO Remove once all custom components implement `observe` method. 477 # https://github.com/airbytehq/airbyte-internal-issues/issues/6955 478 most_recent_record_from_slice = self._get_most_recent_record( 479 most_recent_record_from_slice, current_record, _slice 480 ) 481 yield stream_data 482 483 if self.cursor: 484 self.cursor.close_slice(_slice, most_recent_record_from_slice) 485 return 486 487 def _get_most_recent_record( 488 self, 489 current_most_recent: Optional[Record], 490 current_record: Optional[Record], 491 stream_slice: StreamSlice, 492 ) -> Optional[Record]: 493 if self.cursor and current_record: 494 if not current_most_recent: 495 return current_record 496 else: 497 return ( 498 current_most_recent 499 if self.cursor.is_greater_than_or_equal(current_most_recent, current_record) 500 else current_record 501 ) 502 else: 503 return None 504 505 def _extract_record( 506 self, stream_data: StreamData, stream_slice: StreamSlice 507 ) -> Optional[Record]: 508 """ 509 As we allow the output of _read_pages to be StreamData, it can be multiple things. Therefore, we need to filter out and normalize 510 to data to streamline the rest of the process. 511 """ 512 if isinstance(stream_data, Record): 513 # Record is not part of `StreamData` but is the most common implementation of `Mapping[str, Any]` which is part of `StreamData` 514 return stream_data 515 elif isinstance(stream_data, (dict, Mapping)): 516 return Record( 517 data=dict(stream_data), associated_slice=stream_slice, stream_name=self.name 518 ) 519 elif isinstance(stream_data, AirbyteMessage) and stream_data.record: 520 return Record( 521 data=stream_data.record.data, # type:ignore # AirbyteMessage always has record.data 522 associated_slice=stream_slice, 523 stream_name=self.name, 524 ) 525 return None 526 527 # stream_slices is defined with arguments on http stream and fixing this has a long tail of dependencies. Will be resolved by the decoupling of http stream and simple retriever 528 def stream_slices(self) -> Iterable[Optional[StreamSlice]]: # type: ignore 529 """ 530 Specifies the slices for this stream. See the stream slicing section of the docs for more information. 531 532 :param sync_mode: 533 :param cursor_field: 534 :param stream_state: 535 :return: 536 """ 537 return self.stream_slicer.stream_slices() 538 539 @property 540 def state(self) -> Mapping[str, Any]: 541 return self.cursor.get_stream_state() if self.cursor else {} 542 543 @state.setter 544 def state(self, value: StreamState) -> None: 545 """State setter, accept state serialized by state getter.""" 546 if self.cursor: 547 self.cursor.set_initial_state(value) 548 549 def _parse_records( 550 self, 551 response: Optional[requests.Response], 552 stream_state: Mapping[str, Any], 553 records_schema: Mapping[str, Any], 554 stream_slice: Optional[StreamSlice], 555 ) -> Iterable[Record]: 556 yield from self._parse_response( 557 response, 558 stream_slice=stream_slice, 559 stream_state=stream_state, 560 records_schema=records_schema, 561 ) 562 563 def must_deduplicate_query_params(self) -> bool: 564 return True 565 566 @staticmethod 567 def _to_partition_key(to_serialize: Any) -> str: 568 # separators have changed in Python 3.4. To avoid being impacted by further change, we explicitly specify our own value 569 return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True)
Retrieves records by synchronously sending requests to fetch records.
The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the stream slicer.
For each stream slice, submit requests until there are no more pages of records to fetch.
This retriever currently inherits from HttpStream to reuse the request submission and pagination machinery. As a result, some of the parameters passed to some methods are unused. The two will be decoupled in a future release.
Attributes:
- stream_name (str): The stream's name
- stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): The stream's primary key
- requester (Requester): The HTTP requester
- record_selector (HttpSelector): The record selector
- paginator (Optional[Paginator]): The paginator
- stream_slicer (Optional[StreamSlicer]): The stream slicer
- cursor (Optional[cursor]): The cursor
- parameters (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation
102 @property # type: ignore 103 def name(self) -> str: 104 """ 105 :return: Stream name 106 """ 107 return ( 108 str(self._name.eval(self.config)) 109 if isinstance(self._name, InterpolatedString) 110 else self._name 111 )
Returns
Stream name
286 @property # type: ignore 287 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 288 """The stream's primary key""" 289 return self._primary_key
The stream's primary key
437 def read_records( 438 self, 439 records_schema: Mapping[str, Any], 440 stream_slice: Optional[StreamSlice] = None, 441 ) -> Iterable[StreamData]: 442 """ 443 Fetch a stream's records from an HTTP API source 444 445 :param records_schema: json schema to describe record 446 :param stream_slice: The stream slice to read data for 447 :return: The records read from the API source 448 """ 449 _slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check 450 451 most_recent_record_from_slice = None 452 record_generator = partial( 453 self._parse_records, 454 stream_slice=stream_slice, 455 stream_state=self.state or {}, 456 records_schema=records_schema, 457 ) 458 459 if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor): 460 stream_state = self.state 461 462 # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to 463 # fetch more records. The platform deletes stream state for full refresh streams before starting a 464 # new job, so we don't need to worry about this value existing for the initial attempt 465 if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY): 466 return 467 468 yield from self._read_single_page(record_generator, stream_state, _slice) 469 else: 470 for stream_data in self._read_pages(record_generator, self.state, _slice): 471 current_record = self._extract_record(stream_data, _slice) 472 if self.cursor and current_record: 473 self.cursor.observe(_slice, current_record) 474 475 # Latest record read, not necessarily within slice boundaries. 476 # TODO Remove once all custom components implement `observe` method. 477 # https://github.com/airbytehq/airbyte-internal-issues/issues/6955 478 most_recent_record_from_slice = self._get_most_recent_record( 479 most_recent_record_from_slice, current_record, _slice 480 ) 481 yield stream_data 482 483 if self.cursor: 484 self.cursor.close_slice(_slice, most_recent_record_from_slice) 485 return
Fetch a stream's records from an HTTP API source
Parameters
- records_schema: json schema to describe record
- stream_slice: The stream slice to read data for
Returns
The records read from the API source
528 def stream_slices(self) -> Iterable[Optional[StreamSlice]]: # type: ignore 529 """ 530 Specifies the slices for this stream. See the stream slicing section of the docs for more information. 531 532 :param sync_mode: 533 :param cursor_field: 534 :param stream_state: 535 :return: 536 """ 537 return self.stream_slicer.stream_slices()
Specifies the slices for this stream. See the stream slicing section of the docs for more information.
Parameters
- sync_mode:
- cursor_field:
- stream_state:
Returns
539 @property 540 def state(self) -> Mapping[str, Any]: 541 return self.cursor.get_stream_state() if self.cursor else {}
State getter, should return state in form that can serialized to a string and send to the output as a STATE AirbyteMessage.
A good example of a state is a cursor_value: { self.cursor_field: "cursor_value" }
State should try to be as small as possible but at the same time descriptive enough to restore syncing process from the point where it stopped.
13@dataclass 14class SinglePartitionRouter(PartitionRouter): 15 """Partition router returning only a stream slice""" 16 17 parameters: InitVar[Mapping[str, Any]] 18 19 def get_request_params( 20 self, 21 stream_state: Optional[StreamState] = None, 22 stream_slice: Optional[StreamSlice] = None, 23 next_page_token: Optional[Mapping[str, Any]] = None, 24 ) -> Mapping[str, Any]: 25 return {} 26 27 def get_request_headers( 28 self, 29 stream_state: Optional[StreamState] = None, 30 stream_slice: Optional[StreamSlice] = None, 31 next_page_token: Optional[Mapping[str, Any]] = None, 32 ) -> Mapping[str, Any]: 33 return {} 34 35 def get_request_body_data( 36 self, 37 stream_state: Optional[StreamState] = None, 38 stream_slice: Optional[StreamSlice] = None, 39 next_page_token: Optional[Mapping[str, Any]] = None, 40 ) -> Mapping[str, Any]: 41 return {} 42 43 def get_request_body_json( 44 self, 45 stream_state: Optional[StreamState] = None, 46 stream_slice: Optional[StreamSlice] = None, 47 next_page_token: Optional[Mapping[str, Any]] = None, 48 ) -> Mapping[str, Any]: 49 return {} 50 51 def stream_slices(self) -> Iterable[StreamSlice]: 52 yield StreamSlice(partition={}, cursor_slice={}) 53 54 def set_initial_state(self, stream_state: StreamState) -> None: 55 """ 56 SinglePartitionRouter doesn't have parent streams 57 """ 58 pass 59 60 def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: 61 """ 62 SinglePartitionRouter doesn't have parent streams 63 """ 64 pass
Partition router returning only a stream slice
19 def get_request_params( 20 self, 21 stream_state: Optional[StreamState] = None, 22 stream_slice: Optional[StreamSlice] = None, 23 next_page_token: Optional[Mapping[str, Any]] = None, 24 ) -> Mapping[str, Any]: 25 return {}
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
27 def get_request_headers( 28 self, 29 stream_state: Optional[StreamState] = None, 30 stream_slice: Optional[StreamSlice] = None, 31 next_page_token: Optional[Mapping[str, Any]] = None, 32 ) -> Mapping[str, Any]: 33 return {}
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
35 def get_request_body_data( 36 self, 37 stream_state: Optional[StreamState] = None, 38 stream_slice: Optional[StreamSlice] = None, 39 next_page_token: Optional[Mapping[str, Any]] = None, 40 ) -> Mapping[str, Any]: 41 return {}
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
43 def get_request_body_json( 44 self, 45 stream_state: Optional[StreamState] = None, 46 stream_slice: Optional[StreamSlice] = None, 47 next_page_token: Optional[Mapping[str, Any]] = None, 48 ) -> Mapping[str, Any]: 49 return {}
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
51 def stream_slices(self) -> Iterable[StreamSlice]: 52 yield StreamSlice(partition={}, cursor_slice={})
Defines stream slices
Returns
An iterable of stream slices
42class StopConditionPaginationStrategyDecorator(PaginationStrategy): 43 def __init__(self, _delegate: PaginationStrategy, stop_condition: PaginationStopCondition): 44 self._delegate = _delegate 45 self._stop_condition = stop_condition 46 47 def next_page_token( 48 self, 49 response: requests.Response, 50 last_page_size: int, 51 last_record: Optional[Record], 52 last_page_token_value: Optional[Any] = None, 53 ) -> Optional[Any]: 54 # We evaluate in reverse order because the assumption is that most of the APIs using data feed structure 55 # will return records in descending order. In terms of performance/memory, we return the records lazily 56 if last_record and self._stop_condition.is_met(last_record): 57 return None 58 return self._delegate.next_page_token( 59 response, last_page_size, last_record, last_page_token_value 60 ) 61 62 def get_page_size(self) -> Optional[int]: 63 return self._delegate.get_page_size() 64 65 @property 66 def initial_token(self) -> Optional[Any]: 67 return self._delegate.initial_token
Defines how to get the next page token
47 def next_page_token( 48 self, 49 response: requests.Response, 50 last_page_size: int, 51 last_record: Optional[Record], 52 last_page_token_value: Optional[Any] = None, 53 ) -> Optional[Any]: 54 # We evaluate in reverse order because the assumption is that most of the APIs using data feed structure 55 # will return records in descending order. In terms of performance/memory, we return the records lazily 56 if last_record and self._stop_condition.is_met(last_record): 57 return None 58 return self._delegate.next_page_token( 59 response, last_page_size, last_record, last_page_token_value 60 )
Parameters
- response: response to process
- last_page_size: the number of records read from the response
- last_record: the last record extracted from the response
- last_page_token_value: The current value of the page token made on the last request
Returns
next page token. Returns None if there are no more pages to fetch
67class StreamSlice(Mapping[str, Any]): 68 def __init__( 69 self, 70 *, 71 partition: Mapping[str, Any], 72 cursor_slice: Mapping[str, Any], 73 extra_fields: Optional[Mapping[str, Any]] = None, 74 ) -> None: 75 """ 76 :param partition: The partition keys representing a unique partition in the stream. 77 :param cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens. 78 :param extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream. 79 """ 80 self._partition = partition 81 self._cursor_slice = cursor_slice 82 self._extra_fields = extra_fields or {} 83 84 # Ensure that partition keys do not overlap with cursor slice keys 85 if partition.keys() & cursor_slice.keys(): 86 raise ValueError("Keys for partition and incremental sync cursor should not overlap") 87 88 self._stream_slice = dict(partition) | dict(cursor_slice) 89 90 @property 91 def partition(self) -> Mapping[str, Any]: 92 """Returns the partition portion of the stream slice.""" 93 p = self._partition 94 while isinstance(p, StreamSlice): 95 p = p.partition 96 return p 97 98 @property 99 def cursor_slice(self) -> Mapping[str, Any]: 100 """Returns the cursor slice portion of the stream slice.""" 101 c = self._cursor_slice 102 while isinstance(c, StreamSlice): 103 c = c.cursor_slice 104 return c 105 106 @property 107 def extra_fields(self) -> Mapping[str, Any]: 108 """Returns the extra fields that are not part of the partition.""" 109 return self._extra_fields 110 111 def __repr__(self) -> str: 112 return repr(self._stream_slice) 113 114 def __setitem__(self, key: str, value: Any) -> None: 115 raise ValueError("StreamSlice is immutable") 116 117 def __getitem__(self, key: str) -> Any: 118 return self._stream_slice[key] 119 120 def __len__(self) -> int: 121 return len(self._stream_slice) 122 123 def __iter__(self) -> Iterator[str]: 124 return iter(self._stream_slice) 125 126 def __contains__(self, item: Any) -> bool: 127 return item in self._stream_slice 128 129 def keys(self) -> KeysView[str]: 130 return self._stream_slice.keys() 131 132 def items(self) -> ItemsView[str, Any]: 133 return self._stream_slice.items() 134 135 def values(self) -> ValuesView[Any]: 136 return self._stream_slice.values() 137 138 def get(self, key: str, default: Any = None) -> Optional[Any]: 139 return self._stream_slice.get(key, default) 140 141 def __eq__(self, other: Any) -> bool: 142 if isinstance(other, dict): 143 return self._stream_slice == other 144 if isinstance(other, StreamSlice): 145 # noinspection PyProtectedMember 146 return self._partition == other._partition and self._cursor_slice == other._cursor_slice 147 return False 148 149 def __ne__(self, other: Any) -> bool: 150 return not self.__eq__(other) 151 152 def __json_serializable__(self) -> Any: 153 return self._stream_slice 154 155 def __hash__(self) -> int: 156 return SliceHasher.hash( 157 stream_slice=self._stream_slice 158 ) # no need to provide stream_name here as this is used for slicing the cursor 159 160 def __bool__(self) -> bool: 161 return bool(self._stream_slice) or bool(self._extra_fields)
A Mapping is a generic container for associating key/value pairs.
This class provides concrete generic implementations of all methods except for __getitem__, __iter__, and __len__.
68 def __init__( 69 self, 70 *, 71 partition: Mapping[str, Any], 72 cursor_slice: Mapping[str, Any], 73 extra_fields: Optional[Mapping[str, Any]] = None, 74 ) -> None: 75 """ 76 :param partition: The partition keys representing a unique partition in the stream. 77 :param cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens. 78 :param extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream. 79 """ 80 self._partition = partition 81 self._cursor_slice = cursor_slice 82 self._extra_fields = extra_fields or {} 83 84 # Ensure that partition keys do not overlap with cursor slice keys 85 if partition.keys() & cursor_slice.keys(): 86 raise ValueError("Keys for partition and incremental sync cursor should not overlap") 87 88 self._stream_slice = dict(partition) | dict(cursor_slice)
Parameters
- partition: The partition keys representing a unique partition in the stream.
- cursor_slice: The incremental cursor slice keys, such as dates or pagination tokens.
- extra_fields: Additional fields that should not be part of the partition but passed along, such as metadata from the parent stream.
90 @property 91 def partition(self) -> Mapping[str, Any]: 92 """Returns the partition portion of the stream slice.""" 93 p = self._partition 94 while isinstance(p, StreamSlice): 95 p = p.partition 96 return p
Returns the partition portion of the stream slice.
98 @property 99 def cursor_slice(self) -> Mapping[str, Any]: 100 """Returns the cursor slice portion of the stream slice.""" 101 c = self._cursor_slice 102 while isinstance(c, StreamSlice): 103 c = c.cursor_slice 104 return c
Returns the cursor slice portion of the stream slice.
106 @property 107 def extra_fields(self) -> Mapping[str, Any]: 108 """Returns the extra fields that are not part of the partition.""" 109 return self._extra_fields
Returns the extra fields that are not part of the partition.
D.items() -> a set-like object providing a view on D's items
80@dataclass 81class SubstreamPartitionRouter(PartitionRouter): 82 """ 83 Partition router that iterates over the parent's stream records and emits slices 84 Will populate the state with `partition_field` and `parent_slice` so they can be accessed by other components 85 86 Attributes: 87 parent_stream_configs (List[ParentStreamConfig]): parent streams to iterate over and their config 88 """ 89 90 parent_stream_configs: List[ParentStreamConfig] 91 config: Config 92 parameters: InitVar[Mapping[str, Any]] 93 94 def __post_init__(self, parameters: Mapping[str, Any]) -> None: 95 if not self.parent_stream_configs: 96 raise ValueError("SubstreamPartitionRouter needs at least 1 parent stream") 97 self._parameters = parameters 98 99 def get_request_params( 100 self, 101 stream_state: Optional[StreamState] = None, 102 stream_slice: Optional[StreamSlice] = None, 103 next_page_token: Optional[Mapping[str, Any]] = None, 104 ) -> Mapping[str, Any]: 105 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 106 return self._get_request_option(RequestOptionType.request_parameter, stream_slice) 107 108 def get_request_headers( 109 self, 110 stream_state: Optional[StreamState] = None, 111 stream_slice: Optional[StreamSlice] = None, 112 next_page_token: Optional[Mapping[str, Any]] = None, 113 ) -> Mapping[str, Any]: 114 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 115 return self._get_request_option(RequestOptionType.header, stream_slice) 116 117 def get_request_body_data( 118 self, 119 stream_state: Optional[StreamState] = None, 120 stream_slice: Optional[StreamSlice] = None, 121 next_page_token: Optional[Mapping[str, Any]] = None, 122 ) -> Mapping[str, Any]: 123 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 124 return self._get_request_option(RequestOptionType.body_data, stream_slice) 125 126 def get_request_body_json( 127 self, 128 stream_state: Optional[StreamState] = None, 129 stream_slice: Optional[StreamSlice] = None, 130 next_page_token: Optional[Mapping[str, Any]] = None, 131 ) -> Mapping[str, Any]: 132 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 133 return self._get_request_option(RequestOptionType.body_json, stream_slice) 134 135 def _get_request_option( 136 self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice] 137 ) -> Mapping[str, Any]: 138 params: MutableMapping[str, Any] = {} 139 if stream_slice: 140 for parent_config in self.parent_stream_configs: 141 if ( 142 parent_config.request_option 143 and parent_config.request_option.inject_into == option_type 144 ): 145 key = parent_config.partition_field.eval(self.config) # type: ignore # partition_field is always casted to an interpolated string 146 value = stream_slice.get(key) 147 if value: 148 parent_config.request_option.inject_into_request(params, value, self.config) 149 return params 150 151 def stream_slices(self) -> Iterable[StreamSlice]: 152 """ 153 Iterate over each parent stream's record and create a StreamSlice for each record. 154 155 For each stream, iterate over its stream_slices. 156 For each stream slice, iterate over each record. 157 yield a stream slice for each such records. 158 159 If a parent slice contains no record, emit a slice with parent_record=None. 160 161 The template string can interpolate the following values: 162 - parent_stream_slice: mapping representing the parent's stream slice 163 - parent_record: mapping representing the parent record 164 - parent_stream_name: string representing the parent stream name 165 """ 166 if not self.parent_stream_configs: 167 yield from [] 168 else: 169 for parent_stream_config in self.parent_stream_configs: 170 parent_stream = parent_stream_config.stream 171 parent_field = parent_stream_config.parent_key.eval(self.config) # type: ignore # parent_key is always casted to an interpolated string 172 partition_field = parent_stream_config.partition_field.eval(self.config) # type: ignore # partition_field is always casted to an interpolated string 173 extra_fields = None 174 if parent_stream_config.extra_fields: 175 extra_fields = [ 176 [field_path_part.eval(self.config) for field_path_part in field_path] # type: ignore [union-attr] 177 for field_path in parent_stream_config.extra_fields 178 ] 179 180 # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does 181 # not support either substreams or RFR, but something that needs to be considered once we do 182 for parent_record in parent_stream.read_only_records(): 183 parent_partition = None 184 # Skip non-records (eg AirbyteLogMessage) 185 if isinstance(parent_record, AirbyteMessage): 186 self.logger.warning( 187 f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." 188 ) 189 if parent_record.type == MessageType.RECORD: 190 parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record 191 else: 192 continue 193 elif isinstance(parent_record, Record): 194 parent_partition = ( 195 parent_record.associated_slice.partition 196 if parent_record.associated_slice 197 else {} 198 ) 199 parent_record = parent_record.data 200 elif not isinstance(parent_record, Mapping): 201 # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid 202 raise AirbyteTracedException( 203 message=f"Parent stream returned records as invalid type {type(parent_record)}" 204 ) 205 try: 206 partition_value = dpath.get( 207 parent_record, # type: ignore [arg-type] 208 parent_field, 209 ) 210 except KeyError: 211 continue 212 213 # Add extra fields 214 extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields) 215 216 if parent_stream_config.lazy_read_pointer: 217 extracted_extra_fields = { 218 "child_response": self._extract_child_response( 219 parent_record, 220 parent_stream_config.lazy_read_pointer, # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config 221 ), 222 **extracted_extra_fields, 223 } 224 225 yield StreamSlice( 226 partition={ 227 partition_field: partition_value, 228 "parent_slice": parent_partition or {}, 229 }, 230 cursor_slice={}, 231 extra_fields=extracted_extra_fields, 232 ) 233 234 def _extract_child_response( 235 self, parent_record: Mapping[str, Any] | AirbyteMessage, pointer: List[InterpolatedString] 236 ) -> requests.Response: 237 """Extract child records from a parent record based on lazy pointers.""" 238 239 def _create_response(data: MutableMapping[str, Any]) -> SafeResponse: 240 """Create a SafeResponse with the given data.""" 241 response = SafeResponse() 242 response.content = json.dumps(data).encode("utf-8") 243 response.status_code = 200 244 return response 245 246 path = [path.eval(self.config) for path in pointer] 247 return _create_response(dpath.get(parent_record, path, default=[])) # type: ignore # argunet will be a MutableMapping, given input data structure 248 249 def _extract_extra_fields( 250 self, 251 parent_record: Mapping[str, Any] | AirbyteMessage, 252 extra_fields: Optional[List[List[str]]] = None, 253 ) -> Mapping[str, Any]: 254 """ 255 Extracts additional fields specified by their paths from the parent record. 256 257 Args: 258 parent_record (Mapping[str, Any]): The record from the parent stream to extract fields from. 259 extra_fields (Optional[List[List[str]]]): A list of field paths (as lists of strings) to extract from the parent record. 260 261 Returns: 262 Mapping[str, Any]: A dictionary containing the extracted fields. 263 The keys are the joined field paths, and the values are the corresponding extracted values. 264 """ 265 extracted_extra_fields = {} 266 if extra_fields: 267 for extra_field_path in extra_fields: 268 try: 269 extra_field_value = dpath.get( 270 parent_record, # type: ignore [arg-type] 271 extra_field_path, 272 ) 273 self.logger.debug( 274 f"Extracted extra_field_path: {extra_field_path} with value: {extra_field_value}" 275 ) 276 except KeyError: 277 self.logger.debug(f"Failed to extract extra_field_path: {extra_field_path}") 278 extra_field_value = None 279 extracted_extra_fields[".".join(extra_field_path)] = extra_field_value 280 return extracted_extra_fields 281 282 def set_initial_state(self, stream_state: StreamState) -> None: 283 """ 284 Set the state of the parent streams. 285 286 If the `parent_state` key is missing from `stream_state`, migrate the child stream state to the parent stream's state format. 287 This migration applies only to parent streams with incremental dependencies. 288 289 Args: 290 stream_state (StreamState): The state of the streams to be set. 291 292 Example of state format: 293 { 294 "parent_state": { 295 "parent_stream_name1": { 296 "last_updated": "2023-05-27T00:00:00Z" 297 }, 298 "parent_stream_name2": { 299 "last_updated": "2023-05-27T00:00:00Z" 300 } 301 } 302 } 303 304 Example of migrating to parent state format: 305 - Initial state: 306 { 307 "updated_at": "2023-05-27T00:00:00Z" 308 } 309 - After migration: 310 { 311 "updated_at": "2023-05-27T00:00:00Z", 312 "parent_state": { 313 "parent_stream_name": { 314 "parent_stream_cursor": "2023-05-27T00:00:00Z" 315 } 316 } 317 } 318 """ 319 if not stream_state: 320 return 321 322 parent_state = stream_state.get("parent_state", {}) 323 324 # Set state for each parent stream with an incremental dependency 325 for parent_config in self.parent_stream_configs: 326 if ( 327 not parent_state.get(parent_config.stream.name, {}) 328 and parent_config.incremental_dependency 329 ): 330 # Migrate child state to parent state format 331 parent_state = self._migrate_child_state_to_parent_state(stream_state) 332 333 if parent_config.incremental_dependency: 334 parent_config.stream.state = parent_state.get(parent_config.stream.name, {}) 335 336 def _migrate_child_state_to_parent_state(self, stream_state: StreamState) -> StreamState: 337 """ 338 Migrate the child or global stream state into the parent stream's state format. 339 340 This method converts the child stream state—or, if present, the global state—into a format that is 341 compatible with parent streams that use incremental synchronization. The migration occurs only for 342 parent streams with incremental dependencies. It filters out per-partition states and retains only the 343 global state in the form {cursor_field: cursor_value}. 344 345 The method supports multiple input formats: 346 - A simple global state, e.g.: 347 {"updated_at": "2023-05-27T00:00:00Z"} 348 - A state object that contains a "state" key (which is assumed to hold the global state), e.g.: 349 {"state": {"updated_at": "2023-05-27T00:00:00Z"}, ...} 350 In this case, the migration uses the first value from the "state" dictionary. 351 - Any per-partition state formats or other non-simple structures are ignored during migration. 352 353 Args: 354 stream_state (StreamState): The state to migrate. Expected formats include: 355 - {"updated_at": "2023-05-27T00:00:00Z"} 356 - {"state": {"updated_at": "2023-05-27T00:00:00Z"}, ...} 357 (In this format, only the first global state value is used, and per-partition states are ignored.) 358 359 Returns: 360 StreamState: A migrated state for parent streams in the format: 361 { 362 "parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"} 363 } 364 where each parent stream with an incremental dependency is assigned its corresponding cursor value. 365 366 Example: 367 Input: {"updated_at": "2023-05-27T00:00:00Z"} 368 Output: { 369 "parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"} 370 } 371 """ 372 substream_state_values = list(stream_state.values()) 373 substream_state = substream_state_values[0] if substream_state_values else {} 374 375 # Ignore per-partition states or invalid formats. 376 if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1: 377 # If a global state is present under the key "state", use its first value. 378 if "state" in stream_state and isinstance(stream_state["state"], dict): 379 substream_state = list(stream_state["state"].values())[0] 380 else: 381 return {} 382 383 # Build the parent state for all parent streams with incremental dependencies. 384 parent_state = {} 385 if substream_state: 386 for parent_config in self.parent_stream_configs: 387 if parent_config.incremental_dependency: 388 parent_state[parent_config.stream.name] = { 389 parent_config.stream.cursor_field: substream_state 390 } 391 392 return parent_state 393 394 def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: 395 """ 396 Get the state of the parent streams. 397 398 Returns: 399 StreamState: The current state of the parent streams. 400 401 Example of state format: 402 { 403 "parent_stream_name1": { 404 "last_updated": "2023-05-27T00:00:00Z" 405 }, 406 "parent_stream_name2": { 407 "last_updated": "2023-05-27T00:00:00Z" 408 } 409 } 410 """ 411 parent_state = {} 412 for parent_config in self.parent_stream_configs: 413 if parent_config.incremental_dependency: 414 parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.state) 415 return parent_state 416 417 @property 418 def logger(self) -> logging.Logger: 419 return logging.getLogger("airbyte.SubstreamPartitionRouter")
Partition router that iterates over the parent's stream records and emits slices
Will populate the state with partition_field
and parent_slice
so they can be accessed by other components
Attributes:
- parent_stream_configs (List[ParentStreamConfig]): parent streams to iterate over and their config
99 def get_request_params( 100 self, 101 stream_state: Optional[StreamState] = None, 102 stream_slice: Optional[StreamSlice] = None, 103 next_page_token: Optional[Mapping[str, Any]] = None, 104 ) -> Mapping[str, Any]: 105 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 106 return self._get_request_option(RequestOptionType.request_parameter, stream_slice)
Specifies the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
108 def get_request_headers( 109 self, 110 stream_state: Optional[StreamState] = None, 111 stream_slice: Optional[StreamSlice] = None, 112 next_page_token: Optional[Mapping[str, Any]] = None, 113 ) -> Mapping[str, Any]: 114 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 115 return self._get_request_option(RequestOptionType.header, stream_slice)
Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
117 def get_request_body_data( 118 self, 119 stream_state: Optional[StreamState] = None, 120 stream_slice: Optional[StreamSlice] = None, 121 next_page_token: Optional[Mapping[str, Any]] = None, 122 ) -> Mapping[str, Any]: 123 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 124 return self._get_request_option(RequestOptionType.body_data, stream_slice)
Specifies how to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
126 def get_request_body_json( 127 self, 128 stream_state: Optional[StreamState] = None, 129 stream_slice: Optional[StreamSlice] = None, 130 next_page_token: Optional[Mapping[str, Any]] = None, 131 ) -> Mapping[str, Any]: 132 # Pass the stream_slice from the argument, not the cursor because the cursor is updated after processing the response 133 return self._get_request_option(RequestOptionType.body_json, stream_slice)
Specifies how to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
151 def stream_slices(self) -> Iterable[StreamSlice]: 152 """ 153 Iterate over each parent stream's record and create a StreamSlice for each record. 154 155 For each stream, iterate over its stream_slices. 156 For each stream slice, iterate over each record. 157 yield a stream slice for each such records. 158 159 If a parent slice contains no record, emit a slice with parent_record=None. 160 161 The template string can interpolate the following values: 162 - parent_stream_slice: mapping representing the parent's stream slice 163 - parent_record: mapping representing the parent record 164 - parent_stream_name: string representing the parent stream name 165 """ 166 if not self.parent_stream_configs: 167 yield from [] 168 else: 169 for parent_stream_config in self.parent_stream_configs: 170 parent_stream = parent_stream_config.stream 171 parent_field = parent_stream_config.parent_key.eval(self.config) # type: ignore # parent_key is always casted to an interpolated string 172 partition_field = parent_stream_config.partition_field.eval(self.config) # type: ignore # partition_field is always casted to an interpolated string 173 extra_fields = None 174 if parent_stream_config.extra_fields: 175 extra_fields = [ 176 [field_path_part.eval(self.config) for field_path_part in field_path] # type: ignore [union-attr] 177 for field_path in parent_stream_config.extra_fields 178 ] 179 180 # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does 181 # not support either substreams or RFR, but something that needs to be considered once we do 182 for parent_record in parent_stream.read_only_records(): 183 parent_partition = None 184 # Skip non-records (eg AirbyteLogMessage) 185 if isinstance(parent_record, AirbyteMessage): 186 self.logger.warning( 187 f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." 188 ) 189 if parent_record.type == MessageType.RECORD: 190 parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record 191 else: 192 continue 193 elif isinstance(parent_record, Record): 194 parent_partition = ( 195 parent_record.associated_slice.partition 196 if parent_record.associated_slice 197 else {} 198 ) 199 parent_record = parent_record.data 200 elif not isinstance(parent_record, Mapping): 201 # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid 202 raise AirbyteTracedException( 203 message=f"Parent stream returned records as invalid type {type(parent_record)}" 204 ) 205 try: 206 partition_value = dpath.get( 207 parent_record, # type: ignore [arg-type] 208 parent_field, 209 ) 210 except KeyError: 211 continue 212 213 # Add extra fields 214 extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields) 215 216 if parent_stream_config.lazy_read_pointer: 217 extracted_extra_fields = { 218 "child_response": self._extract_child_response( 219 parent_record, 220 parent_stream_config.lazy_read_pointer, # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config 221 ), 222 **extracted_extra_fields, 223 } 224 225 yield StreamSlice( 226 partition={ 227 partition_field: partition_value, 228 "parent_slice": parent_partition or {}, 229 }, 230 cursor_slice={}, 231 extra_fields=extracted_extra_fields, 232 )
Iterate over each parent stream's record and create a StreamSlice for each record.
For each stream, iterate over its stream_slices. For each stream slice, iterate over each record. yield a stream slice for each such records.
If a parent slice contains no record, emit a slice with parent_record=None.
The template string can interpolate the following values:
- parent_stream_slice: mapping representing the parent's stream slice
- parent_record: mapping representing the parent record
- parent_stream_name: string representing the parent stream name
282 def set_initial_state(self, stream_state: StreamState) -> None: 283 """ 284 Set the state of the parent streams. 285 286 If the `parent_state` key is missing from `stream_state`, migrate the child stream state to the parent stream's state format. 287 This migration applies only to parent streams with incremental dependencies. 288 289 Args: 290 stream_state (StreamState): The state of the streams to be set. 291 292 Example of state format: 293 { 294 "parent_state": { 295 "parent_stream_name1": { 296 "last_updated": "2023-05-27T00:00:00Z" 297 }, 298 "parent_stream_name2": { 299 "last_updated": "2023-05-27T00:00:00Z" 300 } 301 } 302 } 303 304 Example of migrating to parent state format: 305 - Initial state: 306 { 307 "updated_at": "2023-05-27T00:00:00Z" 308 } 309 - After migration: 310 { 311 "updated_at": "2023-05-27T00:00:00Z", 312 "parent_state": { 313 "parent_stream_name": { 314 "parent_stream_cursor": "2023-05-27T00:00:00Z" 315 } 316 } 317 } 318 """ 319 if not stream_state: 320 return 321 322 parent_state = stream_state.get("parent_state", {}) 323 324 # Set state for each parent stream with an incremental dependency 325 for parent_config in self.parent_stream_configs: 326 if ( 327 not parent_state.get(parent_config.stream.name, {}) 328 and parent_config.incremental_dependency 329 ): 330 # Migrate child state to parent state format 331 parent_state = self._migrate_child_state_to_parent_state(stream_state) 332 333 if parent_config.incremental_dependency: 334 parent_config.stream.state = parent_state.get(parent_config.stream.name, {})
Set the state of the parent streams.
If the parent_state
key is missing from stream_state
, migrate the child stream state to the parent stream's state format.
This migration applies only to parent streams with incremental dependencies.
Arguments:
- stream_state (StreamState): The state of the streams to be set.
Example of state format: { "parent_state": { "parent_stream_name1": { "last_updated": "2023-05-27T00:00:00Z" }, "parent_stream_name2": { "last_updated": "2023-05-27T00:00:00Z" } } }
Example of migrating to parent state format:
- Initial state: { "updated_at": "2023-05-27T00:00:00Z" }
- After migration: { "updated_at": "2023-05-27T00:00:00Z", "parent_state": { "parent_stream_name": { "parent_stream_cursor": "2023-05-27T00:00:00Z" } } }
394 def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: 395 """ 396 Get the state of the parent streams. 397 398 Returns: 399 StreamState: The current state of the parent streams. 400 401 Example of state format: 402 { 403 "parent_stream_name1": { 404 "last_updated": "2023-05-27T00:00:00Z" 405 }, 406 "parent_stream_name2": { 407 "last_updated": "2023-05-27T00:00:00Z" 408 } 409 } 410 """ 411 parent_state = {} 412 for parent_config in self.parent_stream_configs: 413 if parent_config.incremental_dependency: 414 parent_state[parent_config.stream.name] = copy.deepcopy(parent_config.stream.state) 415 return parent_state
Get the state of the parent streams.
Returns:
StreamState: The current state of the parent streams.
Example of state format: { "parent_stream_name1": { "last_updated": "2023-05-27T00:00:00Z" }, "parent_stream_name2": { "last_updated": "2023-05-27T00:00:00Z" } }
18class YamlDeclarativeSource(ConcurrentDeclarativeSource[List[AirbyteStateMessage]]): 19 """Declarative source defined by a yaml file""" 20 21 def __init__( 22 self, 23 path_to_yaml: str, 24 debug: bool = False, 25 catalog: Optional[ConfiguredAirbyteCatalog] = None, 26 config: Optional[Mapping[str, Any]] = None, 27 state: Optional[List[AirbyteStateMessage]] = None, 28 ) -> None: 29 """ 30 :param path_to_yaml: Path to the yaml file describing the source 31 """ 32 self._path_to_yaml = path_to_yaml 33 source_config = self._read_and_parse_yaml_file(path_to_yaml) 34 35 super().__init__( 36 catalog=catalog or ConfiguredAirbyteCatalog(streams=[]), 37 config=config or {}, 38 state=state or [], 39 source_config=source_config, 40 ) 41 42 def _read_and_parse_yaml_file(self, path_to_yaml_file: str) -> ConnectionDefinition: 43 try: 44 # For testing purposes, we want to allow to just pass a file 45 with open(path_to_yaml_file, "r") as f: 46 return yaml.safe_load(f) # type: ignore # we assume the yaml represents a ConnectionDefinition 47 except FileNotFoundError: 48 # Running inside the container, the working directory during an operation is not structured the same as the static files 49 package = self.__class__.__module__.split(".")[0] 50 51 yaml_config = pkgutil.get_data(package, path_to_yaml_file) 52 if yaml_config: 53 decoded_yaml = yaml_config.decode() 54 return self._parse(decoded_yaml) 55 return {} 56 57 def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None: 58 extra_args["path_to_yaml"] = self._path_to_yaml 59 60 @staticmethod 61 def _parse(connection_definition_str: str) -> ConnectionDefinition: 62 """ 63 Parses a yaml file into a manifest. Component references still exist in the manifest which will be 64 resolved during the creating of the DeclarativeSource. 65 :param connection_definition_str: yaml string to parse 66 :return: The ConnectionDefinition parsed from connection_definition_str 67 """ 68 return yaml.safe_load(connection_definition_str) # type: ignore # yaml.safe_load doesn't return a type but know it is a Mapping
Declarative source defined by a yaml file
21 def __init__( 22 self, 23 path_to_yaml: str, 24 debug: bool = False, 25 catalog: Optional[ConfiguredAirbyteCatalog] = None, 26 config: Optional[Mapping[str, Any]] = None, 27 state: Optional[List[AirbyteStateMessage]] = None, 28 ) -> None: 29 """ 30 :param path_to_yaml: Path to the yaml file describing the source 31 """ 32 self._path_to_yaml = path_to_yaml 33 source_config = self._read_and_parse_yaml_file(path_to_yaml) 34 35 super().__init__( 36 catalog=catalog or ConfiguredAirbyteCatalog(streams=[]), 37 config=config or {}, 38 state=state or [], 39 source_config=source_config, 40 )
Parameters
- path_to_yaml: Path to the yaml file describing the source
Inherited Members
336def launch(source: Source, args: List[str]) -> None: 337 source_entrypoint = AirbyteEntrypoint(source) 338 parsed_args = source_entrypoint.parse_args(args) 339 # temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs 340 # Refer to: https://github.com/airbytehq/oncall/issues/6235 341 with PRINT_BUFFER: 342 for message in source_entrypoint.run(parsed_args): 343 # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and 344 # the other for the break line. Adding `\n` to the message ensure that both are printed at the same time 345 print(f"{message}\n", end="")
54class AirbyteEntrypoint(object): 55 def __init__(self, source: Source): 56 init_uncaught_exception_handler(logger) 57 58 # Deployment mode is read when instantiating the entrypoint because it is the common path shared by syncs and connector builder test requests 59 if is_cloud_environment(): 60 _init_internal_request_filter() 61 62 self.source = source 63 self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}") 64 65 @staticmethod 66 def parse_args(args: List[str]) -> argparse.Namespace: 67 # set up parent parsers 68 parent_parser = argparse.ArgumentParser(add_help=False) 69 parent_parser.add_argument( 70 "--debug", action="store_true", help="enables detailed debug logs related to the sync" 71 ) 72 main_parser = argparse.ArgumentParser() 73 subparsers = main_parser.add_subparsers(title="commands", dest="command") 74 75 # spec 76 subparsers.add_parser( 77 "spec", help="outputs the json configuration specification", parents=[parent_parser] 78 ) 79 80 # check 81 check_parser = subparsers.add_parser( 82 "check", help="checks the config can be used to connect", parents=[parent_parser] 83 ) 84 required_check_parser = check_parser.add_argument_group("required named arguments") 85 required_check_parser.add_argument( 86 "--config", type=str, required=True, help="path to the json configuration file" 87 ) 88 89 # discover 90 discover_parser = subparsers.add_parser( 91 "discover", 92 help="outputs a catalog describing the source's schema", 93 parents=[parent_parser], 94 ) 95 required_discover_parser = discover_parser.add_argument_group("required named arguments") 96 required_discover_parser.add_argument( 97 "--config", type=str, required=True, help="path to the json configuration file" 98 ) 99 100 # read 101 read_parser = subparsers.add_parser( 102 "read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser] 103 ) 104 105 read_parser.add_argument( 106 "--state", type=str, required=False, help="path to the json-encoded state file" 107 ) 108 required_read_parser = read_parser.add_argument_group("required named arguments") 109 required_read_parser.add_argument( 110 "--config", type=str, required=True, help="path to the json configuration file" 111 ) 112 required_read_parser.add_argument( 113 "--catalog", 114 type=str, 115 required=True, 116 help="path to the catalog used to determine which data to read", 117 ) 118 119 return main_parser.parse_args(args) 120 121 def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: 122 cmd = parsed_args.command 123 if not cmd: 124 raise Exception("No command passed") 125 126 if hasattr(parsed_args, "debug") and parsed_args.debug: 127 self.logger.setLevel(logging.DEBUG) 128 logger.setLevel(logging.DEBUG) 129 self.logger.debug("Debug logs enabled") 130 else: 131 self.logger.setLevel(logging.INFO) 132 133 source_spec: ConnectorSpecification = self.source.spec(self.logger) 134 try: 135 with tempfile.TemporaryDirectory( 136 # Cleanup can fail on Windows due to file locks. Ignore if so, 137 # rather than failing the whole process. 138 ignore_cleanup_errors=True, 139 ) as temp_dir: 140 os.environ[ENV_REQUEST_CACHE_PATH] = ( 141 temp_dir # set this as default directory for request_cache to store *.sqlite files 142 ) 143 if cmd == "spec": 144 message = AirbyteMessage(type=Type.SPEC, spec=source_spec) 145 yield from [ 146 self.airbyte_message_to_string(queued_message) 147 for queued_message in self._emit_queued_messages(self.source) 148 ] 149 yield self.airbyte_message_to_string(message) 150 else: 151 raw_config = self.source.read_config(parsed_args.config) 152 config = self.source.configure(raw_config, temp_dir) 153 154 yield from [ 155 self.airbyte_message_to_string(queued_message) 156 for queued_message in self._emit_queued_messages(self.source) 157 ] 158 if cmd == "check": 159 yield from map( 160 AirbyteEntrypoint.airbyte_message_to_string, 161 self.check(source_spec, config), 162 ) 163 elif cmd == "discover": 164 yield from map( 165 AirbyteEntrypoint.airbyte_message_to_string, 166 self.discover(source_spec, config), 167 ) 168 elif cmd == "read": 169 config_catalog = self.source.read_catalog(parsed_args.catalog) 170 state = self.source.read_state(parsed_args.state) 171 172 yield from map( 173 AirbyteEntrypoint.airbyte_message_to_string, 174 self.read(source_spec, config, config_catalog, state), 175 ) 176 else: 177 raise Exception("Unexpected command " + cmd) 178 finally: 179 yield from [ 180 self.airbyte_message_to_string(queued_message) 181 for queued_message in self._emit_queued_messages(self.source) 182 ] 183 184 def check( 185 self, source_spec: ConnectorSpecification, config: TConfig 186 ) -> Iterable[AirbyteMessage]: 187 self.set_up_secret_filter(config, source_spec.connectionSpecification) 188 try: 189 self.validate_connection(source_spec, config) 190 except AirbyteTracedException as traced_exc: 191 connection_status = traced_exc.as_connection_status_message() 192 # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error 193 # If the failure is not exceptional, we'll emit a failed connection status message and return 194 if traced_exc.failure_type != FailureType.config_error: 195 raise traced_exc 196 if connection_status: 197 yield from self._emit_queued_messages(self.source) 198 yield connection_status 199 return 200 201 try: 202 check_result = self.source.check(self.logger, config) 203 except AirbyteTracedException as traced_exc: 204 yield traced_exc.as_airbyte_message() 205 # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error 206 # If the failure is not exceptional, we'll emit a failed connection status message and return 207 if traced_exc.failure_type != FailureType.config_error: 208 raise traced_exc 209 else: 210 yield AirbyteMessage( 211 type=Type.CONNECTION_STATUS, 212 connectionStatus=AirbyteConnectionStatus( 213 status=Status.FAILED, message=traced_exc.message 214 ), 215 ) 216 return 217 if check_result.status == Status.SUCCEEDED: 218 self.logger.info("Check succeeded") 219 else: 220 self.logger.error("Check failed") 221 222 yield from self._emit_queued_messages(self.source) 223 yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result) 224 225 def discover( 226 self, source_spec: ConnectorSpecification, config: TConfig 227 ) -> Iterable[AirbyteMessage]: 228 self.set_up_secret_filter(config, source_spec.connectionSpecification) 229 if self.source.check_config_against_spec: 230 self.validate_connection(source_spec, config) 231 catalog = self.source.discover(self.logger, config) 232 233 yield from self._emit_queued_messages(self.source) 234 yield AirbyteMessage(type=Type.CATALOG, catalog=catalog) 235 236 def read( 237 self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any] 238 ) -> Iterable[AirbyteMessage]: 239 self.set_up_secret_filter(config, source_spec.connectionSpecification) 240 if self.source.check_config_against_spec: 241 self.validate_connection(source_spec, config) 242 243 # The Airbyte protocol dictates that counts be expressed as float/double to better protect against integer overflows 244 stream_message_counter: DefaultDict[HashableStreamDescriptor, float] = defaultdict(float) 245 for message in self.source.read(self.logger, config, catalog, state): 246 yield self.handle_record_counts(message, stream_message_counter) 247 for message in self._emit_queued_messages(self.source): 248 yield self.handle_record_counts(message, stream_message_counter) 249 250 @staticmethod 251 def handle_record_counts( 252 message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float] 253 ) -> AirbyteMessage: 254 match message.type: 255 case Type.RECORD: 256 if message.record is None: 257 raise ValueError("Record message must have a record attribute") 258 259 stream_message_count[ 260 HashableStreamDescriptor( 261 name=message.record.stream, # type: ignore[union-attr] # record has `stream` 262 namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace` 263 ) 264 ] += 1.0 265 case Type.STATE: 266 if message.state is None: 267 raise ValueError("State message must have a state attribute") 268 269 stream_descriptor = message_utils.get_stream_descriptor(message) 270 271 # Set record count from the counter onto the state message 272 message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats` 273 message.state.sourceStats.recordCount = stream_message_count.get( # type: ignore[union-attr] # state has `sourceStats` 274 stream_descriptor, 0.0 275 ) 276 277 # Reset the counter 278 stream_message_count[stream_descriptor] = 0.0 279 return message 280 281 @staticmethod 282 def validate_connection(source_spec: ConnectorSpecification, config: TConfig) -> None: 283 # Remove internal flags from config before validating so 284 # jsonschema's additionalProperties flag won't fail the validation 285 connector_config, _ = split_config(config) 286 check_config_against_spec_or_exit(connector_config, source_spec) 287 288 @staticmethod 289 def set_up_secret_filter(config: TConfig, connection_specification: Mapping[str, Any]) -> None: 290 # Now that we have the config, we can use it to get a list of ai airbyte_secrets 291 # that we should filter in logging to avoid leaking secrets 292 config_secrets = get_secrets(connection_specification, config) 293 update_secrets(config_secrets) 294 295 @staticmethod 296 def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str: 297 global _HAS_LOGGED_FOR_SERIALIZATION_ERROR 298 serialized_message = AirbyteMessageSerializer.dump(airbyte_message) 299 try: 300 return orjson.dumps(serialized_message).decode() 301 except Exception as exception: 302 if not _HAS_LOGGED_FOR_SERIALIZATION_ERROR: 303 logger.warning( 304 f"There was an error during the serialization of an AirbyteMessage: `{exception}`. This might impact the sync performances." 305 ) 306 _HAS_LOGGED_FOR_SERIALIZATION_ERROR = True 307 return json.dumps(serialized_message) 308 309 @classmethod 310 def extract_state(cls, args: List[str]) -> Optional[Any]: 311 parsed_args = cls.parse_args(args) 312 if hasattr(parsed_args, "state"): 313 return parsed_args.state 314 return None 315 316 @classmethod 317 def extract_catalog(cls, args: List[str]) -> Optional[Any]: 318 parsed_args = cls.parse_args(args) 319 if hasattr(parsed_args, "catalog"): 320 return parsed_args.catalog 321 return None 322 323 @classmethod 324 def extract_config(cls, args: List[str]) -> Optional[Any]: 325 parsed_args = cls.parse_args(args) 326 if hasattr(parsed_args, "config"): 327 return parsed_args.config 328 return None 329 330 def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]: 331 if hasattr(source, "message_repository") and source.message_repository: 332 yield from source.message_repository.consume_queue() 333 return
55 def __init__(self, source: Source): 56 init_uncaught_exception_handler(logger) 57 58 # Deployment mode is read when instantiating the entrypoint because it is the common path shared by syncs and connector builder test requests 59 if is_cloud_environment(): 60 _init_internal_request_filter() 61 62 self.source = source 63 self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}")
65 @staticmethod 66 def parse_args(args: List[str]) -> argparse.Namespace: 67 # set up parent parsers 68 parent_parser = argparse.ArgumentParser(add_help=False) 69 parent_parser.add_argument( 70 "--debug", action="store_true", help="enables detailed debug logs related to the sync" 71 ) 72 main_parser = argparse.ArgumentParser() 73 subparsers = main_parser.add_subparsers(title="commands", dest="command") 74 75 # spec 76 subparsers.add_parser( 77 "spec", help="outputs the json configuration specification", parents=[parent_parser] 78 ) 79 80 # check 81 check_parser = subparsers.add_parser( 82 "check", help="checks the config can be used to connect", parents=[parent_parser] 83 ) 84 required_check_parser = check_parser.add_argument_group("required named arguments") 85 required_check_parser.add_argument( 86 "--config", type=str, required=True, help="path to the json configuration file" 87 ) 88 89 # discover 90 discover_parser = subparsers.add_parser( 91 "discover", 92 help="outputs a catalog describing the source's schema", 93 parents=[parent_parser], 94 ) 95 required_discover_parser = discover_parser.add_argument_group("required named arguments") 96 required_discover_parser.add_argument( 97 "--config", type=str, required=True, help="path to the json configuration file" 98 ) 99 100 # read 101 read_parser = subparsers.add_parser( 102 "read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser] 103 ) 104 105 read_parser.add_argument( 106 "--state", type=str, required=False, help="path to the json-encoded state file" 107 ) 108 required_read_parser = read_parser.add_argument_group("required named arguments") 109 required_read_parser.add_argument( 110 "--config", type=str, required=True, help="path to the json configuration file" 111 ) 112 required_read_parser.add_argument( 113 "--catalog", 114 type=str, 115 required=True, 116 help="path to the catalog used to determine which data to read", 117 ) 118 119 return main_parser.parse_args(args)
121 def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: 122 cmd = parsed_args.command 123 if not cmd: 124 raise Exception("No command passed") 125 126 if hasattr(parsed_args, "debug") and parsed_args.debug: 127 self.logger.setLevel(logging.DEBUG) 128 logger.setLevel(logging.DEBUG) 129 self.logger.debug("Debug logs enabled") 130 else: 131 self.logger.setLevel(logging.INFO) 132 133 source_spec: ConnectorSpecification = self.source.spec(self.logger) 134 try: 135 with tempfile.TemporaryDirectory( 136 # Cleanup can fail on Windows due to file locks. Ignore if so, 137 # rather than failing the whole process. 138 ignore_cleanup_errors=True, 139 ) as temp_dir: 140 os.environ[ENV_REQUEST_CACHE_PATH] = ( 141 temp_dir # set this as default directory for request_cache to store *.sqlite files 142 ) 143 if cmd == "spec": 144 message = AirbyteMessage(type=Type.SPEC, spec=source_spec) 145 yield from [ 146 self.airbyte_message_to_string(queued_message) 147 for queued_message in self._emit_queued_messages(self.source) 148 ] 149 yield self.airbyte_message_to_string(message) 150 else: 151 raw_config = self.source.read_config(parsed_args.config) 152 config = self.source.configure(raw_config, temp_dir) 153 154 yield from [ 155 self.airbyte_message_to_string(queued_message) 156 for queued_message in self._emit_queued_messages(self.source) 157 ] 158 if cmd == "check": 159 yield from map( 160 AirbyteEntrypoint.airbyte_message_to_string, 161 self.check(source_spec, config), 162 ) 163 elif cmd == "discover": 164 yield from map( 165 AirbyteEntrypoint.airbyte_message_to_string, 166 self.discover(source_spec, config), 167 ) 168 elif cmd == "read": 169 config_catalog = self.source.read_catalog(parsed_args.catalog) 170 state = self.source.read_state(parsed_args.state) 171 172 yield from map( 173 AirbyteEntrypoint.airbyte_message_to_string, 174 self.read(source_spec, config, config_catalog, state), 175 ) 176 else: 177 raise Exception("Unexpected command " + cmd) 178 finally: 179 yield from [ 180 self.airbyte_message_to_string(queued_message) 181 for queued_message in self._emit_queued_messages(self.source) 182 ]
184 def check( 185 self, source_spec: ConnectorSpecification, config: TConfig 186 ) -> Iterable[AirbyteMessage]: 187 self.set_up_secret_filter(config, source_spec.connectionSpecification) 188 try: 189 self.validate_connection(source_spec, config) 190 except AirbyteTracedException as traced_exc: 191 connection_status = traced_exc.as_connection_status_message() 192 # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error 193 # If the failure is not exceptional, we'll emit a failed connection status message and return 194 if traced_exc.failure_type != FailureType.config_error: 195 raise traced_exc 196 if connection_status: 197 yield from self._emit_queued_messages(self.source) 198 yield connection_status 199 return 200 201 try: 202 check_result = self.source.check(self.logger, config) 203 except AirbyteTracedException as traced_exc: 204 yield traced_exc.as_airbyte_message() 205 # The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error 206 # If the failure is not exceptional, we'll emit a failed connection status message and return 207 if traced_exc.failure_type != FailureType.config_error: 208 raise traced_exc 209 else: 210 yield AirbyteMessage( 211 type=Type.CONNECTION_STATUS, 212 connectionStatus=AirbyteConnectionStatus( 213 status=Status.FAILED, message=traced_exc.message 214 ), 215 ) 216 return 217 if check_result.status == Status.SUCCEEDED: 218 self.logger.info("Check succeeded") 219 else: 220 self.logger.error("Check failed") 221 222 yield from self._emit_queued_messages(self.source) 223 yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
225 def discover( 226 self, source_spec: ConnectorSpecification, config: TConfig 227 ) -> Iterable[AirbyteMessage]: 228 self.set_up_secret_filter(config, source_spec.connectionSpecification) 229 if self.source.check_config_against_spec: 230 self.validate_connection(source_spec, config) 231 catalog = self.source.discover(self.logger, config) 232 233 yield from self._emit_queued_messages(self.source) 234 yield AirbyteMessage(type=Type.CATALOG, catalog=catalog)
236 def read( 237 self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any] 238 ) -> Iterable[AirbyteMessage]: 239 self.set_up_secret_filter(config, source_spec.connectionSpecification) 240 if self.source.check_config_against_spec: 241 self.validate_connection(source_spec, config) 242 243 # The Airbyte protocol dictates that counts be expressed as float/double to better protect against integer overflows 244 stream_message_counter: DefaultDict[HashableStreamDescriptor, float] = defaultdict(float) 245 for message in self.source.read(self.logger, config, catalog, state): 246 yield self.handle_record_counts(message, stream_message_counter) 247 for message in self._emit_queued_messages(self.source): 248 yield self.handle_record_counts(message, stream_message_counter)
250 @staticmethod 251 def handle_record_counts( 252 message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float] 253 ) -> AirbyteMessage: 254 match message.type: 255 case Type.RECORD: 256 if message.record is None: 257 raise ValueError("Record message must have a record attribute") 258 259 stream_message_count[ 260 HashableStreamDescriptor( 261 name=message.record.stream, # type: ignore[union-attr] # record has `stream` 262 namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace` 263 ) 264 ] += 1.0 265 case Type.STATE: 266 if message.state is None: 267 raise ValueError("State message must have a state attribute") 268 269 stream_descriptor = message_utils.get_stream_descriptor(message) 270 271 # Set record count from the counter onto the state message 272 message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats` 273 message.state.sourceStats.recordCount = stream_message_count.get( # type: ignore[union-attr] # state has `sourceStats` 274 stream_descriptor, 0.0 275 ) 276 277 # Reset the counter 278 stream_message_count[stream_descriptor] = 0.0 279 return message
281 @staticmethod 282 def validate_connection(source_spec: ConnectorSpecification, config: TConfig) -> None: 283 # Remove internal flags from config before validating so 284 # jsonschema's additionalProperties flag won't fail the validation 285 connector_config, _ = split_config(config) 286 check_config_against_spec_or_exit(connector_config, source_spec)
288 @staticmethod 289 def set_up_secret_filter(config: TConfig, connection_specification: Mapping[str, Any]) -> None: 290 # Now that we have the config, we can use it to get a list of ai airbyte_secrets 291 # that we should filter in logging to avoid leaking secrets 292 config_secrets = get_secrets(connection_specification, config) 293 update_secrets(config_secrets)
295 @staticmethod 296 def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str: 297 global _HAS_LOGGED_FOR_SERIALIZATION_ERROR 298 serialized_message = AirbyteMessageSerializer.dump(airbyte_message) 299 try: 300 return orjson.dumps(serialized_message).decode() 301 except Exception as exception: 302 if not _HAS_LOGGED_FOR_SERIALIZATION_ERROR: 303 logger.warning( 304 f"There was an error during the serialization of an AirbyteMessage: `{exception}`. This might impact the sync performances." 305 ) 306 _HAS_LOGGED_FOR_SERIALIZATION_ERROR = True 307 return json.dumps(serialized_message)
479class AbstractAPIBudget(abc.ABC): 480 """Interface to some API where a client allowed to have N calls per T interval. 481 482 Important: APIBudget is not doing any API calls, the end user code is responsible to call this interface 483 to respect call rate limitation of the API. 484 485 It supports multiple policies applied to different group of requests. To distinct these groups we use RequestMatchers. 486 Individual policy represented by MovingWindowCallRatePolicy and currently supports only moving window strategy. 487 """ 488 489 @abc.abstractmethod 490 def acquire_call( 491 self, request: Any, block: bool = True, timeout: Optional[float] = None 492 ) -> None: 493 """Try to get a call from budget, will block by default 494 495 :param request: 496 :param block: when true (default) will block the current thread until call credit is available 497 :param timeout: if set will limit maximum time in block, otherwise will wait until credit is available 498 :raises: CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout 499 """ 500 501 @abc.abstractmethod 502 def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]: 503 """Find matching call rate policy for specific request""" 504 505 @abc.abstractmethod 506 def update_from_response(self, request: Any, response: Any) -> None: 507 """Update budget information based on response from API 508 509 :param request: the initial request that triggered this response 510 :param response: response from the API 511 """
Interface to some API where a client allowed to have N calls per T interval.
Important: APIBudget is not doing any API calls, the end user code is responsible to call this interface to respect call rate limitation of the API.
It supports multiple policies applied to different group of requests. To distinct these groups we use RequestMatchers. Individual policy represented by MovingWindowCallRatePolicy and currently supports only moving window strategy.
489 @abc.abstractmethod 490 def acquire_call( 491 self, request: Any, block: bool = True, timeout: Optional[float] = None 492 ) -> None: 493 """Try to get a call from budget, will block by default 494 495 :param request: 496 :param block: when true (default) will block the current thread until call credit is available 497 :param timeout: if set will limit maximum time in block, otherwise will wait until credit is available 498 :raises: CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout 499 """
Try to get a call from budget, will block by default
Parameters
- request:
- block: when true (default) will block the current thread until call credit is available
- timeout: if set will limit maximum time in block, otherwise will wait until credit is available
Raises
- CallRateLimitHit - when no credits left and if timeout was set the waiting time exceed the timeout
501 @abc.abstractmethod 502 def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]: 503 """Find matching call rate policy for specific request"""
Find matching call rate policy for specific request
505 @abc.abstractmethod 506 def update_from_response(self, request: Any, response: Any) -> None: 507 """Update budget information based on response from API 508 509 :param request: the initial request that triggered this response 510 :param response: response from the API 511 """
Update budget information based on response from API
Parameters
- request: the initial request that triggered this response
- response: response from the API
13class AbstractHeaderAuthenticator(AuthBase): 14 """Abstract class for an header-based authenticators that add a header to outgoing HTTP requests.""" 15 16 def __call__(self, request: requests.PreparedRequest) -> Any: 17 """Attach the HTTP headers required to authenticate on the HTTP request""" 18 request.headers.update(self.get_auth_header()) 19 return request 20 21 def get_auth_header(self) -> Mapping[str, Any]: 22 """The header to set on outgoing HTTP requests""" 23 if self.auth_header: 24 return {self.auth_header: self.token} 25 return {} 26 27 @property 28 @abstractmethod 29 def auth_header(self) -> str: 30 """HTTP header to set on the requests""" 31 32 @property 33 @abstractmethod 34 def token(self) -> str: 35 """The header value to set on outgoing HTTP requests"""
Abstract class for an header-based authenticators that add a header to outgoing HTTP requests.
21 def get_auth_header(self) -> Mapping[str, Any]: 22 """The header to set on outgoing HTTP requests""" 23 if self.auth_header: 24 return {self.auth_header: self.token} 25 return {}
The header to set on outgoing HTTP requests
12class BaseBackoffException(requests.exceptions.HTTPError): 13 def __init__( 14 self, 15 request: requests.PreparedRequest, 16 response: Optional[Union[requests.Response, Exception]], 17 error_message: str = "", 18 ): 19 if isinstance(response, requests.Response): 20 error_message = ( 21 error_message 22 or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}" 23 ) 24 super().__init__(error_message, request=request, response=response) 25 else: 26 error_message = error_message or f"Request URL: {request.url}, Exception: {response}" 27 super().__init__(error_message, request=request, response=None)
An HTTP error occurred.
13 def __init__( 14 self, 15 request: requests.PreparedRequest, 16 response: Optional[Union[requests.Response, Exception]], 17 error_message: str = "", 18 ): 19 if isinstance(response, requests.Response): 20 error_message = ( 21 error_message 22 or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}" 23 ) 24 super().__init__(error_message, request=request, response=response) 25 else: 26 error_message = error_message or f"Request URL: {request.url}, Exception: {response}" 27 super().__init__(error_message, request=request, response=None)
704class CachedLimiterSession(requests_cache.CacheMixin, LimiterMixin, requests.Session): 705 """Session class with caching and rate-limiting behavior."""
Session class with caching and rate-limiting behavior.
An HTTP error occurred.
Inherited Members
34def default_backoff_handler( 35 max_tries: Optional[int], factor: float, max_time: Optional[int] = None, **kwargs: Any 36) -> Callable[[SendRequestCallableType], SendRequestCallableType]: 37 def log_retry_attempt(details: Mapping[str, Any]) -> None: 38 _, exc, _ = sys.exc_info() 39 if isinstance(exc, RequestException) and exc.response: 40 logger.info( 41 f"Status code: {exc.response.status_code!r}, Response Content: {exc.response.content!r}" 42 ) 43 logger.info( 44 f"Caught retryable error '{str(exc)}' after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..." 45 ) 46 47 def should_give_up(exc: Exception) -> bool: 48 # If a non-rate-limiting related 4XX error makes it this far, it means it was unexpected and probably consistent, so we shouldn't back off 49 if isinstance(exc, RequestException): 50 if exc.response is not None: 51 give_up: bool = ( 52 exc.response is not None 53 and exc.response.status_code != codes.too_many_requests 54 and 400 <= exc.response.status_code < 500 55 ) 56 if give_up: 57 logger.info(f"Giving up for returned HTTP status: {exc.response.status_code!r}") 58 return give_up 59 # Only RequestExceptions are retryable, so if we get here, it's not retryable 60 return False 61 62 return backoff.on_exception( # type: ignore # Decorator function returns a function with a different signature than the input function, so mypy can't infer the type of the returned function 63 backoff.expo, 64 TRANSIENT_EXCEPTIONS, 65 jitter=None, 66 on_backoff=log_retry_attempt, 67 giveup=should_give_up, 68 max_tries=max_tries, 69 max_time=max_time, 70 factor=factor, 71 **kwargs, 72 )
631class HttpAPIBudget(APIBudget): 632 """Implementation of AbstractAPIBudget for HTTP""" 633 634 def __init__( 635 self, 636 ratelimit_reset_header: str = "ratelimit-reset", 637 ratelimit_remaining_header: str = "ratelimit-remaining", 638 status_codes_for_ratelimit_hit: list[int] = [429], 639 **kwargs: Any, 640 ): 641 """Constructor 642 643 :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget 644 :param ratelimit_remaining_header: name of the header that has the number of calls left 645 :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit 646 """ 647 self._ratelimit_reset_header = ratelimit_reset_header 648 self._ratelimit_remaining_header = ratelimit_remaining_header 649 self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit 650 super().__init__(**kwargs) 651 652 def update_from_response(self, request: Any, response: Any) -> None: 653 policy = self.get_matching_policy(request) 654 if not policy: 655 return 656 657 if isinstance(response, requests.Response): 658 available_calls = self.get_calls_left_from_response(response) 659 reset_ts = self.get_reset_ts_from_response(response) 660 policy.update(available_calls=available_calls, call_reset_ts=reset_ts) 661 662 def get_reset_ts_from_response( 663 self, response: requests.Response 664 ) -> Optional[datetime.datetime]: 665 if response.headers.get(self._ratelimit_reset_header): 666 return datetime.datetime.fromtimestamp( 667 int(response.headers[self._ratelimit_reset_header]) 668 ) 669 return None 670 671 def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]: 672 if response.headers.get(self._ratelimit_remaining_header): 673 return int(response.headers[self._ratelimit_remaining_header]) 674 675 if response.status_code in self._status_codes_for_ratelimit_hit: 676 return 0 677 678 return None
Implementation of AbstractAPIBudget for HTTP
634 def __init__( 635 self, 636 ratelimit_reset_header: str = "ratelimit-reset", 637 ratelimit_remaining_header: str = "ratelimit-remaining", 638 status_codes_for_ratelimit_hit: list[int] = [429], 639 **kwargs: Any, 640 ): 641 """Constructor 642 643 :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget 644 :param ratelimit_remaining_header: name of the header that has the number of calls left 645 :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit 646 """ 647 self._ratelimit_reset_header = ratelimit_reset_header 648 self._ratelimit_remaining_header = ratelimit_remaining_header 649 self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit 650 super().__init__(**kwargs)
Constructor
Parameters
- ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget
- ratelimit_remaining_header: name of the header that has the number of calls left
- status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit
652 def update_from_response(self, request: Any, response: Any) -> None: 653 policy = self.get_matching_policy(request) 654 if not policy: 655 return 656 657 if isinstance(response, requests.Response): 658 available_calls = self.get_calls_left_from_response(response) 659 reset_ts = self.get_reset_ts_from_response(response) 660 policy.update(available_calls=available_calls, call_reset_ts=reset_ts)
Update budget information based on the API response.
Parameters
- request: the initial request that triggered this response
- response: response from the API
671 def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]: 672 if response.headers.get(self._ratelimit_remaining_header): 673 return int(response.headers[self._ratelimit_remaining_header]) 674 675 if response.status_code in self._status_codes_for_ratelimit_hit: 676 return 0 677 678 return None
Inherited Members
103class HttpRequestMatcher(RequestMatcher): 104 """Simple implementation of RequestMatcher for HTTP requests using HttpRequestRegexMatcher under the hood.""" 105 106 def __init__( 107 self, 108 method: Optional[str] = None, 109 url: Optional[str] = None, 110 params: Optional[Mapping[str, Any]] = None, 111 headers: Optional[Mapping[str, Any]] = None, 112 ): 113 """Constructor 114 115 :param method: HTTP method (e.g., "GET", "POST"). 116 :param url: Full URL to match. 117 :param params: Dictionary of query parameters to match. 118 :param headers: Dictionary of headers to match. 119 """ 120 # Parse the URL to extract the base and path 121 if url: 122 parsed_url = parse.urlsplit(url) 123 url_base = f"{parsed_url.scheme}://{parsed_url.netloc}" 124 url_path = parsed_url.path if parsed_url.path != "/" else None 125 else: 126 url_base = None 127 url_path = None 128 129 # Use HttpRequestRegexMatcher under the hood 130 self._regex_matcher = HttpRequestRegexMatcher( 131 method=method, 132 url_base=url_base, 133 url_path_pattern=re.escape(url_path) if url_path else None, 134 params=params, 135 headers=headers, 136 ) 137 138 def __call__(self, request: Any) -> bool: 139 """ 140 :param request: A requests.Request or requests.PreparedRequest instance. 141 :return: True if the request matches all provided criteria; False otherwise. 142 """ 143 return self._regex_matcher(request) 144 145 def __str__(self) -> str: 146 return ( 147 f"HttpRequestMatcher(method={self._regex_matcher._method}, " 148 f"url={self._regex_matcher._url_base}{self._regex_matcher._url_path_pattern.pattern if self._regex_matcher._url_path_pattern else ''}, " 149 f"params={self._regex_matcher._params}, headers={self._regex_matcher._headers})" 150 )
Simple implementation of RequestMatcher for HTTP requests using HttpRequestRegexMatcher under the hood.
106 def __init__( 107 self, 108 method: Optional[str] = None, 109 url: Optional[str] = None, 110 params: Optional[Mapping[str, Any]] = None, 111 headers: Optional[Mapping[str, Any]] = None, 112 ): 113 """Constructor 114 115 :param method: HTTP method (e.g., "GET", "POST"). 116 :param url: Full URL to match. 117 :param params: Dictionary of query parameters to match. 118 :param headers: Dictionary of headers to match. 119 """ 120 # Parse the URL to extract the base and path 121 if url: 122 parsed_url = parse.urlsplit(url) 123 url_base = f"{parsed_url.scheme}://{parsed_url.netloc}" 124 url_path = parsed_url.path if parsed_url.path != "/" else None 125 else: 126 url_base = None 127 url_path = None 128 129 # Use HttpRequestRegexMatcher under the hood 130 self._regex_matcher = HttpRequestRegexMatcher( 131 method=method, 132 url_base=url_base, 133 url_path_pattern=re.escape(url_path) if url_path else None, 134 params=params, 135 headers=headers, 136 )
Constructor
Parameters
- method: HTTP method (e.g., "GET", "POST").
- url: Full URL to match.
- params: Dictionary of query parameters to match.
- headers: Dictionary of headers to match.
45class HttpStream(Stream, CheckpointMixin, ABC): 46 """ 47 Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API. 48 """ 49 50 source_defined_cursor = True # Most HTTP streams use a source defined cursor (i.e: the user can't configure it like on a SQL table) 51 page_size: Optional[int] = ( 52 None # Use this variable to define page size for API http requests with pagination support 53 ) 54 55 def __init__( 56 self, authenticator: Optional[AuthBase] = None, api_budget: Optional[APIBudget] = None 57 ): 58 self._exit_on_rate_limit: bool = False 59 self._http_client = HttpClient( 60 name=self.name, 61 logger=self.logger, 62 error_handler=self.get_error_handler(), 63 api_budget=api_budget or APIBudget(policies=[]), 64 authenticator=authenticator, 65 use_cache=self.use_cache, 66 backoff_strategy=self.get_backoff_strategy(), 67 message_repository=InMemoryMessageRepository(), 68 ) 69 70 # There are three conditions that dictate if RFR should automatically be applied to a stream 71 # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR 72 # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR. 73 # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform 74 # per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method 75 if ( 76 not self.cursor 77 and len(self.cursor_field) == 0 78 and type(self).read_records is HttpStream.read_records 79 ): 80 self.cursor = ResumableFullRefreshCursor() 81 82 @property 83 def exit_on_rate_limit(self) -> bool: 84 """ 85 :return: False if the stream will retry endlessly when rate limited 86 """ 87 return self._exit_on_rate_limit 88 89 @exit_on_rate_limit.setter 90 def exit_on_rate_limit(self, value: bool) -> None: 91 self._exit_on_rate_limit = value 92 93 @property 94 def cache_filename(self) -> str: 95 """ 96 Override if needed. Return the name of cache file 97 Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. 98 """ 99 return f"{self.name}.sqlite" 100 101 @property 102 def use_cache(self) -> bool: 103 """ 104 Override if needed. If True, all records will be cached. 105 Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. 106 """ 107 return False 108 109 @property 110 @abstractmethod 111 def url_base(self) -> str: 112 """ 113 :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" 114 """ 115 116 @property 117 def http_method(self) -> str: 118 """ 119 Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH. 120 """ 121 return "GET" 122 123 @property 124 @deprecated( 125 "Deprecated as of CDK version 3.0.0. " 126 "You should set error_handler explicitly in HttpStream.get_error_handler() instead." 127 ) 128 def raise_on_http_errors(self) -> bool: 129 """ 130 Override if needed. If set to False, allows opting-out of raising HTTP code exception. 131 """ 132 return True 133 134 @property 135 @deprecated( 136 "Deprecated as of CDK version 3.0.0. " 137 "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead." 138 ) 139 def max_retries(self) -> Union[int, None]: 140 """ 141 Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit. 142 """ 143 return 5 144 145 @property 146 @deprecated( 147 "Deprecated as of CDK version 3.0.0. " 148 "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead." 149 ) 150 def max_time(self) -> Union[int, None]: 151 """ 152 Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit. 153 """ 154 return 60 * 10 155 156 @property 157 @deprecated( 158 "Deprecated as of CDK version 3.0.0. " 159 "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead." 160 ) 161 def retry_factor(self) -> float: 162 """ 163 Override if needed. Specifies factor for backoff policy. 164 """ 165 return 5 166 167 @abstractmethod 168 def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: 169 """ 170 Override this method to define a pagination strategy. 171 172 The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params. 173 174 :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response. 175 """ 176 177 @abstractmethod 178 def path( 179 self, 180 *, 181 stream_state: Optional[Mapping[str, Any]] = None, 182 stream_slice: Optional[Mapping[str, Any]] = None, 183 next_page_token: Optional[Mapping[str, Any]] = None, 184 ) -> str: 185 """ 186 Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" 187 """ 188 189 def request_params( 190 self, 191 stream_state: Optional[Mapping[str, Any]], 192 stream_slice: Optional[Mapping[str, Any]] = None, 193 next_page_token: Optional[Mapping[str, Any]] = None, 194 ) -> MutableMapping[str, Any]: 195 """ 196 Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs. 197 198 E.g: you might want to define query parameters for paging if next_page_token is not None. 199 """ 200 return {} 201 202 def request_headers( 203 self, 204 stream_state: Optional[Mapping[str, Any]], 205 stream_slice: Optional[Mapping[str, Any]] = None, 206 next_page_token: Optional[Mapping[str, Any]] = None, 207 ) -> Mapping[str, Any]: 208 """ 209 Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. 210 """ 211 return {} 212 213 def request_body_data( 214 self, 215 stream_state: Optional[Mapping[str, Any]], 216 stream_slice: Optional[Mapping[str, Any]] = None, 217 next_page_token: Optional[Mapping[str, Any]] = None, 218 ) -> Optional[Union[Mapping[str, Any], str]]: 219 """ 220 Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload. 221 222 If returns a ready text that it will be sent as is. 223 If returns a dict that it will be converted to a urlencoded form. 224 E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" 225 226 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 227 """ 228 return None 229 230 def request_body_json( 231 self, 232 stream_state: Optional[Mapping[str, Any]], 233 stream_slice: Optional[Mapping[str, Any]] = None, 234 next_page_token: Optional[Mapping[str, Any]] = None, 235 ) -> Optional[Mapping[str, Any]]: 236 """ 237 Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload. 238 239 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 240 """ 241 return None 242 243 def request_kwargs( 244 self, 245 stream_state: Optional[Mapping[str, Any]], 246 stream_slice: Optional[Mapping[str, Any]] = None, 247 next_page_token: Optional[Mapping[str, Any]] = None, 248 ) -> Mapping[str, Any]: 249 """ 250 Override to return a mapping of keyword arguments to be used when creating the HTTP request. 251 Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from 252 this method. Note that these options do not conflict with request-level options such as headers, request params, etc.. 253 """ 254 return {} 255 256 @abstractmethod 257 def parse_response( 258 self, 259 response: requests.Response, 260 *, 261 stream_state: Mapping[str, Any], 262 stream_slice: Optional[Mapping[str, Any]] = None, 263 next_page_token: Optional[Mapping[str, Any]] = None, 264 ) -> Iterable[Mapping[str, Any]]: 265 """ 266 Parses the raw response object into a list of records. 267 By default, this returns an iterable containing the input. Override to parse differently. 268 :param response: 269 :param stream_state: 270 :param stream_slice: 271 :param next_page_token: 272 :return: An iterable containing the parsed response 273 """ 274 275 def get_backoff_strategy(self) -> Optional[Union[BackoffStrategy, List[BackoffStrategy]]]: 276 """ 277 Used to initialize Adapter to avoid breaking changes. 278 If Stream has a `backoff_time` method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed. 279 280 Override to provide custom BackoffStrategy 281 :return Optional[BackoffStrategy]: 282 """ 283 if hasattr(self, "backoff_time"): 284 return HttpStreamAdapterBackoffStrategy(self) 285 else: 286 return None 287 288 def get_error_handler(self) -> Optional[ErrorHandler]: 289 """ 290 Used to initialize Adapter to avoid breaking changes. 291 If Stream has a `should_retry` method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed. 292 293 Override to provide custom ErrorHandler 294 :return Optional[ErrorHandler]: 295 """ 296 if hasattr(self, "should_retry"): 297 error_handler = HttpStreamAdapterHttpStatusErrorHandler( 298 stream=self, 299 logger=logging.getLogger(), 300 max_retries=self.max_retries, 301 max_time=timedelta(seconds=self.max_time or 0), 302 ) 303 return error_handler 304 else: 305 return None 306 307 @classmethod 308 def _join_url(cls, url_base: str, path: str) -> str: 309 return urljoin(url_base, path) 310 311 @classmethod 312 def parse_response_error_message(cls, response: requests.Response) -> Optional[str]: 313 """ 314 Parses the raw response object from a failed request into a user-friendly error message. 315 By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently. 316 317 :param response: 318 :return: A user-friendly message that indicates the cause of the error 319 """ 320 321 # default logic to grab error from common fields 322 def _try_get_error(value: Optional[JsonType]) -> Optional[str]: 323 if isinstance(value, str): 324 return value 325 elif isinstance(value, list): 326 errors_in_value = [_try_get_error(v) for v in value] 327 return ", ".join(v for v in errors_in_value if v is not None) 328 elif isinstance(value, dict): 329 new_value = ( 330 value.get("message") 331 or value.get("messages") 332 or value.get("error") 333 or value.get("errors") 334 or value.get("failures") 335 or value.get("failure") 336 or value.get("detail") 337 ) 338 return _try_get_error(new_value) 339 return None 340 341 try: 342 body = response.json() 343 return _try_get_error(body) 344 except requests.exceptions.JSONDecodeError: 345 return None 346 347 def get_error_display_message(self, exception: BaseException) -> Optional[str]: 348 """ 349 Retrieves the user-friendly display message that corresponds to an exception. 350 This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. 351 352 The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message(). 353 The method should be overriden as needed to handle any additional exception types. 354 355 :param exception: The exception that was raised 356 :return: A user-friendly message that indicates the cause of the error 357 """ 358 if isinstance(exception, requests.HTTPError) and exception.response is not None: 359 return self.parse_response_error_message(exception.response) 360 return None 361 362 def read_records( 363 self, 364 sync_mode: SyncMode, 365 cursor_field: Optional[List[str]] = None, 366 stream_slice: Optional[Mapping[str, Any]] = None, 367 stream_state: Optional[Mapping[str, Any]] = None, 368 ) -> Iterable[StreamData]: 369 # A cursor_field indicates this is an incremental stream which offers better checkpointing than RFR enabled via the cursor 370 if self.cursor_field or not isinstance(self.get_cursor(), ResumableFullRefreshCursor): 371 yield from self._read_pages( 372 lambda req, res, state, _slice: self.parse_response( 373 res, stream_slice=_slice, stream_state=state 374 ), 375 stream_slice, 376 stream_state, 377 ) 378 else: 379 yield from self._read_single_page( 380 lambda req, res, state, _slice: self.parse_response( 381 res, stream_slice=_slice, stream_state=state 382 ), 383 stream_slice, 384 stream_state, 385 ) 386 387 @property 388 def state(self) -> MutableMapping[str, Any]: 389 cursor = self.get_cursor() 390 if cursor: 391 return cursor.get_stream_state() # type: ignore 392 return self._state 393 394 @state.setter 395 def state(self, value: MutableMapping[str, Any]) -> None: 396 cursor = self.get_cursor() 397 if cursor: 398 cursor.set_initial_state(value) 399 self._state = value 400 401 def get_cursor(self) -> Optional[Cursor]: 402 # I don't love that this is semi-stateful but not sure what else to do. We don't know exactly what type of cursor to 403 # instantiate when creating the class. We can make a few assumptions like if there is a cursor_field which implies 404 # incremental, but we don't know until runtime if this is a substream. Ideally, a stream should explicitly define 405 # its cursor, but because we're trying to automatically apply RFR we're stuck with this logic where we replace the 406 # cursor at runtime once we detect this is a substream based on self.has_multiple_slices being reassigned 407 if self.has_multiple_slices and isinstance(self.cursor, ResumableFullRefreshCursor): 408 self.cursor = SubstreamResumableFullRefreshCursor() 409 return self.cursor 410 else: 411 return self.cursor 412 413 def _read_pages( 414 self, 415 records_generator_fn: Callable[ 416 [ 417 requests.PreparedRequest, 418 requests.Response, 419 Mapping[str, Any], 420 Optional[Mapping[str, Any]], 421 ], 422 Iterable[StreamData], 423 ], 424 stream_slice: Optional[Mapping[str, Any]] = None, 425 stream_state: Optional[Mapping[str, Any]] = None, 426 ) -> Iterable[StreamData]: 427 stream_state = stream_state or {} 428 pagination_complete = False 429 next_page_token = None 430 while not pagination_complete: 431 request, response = self._fetch_next_page(stream_slice, stream_state, next_page_token) 432 yield from records_generator_fn(request, response, stream_state, stream_slice) 433 434 next_page_token = self.next_page_token(response) 435 if not next_page_token: 436 pagination_complete = True 437 438 cursor = self.get_cursor() 439 if cursor and isinstance(cursor, SubstreamResumableFullRefreshCursor): 440 partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice) 441 # Substreams checkpoint state by marking an entire parent partition as completed so that on the subsequent attempt 442 # after a failure, completed parents are skipped and the sync can make progress 443 cursor.close_slice(StreamSlice(cursor_slice={}, partition=partition)) 444 445 # Always return an empty generator just in case no records were ever yielded 446 yield from [] 447 448 def _read_single_page( 449 self, 450 records_generator_fn: Callable[ 451 [ 452 requests.PreparedRequest, 453 requests.Response, 454 Mapping[str, Any], 455 Optional[Mapping[str, Any]], 456 ], 457 Iterable[StreamData], 458 ], 459 stream_slice: Optional[Mapping[str, Any]] = None, 460 stream_state: Optional[Mapping[str, Any]] = None, 461 ) -> Iterable[StreamData]: 462 partition, cursor_slice, remaining_slice = self._extract_slice_fields( 463 stream_slice=stream_slice 464 ) 465 stream_state = stream_state or {} 466 next_page_token = cursor_slice or None 467 468 request, response = self._fetch_next_page(remaining_slice, stream_state, next_page_token) 469 yield from records_generator_fn(request, response, stream_state, remaining_slice) 470 471 next_page_token = self.next_page_token(response) or { 472 "__ab_full_refresh_sync_complete": True 473 } 474 475 cursor = self.get_cursor() 476 if cursor: 477 cursor.close_slice(StreamSlice(cursor_slice=next_page_token, partition=partition)) 478 479 # Always return an empty generator just in case no records were ever yielded 480 yield from [] 481 482 @staticmethod 483 def _extract_slice_fields( 484 stream_slice: Optional[Mapping[str, Any]], 485 ) -> tuple[Mapping[str, Any], Mapping[str, Any], Mapping[str, Any]]: 486 if not stream_slice: 487 return {}, {}, {} 488 489 if isinstance(stream_slice, StreamSlice): 490 partition = stream_slice.partition 491 cursor_slice = stream_slice.cursor_slice 492 remaining = {k: v for k, v in stream_slice.items()} 493 else: 494 # RFR streams that implement stream_slices() to generate stream slices in the legacy mapping format are converted into a 495 # structured stream slice mapping by the LegacyCursorBasedCheckpointReader. The structured mapping object has separate 496 # fields for the partition and cursor_slice value 497 partition = stream_slice.get("partition", {}) 498 cursor_slice = stream_slice.get("cursor_slice", {}) 499 remaining = { 500 key: val 501 for key, val in stream_slice.items() 502 if key != "partition" and key != "cursor_slice" 503 } 504 return partition, cursor_slice, remaining 505 506 def _fetch_next_page( 507 self, 508 stream_slice: Optional[Mapping[str, Any]] = None, 509 stream_state: Optional[Mapping[str, Any]] = None, 510 next_page_token: Optional[Mapping[str, Any]] = None, 511 ) -> Tuple[requests.PreparedRequest, requests.Response]: 512 request, response = self._http_client.send_request( 513 http_method=self.http_method, 514 url=self._join_url( 515 self.url_base, 516 self.path( 517 stream_state=stream_state, 518 stream_slice=stream_slice, 519 next_page_token=next_page_token, 520 ), 521 ), 522 request_kwargs=self.request_kwargs( 523 stream_state=stream_state, 524 stream_slice=stream_slice, 525 next_page_token=next_page_token, 526 ), 527 headers=self.request_headers( 528 stream_state=stream_state, 529 stream_slice=stream_slice, 530 next_page_token=next_page_token, 531 ), 532 params=self.request_params( 533 stream_state=stream_state, 534 stream_slice=stream_slice, 535 next_page_token=next_page_token, 536 ), 537 json=self.request_body_json( 538 stream_state=stream_state, 539 stream_slice=stream_slice, 540 next_page_token=next_page_token, 541 ), 542 data=self.request_body_data( 543 stream_state=stream_state, 544 stream_slice=stream_slice, 545 next_page_token=next_page_token, 546 ), 547 dedupe_query_params=True, 548 log_formatter=self.get_log_formatter(), 549 exit_on_rate_limit=self.exit_on_rate_limit, 550 ) 551 552 return request, response 553 554 def get_log_formatter(self) -> Optional[Callable[[requests.Response], Any]]: 555 """ 556 557 :return Optional[Callable[[requests.Response], Any]]: Function that will be used in logging inside HttpClient 558 """ 559 return None
Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API.
82 @property 83 def exit_on_rate_limit(self) -> bool: 84 """ 85 :return: False if the stream will retry endlessly when rate limited 86 """ 87 return self._exit_on_rate_limit
Returns
False if the stream will retry endlessly when rate limited
93 @property 94 def cache_filename(self) -> str: 95 """ 96 Override if needed. Return the name of cache file 97 Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. 98 """ 99 return f"{self.name}.sqlite"
Override if needed. Return the name of cache file Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.
101 @property 102 def use_cache(self) -> bool: 103 """ 104 Override if needed. If True, all records will be cached. 105 Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only. 106 """ 107 return False
Override if needed. If True, all records will be cached. Note that if the environment variable REQUEST_CACHE_PATH is not set, the cache will be in-memory only.
109 @property 110 @abstractmethod 111 def url_base(self) -> str: 112 """ 113 :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" 114 """
Returns
URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/"
116 @property 117 def http_method(self) -> str: 118 """ 119 Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH. 120 """ 121 return "GET"
Override if needed. See get_request_data/get_request_json if using POST/PUT/PATCH.
123 @property 124 @deprecated( 125 "Deprecated as of CDK version 3.0.0. " 126 "You should set error_handler explicitly in HttpStream.get_error_handler() instead." 127 ) 128 def raise_on_http_errors(self) -> bool: 129 """ 130 Override if needed. If set to False, allows opting-out of raising HTTP code exception. 131 """ 132 return True
Override if needed. If set to False, allows opting-out of raising HTTP code exception.
134 @property 135 @deprecated( 136 "Deprecated as of CDK version 3.0.0. " 137 "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead." 138 ) 139 def max_retries(self) -> Union[int, None]: 140 """ 141 Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit. 142 """ 143 return 5
Override if needed. Specifies maximum amount of retries for backoff policy. Return None for no limit.
145 @property 146 @deprecated( 147 "Deprecated as of CDK version 3.0.0. " 148 "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead." 149 ) 150 def max_time(self) -> Union[int, None]: 151 """ 152 Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit. 153 """ 154 return 60 * 10
Override if needed. Specifies maximum total waiting time (in seconds) for backoff policy. Return None for no limit.
156 @property 157 @deprecated( 158 "Deprecated as of CDK version 3.0.0. " 159 "You should set backoff_strategies explicitly in HttpStream.get_backoff_strategy() instead." 160 ) 161 def retry_factor(self) -> float: 162 """ 163 Override if needed. Specifies factor for backoff policy. 164 """ 165 return 5
Override if needed. Specifies factor for backoff policy.
167 @abstractmethod 168 def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: 169 """ 170 Override this method to define a pagination strategy. 171 172 The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params. 173 174 :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response. 175 """
Override this method to define a pagination strategy.
The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params.
Returns
The token for the next page from the input response object. Returning None means there are no more pages to read in this response.
177 @abstractmethod 178 def path( 179 self, 180 *, 181 stream_state: Optional[Mapping[str, Any]] = None, 182 stream_slice: Optional[Mapping[str, Any]] = None, 183 next_page_token: Optional[Mapping[str, Any]] = None, 184 ) -> str: 185 """ 186 Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" 187 """
Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity"
189 def request_params( 190 self, 191 stream_state: Optional[Mapping[str, Any]], 192 stream_slice: Optional[Mapping[str, Any]] = None, 193 next_page_token: Optional[Mapping[str, Any]] = None, 194 ) -> MutableMapping[str, Any]: 195 """ 196 Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs. 197 198 E.g: you might want to define query parameters for paging if next_page_token is not None. 199 """ 200 return {}
Override this method to define the query parameters that should be set on an outgoing HTTP request given the inputs.
E.g: you might want to define query parameters for paging if next_page_token is not None.
202 def request_headers( 203 self, 204 stream_state: Optional[Mapping[str, Any]], 205 stream_slice: Optional[Mapping[str, Any]] = None, 206 next_page_token: Optional[Mapping[str, Any]] = None, 207 ) -> Mapping[str, Any]: 208 """ 209 Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. 210 """ 211 return {}
Override to return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.
213 def request_body_data( 214 self, 215 stream_state: Optional[Mapping[str, Any]], 216 stream_slice: Optional[Mapping[str, Any]] = None, 217 next_page_token: Optional[Mapping[str, Any]] = None, 218 ) -> Optional[Union[Mapping[str, Any], str]]: 219 """ 220 Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload. 221 222 If returns a ready text that it will be sent as is. 223 If returns a dict that it will be converted to a urlencoded form. 224 E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" 225 226 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 227 """ 228 return None
Override when creating POST/PUT/PATCH requests to populate the body of the request with a non-JSON payload.
If returns a ready text that it will be sent as is. If returns a dict that it will be converted to a urlencoded form. E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2"
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
230 def request_body_json( 231 self, 232 stream_state: Optional[Mapping[str, Any]], 233 stream_slice: Optional[Mapping[str, Any]] = None, 234 next_page_token: Optional[Mapping[str, Any]] = None, 235 ) -> Optional[Mapping[str, Any]]: 236 """ 237 Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload. 238 239 At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. 240 """ 241 return None
Override when creating POST/PUT/PATCH requests to populate the body of the request with a JSON payload.
At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden.
243 def request_kwargs( 244 self, 245 stream_state: Optional[Mapping[str, Any]], 246 stream_slice: Optional[Mapping[str, Any]] = None, 247 next_page_token: Optional[Mapping[str, Any]] = None, 248 ) -> Mapping[str, Any]: 249 """ 250 Override to return a mapping of keyword arguments to be used when creating the HTTP request. 251 Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from 252 this method. Note that these options do not conflict with request-level options such as headers, request params, etc.. 253 """ 254 return {}
Override to return a mapping of keyword arguments to be used when creating the HTTP request. Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from this method. Note that these options do not conflict with request-level options such as headers, request params, etc..
256 @abstractmethod 257 def parse_response( 258 self, 259 response: requests.Response, 260 *, 261 stream_state: Mapping[str, Any], 262 stream_slice: Optional[Mapping[str, Any]] = None, 263 next_page_token: Optional[Mapping[str, Any]] = None, 264 ) -> Iterable[Mapping[str, Any]]: 265 """ 266 Parses the raw response object into a list of records. 267 By default, this returns an iterable containing the input. Override to parse differently. 268 :param response: 269 :param stream_state: 270 :param stream_slice: 271 :param next_page_token: 272 :return: An iterable containing the parsed response 273 """
Parses the raw response object into a list of records. By default, this returns an iterable containing the input. Override to parse differently.
Parameters
- response:
- stream_state:
- stream_slice:
- next_page_token:
Returns
An iterable containing the parsed response
275 def get_backoff_strategy(self) -> Optional[Union[BackoffStrategy, List[BackoffStrategy]]]: 276 """ 277 Used to initialize Adapter to avoid breaking changes. 278 If Stream has a `backoff_time` method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed. 279 280 Override to provide custom BackoffStrategy 281 :return Optional[BackoffStrategy]: 282 """ 283 if hasattr(self, "backoff_time"): 284 return HttpStreamAdapterBackoffStrategy(self) 285 else: 286 return None
Used to initialize Adapter to avoid breaking changes.
If Stream has a backoff_time
method implementation, we know this stream uses old (pre-HTTPClient) backoff handlers and thus an adapter is needed.
Override to provide custom BackoffStrategy
Returns
288 def get_error_handler(self) -> Optional[ErrorHandler]: 289 """ 290 Used to initialize Adapter to avoid breaking changes. 291 If Stream has a `should_retry` method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed. 292 293 Override to provide custom ErrorHandler 294 :return Optional[ErrorHandler]: 295 """ 296 if hasattr(self, "should_retry"): 297 error_handler = HttpStreamAdapterHttpStatusErrorHandler( 298 stream=self, 299 logger=logging.getLogger(), 300 max_retries=self.max_retries, 301 max_time=timedelta(seconds=self.max_time or 0), 302 ) 303 return error_handler 304 else: 305 return None
Used to initialize Adapter to avoid breaking changes.
If Stream has a should_retry
method implementation, we know this stream uses old (pre-HTTPClient) error handlers and thus an adapter is needed.
Override to provide custom ErrorHandler
Returns
311 @classmethod 312 def parse_response_error_message(cls, response: requests.Response) -> Optional[str]: 313 """ 314 Parses the raw response object from a failed request into a user-friendly error message. 315 By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently. 316 317 :param response: 318 :return: A user-friendly message that indicates the cause of the error 319 """ 320 321 # default logic to grab error from common fields 322 def _try_get_error(value: Optional[JsonType]) -> Optional[str]: 323 if isinstance(value, str): 324 return value 325 elif isinstance(value, list): 326 errors_in_value = [_try_get_error(v) for v in value] 327 return ", ".join(v for v in errors_in_value if v is not None) 328 elif isinstance(value, dict): 329 new_value = ( 330 value.get("message") 331 or value.get("messages") 332 or value.get("error") 333 or value.get("errors") 334 or value.get("failures") 335 or value.get("failure") 336 or value.get("detail") 337 ) 338 return _try_get_error(new_value) 339 return None 340 341 try: 342 body = response.json() 343 return _try_get_error(body) 344 except requests.exceptions.JSONDecodeError: 345 return None
Parses the raw response object from a failed request into a user-friendly error message. By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently.
Parameters
- response:
Returns
A user-friendly message that indicates the cause of the error
347 def get_error_display_message(self, exception: BaseException) -> Optional[str]: 348 """ 349 Retrieves the user-friendly display message that corresponds to an exception. 350 This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. 351 352 The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message(). 353 The method should be overriden as needed to handle any additional exception types. 354 355 :param exception: The exception that was raised 356 :return: A user-friendly message that indicates the cause of the error 357 """ 358 if isinstance(exception, requests.HTTPError) and exception.response is not None: 359 return self.parse_response_error_message(exception.response) 360 return None
Retrieves the user-friendly display message that corresponds to an exception. This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.
The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message(). The method should be overriden as needed to handle any additional exception types.
Parameters
- exception: The exception that was raised
Returns
A user-friendly message that indicates the cause of the error
362 def read_records( 363 self, 364 sync_mode: SyncMode, 365 cursor_field: Optional[List[str]] = None, 366 stream_slice: Optional[Mapping[str, Any]] = None, 367 stream_state: Optional[Mapping[str, Any]] = None, 368 ) -> Iterable[StreamData]: 369 # A cursor_field indicates this is an incremental stream which offers better checkpointing than RFR enabled via the cursor 370 if self.cursor_field or not isinstance(self.get_cursor(), ResumableFullRefreshCursor): 371 yield from self._read_pages( 372 lambda req, res, state, _slice: self.parse_response( 373 res, stream_slice=_slice, stream_state=state 374 ), 375 stream_slice, 376 stream_state, 377 ) 378 else: 379 yield from self._read_single_page( 380 lambda req, res, state, _slice: self.parse_response( 381 res, stream_slice=_slice, stream_state=state 382 ), 383 stream_slice, 384 stream_state, 385 )
This method should be overridden by subclasses to read records based on the inputs
387 @property 388 def state(self) -> MutableMapping[str, Any]: 389 cursor = self.get_cursor() 390 if cursor: 391 return cursor.get_stream_state() # type: ignore 392 return self._state
State getter, should return state in form that can serialized to a string and send to the output as a STATE AirbyteMessage.
A good example of a state is a cursor_value: { self.cursor_field: "cursor_value" }
State should try to be as small as possible but at the same time descriptive enough to restore syncing process from the point where it stopped.
401 def get_cursor(self) -> Optional[Cursor]: 402 # I don't love that this is semi-stateful but not sure what else to do. We don't know exactly what type of cursor to 403 # instantiate when creating the class. We can make a few assumptions like if there is a cursor_field which implies 404 # incremental, but we don't know until runtime if this is a substream. Ideally, a stream should explicitly define 405 # its cursor, but because we're trying to automatically apply RFR we're stuck with this logic where we replace the 406 # cursor at runtime once we detect this is a substream based on self.has_multiple_slices being reassigned 407 if self.has_multiple_slices and isinstance(self.cursor, ResumableFullRefreshCursor): 408 self.cursor = SubstreamResumableFullRefreshCursor() 409 return self.cursor 410 else: 411 return self.cursor
A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need to define a cursor implementation and override this method to manage state through a Cursor.
554 def get_log_formatter(self) -> Optional[Callable[[requests.Response], Any]]: 555 """ 556 557 :return Optional[Callable[[requests.Response], Any]]: Function that will be used in logging inside HttpClient 558 """ 559 return None
Returns
Function that will be used in logging inside HttpClient
Inherited Members
562class HttpSubStream(HttpStream, ABC): 563 def __init__(self, parent: HttpStream, **kwargs: Any): 564 """ 565 :param parent: should be the instance of HttpStream class 566 """ 567 super().__init__(**kwargs) 568 self.parent = parent 569 self.has_multiple_slices = ( 570 True # Substreams are based on parent records which implies there are multiple slices 571 ) 572 573 # There are three conditions that dictate if RFR should automatically be applied to a stream 574 # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR 575 # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR. 576 # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform 577 # per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method 578 if ( 579 not self.cursor 580 and len(self.cursor_field) == 0 581 and type(self).read_records is HttpStream.read_records 582 ): 583 self.cursor = SubstreamResumableFullRefreshCursor() 584 585 def stream_slices( 586 self, 587 sync_mode: SyncMode, 588 cursor_field: Optional[List[str]] = None, 589 stream_state: Optional[Mapping[str, Any]] = None, 590 ) -> Iterable[Optional[Mapping[str, Any]]]: 591 # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does 592 # not support either substreams or RFR, but something that needs to be considered once we do 593 for parent_record in self.parent.read_only_records(stream_state): 594 # Skip non-records (eg AirbyteLogMessage) 595 if isinstance(parent_record, AirbyteMessage): 596 if parent_record.type == MessageType.RECORD: 597 parent_record = parent_record.record.data # type: ignore [assignment, union-attr] # Incorrect type for assignment 598 else: 599 continue 600 elif isinstance(parent_record, Record): 601 parent_record = parent_record.data 602 yield {"parent": parent_record}
Base abstract class for an Airbyte Stream using the HTTP protocol. Basic building block for users building an Airbyte source for a HTTP API.
563 def __init__(self, parent: HttpStream, **kwargs: Any): 564 """ 565 :param parent: should be the instance of HttpStream class 566 """ 567 super().__init__(**kwargs) 568 self.parent = parent 569 self.has_multiple_slices = ( 570 True # Substreams are based on parent records which implies there are multiple slices 571 ) 572 573 # There are three conditions that dictate if RFR should automatically be applied to a stream 574 # 1. Streams that explicitly initialize their own cursor should defer to it and not automatically apply RFR 575 # 2. Streams with at least one cursor_field are incremental and thus a superior sync to RFR. 576 # 3. Streams overriding read_records() do not guarantee that they will call the parent implementation which can perform 577 # per-page checkpointing so RFR is only supported if a stream use the default `HttpStream.read_records()` method 578 if ( 579 not self.cursor 580 and len(self.cursor_field) == 0 581 and type(self).read_records is HttpStream.read_records 582 ): 583 self.cursor = SubstreamResumableFullRefreshCursor()
Parameters
- parent: should be the instance of HttpStream class
585 def stream_slices( 586 self, 587 sync_mode: SyncMode, 588 cursor_field: Optional[List[str]] = None, 589 stream_state: Optional[Mapping[str, Any]] = None, 590 ) -> Iterable[Optional[Mapping[str, Any]]]: 591 # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does 592 # not support either substreams or RFR, but something that needs to be considered once we do 593 for parent_record in self.parent.read_only_records(stream_state): 594 # Skip non-records (eg AirbyteLogMessage) 595 if isinstance(parent_record, AirbyteMessage): 596 if parent_record.type == MessageType.RECORD: 597 parent_record = parent_record.record.data # type: ignore [assignment, union-attr] # Incorrect type for assignment 598 else: 599 continue 600 elif isinstance(parent_record, Record): 601 parent_record = parent_record.data 602 yield {"parent": parent_record}
Override to define the slices for this stream. See the stream slicing section of the docs for more information.
Parameters
- sync_mode:
- cursor_field:
- stream_state:
Returns
Inherited Members
- HttpStream
- source_defined_cursor
- page_size
- exit_on_rate_limit
- cache_filename
- use_cache
- url_base
- http_method
- raise_on_http_errors
- max_retries
- max_time
- retry_factor
- next_page_token
- path
- request_params
- request_headers
- request_body_data
- request_body_json
- request_kwargs
- parse_response
- get_backoff_strategy
- get_error_handler
- parse_response_error_message
- get_error_display_message
- read_records
- state
- get_cursor
- get_log_formatter
700class LimiterSession(LimiterMixin, requests.Session): 701 """Session that adds rate-limiting behavior to requests."""
Session that adds rate-limiting behavior to requests.
Inherited Members
396class MovingWindowCallRatePolicy(BaseCallRatePolicy): 397 """ 398 Policy to control requests rate implemented on top of PyRateLimiter lib. 399 The main difference between this policy and FixedWindowCallRatePolicy is that the rate-limiting window 400 is moving along requests that we made, and there is no moment when we reset an available number of calls. 401 This strategy requires saving of timestamps of all requests within a window. 402 """ 403 404 def __init__(self, rates: list[Rate], matchers: list[RequestMatcher]): 405 """Constructor 406 407 :param rates: list of rates, the order is important and must be ascending 408 :param matchers: 409 """ 410 if not rates: 411 raise ValueError("The list of rates can not be empty") 412 pyrate_rates = [ 413 PyRateRate(limit=rate.limit, interval=int(rate.interval.total_seconds() * 1000)) 414 for rate in rates 415 ] 416 self._bucket = InMemoryBucket(pyrate_rates) 417 # Limiter will create the background task that clears old requests in the bucket 418 self._limiter = Limiter(self._bucket) 419 super().__init__(matchers=matchers) 420 421 def try_acquire(self, request: Any, weight: int) -> None: 422 if not self.matches(request): 423 raise ValueError("Request does not match the policy") 424 425 try: 426 self._limiter.try_acquire(request, weight=weight) 427 except BucketFullException as exc: 428 item = self._limiter.bucket_factory.wrap_item(request, weight) 429 assert isinstance(item, RateItem) 430 431 with self._limiter.lock: 432 time_to_wait = self._bucket.waiting(item) 433 assert isinstance(time_to_wait, int) 434 435 raise CallRateLimitHit( 436 error=str(exc.meta_info["error"]), 437 item=request, 438 weight=int(exc.meta_info["weight"]), 439 rate=str(exc.meta_info["rate"]), 440 time_to_wait=timedelta(milliseconds=time_to_wait), 441 ) 442 443 def update( 444 self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime] 445 ) -> None: 446 """Adjust call bucket to reflect the state of the API server 447 448 :param available_calls: 449 :param call_reset_ts: 450 :return: 451 """ 452 if ( 453 available_calls is not None and call_reset_ts is None 454 ): # we do our best to sync buckets with API 455 if available_calls == 0: 456 with self._limiter.lock: 457 items_to_add = self._bucket.count() < self._bucket.rates[0].limit 458 if items_to_add > 0: 459 now: int = TimeClock().now() # type: ignore[no-untyped-call] 460 self._bucket.put(RateItem(name="dummy", timestamp=now, weight=items_to_add)) 461 # TODO: add support if needed, it might be that it is not possible to make a good solution for this case 462 # if available_calls is not None and call_reset_ts is not None: 463 # ts = call_reset_ts.timestamp() 464 465 def __str__(self) -> str: 466 """Return a human-friendly description of the moving window rate policy for logging purposes.""" 467 rates_info = ", ".join( 468 f"{rate.limit} per {timedelta(milliseconds=rate.interval)}" 469 for rate in self._bucket.rates 470 ) 471 current_bucket_count = self._bucket.count() 472 matcher_str = ", ".join(f"{matcher}" for matcher in self._matchers) 473 return ( 474 f"MovingWindowCallRatePolicy(rates=[{rates_info}], current_bucket_count={current_bucket_count}, " 475 f"matchers=[{matcher_str}])" 476 )
Policy to control requests rate implemented on top of PyRateLimiter lib. The main difference between this policy and FixedWindowCallRatePolicy is that the rate-limiting window is moving along requests that we made, and there is no moment when we reset an available number of calls. This strategy requires saving of timestamps of all requests within a window.
404 def __init__(self, rates: list[Rate], matchers: list[RequestMatcher]): 405 """Constructor 406 407 :param rates: list of rates, the order is important and must be ascending 408 :param matchers: 409 """ 410 if not rates: 411 raise ValueError("The list of rates can not be empty") 412 pyrate_rates = [ 413 PyRateRate(limit=rate.limit, interval=int(rate.interval.total_seconds() * 1000)) 414 for rate in rates 415 ] 416 self._bucket = InMemoryBucket(pyrate_rates) 417 # Limiter will create the background task that clears old requests in the bucket 418 self._limiter = Limiter(self._bucket) 419 super().__init__(matchers=matchers)
Constructor
Parameters
- rates: list of rates, the order is important and must be ascending
- matchers:
421 def try_acquire(self, request: Any, weight: int) -> None: 422 if not self.matches(request): 423 raise ValueError("Request does not match the policy") 424 425 try: 426 self._limiter.try_acquire(request, weight=weight) 427 except BucketFullException as exc: 428 item = self._limiter.bucket_factory.wrap_item(request, weight) 429 assert isinstance(item, RateItem) 430 431 with self._limiter.lock: 432 time_to_wait = self._bucket.waiting(item) 433 assert isinstance(time_to_wait, int) 434 435 raise CallRateLimitHit( 436 error=str(exc.meta_info["error"]), 437 item=request, 438 weight=int(exc.meta_info["weight"]), 439 rate=str(exc.meta_info["rate"]), 440 time_to_wait=timedelta(milliseconds=time_to_wait), 441 )
Try to acquire request
Parameters
- request: a request object representing a single call to API
- weight: number of requests to deduct from credit
Returns
443 def update( 444 self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime] 445 ) -> None: 446 """Adjust call bucket to reflect the state of the API server 447 448 :param available_calls: 449 :param call_reset_ts: 450 :return: 451 """ 452 if ( 453 available_calls is not None and call_reset_ts is None 454 ): # we do our best to sync buckets with API 455 if available_calls == 0: 456 with self._limiter.lock: 457 items_to_add = self._bucket.count() < self._bucket.rates[0].limit 458 if items_to_add > 0: 459 now: int = TimeClock().now() # type: ignore[no-untyped-call] 460 self._bucket.put(RateItem(name="dummy", timestamp=now, weight=items_to_add)) 461 # TODO: add support if needed, it might be that it is not possible to make a good solution for this case 462 # if available_calls is not None and call_reset_ts is not None: 463 # ts = call_reset_ts.timestamp()
Adjust call bucket to reflect the state of the API server
Parameters
- available_calls:
- call_reset_ts:
Returns
Inherited Members
26class Oauth2Authenticator(AbstractOauth2Authenticator): 27 """ 28 Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials. 29 The generated access token is attached to each request via the Authorization header. 30 If a connector_config is provided any mutation of it's value in the scope of this class will emit AirbyteControlConnectorConfigMessage. 31 """ 32 33 def __init__( 34 self, 35 token_refresh_endpoint: str, 36 client_id: str, 37 client_secret: str, 38 refresh_token: str, 39 client_id_name: str = "client_id", 40 client_secret_name: str = "client_secret", 41 refresh_token_name: str = "refresh_token", 42 scopes: List[str] | None = None, 43 token_expiry_date: AirbyteDateTime | None = None, 44 token_expiry_date_format: str | None = None, 45 access_token_name: str = "access_token", 46 expires_in_name: str = "expires_in", 47 refresh_request_body: Mapping[str, Any] | None = None, 48 refresh_request_headers: Mapping[str, Any] | None = None, 49 grant_type_name: str = "grant_type", 50 grant_type: str = "refresh_token", 51 token_expiry_is_time_of_expiration: bool = False, 52 refresh_token_error_status_codes: Tuple[int, ...] = (), 53 refresh_token_error_key: str = "", 54 refresh_token_error_values: Tuple[str, ...] = (), 55 ) -> None: 56 self._token_refresh_endpoint = token_refresh_endpoint 57 self._client_secret_name = client_secret_name 58 self._client_secret = client_secret 59 self._client_id_name = client_id_name 60 self._client_id = client_id 61 self._refresh_token_name = refresh_token_name 62 self._refresh_token = refresh_token 63 self._scopes = scopes 64 self._access_token_name = access_token_name 65 self._expires_in_name = expires_in_name 66 self._refresh_request_body = refresh_request_body 67 self._refresh_request_headers = refresh_request_headers 68 self._grant_type_name = grant_type_name 69 self._grant_type = grant_type 70 71 self._token_expiry_date = token_expiry_date or (ab_datetime_now() - timedelta(days=1)) 72 self._token_expiry_date_format = token_expiry_date_format 73 self._token_expiry_is_time_of_expiration = token_expiry_is_time_of_expiration 74 self._access_token = None 75 super().__init__( 76 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values 77 ) 78 79 def get_token_refresh_endpoint(self) -> str: 80 return self._token_refresh_endpoint 81 82 def get_client_id_name(self) -> str: 83 return self._client_id_name 84 85 def get_client_id(self) -> str: 86 return self._client_id 87 88 def get_client_secret_name(self) -> str: 89 return self._client_secret_name 90 91 def get_client_secret(self) -> str: 92 return self._client_secret 93 94 def get_refresh_token_name(self) -> str: 95 return self._refresh_token_name 96 97 def get_refresh_token(self) -> str: 98 return self._refresh_token 99 100 def get_access_token_name(self) -> str: 101 return self._access_token_name 102 103 def get_scopes(self) -> list[str]: 104 return self._scopes # type: ignore[return-value] 105 106 def get_expires_in_name(self) -> str: 107 return self._expires_in_name 108 109 def get_refresh_request_body(self) -> Mapping[str, Any]: 110 return self._refresh_request_body # type: ignore[return-value] 111 112 def get_refresh_request_headers(self) -> Mapping[str, Any]: 113 return self._refresh_request_headers # type: ignore[return-value] 114 115 def get_grant_type_name(self) -> str: 116 return self._grant_type_name 117 118 def get_grant_type(self) -> str: 119 return self._grant_type 120 121 def get_token_expiry_date(self) -> AirbyteDateTime: 122 return self._token_expiry_date 123 124 def set_token_expiry_date(self, value: Union[str, int]) -> None: 125 self._token_expiry_date = self._parse_token_expiration_date(value) 126 127 @property 128 def token_expiry_is_time_of_expiration(self) -> bool: 129 return self._token_expiry_is_time_of_expiration 130 131 @property 132 def token_expiry_date_format(self) -> Optional[str]: 133 return self._token_expiry_date_format 134 135 @property 136 def access_token(self) -> str: 137 return self._access_token # type: ignore[return-value] 138 139 @access_token.setter 140 def access_token(self, value: str) -> None: 141 self._access_token = value # type: ignore[assignment] # Incorrect type for assignment
Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials. The generated access token is attached to each request via the Authorization header. If a connector_config is provided any mutation of it's value in the scope of this class will emit AirbyteControlConnectorConfigMessage.
33 def __init__( 34 self, 35 token_refresh_endpoint: str, 36 client_id: str, 37 client_secret: str, 38 refresh_token: str, 39 client_id_name: str = "client_id", 40 client_secret_name: str = "client_secret", 41 refresh_token_name: str = "refresh_token", 42 scopes: List[str] | None = None, 43 token_expiry_date: AirbyteDateTime | None = None, 44 token_expiry_date_format: str | None = None, 45 access_token_name: str = "access_token", 46 expires_in_name: str = "expires_in", 47 refresh_request_body: Mapping[str, Any] | None = None, 48 refresh_request_headers: Mapping[str, Any] | None = None, 49 grant_type_name: str = "grant_type", 50 grant_type: str = "refresh_token", 51 token_expiry_is_time_of_expiration: bool = False, 52 refresh_token_error_status_codes: Tuple[int, ...] = (), 53 refresh_token_error_key: str = "", 54 refresh_token_error_values: Tuple[str, ...] = (), 55 ) -> None: 56 self._token_refresh_endpoint = token_refresh_endpoint 57 self._client_secret_name = client_secret_name 58 self._client_secret = client_secret 59 self._client_id_name = client_id_name 60 self._client_id = client_id 61 self._refresh_token_name = refresh_token_name 62 self._refresh_token = refresh_token 63 self._scopes = scopes 64 self._access_token_name = access_token_name 65 self._expires_in_name = expires_in_name 66 self._refresh_request_body = refresh_request_body 67 self._refresh_request_headers = refresh_request_headers 68 self._grant_type_name = grant_type_name 69 self._grant_type = grant_type 70 71 self._token_expiry_date = token_expiry_date or (ab_datetime_now() - timedelta(days=1)) 72 self._token_expiry_date_format = token_expiry_date_format 73 self._token_expiry_is_time_of_expiration = token_expiry_is_time_of_expiration 74 self._access_token = None 75 super().__init__( 76 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values 77 )
If all of refresh_token_error_status_codes, refresh_token_error_key, and refresh_token_error_values are set, then http errors with such params will be wrapped in AirbyteTracedException.
109 def get_refresh_request_body(self) -> Mapping[str, Any]: 110 return self._refresh_request_body # type: ignore[return-value]
Returns the request body to set on the refresh request
112 def get_refresh_request_headers(self) -> Mapping[str, Any]: 113 return self._refresh_request_headers # type: ignore[return-value]
Returns the request headers to set on the refresh request
124 def set_token_expiry_date(self, value: Union[str, int]) -> None: 125 self._token_expiry_date = self._parse_token_expiration_date(value)
Setter for access token expiration date
127 @property 128 def token_expiry_is_time_of_expiration(self) -> bool: 129 return self._token_expiry_is_time_of_expiration
Indicates that the Token Expiry returns the date until which the token will be valid, not the amount of time it will be valid.
131 @property 132 def token_expiry_date_format(self) -> Optional[str]: 133 return self._token_expiry_date_format
Format of the datetime; exists it if expires_in is returned as the expiration datetime instead of seconds until it expires
33@dataclasses.dataclass 34class Rate: 35 """Call rate limit""" 36 37 limit: int 38 interval: timedelta
Call rate limit
144class SingleUseRefreshTokenOauth2Authenticator(Oauth2Authenticator): 145 """ 146 Authenticator that should be used for API implementing single use refresh tokens: 147 when refreshing access token some API returns a new refresh token that needs to used in the next refresh flow. 148 This authenticator updates the configuration with new refresh token by emitting Airbyte control message from an observed mutation. 149 By default, this authenticator expects a connector config with a "credentials" field with the following nested fields: client_id, 150 client_secret, refresh_token. This behavior can be changed by defining custom config path (using dpath paths) in client_id_config_path, 151 client_secret_config_path, refresh_token_config_path constructor arguments. 152 """ 153 154 def __init__( 155 self, 156 connector_config: Mapping[str, Any], 157 token_refresh_endpoint: str, 158 scopes: List[str] | None = None, 159 access_token_name: str = "access_token", 160 expires_in_name: str = "expires_in", 161 refresh_token_name: str = "refresh_token", 162 refresh_request_body: Mapping[str, Any] | None = None, 163 refresh_request_headers: Mapping[str, Any] | None = None, 164 grant_type_name: str = "grant_type", 165 grant_type: str = "refresh_token", 166 client_id_name: str = "client_id", 167 client_id: Optional[str] = None, 168 client_secret_name: str = "client_secret", 169 client_secret: Optional[str] = None, 170 access_token_config_path: Sequence[str] = ("credentials", "access_token"), 171 refresh_token_config_path: Sequence[str] = ("credentials", "refresh_token"), 172 token_expiry_date_config_path: Sequence[str] = ("credentials", "token_expiry_date"), 173 token_expiry_date_format: Optional[str] = None, 174 message_repository: MessageRepository = NoopMessageRepository(), 175 token_expiry_is_time_of_expiration: bool = False, 176 refresh_token_error_status_codes: Tuple[int, ...] = (), 177 refresh_token_error_key: str = "", 178 refresh_token_error_values: Tuple[str, ...] = (), 179 ) -> None: 180 """ 181 Args: 182 connector_config (Mapping[str, Any]): The full connector configuration 183 token_refresh_endpoint (str): Full URL to the token refresh endpoint 184 scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None. 185 access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token". 186 expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in". 187 refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token". 188 refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None. 189 refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None. 190 grant_type (str, optional): OAuth grant type. Defaults to "refresh_token". 191 client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object. 192 client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object. 193 access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token"). 194 refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token"). 195 token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date"). 196 token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration. 197 token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration 198 message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update 199 """ 200 self._connector_config = connector_config 201 self._client_id: str = self._get_config_value_by_path( 202 ("credentials", "client_id"), client_id 203 ) 204 self._client_secret: str = self._get_config_value_by_path( 205 ("credentials", "client_secret"), client_secret 206 ) 207 self._client_id_name = client_id_name 208 self._client_secret_name = client_secret_name 209 self._access_token_config_path = access_token_config_path 210 self._refresh_token_config_path = refresh_token_config_path 211 self._token_expiry_date_config_path = token_expiry_date_config_path 212 self._token_expiry_date_format = token_expiry_date_format 213 self._refresh_token_name = refresh_token_name 214 self._grant_type_name = grant_type_name 215 self._connector_config = connector_config 216 self.__message_repository = message_repository 217 super().__init__( 218 token_refresh_endpoint=token_refresh_endpoint, 219 client_id_name=self._client_id_name, 220 client_id=self._client_id, 221 client_secret_name=self._client_secret_name, 222 client_secret=self._client_secret, 223 refresh_token=self.get_refresh_token(), 224 refresh_token_name=self._refresh_token_name, 225 scopes=scopes, 226 token_expiry_date=self.get_token_expiry_date(), 227 access_token_name=access_token_name, 228 expires_in_name=expires_in_name, 229 refresh_request_body=refresh_request_body, 230 refresh_request_headers=refresh_request_headers, 231 grant_type_name=self._grant_type_name, 232 grant_type=grant_type, 233 token_expiry_date_format=token_expiry_date_format, 234 token_expiry_is_time_of_expiration=token_expiry_is_time_of_expiration, 235 refresh_token_error_status_codes=refresh_token_error_status_codes, 236 refresh_token_error_key=refresh_token_error_key, 237 refresh_token_error_values=refresh_token_error_values, 238 ) 239 240 @property 241 def access_token(self) -> str: 242 """ 243 Retrieve the access token from the configuration. 244 245 Returns: 246 str: The access token. 247 """ 248 return self._get_config_value_by_path(self._access_token_config_path) # type: ignore[return-value] 249 250 @access_token.setter 251 def access_token(self, new_access_token: str) -> None: 252 """ 253 Sets a new access token. 254 255 Args: 256 new_access_token (str): The new access token to be set. 257 """ 258 self._set_config_value_by_path(self._access_token_config_path, new_access_token) 259 260 def get_refresh_token(self) -> str: 261 """ 262 Retrieve the refresh token from the configuration. 263 264 This method fetches the refresh token using the configuration path specified 265 by `_refresh_token_config_path`. 266 267 Returns: 268 str: The refresh token as a string. 269 """ 270 return self._get_config_value_by_path(self._refresh_token_config_path) # type: ignore[return-value] 271 272 def set_refresh_token(self, new_refresh_token: str) -> None: 273 """ 274 Updates the refresh token in the configuration. 275 276 Args: 277 new_refresh_token (str): The new refresh token to be set. 278 """ 279 self._set_config_value_by_path(self._refresh_token_config_path, new_refresh_token) 280 281 def get_token_expiry_date(self) -> AirbyteDateTime: 282 """ 283 Retrieves the token expiry date from the configuration. 284 285 This method fetches the token expiry date from the configuration using the specified path. 286 If the expiry date is an empty string, it returns the current date and time minus one day. 287 Otherwise, it parses the expiry date string into an AirbyteDateTime object. 288 289 Returns: 290 AirbyteDateTime: The parsed or calculated token expiry date. 291 292 Raises: 293 TypeError: If the result is not an instance of AirbyteDateTime. 294 """ 295 expiry_date = self._get_config_value_by_path(self._token_expiry_date_config_path) 296 result = ( 297 ab_datetime_now() - timedelta(days=1) 298 if expiry_date == "" 299 else ab_datetime_parse(str(expiry_date)) 300 ) 301 if isinstance(result, AirbyteDateTime): 302 return result 303 raise TypeError("Invalid datetime conversion") 304 305 def set_token_expiry_date(self, new_token_expiry_date: AirbyteDateTime) -> None: # type: ignore[override] 306 """ 307 Sets the token expiry date in the configuration. 308 309 Args: 310 new_token_expiry_date (AirbyteDateTime): The new expiry date for the token. 311 """ 312 self._set_config_value_by_path( 313 self._token_expiry_date_config_path, str(new_token_expiry_date) 314 ) 315 316 def token_has_expired(self) -> bool: 317 """Returns True if the token is expired""" 318 return ab_datetime_now() > self.get_token_expiry_date() 319 320 @staticmethod 321 def get_new_token_expiry_date( 322 access_token_expires_in: str, 323 token_expiry_date_format: str | None = None, 324 ) -> AirbyteDateTime: 325 """ 326 Calculate the new token expiry date based on the provided expiration duration or format. 327 328 Args: 329 access_token_expires_in (str): The duration (in seconds) until the access token expires, or the expiry date in a specific format. 330 token_expiry_date_format (str | None, optional): The format of the expiry date if provided. Defaults to None. 331 332 Returns: 333 AirbyteDateTime: The calculated expiry date of the access token. 334 """ 335 if token_expiry_date_format: 336 return ab_datetime_parse(access_token_expires_in) 337 else: 338 return ab_datetime_now() + timedelta(seconds=int(access_token_expires_in)) 339 340 def get_access_token(self) -> str: 341 """Retrieve new access and refresh token if the access token has expired. 342 The new refresh token is persisted with the set_refresh_token function 343 Returns: 344 str: The current access_token, updated if it was previously expired. 345 """ 346 if self.token_has_expired(): 347 new_access_token, access_token_expires_in, new_refresh_token = ( 348 self.refresh_access_token() 349 ) 350 new_token_expiry_date: AirbyteDateTime = self.get_new_token_expiry_date( 351 access_token_expires_in, self._token_expiry_date_format 352 ) 353 self.access_token = new_access_token 354 self.set_refresh_token(new_refresh_token) 355 self.set_token_expiry_date(new_token_expiry_date) 356 self._emit_control_message() 357 return self.access_token 358 359 def refresh_access_token(self) -> Tuple[str, str, str]: # type: ignore[override] 360 """ 361 Refreshes the access token by making a handled request and extracting the necessary token information. 362 363 Returns: 364 Tuple[str, str, str]: A tuple containing the new access token, token expiry date, and refresh token. 365 """ 366 response_json = self._make_handled_request() 367 return ( 368 self._extract_access_token(response_json), 369 self._extract_token_expiry_date(response_json), 370 self._extract_refresh_token(response_json), 371 ) 372 373 def _set_config_value_by_path(self, config_path: Union[str, Sequence[str]], value: Any) -> None: 374 """ 375 Set a value in the connector configuration at the specified path. 376 377 Args: 378 config_path (Union[str, Sequence[str]]): The path within the configuration where the value should be set. 379 This can be a string representing a single key or a sequence of strings representing a nested path. 380 value (Any): The value to set at the specified path in the configuration. 381 382 Returns: 383 None 384 """ 385 dpath.new(self._connector_config, config_path, value) # type: ignore[arg-type] 386 387 def _get_config_value_by_path( 388 self, config_path: Union[str, Sequence[str]], default: Optional[str] = None 389 ) -> str | Any: 390 """ 391 Retrieve a value from the connector configuration using a specified path. 392 393 Args: 394 config_path (Union[str, Sequence[str]]): The path to the desired configuration value. This can be a string or a sequence of strings. 395 default (Optional[str], optional): The default value to return if the specified path does not exist in the configuration. Defaults to None. 396 397 Returns: 398 Any: The value from the configuration at the specified path, or the default value if the path does not exist. 399 """ 400 return dpath.get( 401 self._connector_config, # type: ignore[arg-type] 402 config_path, 403 default=default if default is not None else "", 404 ) 405 406 def _emit_control_message(self) -> None: 407 """ 408 Emits a control message based on the connector configuration. 409 410 This method checks if the message repository is not a NoopMessageRepository. 411 If it is not, it emits a message using the message repository. Otherwise, 412 it falls back to emitting the configuration as an Airbyte control message 413 directly to the console for backward compatibility. 414 415 Note: 416 The function `emit_configuration_as_airbyte_control_message` has been deprecated 417 in favor of the package `airbyte_cdk.sources.message`. 418 419 Raises: 420 TypeError: If the argument types are incorrect. 421 """ 422 # FIXME emit_configuration_as_airbyte_control_message as been deprecated in favor of package airbyte_cdk.sources.message 423 # Usually, a class shouldn't care about the implementation details but to keep backward compatibility where we print the 424 # message directly in the console, this is needed 425 if not isinstance(self._message_repository, NoopMessageRepository): 426 self._message_repository.emit_message( 427 create_connector_config_control_message(self._connector_config) # type: ignore[arg-type] 428 ) 429 else: 430 emit_configuration_as_airbyte_control_message(self._connector_config) # type: ignore[arg-type] 431 432 @property 433 def _message_repository(self) -> MessageRepository: 434 """ 435 Overriding AbstractOauth2Authenticator._message_repository to allow for HTTP request logs 436 """ 437 return self.__message_repository
Authenticator that should be used for API implementing single use refresh tokens: when refreshing access token some API returns a new refresh token that needs to used in the next refresh flow. This authenticator updates the configuration with new refresh token by emitting Airbyte control message from an observed mutation. By default, this authenticator expects a connector config with a "credentials" field with the following nested fields: client_id, client_secret, refresh_token. This behavior can be changed by defining custom config path (using dpath paths) in client_id_config_path, client_secret_config_path, refresh_token_config_path constructor arguments.
154 def __init__( 155 self, 156 connector_config: Mapping[str, Any], 157 token_refresh_endpoint: str, 158 scopes: List[str] | None = None, 159 access_token_name: str = "access_token", 160 expires_in_name: str = "expires_in", 161 refresh_token_name: str = "refresh_token", 162 refresh_request_body: Mapping[str, Any] | None = None, 163 refresh_request_headers: Mapping[str, Any] | None = None, 164 grant_type_name: str = "grant_type", 165 grant_type: str = "refresh_token", 166 client_id_name: str = "client_id", 167 client_id: Optional[str] = None, 168 client_secret_name: str = "client_secret", 169 client_secret: Optional[str] = None, 170 access_token_config_path: Sequence[str] = ("credentials", "access_token"), 171 refresh_token_config_path: Sequence[str] = ("credentials", "refresh_token"), 172 token_expiry_date_config_path: Sequence[str] = ("credentials", "token_expiry_date"), 173 token_expiry_date_format: Optional[str] = None, 174 message_repository: MessageRepository = NoopMessageRepository(), 175 token_expiry_is_time_of_expiration: bool = False, 176 refresh_token_error_status_codes: Tuple[int, ...] = (), 177 refresh_token_error_key: str = "", 178 refresh_token_error_values: Tuple[str, ...] = (), 179 ) -> None: 180 """ 181 Args: 182 connector_config (Mapping[str, Any]): The full connector configuration 183 token_refresh_endpoint (str): Full URL to the token refresh endpoint 184 scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None. 185 access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token". 186 expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in". 187 refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token". 188 refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None. 189 refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None. 190 grant_type (str, optional): OAuth grant type. Defaults to "refresh_token". 191 client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object. 192 client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object. 193 access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token"). 194 refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token"). 195 token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date"). 196 token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration. 197 token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration 198 message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update 199 """ 200 self._connector_config = connector_config 201 self._client_id: str = self._get_config_value_by_path( 202 ("credentials", "client_id"), client_id 203 ) 204 self._client_secret: str = self._get_config_value_by_path( 205 ("credentials", "client_secret"), client_secret 206 ) 207 self._client_id_name = client_id_name 208 self._client_secret_name = client_secret_name 209 self._access_token_config_path = access_token_config_path 210 self._refresh_token_config_path = refresh_token_config_path 211 self._token_expiry_date_config_path = token_expiry_date_config_path 212 self._token_expiry_date_format = token_expiry_date_format 213 self._refresh_token_name = refresh_token_name 214 self._grant_type_name = grant_type_name 215 self._connector_config = connector_config 216 self.__message_repository = message_repository 217 super().__init__( 218 token_refresh_endpoint=token_refresh_endpoint, 219 client_id_name=self._client_id_name, 220 client_id=self._client_id, 221 client_secret_name=self._client_secret_name, 222 client_secret=self._client_secret, 223 refresh_token=self.get_refresh_token(), 224 refresh_token_name=self._refresh_token_name, 225 scopes=scopes, 226 token_expiry_date=self.get_token_expiry_date(), 227 access_token_name=access_token_name, 228 expires_in_name=expires_in_name, 229 refresh_request_body=refresh_request_body, 230 refresh_request_headers=refresh_request_headers, 231 grant_type_name=self._grant_type_name, 232 grant_type=grant_type, 233 token_expiry_date_format=token_expiry_date_format, 234 token_expiry_is_time_of_expiration=token_expiry_is_time_of_expiration, 235 refresh_token_error_status_codes=refresh_token_error_status_codes, 236 refresh_token_error_key=refresh_token_error_key, 237 refresh_token_error_values=refresh_token_error_values, 238 )
Arguments:
- connector_config (Mapping[str, Any]): The full connector configuration
- token_refresh_endpoint (str): Full URL to the token refresh endpoint
- scopes (List[str], optional): List of OAuth scopes to pass in the refresh token request body. Defaults to None.
- access_token_name (str, optional): Name of the access token field, used to parse the refresh token response. Defaults to "access_token".
- expires_in_name (str, optional): Name of the name of the field that characterizes when the current access token will expire, used to parse the refresh token response. Defaults to "expires_in".
- refresh_token_name (str, optional): Name of the name of the refresh token field, used to parse the refresh token response. Defaults to "refresh_token".
- refresh_request_body (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request body. Defaults to None.
- refresh_request_headers (Mapping[str, Any], optional): Custom key value pair that will be added to the refresh token request headers. Defaults to None.
- grant_type (str, optional): OAuth grant type. Defaults to "refresh_token".
- client_id (Optional[str]): The client id to authenticate. If not specified, defaults to credentials.client_id in the config object.
- client_secret (Optional[str]): The client secret to authenticate. If not specified, defaults to credentials.client_secret in the config object.
- access_token_config_path (Sequence[str]): Dpath to the access_token field in the connector configuration. Defaults to ("credentials", "access_token").
- refresh_token_config_path (Sequence[str]): Dpath to the refresh_token field in the connector configuration. Defaults to ("credentials", "refresh_token").
- token_expiry_date_config_path (Sequence[str]): Dpath to the token_expiry_date field in the connector configuration. Defaults to ("credentials", "token_expiry_date").
- token_expiry_date_format (Optional[str]): Date format of the token expiry date field (set by expires_in_name). If not specified the token expiry date is interpreted as number of seconds until expiration.
- token_expiry_is_time_of_expiration bool: set True it if expires_in is returned as time of expiration instead of the number seconds until expiration
- message_repository (MessageRepository): the message repository used to emit logs on HTTP requests and control message on config update
240 @property 241 def access_token(self) -> str: 242 """ 243 Retrieve the access token from the configuration. 244 245 Returns: 246 str: The access token. 247 """ 248 return self._get_config_value_by_path(self._access_token_config_path) # type: ignore[return-value]
Retrieve the access token from the configuration.
Returns:
str: The access token.
260 def get_refresh_token(self) -> str: 261 """ 262 Retrieve the refresh token from the configuration. 263 264 This method fetches the refresh token using the configuration path specified 265 by `_refresh_token_config_path`. 266 267 Returns: 268 str: The refresh token as a string. 269 """ 270 return self._get_config_value_by_path(self._refresh_token_config_path) # type: ignore[return-value]
Retrieve the refresh token from the configuration.
This method fetches the refresh token using the configuration path specified
by _refresh_token_config_path
.
Returns:
str: The refresh token as a string.
272 def set_refresh_token(self, new_refresh_token: str) -> None: 273 """ 274 Updates the refresh token in the configuration. 275 276 Args: 277 new_refresh_token (str): The new refresh token to be set. 278 """ 279 self._set_config_value_by_path(self._refresh_token_config_path, new_refresh_token)
Updates the refresh token in the configuration.
Arguments:
- new_refresh_token (str): The new refresh token to be set.
281 def get_token_expiry_date(self) -> AirbyteDateTime: 282 """ 283 Retrieves the token expiry date from the configuration. 284 285 This method fetches the token expiry date from the configuration using the specified path. 286 If the expiry date is an empty string, it returns the current date and time minus one day. 287 Otherwise, it parses the expiry date string into an AirbyteDateTime object. 288 289 Returns: 290 AirbyteDateTime: The parsed or calculated token expiry date. 291 292 Raises: 293 TypeError: If the result is not an instance of AirbyteDateTime. 294 """ 295 expiry_date = self._get_config_value_by_path(self._token_expiry_date_config_path) 296 result = ( 297 ab_datetime_now() - timedelta(days=1) 298 if expiry_date == "" 299 else ab_datetime_parse(str(expiry_date)) 300 ) 301 if isinstance(result, AirbyteDateTime): 302 return result 303 raise TypeError("Invalid datetime conversion")
Retrieves the token expiry date from the configuration.
This method fetches the token expiry date from the configuration using the specified path. If the expiry date is an empty string, it returns the current date and time minus one day. Otherwise, it parses the expiry date string into an AirbyteDateTime object.
Returns:
AirbyteDateTime: The parsed or calculated token expiry date.
Raises:
- TypeError: If the result is not an instance of AirbyteDateTime.
305 def set_token_expiry_date(self, new_token_expiry_date: AirbyteDateTime) -> None: # type: ignore[override] 306 """ 307 Sets the token expiry date in the configuration. 308 309 Args: 310 new_token_expiry_date (AirbyteDateTime): The new expiry date for the token. 311 """ 312 self._set_config_value_by_path( 313 self._token_expiry_date_config_path, str(new_token_expiry_date) 314 )
Sets the token expiry date in the configuration.
Arguments:
- new_token_expiry_date (AirbyteDateTime): The new expiry date for the token.
316 def token_has_expired(self) -> bool: 317 """Returns True if the token is expired""" 318 return ab_datetime_now() > self.get_token_expiry_date()
Returns True if the token is expired
320 @staticmethod 321 def get_new_token_expiry_date( 322 access_token_expires_in: str, 323 token_expiry_date_format: str | None = None, 324 ) -> AirbyteDateTime: 325 """ 326 Calculate the new token expiry date based on the provided expiration duration or format. 327 328 Args: 329 access_token_expires_in (str): The duration (in seconds) until the access token expires, or the expiry date in a specific format. 330 token_expiry_date_format (str | None, optional): The format of the expiry date if provided. Defaults to None. 331 332 Returns: 333 AirbyteDateTime: The calculated expiry date of the access token. 334 """ 335 if token_expiry_date_format: 336 return ab_datetime_parse(access_token_expires_in) 337 else: 338 return ab_datetime_now() + timedelta(seconds=int(access_token_expires_in))
Calculate the new token expiry date based on the provided expiration duration or format.
Arguments:
- access_token_expires_in (str): The duration (in seconds) until the access token expires, or the expiry date in a specific format.
- token_expiry_date_format (str | None, optional): The format of the expiry date if provided. Defaults to None.
Returns:
AirbyteDateTime: The calculated expiry date of the access token.
340 def get_access_token(self) -> str: 341 """Retrieve new access and refresh token if the access token has expired. 342 The new refresh token is persisted with the set_refresh_token function 343 Returns: 344 str: The current access_token, updated if it was previously expired. 345 """ 346 if self.token_has_expired(): 347 new_access_token, access_token_expires_in, new_refresh_token = ( 348 self.refresh_access_token() 349 ) 350 new_token_expiry_date: AirbyteDateTime = self.get_new_token_expiry_date( 351 access_token_expires_in, self._token_expiry_date_format 352 ) 353 self.access_token = new_access_token 354 self.set_refresh_token(new_refresh_token) 355 self.set_token_expiry_date(new_token_expiry_date) 356 self._emit_control_message() 357 return self.access_token
Retrieve new access and refresh token if the access token has expired. The new refresh token is persisted with the set_refresh_token function
Returns:
str: The current access_token, updated if it was previously expired.
359 def refresh_access_token(self) -> Tuple[str, str, str]: # type: ignore[override] 360 """ 361 Refreshes the access token by making a handled request and extracting the necessary token information. 362 363 Returns: 364 Tuple[str, str, str]: A tuple containing the new access token, token expiry date, and refresh token. 365 """ 366 response_json = self._make_handled_request() 367 return ( 368 self._extract_access_token(response_json), 369 self._extract_token_expiry_date(response_json), 370 self._extract_refresh_token(response_json), 371 )
Refreshes the access token by making a handled request and extracting the necessary token information.
Returns:
Tuple[str, str, str]: A tuple containing the new access token, token expiry date, and refresh token.
Inherited Members
- Oauth2Authenticator
- get_token_refresh_endpoint
- get_client_id_name
- get_client_id
- get_client_secret_name
- get_client_secret
- get_refresh_token_name
- get_access_token_name
- get_scopes
- get_expires_in_name
- get_refresh_request_body
- get_refresh_request_headers
- get_grant_type_name
- get_grant_type
- token_expiry_is_time_of_expiration
- token_expiry_date_format
39class TokenAuthenticator(AbstractHeaderAuthenticator): 40 """ 41 Builds auth header, based on the token provided. 42 The token is attached to each request via the `auth_header` header. 43 """ 44 45 @property 46 def auth_header(self) -> str: 47 return self._auth_header 48 49 @property 50 def token(self) -> str: 51 return f"{self._auth_method} {self._token}" 52 53 def __init__(self, token: str, auth_method: str = "Bearer", auth_header: str = "Authorization"): 54 self._auth_header = auth_header 55 self._auth_method = auth_method 56 self._token = token
Builds auth header, based on the token provided.
The token is attached to each request via the auth_header
header.
Inherited Members
36class UserDefinedBackoffException(BaseBackoffException): 37 """ 38 An exception that exposes how long it attempted to backoff 39 """ 40 41 def __init__( 42 self, 43 backoff: Union[int, float], 44 request: requests.PreparedRequest, 45 response: Optional[Union[requests.Response, Exception]], 46 error_message: str = "", 47 ): 48 """ 49 :param backoff: how long to backoff in seconds 50 :param request: the request that triggered this backoff exception 51 :param response: the response that triggered the backoff exception 52 """ 53 self.backoff = backoff 54 super().__init__(request=request, response=response, error_message=error_message)
An exception that exposes how long it attempted to backoff
41 def __init__( 42 self, 43 backoff: Union[int, float], 44 request: requests.PreparedRequest, 45 response: Optional[Union[requests.Response, Exception]], 46 error_message: str = "", 47 ): 48 """ 49 :param backoff: how long to backoff in seconds 50 :param request: the request that triggered this backoff exception 51 :param response: the response that triggered the backoff exception 52 """ 53 self.backoff = backoff 54 super().__init__(request=request, response=response, error_message=error_message)
Parameters
- backoff: how long to backoff in seconds
- request: the request that triggered this backoff exception
- response: the response that triggered the backoff exception
60class AirbyteLogFormatter(logging.Formatter): 61 """Output log records using AirbyteMessage""" 62 63 # Transforming Python log levels to Airbyte protocol log levels 64 level_mapping = { 65 logging.FATAL: Level.FATAL, 66 logging.ERROR: Level.ERROR, 67 logging.WARNING: Level.WARN, 68 logging.INFO: Level.INFO, 69 logging.DEBUG: Level.DEBUG, 70 } 71 72 def format(self, record: logging.LogRecord) -> str: 73 """Return a JSON representation of the log message""" 74 airbyte_level = self.level_mapping.get(record.levelno, "INFO") 75 if airbyte_level == Level.DEBUG: 76 extras = self.extract_extra_args_from_record(record) 77 debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras} 78 return filter_secrets(json.dumps(debug_dict)) 79 else: 80 message = super().format(record) 81 message = filter_secrets(message) 82 log_message = AirbyteMessage( 83 type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message) 84 ) 85 return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode() 86 87 @staticmethod 88 def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]: 89 """ 90 The python logger conflates default args with extra args. We use an empty log record and set operations 91 to isolate fields passed to the log record via extra by the developer. 92 """ 93 default_attrs = logging.LogRecord("", 0, "", 0, None, None, None).__dict__.keys() 94 extra_keys = set(record.__dict__.keys()) - default_attrs 95 return {k: str(getattr(record, k)) for k in extra_keys if hasattr(record, k)}
Output log records using AirbyteMessage
72 def format(self, record: logging.LogRecord) -> str: 73 """Return a JSON representation of the log message""" 74 airbyte_level = self.level_mapping.get(record.levelno, "INFO") 75 if airbyte_level == Level.DEBUG: 76 extras = self.extract_extra_args_from_record(record) 77 debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras} 78 return filter_secrets(json.dumps(debug_dict)) 79 else: 80 message = super().format(record) 81 message = filter_secrets(message) 82 log_message = AirbyteMessage( 83 type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message) 84 ) 85 return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode()
Return a JSON representation of the log message
87 @staticmethod 88 def extract_extra_args_from_record(record: logging.LogRecord) -> Mapping[str, Any]: 89 """ 90 The python logger conflates default args with extra args. We use an empty log record and set operations 91 to isolate fields passed to the log record via extra by the developer. 92 """ 93 default_attrs = logging.LogRecord("", 0, "", 0, None, None, None).__dict__.keys() 94 extra_keys = set(record.__dict__.keys()) - default_attrs 95 return {k: str(getattr(record, k)) for k in extra_keys if hasattr(record, k)}
The python logger conflates default args with extra args. We use an empty log record and set operations to isolate fields passed to the log record via extra by the developer.
44def init_logger(name: Optional[str] = None) -> logging.Logger: 45 """Initial set up of logger""" 46 logger = logging.getLogger(name) 47 logger.setLevel(logging.INFO) 48 logging.config.dictConfig(LOGGING_CONFIG) 49 return logger
Initial set up of logger
264@dataclass 265class AirbyteStream: 266 name: str 267 json_schema: Dict[str, Any] 268 supported_sync_modes: List[SyncMode] 269 source_defined_cursor: Optional[bool] = None 270 default_cursor_field: Optional[List[str]] = None 271 source_defined_primary_key: Optional[List[List[str]]] = None 272 namespace: Optional[str] = None 273 is_resumable: Optional[bool] = None
172@dataclass 173class AirbyteConnectionStatus: 174 status: Status 175 message: Optional[str] = None
81@dataclass 82class AirbyteMessage: 83 type: Type # type: ignore [name-defined] 84 log: Optional[AirbyteLogMessage] = None # type: ignore [name-defined] 85 spec: Optional[ConnectorSpecification] = None # type: ignore [name-defined] 86 connectionStatus: Optional[AirbyteConnectionStatus] = None # type: ignore [name-defined] 87 catalog: Optional[AirbyteCatalog] = None # type: ignore [name-defined] 88 record: Optional[Union[AirbyteFileTransferRecordMessage, AirbyteRecordMessage]] = None # type: ignore [name-defined] 89 state: Optional[AirbyteStateMessage] = None 90 trace: Optional[AirbyteTraceMessage] = None # type: ignore [name-defined] 91 control: Optional[AirbyteControlMessage] = None # type: ignore [name-defined]
An enumeration.
12class Type(Enum): 13 RECORD = 'RECORD' 14 STATE = 'STATE' 15 LOG = 'LOG' 16 SPEC = 'SPEC' 17 CONNECTION_STATUS = 'CONNECTION_STATUS' 18 CATALOG = 'CATALOG' 19 TRACE = 'TRACE' 20 CONTROL = 'CONTROL'
An enumeration.
An enumeration.
276@dataclass 277class ConfiguredAirbyteStream: 278 stream: AirbyteStream 279 sync_mode: SyncMode 280 destination_sync_mode: DestinationSyncMode 281 cursor_field: Optional[List[str]] = None 282 primary_key: Optional[List[List[str]]] = None 283 generation_id: Optional[int] = None 284 minimum_generation_id: Optional[int] = None 285 sync_id: Optional[int] = None
183class DestinationSyncMode(Enum): 184 append = 'append' 185 overwrite = 'overwrite' 186 append_dedup = 'append_dedup'
An enumeration.
An enumeration.
94class FailureType(Enum): 95 system_error = 'system_error' 96 config_error = 'config_error' 97 transient_error = 'transient_error'
An enumeration.
288@dataclass 289class AdvancedAuth: 290 auth_flow_type: Optional[AuthFlowType] = None 291 predicate_key: Optional[List[str]] = None 292 predicate_value: Optional[str] = None 293 oauth_config_specification: Optional[OAuthConfigSpecification] = None
80@dataclass 81class AirbyteLogMessage: 82 level: Level 83 message: str 84 stack_trace: Optional[str] = None
217@dataclass 218class OAuthConfigSpecification: 219 oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( 220 None 221 ) 222 oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( 223 None 224 ) 225 complete_oauth_output_specification: Optional[Dict[str, Any]] = None 226 complete_oauth_server_input_specification: Optional[Dict[str, Any]] = None 227 complete_oauth_server_output_specification: Optional[Dict[str, Any]] = None
296@dataclass 297class ConnectorSpecification: 298 connectionSpecification: Dict[str, Any] 299 documentationUrl: Optional[str] = None 300 changelogUrl: Optional[str] = None 301 supportsIncremental: Optional[bool] = None 302 supportsNormalization: Optional[bool] = False 303 supportsDBT: Optional[bool] = False 304 supported_destination_sync_modes: Optional[List[DestinationSyncMode]] = None 305 advanced_auth: Optional[AdvancedAuth] = None 306 protocol_version: Optional[str] = None
71class Level(Enum): 72 FATAL = 'FATAL' 73 ERROR = 'ERROR' 74 WARN = 'WARN' 75 INFO = 'INFO' 76 DEBUG = 'DEBUG' 77 TRACE = 'TRACE'
An enumeration.
309@dataclass 310class AirbyteRecordMessage: 311 stream: str 312 data: Dict[str, Any] 313 emitted_at: int 314 namespace: Optional[str] = None 315 meta: Optional[AirbyteRecordMessageMeta] = None
75class InMemoryMessageRepository(MessageRepository): 76 def __init__(self, log_level: Level = Level.INFO) -> None: 77 self._message_queue: Deque[AirbyteMessage] = deque() 78 self._log_level = log_level 79 80 def emit_message(self, message: AirbyteMessage) -> None: 81 self._message_queue.append(message) 82 83 def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: 84 if _is_severe_enough(self._log_level, level): 85 self.emit_message( 86 AirbyteMessage( 87 type=Type.LOG, 88 log=AirbyteLogMessage( 89 level=level, message=filter_secrets(json.dumps(message_provider())) 90 ), 91 ) 92 ) 93 94 def consume_queue(self) -> Iterable[AirbyteMessage]: 95 while self._message_queue: 96 yield self._message_queue.popleft()
Helper class that provides a standard way to create an ABC using inheritance.
83 def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: 84 if _is_severe_enough(self._log_level, level): 85 self.emit_message( 86 AirbyteMessage( 87 type=Type.LOG, 88 log=AirbyteLogMessage( 89 level=level, message=filter_secrets(json.dumps(message_provider())) 90 ), 91 ) 92 )
Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if the log level is less severe than what is configured
46class MessageRepository(ABC): 47 @abstractmethod 48 def emit_message(self, message: AirbyteMessage) -> None: 49 raise NotImplementedError() 50 51 @abstractmethod 52 def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: 53 """ 54 Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if 55 the log level is less severe than what is configured 56 """ 57 raise NotImplementedError() 58 59 @abstractmethod 60 def consume_queue(self) -> Iterable[AirbyteMessage]: 61 raise NotImplementedError()
Helper class that provides a standard way to create an ABC using inheritance.
51 @abstractmethod 52 def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None: 53 """ 54 Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if 55 the log level is less severe than what is configured 56 """ 57 raise NotImplementedError()
Computing messages can be resource consuming. This method is specialized for logging because we want to allow for lazy evaluation if the log level is less severe than what is configured
33class ConnectorStateManager: 34 """ 35 ConnectorStateManager consolidates the various forms of a stream's incoming state message (STREAM / GLOBAL) under a common 36 interface. It also provides methods to extract and update state 37 """ 38 39 def __init__(self, state: Optional[List[AirbyteStateMessage]] = None): 40 shared_state, per_stream_states = self._extract_from_state_message(state) 41 42 # We explicitly throw an error if we receive a GLOBAL state message that contains a shared_state because API sources are 43 # designed to checkpoint state independently of one another. API sources should never be emitting a state message where 44 # shared_state is populated. Rather than define how to handle shared_state without a clear use case, we're opting to throw an 45 # error instead and if/when we find one, we will then implement processing of the shared_state value. 46 if shared_state: 47 raise ValueError( 48 "Received a GLOBAL AirbyteStateMessage that contains a shared_state. This library only ever generates per-STREAM " 49 "STATE messages so this was not generated by this connector. This must be an orchestrator or platform error. GLOBAL " 50 "state messages with shared_state will not be processed correctly. " 51 ) 52 self.per_stream_states = per_stream_states 53 54 def get_stream_state( 55 self, stream_name: str, namespace: Optional[str] 56 ) -> MutableMapping[str, Any]: 57 """ 58 Retrieves the state of a given stream based on its descriptor (name + namespace). 59 :param stream_name: Name of the stream being fetched 60 :param namespace: Namespace of the stream being fetched 61 :return: The per-stream state for a stream 62 """ 63 stream_state: AirbyteStateBlob | None = self.per_stream_states.get( 64 HashableStreamDescriptor(name=stream_name, namespace=namespace) 65 ) 66 if stream_state: 67 return copy.deepcopy({k: v for k, v in stream_state.__dict__.items()}) 68 return {} 69 70 def update_state_for_stream( 71 self, stream_name: str, namespace: Optional[str], value: Mapping[str, Any] 72 ) -> None: 73 """ 74 Overwrites the state blob of a specific stream based on the provided stream name and optional namespace 75 :param stream_name: The name of the stream whose state is being updated 76 :param namespace: The namespace of the stream if it exists 77 :param value: A stream state mapping that is being updated for a stream 78 """ 79 stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace) 80 self.per_stream_states[stream_descriptor] = AirbyteStateBlob(value) 81 82 def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage: 83 """ 84 Generates an AirbyteMessage using the current per-stream state of a specified stream 85 :param stream_name: The name of the stream for the message that is being created 86 :param namespace: The namespace of the stream for the message that is being created 87 :return: The Airbyte state message to be emitted by the connector during a sync 88 """ 89 hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace) 90 stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob() 91 92 return AirbyteMessage( 93 type=MessageType.STATE, 94 state=AirbyteStateMessage( 95 type=AirbyteStateType.STREAM, 96 stream=AirbyteStreamState( 97 stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), 98 stream_state=stream_state, 99 ), 100 ), 101 ) 102 103 @classmethod 104 def _extract_from_state_message( 105 cls, 106 state: Optional[List[AirbyteStateMessage]], 107 ) -> Tuple[ 108 Optional[AirbyteStateBlob], 109 MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]], 110 ]: 111 """ 112 Takes an incoming list of state messages or a global state message and extracts state attributes according to 113 type which can then be assigned to the new state manager being instantiated 114 :param state: The incoming state input 115 :return: A tuple of shared state and per stream state assembled from the incoming state list 116 """ 117 if state is None: 118 return None, {} 119 120 is_global = cls._is_global_state(state) 121 122 if is_global: 123 # We already validate that this is a global state message, not None: 124 global_state = cast(AirbyteGlobalState, state[0].global_) 125 # global_state has shared_state, also not None: 126 shared_state: AirbyteStateBlob = cast( 127 AirbyteStateBlob, copy.deepcopy(global_state.shared_state, {}) 128 ) 129 streams = { 130 HashableStreamDescriptor( 131 name=per_stream_state.stream_descriptor.name, 132 namespace=per_stream_state.stream_descriptor.namespace, 133 ): per_stream_state.stream_state 134 for per_stream_state in global_state.stream_states # type: ignore[union-attr] # global_state has shared_state 135 } 136 return shared_state, streams 137 else: 138 streams = { 139 HashableStreamDescriptor( 140 name=per_stream_state.stream.stream_descriptor.name, # type: ignore[union-attr] # stream has stream_descriptor 141 namespace=per_stream_state.stream.stream_descriptor.namespace, # type: ignore[union-attr] # stream has stream_descriptor 142 ): per_stream_state.stream.stream_state # type: ignore[union-attr] # stream has stream_state 143 for per_stream_state in state 144 if per_stream_state.type == AirbyteStateType.STREAM 145 and hasattr(per_stream_state, "stream") # type: ignore # state is always a list of AirbyteStateMessage if is_per_stream is True 146 } 147 return None, streams 148 149 @staticmethod 150 def _is_global_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool: 151 return ( 152 isinstance(state, List) 153 and len(state) == 1 154 and isinstance(state[0], AirbyteStateMessage) 155 and state[0].type == AirbyteStateType.GLOBAL 156 ) 157 158 @staticmethod 159 def _is_per_stream_state( 160 state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]], 161 ) -> bool: 162 return isinstance(state, List)
ConnectorStateManager consolidates the various forms of a stream's incoming state message (STREAM / GLOBAL) under a common interface. It also provides methods to extract and update state
39 def __init__(self, state: Optional[List[AirbyteStateMessage]] = None): 40 shared_state, per_stream_states = self._extract_from_state_message(state) 41 42 # We explicitly throw an error if we receive a GLOBAL state message that contains a shared_state because API sources are 43 # designed to checkpoint state independently of one another. API sources should never be emitting a state message where 44 # shared_state is populated. Rather than define how to handle shared_state without a clear use case, we're opting to throw an 45 # error instead and if/when we find one, we will then implement processing of the shared_state value. 46 if shared_state: 47 raise ValueError( 48 "Received a GLOBAL AirbyteStateMessage that contains a shared_state. This library only ever generates per-STREAM " 49 "STATE messages so this was not generated by this connector. This must be an orchestrator or platform error. GLOBAL " 50 "state messages with shared_state will not be processed correctly. " 51 ) 52 self.per_stream_states = per_stream_states
54 def get_stream_state( 55 self, stream_name: str, namespace: Optional[str] 56 ) -> MutableMapping[str, Any]: 57 """ 58 Retrieves the state of a given stream based on its descriptor (name + namespace). 59 :param stream_name: Name of the stream being fetched 60 :param namespace: Namespace of the stream being fetched 61 :return: The per-stream state for a stream 62 """ 63 stream_state: AirbyteStateBlob | None = self.per_stream_states.get( 64 HashableStreamDescriptor(name=stream_name, namespace=namespace) 65 ) 66 if stream_state: 67 return copy.deepcopy({k: v for k, v in stream_state.__dict__.items()}) 68 return {}
Retrieves the state of a given stream based on its descriptor (name + namespace).
Parameters
- stream_name: Name of the stream being fetched
- namespace: Namespace of the stream being fetched
Returns
The per-stream state for a stream
70 def update_state_for_stream( 71 self, stream_name: str, namespace: Optional[str], value: Mapping[str, Any] 72 ) -> None: 73 """ 74 Overwrites the state blob of a specific stream based on the provided stream name and optional namespace 75 :param stream_name: The name of the stream whose state is being updated 76 :param namespace: The namespace of the stream if it exists 77 :param value: A stream state mapping that is being updated for a stream 78 """ 79 stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace) 80 self.per_stream_states[stream_descriptor] = AirbyteStateBlob(value)
Overwrites the state blob of a specific stream based on the provided stream name and optional namespace
Parameters
- stream_name: The name of the stream whose state is being updated
- namespace: The namespace of the stream if it exists
- value: A stream state mapping that is being updated for a stream
82 def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage: 83 """ 84 Generates an AirbyteMessage using the current per-stream state of a specified stream 85 :param stream_name: The name of the stream for the message that is being created 86 :param namespace: The namespace of the stream for the message that is being created 87 :return: The Airbyte state message to be emitted by the connector during a sync 88 """ 89 hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace) 90 stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob() 91 92 return AirbyteMessage( 93 type=MessageType.STATE, 94 state=AirbyteStateMessage( 95 type=AirbyteStateType.STREAM, 96 stream=AirbyteStreamState( 97 stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), 98 stream_state=stream_state, 99 ), 100 ), 101 )
Generates an AirbyteMessage using the current per-stream state of a specified stream
Parameters
- stream_name: The name of the stream for the message that is being created
- namespace: The namespace of the stream for the message that is being created
Returns
The Airbyte state message to be emitted by the connector during a sync
95@deprecated( 96 "Deprecated as of CDK version 0.87.0. " 97 "Deprecated in favor of the `CheckpointMixin` which offers similar functionality." 98) 99class IncrementalMixin(CheckpointMixin, ABC): 100 """Mixin to make stream incremental. 101 102 class IncrementalStream(Stream, IncrementalMixin): 103 @property 104 def state(self): 105 return self._state 106 107 @state.setter 108 def state(self, value): 109 self._state[self.cursor_field] = value[self.cursor_field] 110 """
Mixin to make stream incremental.
class IncrementalStream(Stream, IncrementalMixin): @property def state(self): return self._state
@state.setter
def state(self, value):
self._state[self.cursor_field] = value[self.cursor_field]
Inherited Members
119class Stream(ABC): 120 """ 121 Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol. 122 """ 123 124 _configured_json_schema: Optional[Dict[str, Any]] = None 125 _exit_on_rate_limit: bool = False 126 127 # Use self.logger in subclasses to log any messages 128 @property 129 def logger(self) -> logging.Logger: 130 return logging.getLogger(f"airbyte.streams.{self.name}") 131 132 # TypeTransformer object to perform output data transformation 133 transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform) 134 135 cursor: Optional[Cursor] = None 136 137 has_multiple_slices = False 138 139 @cached_property 140 def name(self) -> str: 141 """ 142 :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. 143 """ 144 return casing.camel_to_snake(self.__class__.__name__) 145 146 def get_error_display_message(self, exception: BaseException) -> Optional[str]: 147 """ 148 Retrieves the user-friendly display message that corresponds to an exception. 149 This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. 150 151 The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed. 152 153 :param exception: The exception that was raised 154 :return: A user-friendly message that indicates the cause of the error 155 """ 156 return None 157 158 def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies 159 self, 160 configured_stream: ConfiguredAirbyteStream, 161 logger: logging.Logger, 162 slice_logger: SliceLogger, 163 stream_state: MutableMapping[str, Any], 164 state_manager, 165 internal_config: InternalConfig, 166 ) -> Iterable[StreamData]: 167 sync_mode = configured_stream.sync_mode 168 cursor_field = configured_stream.cursor_field 169 self.configured_json_schema = configured_stream.stream.json_schema 170 171 # WARNING: When performing a read() that uses incoming stream state, we MUST use the self.state that is defined as 172 # opposed to the incoming stream_state value. Because some connectors like ones using the file-based CDK modify 173 # state before setting the value on the Stream attribute, the most up-to-date state is derived from Stream.state 174 # instead of the stream_state parameter. This does not apply to legacy connectors using get_updated_state(). 175 try: 176 stream_state = self.state # type: ignore # we know the field might not exist... 177 except AttributeError: 178 pass 179 180 should_checkpoint = bool(state_manager) 181 checkpoint_reader = self._get_checkpoint_reader( 182 logger=logger, cursor_field=cursor_field, sync_mode=sync_mode, stream_state=stream_state 183 ) 184 185 next_slice = checkpoint_reader.next() 186 record_counter = 0 187 stream_state_tracker = copy.deepcopy(stream_state) 188 while next_slice is not None: 189 if slice_logger.should_log_slice_message(logger): 190 yield slice_logger.create_slice_log_message(next_slice) 191 records = self.read_records( 192 sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior 193 stream_slice=next_slice, 194 stream_state=stream_state, 195 cursor_field=cursor_field or None, 196 ) 197 for record_data_or_message in records: 198 yield record_data_or_message 199 if isinstance(record_data_or_message, Mapping) or ( 200 hasattr(record_data_or_message, "type") 201 and record_data_or_message.type == MessageType.RECORD 202 ): 203 record_data = ( 204 record_data_or_message 205 if isinstance(record_data_or_message, Mapping) 206 else record_data_or_message.record 207 ) 208 209 # Thanks I hate it. RFR fundamentally doesn't fit with the concept of the legacy Stream.get_updated_state() 210 # method because RFR streams rely on pagination as a cursor. Stream.get_updated_state() was designed to make 211 # the CDK manage state using specifically the last seen record. don't @ brian.lai 212 # 213 # Also, because the legacy incremental state case decouples observing incoming records from emitting state, it 214 # requires that we separate CheckpointReader.observe() and CheckpointReader.get_checkpoint() which could 215 # otherwise be combined. 216 if self.cursor_field: 217 # Some connectors have streams that implement get_updated_state(), but do not define a cursor_field. This 218 # should be fixed on the stream implementation, but we should also protect against this in the CDK as well 219 stream_state_tracker = self.get_updated_state( 220 stream_state_tracker, 221 record_data, # type: ignore [arg-type] 222 ) 223 self._observe_state(checkpoint_reader, stream_state_tracker) 224 record_counter += 1 225 226 checkpoint_interval = self.state_checkpoint_interval 227 if ( 228 should_checkpoint 229 and checkpoint_interval 230 and record_counter % checkpoint_interval == 0 231 ): 232 checkpoint = checkpoint_reader.get_checkpoint() 233 if checkpoint: 234 airbyte_state_message = self._checkpoint_state( 235 checkpoint, state_manager=state_manager 236 ) 237 yield airbyte_state_message 238 239 if internal_config.is_limit_reached(record_counter): 240 break 241 self._observe_state(checkpoint_reader) 242 checkpoint_state = checkpoint_reader.get_checkpoint() 243 if should_checkpoint and checkpoint_state is not None: 244 airbyte_state_message = self._checkpoint_state( 245 checkpoint_state, state_manager=state_manager 246 ) 247 yield airbyte_state_message 248 249 next_slice = checkpoint_reader.next() 250 251 checkpoint = checkpoint_reader.get_checkpoint() 252 if should_checkpoint and checkpoint is not None: 253 airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager) 254 yield airbyte_state_message 255 256 def read_only_records(self, state: Optional[Mapping[str, Any]] = None) -> Iterable[StreamData]: 257 """ 258 Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports 259 incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter) 260 or emit state messages. 261 """ 262 263 configured_stream = ConfiguredAirbyteStream( 264 stream=AirbyteStream( 265 name=self.name, 266 json_schema={}, 267 supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], 268 ), 269 sync_mode=SyncMode.incremental if state else SyncMode.full_refresh, 270 destination_sync_mode=DestinationSyncMode.append, 271 ) 272 273 yield from self.read( 274 configured_stream=configured_stream, 275 logger=self.logger, 276 slice_logger=DebugSliceLogger(), 277 stream_state=dict(state) 278 if state 279 else {}, # read() expects MutableMapping instead of Mapping which is used more often 280 state_manager=None, 281 internal_config=InternalConfig(), # type: ignore [call-arg] 282 ) 283 284 @abstractmethod 285 def read_records( 286 self, 287 sync_mode: SyncMode, 288 cursor_field: Optional[List[str]] = None, 289 stream_slice: Optional[Mapping[str, Any]] = None, 290 stream_state: Optional[Mapping[str, Any]] = None, 291 ) -> Iterable[StreamData]: 292 """ 293 This method should be overridden by subclasses to read records based on the inputs 294 """ 295 296 @lru_cache(maxsize=None) 297 def get_json_schema(self) -> Mapping[str, Any]: 298 """ 299 :return: A dict of the JSON schema representing this stream. 300 301 The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. 302 Override as needed. 303 """ 304 # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec 305 return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema(self.name) 306 307 def as_airbyte_stream(self) -> AirbyteStream: 308 stream = AirbyteStream( 309 name=self.name, 310 json_schema=dict(self.get_json_schema()), 311 supported_sync_modes=[SyncMode.full_refresh], 312 is_resumable=self.is_resumable, 313 ) 314 315 if self.namespace: 316 stream.namespace = self.namespace 317 318 # If we can offer incremental we always should. RFR is always less reliable than incremental which uses a real cursor value 319 if self.supports_incremental: 320 stream.source_defined_cursor = self.source_defined_cursor 321 stream.supported_sync_modes.append(SyncMode.incremental) 322 stream.default_cursor_field = self._wrapped_cursor_field() 323 324 keys = Stream._wrapped_primary_key(self.primary_key) 325 if keys and len(keys) > 0: 326 stream.source_defined_primary_key = keys 327 328 return stream 329 330 @property 331 def supports_incremental(self) -> bool: 332 """ 333 :return: True if this stream supports incrementally reading data 334 """ 335 return len(self._wrapped_cursor_field()) > 0 336 337 @property 338 def is_resumable(self) -> bool: 339 """ 340 :return: True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. 341 This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh 342 can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning 343 on the next sync job. 344 """ 345 if self.supports_incremental: 346 return True 347 if self.has_multiple_slices: 348 # We temporarily gate substream to not support RFR because puts a pretty high burden on connector developers 349 # to structure stream state in a very specific way. We also can't check for issubclass(HttpSubStream) because 350 # not all substreams implement the interface and it would be a circular dependency so we use parent as a surrogate 351 return False 352 elif hasattr(type(self), "state") and getattr(type(self), "state").fset is not None: 353 # Modern case where a stream manages state using getter/setter 354 return True 355 else: 356 # Legacy case where the CDK manages state via the get_updated_state() method. This is determined by checking if 357 # the stream's get_updated_state() differs from the Stream class and therefore has been overridden 358 return type(self).get_updated_state != Stream.get_updated_state 359 360 def _wrapped_cursor_field(self) -> List[str]: 361 return [self.cursor_field] if isinstance(self.cursor_field, str) else self.cursor_field 362 363 @property 364 def cursor_field(self) -> Union[str, List[str]]: 365 """ 366 Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. 367 :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. 368 """ 369 return [] 370 371 @property 372 def namespace(self) -> Optional[str]: 373 """ 374 Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for. 375 :return: A string containing the name of the namespace. 376 """ 377 return None 378 379 @property 380 def source_defined_cursor(self) -> bool: 381 """ 382 Return False if the cursor can be configured by the user. 383 """ 384 return True 385 386 @property 387 def exit_on_rate_limit(self) -> bool: 388 """Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.""" 389 return self._exit_on_rate_limit 390 391 @exit_on_rate_limit.setter 392 def exit_on_rate_limit(self, value: bool) -> None: 393 """Exit on rate limit setter, accept bool value.""" 394 self._exit_on_rate_limit = value 395 396 @property 397 @abstractmethod 398 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 399 """ 400 :return: string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. 401 If the stream has no primary keys, return None. 402 """ 403 404 def stream_slices( 405 self, 406 *, 407 sync_mode: SyncMode, 408 cursor_field: Optional[List[str]] = None, 409 stream_state: Optional[Mapping[str, Any]] = None, 410 ) -> Iterable[Optional[Mapping[str, Any]]]: 411 """ 412 Override to define the slices for this stream. See the stream slicing section of the docs for more information. 413 414 :param sync_mode: 415 :param cursor_field: 416 :param stream_state: 417 :return: 418 """ 419 yield StreamSlice(partition={}, cursor_slice={}) 420 421 @property 422 def state_checkpoint_interval(self) -> Optional[int]: 423 """ 424 Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading 425 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source. 426 427 Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled. 428 429 return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in 430 ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of 431 created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read. 432 """ 433 return None 434 435 # Commented-out to avoid any runtime penalty, since this is used in a hot per-record codepath. 436 # To be evaluated for re-introduction here: https://github.com/airbytehq/airbyte-python-cdk/issues/116 437 # @deprecated( 438 # "Deprecated method `get_updated_state` as of CDK version 0.1.49. " 439 # "Please use explicit state property instead, see `IncrementalMixin` docs." 440 # ) 441 def get_updated_state( 442 self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] 443 ) -> MutableMapping[str, Any]: 444 """DEPRECATED. Please use explicit state property instead, see `IncrementalMixin` docs. 445 446 Override to extract state from the latest record. Needed to implement incremental sync. 447 448 Inspects the latest record extracted from the data source and the current state object and return an updated state object. 449 450 For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is 451 {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object. 452 453 :param current_stream_state: The stream's current state object 454 :param latest_record: The latest record extracted from the stream 455 :return: An updated state object 456 """ 457 return {} 458 459 def get_cursor(self) -> Optional[Cursor]: 460 """ 461 A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while 462 reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need 463 to define a cursor implementation and override this method to manage state through a Cursor. 464 """ 465 return self.cursor 466 467 def _get_checkpoint_reader( 468 self, 469 logger: logging.Logger, 470 cursor_field: Optional[List[str]], 471 sync_mode: SyncMode, 472 stream_state: MutableMapping[str, Any], 473 ) -> CheckpointReader: 474 mappings_or_slices = self.stream_slices( 475 cursor_field=cursor_field, 476 sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior 477 stream_state=stream_state, 478 ) 479 480 # Because of poor foresight, we wrote the default Stream.stream_slices() method to return [None] which is confusing and 481 # has now normalized this behavior for connector developers. Now some connectors return [None]. This is objectively 482 # misleading and a more ideal interface is [{}] to indicate we still want to iterate over one slice, but with no 483 # specific slice values. None is bad, and now I feel bad that I have to write this hack. 484 if mappings_or_slices == [None]: 485 mappings_or_slices = [{}] 486 487 slices_iterable_copy, iterable_for_detecting_format = itertools.tee(mappings_or_slices, 2) 488 stream_classification = self._classify_stream( 489 mappings_or_slices=iterable_for_detecting_format 490 ) 491 492 # Streams that override has_multiple_slices are explicitly indicating that they will iterate over 493 # multiple partitions. Inspecting slices to automatically apply the correct cursor is only needed as 494 # a backup. So if this value was already assigned to True by the stream, we don't need to reassign it 495 self.has_multiple_slices = ( 496 self.has_multiple_slices or stream_classification.has_multiple_slices 497 ) 498 499 cursor = self.get_cursor() 500 if cursor: 501 cursor.set_initial_state(stream_state=stream_state) 502 503 checkpoint_mode = self._checkpoint_mode 504 505 if cursor and stream_classification.is_legacy_format: 506 return LegacyCursorBasedCheckpointReader( 507 stream_slices=slices_iterable_copy, cursor=cursor, read_state_from_cursor=True 508 ) 509 elif cursor: 510 return CursorBasedCheckpointReader( 511 stream_slices=slices_iterable_copy, 512 cursor=cursor, 513 read_state_from_cursor=checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH, 514 ) 515 elif checkpoint_mode == CheckpointMode.RESUMABLE_FULL_REFRESH: 516 # Resumable full refresh readers rely on the stream state dynamically being updated during pagination and does 517 # not iterate over a static set of slices. 518 return ResumableFullRefreshCheckpointReader(stream_state=stream_state) 519 elif checkpoint_mode == CheckpointMode.INCREMENTAL: 520 return IncrementalCheckpointReader( 521 stream_slices=slices_iterable_copy, stream_state=stream_state 522 ) 523 else: 524 return FullRefreshCheckpointReader(stream_slices=slices_iterable_copy) 525 526 @property 527 def _checkpoint_mode(self) -> CheckpointMode: 528 if self.is_resumable and len(self._wrapped_cursor_field()) > 0: 529 return CheckpointMode.INCREMENTAL 530 elif self.is_resumable: 531 return CheckpointMode.RESUMABLE_FULL_REFRESH 532 else: 533 return CheckpointMode.FULL_REFRESH 534 535 @staticmethod 536 def _classify_stream( 537 mappings_or_slices: Iterator[Optional[Union[Mapping[str, Any], StreamSlice]]], 538 ) -> StreamClassification: 539 """ 540 This is a bit of a crazy solution, but also the only way we can detect certain attributes about the stream since Python 541 streams do not follow consistent implementation patterns. We care about the following two attributes: 542 - is_substream: Helps to incrementally release changes since substreams w/ parents are much more complicated. Also 543 helps de-risk the release of changes that might impact all connectors 544 - uses_legacy_slice_format: Since the checkpoint reader must manage a complex state object, we opted to have it always 545 use the structured StreamSlice object. However, this requires backwards compatibility with Python sources that only 546 support the legacy mapping object 547 548 Both attributes can eventually be deprecated once stream's define this method deleted once substreams have been implemented and 549 legacy connectors all adhere to the StreamSlice object. 550 """ 551 if not mappings_or_slices: 552 raise ValueError("A stream should always have at least one slice") 553 try: 554 next_slice = next(mappings_or_slices) 555 if isinstance(next_slice, StreamSlice) and next_slice == StreamSlice( 556 partition={}, cursor_slice={} 557 ): 558 is_legacy_format = False 559 slice_has_value = False 560 elif next_slice == {}: 561 is_legacy_format = True 562 slice_has_value = False 563 elif isinstance(next_slice, StreamSlice): 564 is_legacy_format = False 565 slice_has_value = True 566 else: 567 is_legacy_format = True 568 slice_has_value = True 569 except StopIteration: 570 # If the stream has no slices, the format ultimately does not matter since no data will get synced. This is technically 571 # a valid case because it is up to the stream to define its slicing behavior 572 return StreamClassification(is_legacy_format=False, has_multiple_slices=False) 573 574 if slice_has_value: 575 # If the first slice contained a partition value from the result of stream_slices(), this is a substream that might 576 # have multiple parent records to iterate over 577 return StreamClassification( 578 is_legacy_format=is_legacy_format, has_multiple_slices=slice_has_value 579 ) 580 581 try: 582 # If stream_slices() returns multiple slices, this is also a substream that can potentially generate empty slices 583 next(mappings_or_slices) 584 return StreamClassification(is_legacy_format=is_legacy_format, has_multiple_slices=True) 585 except StopIteration: 586 # If the result of stream_slices() only returns a single empty stream slice, then we know this is a regular stream 587 return StreamClassification( 588 is_legacy_format=is_legacy_format, has_multiple_slices=False 589 ) 590 591 def log_stream_sync_configuration(self) -> None: 592 """ 593 Logs the configuration of this stream. 594 """ 595 self.logger.debug( 596 f"Syncing stream instance: {self.name}", 597 extra={ 598 "primary_key": self.primary_key, 599 "cursor_field": self.cursor_field, 600 }, 601 ) 602 603 @staticmethod 604 def _wrapped_primary_key( 605 keys: Optional[Union[str, List[str], List[List[str]]]], 606 ) -> Optional[List[List[str]]]: 607 """ 608 :return: wrap the primary_key property in a list of list of strings required by the Airbyte Stream object. 609 """ 610 if not keys: 611 return None 612 613 if isinstance(keys, str): 614 return [[keys]] 615 elif isinstance(keys, list): 616 wrapped_keys = [] 617 for component in keys: 618 if isinstance(component, str): 619 wrapped_keys.append([component]) 620 elif isinstance(component, list): 621 wrapped_keys.append(component) 622 else: 623 raise ValueError(f"Element must be either list or str. Got: {type(component)}") 624 return wrapped_keys 625 else: 626 raise ValueError(f"Element must be either list or str. Got: {type(keys)}") 627 628 def _observe_state( 629 self, checkpoint_reader: CheckpointReader, stream_state: Optional[Mapping[str, Any]] = None 630 ) -> None: 631 """ 632 Convenience method that attempts to read the Stream's state using the recommended way of connector's managing their 633 own state via state setter/getter. But if we get back an AttributeError, then the legacy Stream.get_updated_state() 634 method is used as a fallback method. 635 """ 636 637 # This is an inversion of the original logic that used to try state getter/setters first. As part of the work to 638 # automatically apply resumable full refresh to all streams, all HttpStream classes implement default state 639 # getter/setter methods, we should default to only using the incoming stream_state parameter value is {} which 640 # indicates the stream does not override the default get_updated_state() implementation. When the default method 641 # is not overridden, then the stream defers to self.state getter 642 if stream_state: 643 checkpoint_reader.observe(stream_state) 644 elif type(self).get_updated_state == Stream.get_updated_state: 645 # We only default to the state getter/setter if the stream does not use the legacy get_updated_state() method 646 try: 647 new_state = self.state # type: ignore # This will always exist on HttpStreams, but may not for Stream 648 if new_state: 649 checkpoint_reader.observe(new_state) 650 except AttributeError: 651 pass 652 653 def _checkpoint_state( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies 654 self, 655 stream_state: Mapping[str, Any], 656 state_manager, 657 ) -> AirbyteMessage: 658 # todo: This can be consolidated into one ConnectorStateManager.update_and_create_state_message() method, but I want 659 # to reduce changes right now and this would span concurrent as well 660 state_manager.update_state_for_stream(self.name, self.namespace, stream_state) 661 return state_manager.create_state_message(self.name, self.namespace) # type: ignore [no-any-return] 662 663 @property 664 def configured_json_schema(self) -> Optional[Dict[str, Any]]: 665 """ 666 This property is set from the read method. 667 668 :return Optional[Dict]: JSON schema from configured catalog if provided, otherwise None. 669 """ 670 return self._configured_json_schema 671 672 @configured_json_schema.setter 673 def configured_json_schema(self, json_schema: Dict[str, Any]) -> None: 674 self._configured_json_schema = self._filter_schema_invalid_properties(json_schema) 675 676 def _filter_schema_invalid_properties( 677 self, configured_catalog_json_schema: Dict[str, Any] 678 ) -> Dict[str, Any]: 679 """ 680 Filters the properties in json_schema that are not present in the stream schema. 681 Configured Schemas can have very old fields, so we need to housekeeping ourselves. 682 """ 683 configured_schema: Any = configured_catalog_json_schema.get("properties", {}) 684 stream_schema_properties: Any = self.get_json_schema().get("properties", {}) 685 686 configured_keys = configured_schema.keys() 687 stream_keys = stream_schema_properties.keys() 688 invalid_properties = configured_keys - stream_keys 689 if not invalid_properties: 690 return configured_catalog_json_schema 691 692 self.logger.warning( 693 f"Stream {self.name}: the following fields are deprecated and cannot be synced. {invalid_properties}. Refresh the connection's source schema to resolve this warning." 694 ) 695 696 valid_configured_schema_properties_keys = stream_keys & configured_keys 697 valid_configured_schema_properties = {} 698 699 for configured_schema_property in valid_configured_schema_properties_keys: 700 valid_configured_schema_properties[configured_schema_property] = ( 701 stream_schema_properties[configured_schema_property] 702 ) 703 704 return {**configured_catalog_json_schema, "properties": valid_configured_schema_properties}
Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol.
139 @cached_property 140 def name(self) -> str: 141 """ 142 :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. 143 """ 144 return casing.camel_to_snake(self.__class__.__name__)
Returns
Stream name. By default this is the implementing class name, but it can be overridden as needed.
146 def get_error_display_message(self, exception: BaseException) -> Optional[str]: 147 """ 148 Retrieves the user-friendly display message that corresponds to an exception. 149 This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. 150 151 The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed. 152 153 :param exception: The exception that was raised 154 :return: A user-friendly message that indicates the cause of the error 155 """ 156 return None
Retrieves the user-friendly display message that corresponds to an exception. This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage.
The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed.
Parameters
- exception: The exception that was raised
Returns
A user-friendly message that indicates the cause of the error
158 def read( # type: ignore # ignoring typing for ConnectorStateManager because of circular dependencies 159 self, 160 configured_stream: ConfiguredAirbyteStream, 161 logger: logging.Logger, 162 slice_logger: SliceLogger, 163 stream_state: MutableMapping[str, Any], 164 state_manager, 165 internal_config: InternalConfig, 166 ) -> Iterable[StreamData]: 167 sync_mode = configured_stream.sync_mode 168 cursor_field = configured_stream.cursor_field 169 self.configured_json_schema = configured_stream.stream.json_schema 170 171 # WARNING: When performing a read() that uses incoming stream state, we MUST use the self.state that is defined as 172 # opposed to the incoming stream_state value. Because some connectors like ones using the file-based CDK modify 173 # state before setting the value on the Stream attribute, the most up-to-date state is derived from Stream.state 174 # instead of the stream_state parameter. This does not apply to legacy connectors using get_updated_state(). 175 try: 176 stream_state = self.state # type: ignore # we know the field might not exist... 177 except AttributeError: 178 pass 179 180 should_checkpoint = bool(state_manager) 181 checkpoint_reader = self._get_checkpoint_reader( 182 logger=logger, cursor_field=cursor_field, sync_mode=sync_mode, stream_state=stream_state 183 ) 184 185 next_slice = checkpoint_reader.next() 186 record_counter = 0 187 stream_state_tracker = copy.deepcopy(stream_state) 188 while next_slice is not None: 189 if slice_logger.should_log_slice_message(logger): 190 yield slice_logger.create_slice_log_message(next_slice) 191 records = self.read_records( 192 sync_mode=sync_mode, # todo: change this interface to no longer rely on sync_mode for behavior 193 stream_slice=next_slice, 194 stream_state=stream_state, 195 cursor_field=cursor_field or None, 196 ) 197 for record_data_or_message in records: 198 yield record_data_or_message 199 if isinstance(record_data_or_message, Mapping) or ( 200 hasattr(record_data_or_message, "type") 201 and record_data_or_message.type == MessageType.RECORD 202 ): 203 record_data = ( 204 record_data_or_message 205 if isinstance(record_data_or_message, Mapping) 206 else record_data_or_message.record 207 ) 208 209 # Thanks I hate it. RFR fundamentally doesn't fit with the concept of the legacy Stream.get_updated_state() 210 # method because RFR streams rely on pagination as a cursor. Stream.get_updated_state() was designed to make 211 # the CDK manage state using specifically the last seen record. don't @ brian.lai 212 # 213 # Also, because the legacy incremental state case decouples observing incoming records from emitting state, it 214 # requires that we separate CheckpointReader.observe() and CheckpointReader.get_checkpoint() which could 215 # otherwise be combined. 216 if self.cursor_field: 217 # Some connectors have streams that implement get_updated_state(), but do not define a cursor_field. This 218 # should be fixed on the stream implementation, but we should also protect against this in the CDK as well 219 stream_state_tracker = self.get_updated_state( 220 stream_state_tracker, 221 record_data, # type: ignore [arg-type] 222 ) 223 self._observe_state(checkpoint_reader, stream_state_tracker) 224 record_counter += 1 225 226 checkpoint_interval = self.state_checkpoint_interval 227 if ( 228 should_checkpoint 229 and checkpoint_interval 230 and record_counter % checkpoint_interval == 0 231 ): 232 checkpoint = checkpoint_reader.get_checkpoint() 233 if checkpoint: 234 airbyte_state_message = self._checkpoint_state( 235 checkpoint, state_manager=state_manager 236 ) 237 yield airbyte_state_message 238 239 if internal_config.is_limit_reached(record_counter): 240 break 241 self._observe_state(checkpoint_reader) 242 checkpoint_state = checkpoint_reader.get_checkpoint() 243 if should_checkpoint and checkpoint_state is not None: 244 airbyte_state_message = self._checkpoint_state( 245 checkpoint_state, state_manager=state_manager 246 ) 247 yield airbyte_state_message 248 249 next_slice = checkpoint_reader.next() 250 251 checkpoint = checkpoint_reader.get_checkpoint() 252 if should_checkpoint and checkpoint is not None: 253 airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager) 254 yield airbyte_state_message
256 def read_only_records(self, state: Optional[Mapping[str, Any]] = None) -> Iterable[StreamData]: 257 """ 258 Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports 259 incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter) 260 or emit state messages. 261 """ 262 263 configured_stream = ConfiguredAirbyteStream( 264 stream=AirbyteStream( 265 name=self.name, 266 json_schema={}, 267 supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], 268 ), 269 sync_mode=SyncMode.incremental if state else SyncMode.full_refresh, 270 destination_sync_mode=DestinationSyncMode.append, 271 ) 272 273 yield from self.read( 274 configured_stream=configured_stream, 275 logger=self.logger, 276 slice_logger=DebugSliceLogger(), 277 stream_state=dict(state) 278 if state 279 else {}, # read() expects MutableMapping instead of Mapping which is used more often 280 state_manager=None, 281 internal_config=InternalConfig(), # type: ignore [call-arg] 282 )
Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter) or emit state messages.
284 @abstractmethod 285 def read_records( 286 self, 287 sync_mode: SyncMode, 288 cursor_field: Optional[List[str]] = None, 289 stream_slice: Optional[Mapping[str, Any]] = None, 290 stream_state: Optional[Mapping[str, Any]] = None, 291 ) -> Iterable[StreamData]: 292 """ 293 This method should be overridden by subclasses to read records based on the inputs 294 """
This method should be overridden by subclasses to read records based on the inputs
296 @lru_cache(maxsize=None) 297 def get_json_schema(self) -> Mapping[str, Any]: 298 """ 299 :return: A dict of the JSON schema representing this stream. 300 301 The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. 302 Override as needed. 303 """ 304 # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec 305 return ResourceSchemaLoader(package_name_from_class(self.__class__)).get_schema(self.name)
Returns
A dict of the JSON schema representing this stream.
The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. Override as needed.
307 def as_airbyte_stream(self) -> AirbyteStream: 308 stream = AirbyteStream( 309 name=self.name, 310 json_schema=dict(self.get_json_schema()), 311 supported_sync_modes=[SyncMode.full_refresh], 312 is_resumable=self.is_resumable, 313 ) 314 315 if self.namespace: 316 stream.namespace = self.namespace 317 318 # If we can offer incremental we always should. RFR is always less reliable than incremental which uses a real cursor value 319 if self.supports_incremental: 320 stream.source_defined_cursor = self.source_defined_cursor 321 stream.supported_sync_modes.append(SyncMode.incremental) 322 stream.default_cursor_field = self._wrapped_cursor_field() 323 324 keys = Stream._wrapped_primary_key(self.primary_key) 325 if keys and len(keys) > 0: 326 stream.source_defined_primary_key = keys 327 328 return stream
330 @property 331 def supports_incremental(self) -> bool: 332 """ 333 :return: True if this stream supports incrementally reading data 334 """ 335 return len(self._wrapped_cursor_field()) > 0
Returns
True if this stream supports incrementally reading data
337 @property 338 def is_resumable(self) -> bool: 339 """ 340 :return: True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. 341 This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh 342 can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning 343 on the next sync job. 344 """ 345 if self.supports_incremental: 346 return True 347 if self.has_multiple_slices: 348 # We temporarily gate substream to not support RFR because puts a pretty high burden on connector developers 349 # to structure stream state in a very specific way. We also can't check for issubclass(HttpSubStream) because 350 # not all substreams implement the interface and it would be a circular dependency so we use parent as a surrogate 351 return False 352 elif hasattr(type(self), "state") and getattr(type(self), "state").fset is not None: 353 # Modern case where a stream manages state using getter/setter 354 return True 355 else: 356 # Legacy case where the CDK manages state via the get_updated_state() method. This is determined by checking if 357 # the stream's get_updated_state() differs from the Stream class and therefore has been overridden 358 return type(self).get_updated_state != Stream.get_updated_state
Returns
True if this stream allows the checkpointing of sync progress and can resume from it on subsequent attempts. This differs from supports_incremental because certain kinds of streams like those supporting resumable full refresh can checkpoint progress in between attempts for improved fault tolerance. However, they will start from the beginning on the next sync job.
363 @property 364 def cursor_field(self) -> Union[str, List[str]]: 365 """ 366 Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. 367 :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. 368 """ 369 return []
Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
Returns
The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
371 @property 372 def namespace(self) -> Optional[str]: 373 """ 374 Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for. 375 :return: A string containing the name of the namespace. 376 """ 377 return None
Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for.
Returns
A string containing the name of the namespace.
379 @property 380 def source_defined_cursor(self) -> bool: 381 """ 382 Return False if the cursor can be configured by the user. 383 """ 384 return True
Return False if the cursor can be configured by the user.
386 @property 387 def exit_on_rate_limit(self) -> bool: 388 """Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.""" 389 return self._exit_on_rate_limit
Exit on rate limit getter, should return bool value. False if the stream will retry endlessly when rate limited.
396 @property 397 @abstractmethod 398 def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: 399 """ 400 :return: string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. 401 If the stream has no primary keys, return None. 402 """
Returns
string if single primary key, list of strings if composite primary key, list of list of strings if composite primary key consisting of nested fields. If the stream has no primary keys, return None.
404 def stream_slices( 405 self, 406 *, 407 sync_mode: SyncMode, 408 cursor_field: Optional[List[str]] = None, 409 stream_state: Optional[Mapping[str, Any]] = None, 410 ) -> Iterable[Optional[Mapping[str, Any]]]: 411 """ 412 Override to define the slices for this stream. See the stream slicing section of the docs for more information. 413 414 :param sync_mode: 415 :param cursor_field: 416 :param stream_state: 417 :return: 418 """ 419 yield StreamSlice(partition={}, cursor_slice={})
Override to define the slices for this stream. See the stream slicing section of the docs for more information.
Parameters
- sync_mode:
- cursor_field:
- stream_state:
Returns
421 @property 422 def state_checkpoint_interval(self) -> Optional[int]: 423 """ 424 Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading 425 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source. 426 427 Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled. 428 429 return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in 430 ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of 431 created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read. 432 """ 433 return None
Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source.
Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled.
return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read.
441 def get_updated_state( 442 self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any] 443 ) -> MutableMapping[str, Any]: 444 """DEPRECATED. Please use explicit state property instead, see `IncrementalMixin` docs. 445 446 Override to extract state from the latest record. Needed to implement incremental sync. 447 448 Inspects the latest record extracted from the data source and the current state object and return an updated state object. 449 450 For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is 451 {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object. 452 453 :param current_stream_state: The stream's current state object 454 :param latest_record: The latest record extracted from the stream 455 :return: An updated state object 456 """ 457 return {}
DEPRECATED. Please use explicit state property instead, see IncrementalMixin
docs.
Override to extract state from the latest record. Needed to implement incremental sync.
Inspects the latest record extracted from the data source and the current state object and return an updated state object.
For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.
Parameters
- current_stream_state: The stream's current state object
- latest_record: The latest record extracted from the stream
Returns
An updated state object
459 def get_cursor(self) -> Optional[Cursor]: 460 """ 461 A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while 462 reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need 463 to define a cursor implementation and override this method to manage state through a Cursor. 464 """ 465 return self.cursor
A Cursor is an interface that a stream can implement to manage how its internal state is read and updated while reading records. Historically, Python connectors had no concept of a cursor to manage state. Python streams need to define a cursor implementation and override this method to manage state through a Cursor.
591 def log_stream_sync_configuration(self) -> None: 592 """ 593 Logs the configuration of this stream. 594 """ 595 self.logger.debug( 596 f"Syncing stream instance: {self.name}", 597 extra={ 598 "primary_key": self.primary_key, 599 "cursor_field": self.cursor_field, 600 }, 601 )
Logs the configuration of this stream.
663 @property 664 def configured_json_schema(self) -> Optional[Dict[str, Any]]: 665 """ 666 This property is set from the read method. 667 668 :return Optional[Dict]: JSON schema from configured catalog if provided, otherwise None. 669 """ 670 return self._configured_json_schema
This property is set from the read method.
Returns
JSON schema from configured catalog if provided, otherwise None.
52def package_name_from_class(cls: object) -> str: 53 """Find the package name given a class name""" 54 module = inspect.getmodule(cls) 55 if module is not None: 56 return module.__name__.split(".")[0] 57 else: 58 raise ValueError(f"Could not find package name for class {cls}")
Find the package name given a class name
26class AirbyteTracedException(Exception): 27 """ 28 An exception that should be emitted as an AirbyteTraceMessage 29 """ 30 31 def __init__( 32 self, 33 internal_message: Optional[str] = None, 34 message: Optional[str] = None, 35 failure_type: FailureType = FailureType.system_error, 36 exception: Optional[BaseException] = None, 37 stream_descriptor: Optional[StreamDescriptor] = None, 38 ): 39 """ 40 :param internal_message: the internal error that caused the failure 41 :param message: a user-friendly message that indicates the cause of the error 42 :param failure_type: the type of error 43 :param exception: the exception that caused the error, from which the stack trace should be retrieved 44 :param stream_descriptor: describe the stream from which the exception comes from 45 """ 46 self.internal_message = internal_message 47 self.message = message 48 self.failure_type = failure_type 49 self._exception = exception 50 self._stream_descriptor = stream_descriptor 51 super().__init__(internal_message) 52 53 def as_airbyte_message( 54 self, stream_descriptor: Optional[StreamDescriptor] = None 55 ) -> AirbyteMessage: 56 """ 57 Builds an AirbyteTraceMessage from the exception 58 59 :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many 60 stream_descriptors are defined, the one from `as_airbyte_message` will be discarded. 61 """ 62 now_millis = time.time_ns() // 1_000_000 63 64 trace_exc = self._exception or self 65 stack_trace_str = "".join(traceback.TracebackException.from_exception(trace_exc).format()) 66 67 trace_message = AirbyteTraceMessage( 68 type=TraceType.ERROR, 69 emitted_at=now_millis, 70 error=AirbyteErrorTraceMessage( 71 message=self.message 72 or "Something went wrong in the connector. See the logs for more details.", 73 internal_message=self.internal_message, 74 failure_type=self.failure_type, 75 stack_trace=stack_trace_str, 76 stream_descriptor=self._stream_descriptor 77 if self._stream_descriptor is not None 78 else stream_descriptor, 79 ), 80 ) 81 82 return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) 83 84 def as_connection_status_message(self) -> Optional[AirbyteMessage]: 85 if self.failure_type == FailureType.config_error: 86 return AirbyteMessage( 87 type=MessageType.CONNECTION_STATUS, 88 connectionStatus=AirbyteConnectionStatus( 89 status=Status.FAILED, message=self.message 90 ), 91 ) 92 return None 93 94 def emit_message(self) -> None: 95 """ 96 Prints the exception as an AirbyteTraceMessage. 97 Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint. 98 """ 99 message = orjson.dumps(AirbyteMessageSerializer.dump(self.as_airbyte_message())).decode() 100 filtered_message = filter_secrets(message) 101 print(filtered_message) 102 103 @classmethod 104 def from_exception( 105 cls, 106 exc: BaseException, 107 stream_descriptor: Optional[StreamDescriptor] = None, 108 *args: Any, 109 **kwargs: Any, 110 ) -> "AirbyteTracedException": 111 """ 112 Helper to create an AirbyteTracedException from an existing exception 113 :param exc: the exception that caused the error 114 :param stream_descriptor: describe the stream from which the exception comes from 115 """ 116 return cls( 117 internal_message=str(exc), 118 exception=exc, 119 stream_descriptor=stream_descriptor, 120 *args, 121 **kwargs, 122 ) # type: ignore # ignoring because of args and kwargs 123 124 def as_sanitized_airbyte_message( 125 self, stream_descriptor: Optional[StreamDescriptor] = None 126 ) -> AirbyteMessage: 127 """ 128 Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body 129 130 :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many 131 stream_descriptors are defined, the one from `as_sanitized_airbyte_message` will be discarded. 132 """ 133 error_message = self.as_airbyte_message(stream_descriptor=stream_descriptor) 134 if error_message.trace.error.message: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 135 error_message.trace.error.message = filter_secrets( # type: ignore[union-attr] 136 error_message.trace.error.message, # type: ignore[union-attr] 137 ) 138 if error_message.trace.error.internal_message: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 139 error_message.trace.error.internal_message = filter_secrets( # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 140 error_message.trace.error.internal_message # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 141 ) 142 if error_message.trace.error.stack_trace: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 143 error_message.trace.error.stack_trace = filter_secrets( # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 144 error_message.trace.error.stack_trace # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 145 ) 146 return error_message
An exception that should be emitted as an AirbyteTraceMessage
31 def __init__( 32 self, 33 internal_message: Optional[str] = None, 34 message: Optional[str] = None, 35 failure_type: FailureType = FailureType.system_error, 36 exception: Optional[BaseException] = None, 37 stream_descriptor: Optional[StreamDescriptor] = None, 38 ): 39 """ 40 :param internal_message: the internal error that caused the failure 41 :param message: a user-friendly message that indicates the cause of the error 42 :param failure_type: the type of error 43 :param exception: the exception that caused the error, from which the stack trace should be retrieved 44 :param stream_descriptor: describe the stream from which the exception comes from 45 """ 46 self.internal_message = internal_message 47 self.message = message 48 self.failure_type = failure_type 49 self._exception = exception 50 self._stream_descriptor = stream_descriptor 51 super().__init__(internal_message)
Parameters
- internal_message: the internal error that caused the failure
- message: a user-friendly message that indicates the cause of the error
- failure_type: the type of error
- exception: the exception that caused the error, from which the stack trace should be retrieved
- stream_descriptor: describe the stream from which the exception comes from
53 def as_airbyte_message( 54 self, stream_descriptor: Optional[StreamDescriptor] = None 55 ) -> AirbyteMessage: 56 """ 57 Builds an AirbyteTraceMessage from the exception 58 59 :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many 60 stream_descriptors are defined, the one from `as_airbyte_message` will be discarded. 61 """ 62 now_millis = time.time_ns() // 1_000_000 63 64 trace_exc = self._exception or self 65 stack_trace_str = "".join(traceback.TracebackException.from_exception(trace_exc).format()) 66 67 trace_message = AirbyteTraceMessage( 68 type=TraceType.ERROR, 69 emitted_at=now_millis, 70 error=AirbyteErrorTraceMessage( 71 message=self.message 72 or "Something went wrong in the connector. See the logs for more details.", 73 internal_message=self.internal_message, 74 failure_type=self.failure_type, 75 stack_trace=stack_trace_str, 76 stream_descriptor=self._stream_descriptor 77 if self._stream_descriptor is not None 78 else stream_descriptor, 79 ), 80 ) 81 82 return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)
Builds an AirbyteTraceMessage from the exception
:param stream_descriptor is deprecated, please use the stream_description in __init__ or
from_exception. If many
stream_descriptors are defined, the one from
as_airbyte_message` will be discarded.
84 def as_connection_status_message(self) -> Optional[AirbyteMessage]: 85 if self.failure_type == FailureType.config_error: 86 return AirbyteMessage( 87 type=MessageType.CONNECTION_STATUS, 88 connectionStatus=AirbyteConnectionStatus( 89 status=Status.FAILED, message=self.message 90 ), 91 ) 92 return None
94 def emit_message(self) -> None: 95 """ 96 Prints the exception as an AirbyteTraceMessage. 97 Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint. 98 """ 99 message = orjson.dumps(AirbyteMessageSerializer.dump(self.as_airbyte_message())).decode() 100 filtered_message = filter_secrets(message) 101 print(filtered_message)
Prints the exception as an AirbyteTraceMessage. Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint.
103 @classmethod 104 def from_exception( 105 cls, 106 exc: BaseException, 107 stream_descriptor: Optional[StreamDescriptor] = None, 108 *args: Any, 109 **kwargs: Any, 110 ) -> "AirbyteTracedException": 111 """ 112 Helper to create an AirbyteTracedException from an existing exception 113 :param exc: the exception that caused the error 114 :param stream_descriptor: describe the stream from which the exception comes from 115 """ 116 return cls( 117 internal_message=str(exc), 118 exception=exc, 119 stream_descriptor=stream_descriptor, 120 *args, 121 **kwargs, 122 ) # type: ignore # ignoring because of args and kwargs
Helper to create an AirbyteTracedException from an existing exception
Parameters
- exc: the exception that caused the error
- stream_descriptor: describe the stream from which the exception comes from
124 def as_sanitized_airbyte_message( 125 self, stream_descriptor: Optional[StreamDescriptor] = None 126 ) -> AirbyteMessage: 127 """ 128 Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body 129 130 :param stream_descriptor is deprecated, please use the stream_description in `__init__ or `from_exception`. If many 131 stream_descriptors are defined, the one from `as_sanitized_airbyte_message` will be discarded. 132 """ 133 error_message = self.as_airbyte_message(stream_descriptor=stream_descriptor) 134 if error_message.trace.error.message: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 135 error_message.trace.error.message = filter_secrets( # type: ignore[union-attr] 136 error_message.trace.error.message, # type: ignore[union-attr] 137 ) 138 if error_message.trace.error.internal_message: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 139 error_message.trace.error.internal_message = filter_secrets( # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 140 error_message.trace.error.internal_message # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 141 ) 142 if error_message.trace.error.stack_trace: # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 143 error_message.trace.error.stack_trace = filter_secrets( # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 144 error_message.trace.error.stack_trace # type: ignore[union-attr] # AirbyteMessage with MessageType.TRACE has AirbyteTraceMessage 145 ) 146 return error_message
Builds an AirbyteTraceMessage from the exception and sanitizes any secrets from the message body
:param stream_descriptor is deprecated, please use the stream_description in __init__ or
from_exception. If many
stream_descriptors are defined, the one from
as_sanitized_airbyte_message` will be discarded.
11def is_cloud_environment() -> bool: 12 """ 13 Returns True if the connector is running in a cloud environment, False otherwise. 14 15 The function checks the value of the DEPLOYMENT_MODE environment variable which is set by the platform. 16 This function can be used to determine whether stricter security measures should be applied. 17 """ 18 deployment_mode = os.environ.get("DEPLOYMENT_MODE", "") 19 return deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
Returns True if the connector is running in a cloud environment, False otherwise.
The function checks the value of the DEPLOYMENT_MODE environment variable which is set by the platform. This function can be used to determine whether stricter security measures should be applied.
190class InternalConfig(BaseModel): 191 KEYWORDS: ClassVar[set[str]] = {"_limit", "_page_size"} 192 limit: int = Field(None, alias="_limit") 193 page_size: int = Field(None, alias="_page_size") 194 195 def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]: 196 kwargs["by_alias"] = True 197 kwargs["exclude_unset"] = True 198 return super().dict(*args, **kwargs) 199 200 def is_limit_reached(self, records_counter: int) -> bool: 201 """ 202 Check if record count reached limit set by internal config. 203 :param records_counter - number of records already red 204 :return True if limit reached, False otherwise 205 """ 206 if self.limit: 207 if records_counter >= self.limit: 208 return True 209 return False
195 def dict(self, *args: Any, **kwargs: Any) -> dict[str, Any]: 196 kwargs["by_alias"] = True 197 kwargs["exclude_unset"] = True 198 return super().dict(*args, **kwargs)
Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.
200 def is_limit_reached(self, records_counter: int) -> bool: 201 """ 202 Check if record count reached limit set by internal config. 203 :param records_counter - number of records already red 204 :return True if limit reached, False otherwise 205 """ 206 if self.limit: 207 if records_counter >= self.limit: 208 return True 209 return False
Check if record count reached limit set by internal config. :param records_counter - number of records already red :return True if limit reached, False otherwise
116class ResourceSchemaLoader: 117 """JSONSchema loader from package resources""" 118 119 def __init__(self, package_name: str): 120 self.package_name = package_name 121 122 def get_schema(self, name: str) -> dict[str, Any]: 123 """ 124 This method retrieves a JSON schema from the schemas/ folder. 125 126 127 The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs 128 living inside the "schemas/shared/" folder. For example: 129 130 schemas/shared/<shared_definition>.json 131 schemas/<name>.json # contains a $ref to shared_definition 132 schemas/<name2>.json # contains a $ref to shared_definition 133 """ 134 135 schema_filename = f"schemas/{name}.json" 136 raw_file = pkgutil.get_data(self.package_name, schema_filename) 137 if not raw_file: 138 raise IOError(f"Cannot find file {schema_filename}") 139 try: 140 raw_schema = json.loads(raw_file) 141 except ValueError as err: 142 raise RuntimeError(f"Invalid JSON file format for file {schema_filename}") from err 143 144 return self._resolve_schema_references(raw_schema) 145 146 def _resolve_schema_references(self, raw_schema: dict[str, Any]) -> dict[str, Any]: 147 """ 148 Resolve links to external references and move it to local "definitions" map. 149 150 :param raw_schema jsonschema to lookup for external links. 151 :return JSON serializable object with references without external dependencies. 152 """ 153 154 package = importlib.import_module(self.package_name) 155 if package.__file__: 156 base = os.path.dirname(package.__file__) + "/" 157 else: 158 raise ValueError(f"Package {package} does not have a valid __file__ field") 159 resolved = jsonref.JsonRef.replace_refs( 160 raw_schema, loader=JsonFileLoader(base, "schemas/shared"), base_uri=base 161 ) 162 resolved = resolve_ref_links(resolved) 163 if isinstance(resolved, dict): 164 return resolved 165 else: 166 raise ValueError(f"Expected resolved to be a dict. Got {resolved}")
JSONSchema loader from package resources
122 def get_schema(self, name: str) -> dict[str, Any]: 123 """ 124 This method retrieves a JSON schema from the schemas/ folder. 125 126 127 The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs 128 living inside the "schemas/shared/" folder. For example: 129 130 schemas/shared/<shared_definition>.json 131 schemas/<name>.json # contains a $ref to shared_definition 132 schemas/<name2>.json # contains a $ref to shared_definition 133 """ 134 135 schema_filename = f"schemas/{name}.json" 136 raw_file = pkgutil.get_data(self.package_name, schema_filename) 137 if not raw_file: 138 raise IOError(f"Cannot find file {schema_filename}") 139 try: 140 raw_schema = json.loads(raw_file) 141 except ValueError as err: 142 raise RuntimeError(f"Invalid JSON file format for file {schema_filename}") from err 143 144 return self._resolve_schema_references(raw_schema)
This method retrieves a JSON schema from the schemas/ folder.
The expected file structure is to have all top-level schemas (corresponding to streams) in the "schemas/" folder, with any shared $refs living inside the "schemas/shared/" folder. For example:
schemas/shared/
169def check_config_against_spec_or_exit( 170 config: Mapping[str, Any], spec: ConnectorSpecification 171) -> None: 172 """ 173 Check config object against spec. In case of spec is invalid, throws 174 an exception with validation error description. 175 176 :param config - config loaded from file specified over command line 177 :param spec - spec object generated by connector 178 """ 179 spec_schema = spec.connectionSpecification 180 try: 181 validate(instance=config, schema=spec_schema) 182 except ValidationError as validation_error: 183 raise AirbyteTracedException( 184 message="Config validation error: " + validation_error.message, 185 internal_message=validation_error.message, 186 failure_type=FailureType.config_error, 187 ) from None # required to prevent logging config secrets from the ValidationError's stacktrace
Check config object against spec. In case of spec is invalid, throws an exception with validation error description.
:param config - config loaded from file specified over command line :param spec - spec object generated by connector
212def split_config(config: Mapping[str, Any]) -> Tuple[dict[str, Any], InternalConfig]: 213 """ 214 Break config map object into 2 instances: first is a dict with user defined 215 configuration and second is internal config that contains private keys for 216 acceptance test configuration. 217 218 :param 219 config - Dict object that has been loaded from config file. 220 221 :return tuple of user defined config dict with filtered out internal 222 parameters and connector acceptance test internal config object. 223 """ 224 main_config = {} 225 internal_config = {} 226 for k, v in config.items(): 227 if k in InternalConfig.KEYWORDS: 228 internal_config[k] = v 229 else: 230 main_config[k] = v 231 return main_config, InternalConfig.parse_obj(internal_config)
Break config map object into 2 instances: first is a dict with user defined configuration and second is internal config that contains private keys for acceptance test configuration.
:param config - Dict object that has been loaded from config file.
:return tuple of user defined config dict with filtered out internal parameters and connector acceptance test internal config object.
48class TransformConfig(Flag): 49 """ 50 TypeTransformer class config. Configs can be combined using bitwise or operator e.g. 51 ``` 52 TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization 53 ``` 54 """ 55 56 # No action taken, default behavior. Cannot be combined with any other options. 57 NoTransform = auto() 58 # Applies default type casting with default_convert method which converts 59 # values by applying simple type casting to specified jsonschema type. 60 DefaultSchemaNormalization = auto() 61 # Allow registering custom type transformation callback. Can be combined 62 # with DefaultSchemaNormalization. In this case default type casting would 63 # be applied before custom one. 64 CustomSchemaNormalization = auto()
TypeTransformer class config. Configs can be combined using bitwise or operator e.g.
TransformConfig.DefaultSchemaNormalization | TransformConfig.CustomSchemaNormalization
67class TypeTransformer: 68 """ 69 Class for transforming object before output. 70 """ 71 72 _custom_normalizer: Optional[Callable[[Any, Dict[str, Any]], Any]] = None 73 74 def __init__(self, config: TransformConfig): 75 """ 76 Initialize TypeTransformer instance. 77 :param config Transform config that would be applied to object 78 """ 79 if TransformConfig.NoTransform in config and config != TransformConfig.NoTransform: 80 raise Exception("NoTransform option cannot be combined with other flags.") 81 self._config = config 82 all_validators = { 83 key: self.__get_normalizer(key, orig_validator) 84 for key, orig_validator in Draft7Validator.VALIDATORS.items() 85 # Do not validate field we do not transform for maximum performance. 86 if key in ["type", "array", "$ref", "properties", "items"] 87 } 88 self._normalizer = validators.create( 89 meta_schema=Draft7Validator.META_SCHEMA, validators=all_validators 90 ) 91 92 def registerCustomTransform( 93 self, normalization_callback: Callable[[Any, dict[str, Any]], Any] 94 ) -> Callable[[Any, dict[str, Any]], Any]: 95 """ 96 Register custom normalization callback. 97 :param normalization_callback function to be used for value 98 normalization. Takes original value and part type schema. Should return 99 normalized value. See docs/connector-development/cdk-python/schemas.md 100 for details. 101 :return Same callback, this is useful for using registerCustomTransform function as decorator. 102 """ 103 if TransformConfig.CustomSchemaNormalization not in self._config: 104 raise Exception( 105 "Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer" 106 ) 107 self._custom_normalizer = normalization_callback 108 return normalization_callback 109 110 def __normalize(self, original_item: Any, subschema: Dict[str, Any]) -> Any: 111 """ 112 Applies different transform function to object's field according to config. 113 :param original_item original value of field. 114 :param subschema part of the jsonschema containing field type/format data. 115 :return Final field value. 116 """ 117 if TransformConfig.DefaultSchemaNormalization in self._config: 118 original_item = self.default_convert(original_item, subschema) 119 120 if self._custom_normalizer: 121 original_item = self._custom_normalizer(original_item, subschema) 122 return original_item 123 124 @staticmethod 125 def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any: 126 """ 127 Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set. 128 :param original_item original value of field. 129 :param subschema part of the jsonschema containing field type/format data. 130 :return transformed field value. 131 """ 132 target_type = subschema.get("type", []) 133 if original_item is None and "null" in target_type: 134 return None 135 if isinstance(target_type, list): 136 # jsonschema type could either be a single string or array of type 137 # strings. In case if there is some disambigous and more than one 138 # type (except null) do not do any conversion and return original 139 # value. If type array has one type and null i.e. {"type": 140 # ["integer", "null"]}, convert value to specified type. 141 target_type = [t for t in target_type if t != "null"] 142 if len(target_type) != 1: 143 return original_item 144 target_type = target_type[0] 145 try: 146 if target_type == "string": 147 return str(original_item) 148 elif target_type == "number": 149 return float(original_item) 150 elif target_type == "integer": 151 return int(original_item) 152 elif target_type == "boolean": 153 if isinstance(original_item, str): 154 return _strtobool(original_item) == 1 155 return bool(original_item) 156 elif target_type == "array": 157 item_types = set(subschema.get("items", {}).get("type", set())) 158 if ( 159 item_types.issubset(json_to_python_simple) 160 and type(original_item) in json_to_python_simple.values() 161 ): 162 return [original_item] 163 except (ValueError, TypeError): 164 return original_item 165 return original_item 166 167 def __get_normalizer( 168 self, 169 schema_key: str, 170 original_validator: Callable, # type: ignore[type-arg] 171 ) -> Callable[[Any, Any, Any, dict[str, Any]], Generator[Any, Any, None]]: 172 """ 173 Traverse through object fields using native jsonschema validator and apply normalization function. 174 :param schema_key related json schema key that currently being validated/normalized. 175 :original_validator: native jsonschema validator callback. 176 """ 177 178 def normalizator( 179 validator_instance: Validator, 180 property_value: Any, 181 instance: Any, 182 schema: Dict[str, Any], 183 ) -> Generator[Any, Any, None]: 184 """ 185 Jsonschema validator callable it uses for validating instance. We 186 override default Draft7Validator to perform value transformation 187 before validation take place. We do not take any action except 188 logging warn if object does not conform to json schema, just using 189 jsonschema algorithm to traverse through object fields. 190 Look 191 https://python-jsonschema.readthedocs.io/en/stable/creating/?highlight=validators.create#jsonschema.validators.create 192 validators parameter for detailed description. 193 : 194 """ 195 196 def resolve(subschema: dict[str, Any]) -> dict[str, Any]: 197 if "$ref" in subschema: 198 _, resolved = cast( 199 RefResolver, 200 validator_instance.resolver, 201 ).resolve(subschema["$ref"]) 202 return cast(dict[str, Any], resolved) 203 return subschema 204 205 # Transform object and array values before running json schema type checking for each element. 206 # Recursively normalize every value of the "instance" sub-object, 207 # if "instance" is an incorrect type - skip recursive normalization of "instance" 208 if schema_key == "properties" and isinstance(instance, dict): 209 for k, subschema in property_value.items(): 210 if k in instance: 211 subschema = resolve(subschema) 212 instance[k] = self.__normalize(instance[k], subschema) 213 # Recursively normalize every item of the "instance" sub-array, 214 # if "instance" is an incorrect type - skip recursive normalization of "instance" 215 elif schema_key == "items" and isinstance(instance, list): 216 subschema = resolve(property_value) 217 for index, item in enumerate(instance): 218 instance[index] = self.__normalize(item, subschema) 219 220 # Running native jsonschema traverse algorithm after field normalization is done. 221 yield from original_validator( 222 validator_instance, 223 property_value, 224 instance, 225 schema, 226 ) 227 228 return normalizator 229 230 def transform( 231 self, 232 record: Dict[str, Any], 233 schema: Mapping[str, Any], 234 ) -> None: 235 """ 236 Normalize and validate according to config. 237 :param record: record instance for normalization/transformation. All modification are done by modifying existent object. 238 :param schema: object's jsonschema for normalization. 239 """ 240 if TransformConfig.NoTransform in self._config: 241 return 242 normalizer = self._normalizer(schema) 243 for e in normalizer.iter_errors(record): 244 """ 245 just calling normalizer.validate() would throw an exception on 246 first validation occurrences and stop processing rest of schema. 247 """ 248 logger.warning(self.get_error_message(e)) 249 250 def get_error_message(self, e: ValidationError) -> str: 251 """ 252 Construct a sanitized error message from a ValidationError instance. 253 """ 254 field_path = ".".join(map(str, e.path)) 255 type_structure = self._get_type_structure(e.instance) 256 257 return f"Failed to transform value from type '{type_structure}' to type '{e.validator_value}' at path: '{field_path}'" 258 259 def _get_type_structure(self, input_data: Any, current_depth: int = 0) -> Any: 260 """ 261 Get the structure of a given input data for use in error message construction. 262 """ 263 # Handle null values 264 if input_data is None: 265 return "null" 266 267 # Avoid recursing too deep 268 if current_depth >= MAX_NESTING_DEPTH: 269 return "object" if isinstance(input_data, dict) else python_to_json[type(input_data)] 270 271 if isinstance(input_data, dict): 272 return { 273 key: self._get_type_structure(field_value, current_depth + 1) 274 for key, field_value in input_data.items() 275 } 276 277 else: 278 return python_to_json[type(input_data)]
Class for transforming object before output.
74 def __init__(self, config: TransformConfig): 75 """ 76 Initialize TypeTransformer instance. 77 :param config Transform config that would be applied to object 78 """ 79 if TransformConfig.NoTransform in config and config != TransformConfig.NoTransform: 80 raise Exception("NoTransform option cannot be combined with other flags.") 81 self._config = config 82 all_validators = { 83 key: self.__get_normalizer(key, orig_validator) 84 for key, orig_validator in Draft7Validator.VALIDATORS.items() 85 # Do not validate field we do not transform for maximum performance. 86 if key in ["type", "array", "$ref", "properties", "items"] 87 } 88 self._normalizer = validators.create( 89 meta_schema=Draft7Validator.META_SCHEMA, validators=all_validators 90 )
Initialize TypeTransformer instance. :param config Transform config that would be applied to object
92 def registerCustomTransform( 93 self, normalization_callback: Callable[[Any, dict[str, Any]], Any] 94 ) -> Callable[[Any, dict[str, Any]], Any]: 95 """ 96 Register custom normalization callback. 97 :param normalization_callback function to be used for value 98 normalization. Takes original value and part type schema. Should return 99 normalized value. See docs/connector-development/cdk-python/schemas.md 100 for details. 101 :return Same callback, this is useful for using registerCustomTransform function as decorator. 102 """ 103 if TransformConfig.CustomSchemaNormalization not in self._config: 104 raise Exception( 105 "Please set TransformConfig.CustomSchemaNormalization config before registering custom normalizer" 106 ) 107 self._custom_normalizer = normalization_callback 108 return normalization_callback
Register custom normalization callback. :param normalization_callback function to be used for value normalization. Takes original value and part type schema. Should return normalized value. See docs/connector-development/cdk-python/schemas.md for details. :return Same callback, this is useful for using registerCustomTransform function as decorator.
124 @staticmethod 125 def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any: 126 """ 127 Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set. 128 :param original_item original value of field. 129 :param subschema part of the jsonschema containing field type/format data. 130 :return transformed field value. 131 """ 132 target_type = subschema.get("type", []) 133 if original_item is None and "null" in target_type: 134 return None 135 if isinstance(target_type, list): 136 # jsonschema type could either be a single string or array of type 137 # strings. In case if there is some disambigous and more than one 138 # type (except null) do not do any conversion and return original 139 # value. If type array has one type and null i.e. {"type": 140 # ["integer", "null"]}, convert value to specified type. 141 target_type = [t for t in target_type if t != "null"] 142 if len(target_type) != 1: 143 return original_item 144 target_type = target_type[0] 145 try: 146 if target_type == "string": 147 return str(original_item) 148 elif target_type == "number": 149 return float(original_item) 150 elif target_type == "integer": 151 return int(original_item) 152 elif target_type == "boolean": 153 if isinstance(original_item, str): 154 return _strtobool(original_item) == 1 155 return bool(original_item) 156 elif target_type == "array": 157 item_types = set(subschema.get("items", {}).get("type", set())) 158 if ( 159 item_types.issubset(json_to_python_simple) 160 and type(original_item) in json_to_python_simple.values() 161 ): 162 return [original_item] 163 except (ValueError, TypeError): 164 return original_item 165 return original_item
Default transform function that is used when TransformConfig.DefaultSchemaNormalization flag set. :param original_item original value of field. :param subschema part of the jsonschema containing field type/format data. :return transformed field value.
230 def transform( 231 self, 232 record: Dict[str, Any], 233 schema: Mapping[str, Any], 234 ) -> None: 235 """ 236 Normalize and validate according to config. 237 :param record: record instance for normalization/transformation. All modification are done by modifying existent object. 238 :param schema: object's jsonschema for normalization. 239 """ 240 if TransformConfig.NoTransform in self._config: 241 return 242 normalizer = self._normalizer(schema) 243 for e in normalizer.iter_errors(record): 244 """ 245 just calling normalizer.validate() would throw an exception on 246 first validation occurrences and stop processing rest of schema. 247 """ 248 logger.warning(self.get_error_message(e))
Normalize and validate according to config.
Parameters
- record: record instance for normalization/transformation. All modification are done by modifying existent object.
- schema: object's jsonschema for normalization.
250 def get_error_message(self, e: ValidationError) -> str: 251 """ 252 Construct a sanitized error message from a ValidationError instance. 253 """ 254 field_path = ".".join(map(str, e.path)) 255 type_structure = self._get_type_structure(e.instance) 256 257 return f"Failed to transform value from type '{type_structure}' to type '{e.validator_value}' at path: '{field_path}'"
Construct a sanitized error message from a ValidationError instance.
80@contextmanager 81def create_timer(name: str) -> Generator[EventTimer, Any, None]: 82 """ 83 Creates a new EventTimer as a context manager to improve code readability. 84 """ 85 a_timer = EventTimer(name) 86 yield a_timer
Creates a new EventTimer as a context manager to improve code readability.
9class OneOfOptionConfig: 10 """ 11 Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers. 12 13 Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema). 14 15 Usage: 16 17 ```python 18 class OptionModel(BaseModel): 19 mode: Literal["option_a"] = Field("option_a", const=True) 20 option_a_field: str = Field(...) 21 22 class Config(OneOfOptionConfig): 23 title = "Option A" 24 description = "Option A description" 25 discriminator = "mode" 26 ``` 27 """ 28 29 @staticmethod 30 def schema_extra(schema: Dict[str, Any], model: Any) -> None: 31 if hasattr(model.Config, "description"): 32 schema["description"] = model.Config.description 33 if hasattr(model.Config, "discriminator"): 34 schema.setdefault("required", []).append(model.Config.discriminator)
Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
Usage:
class OptionModel(BaseModel): mode: Literal["option_a"] = Field("option_a", const=True) option_a_field: str = Field(...) class Config(OneOfOptionConfig): title = "Option A" description = "Option A description" discriminator = "mode"
13def resolve_refs(schema: dict[str, Any]) -> dict[str, Any]: 14 """ 15 For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object 16 relationships. 17 """ 18 json_schema_ref_resolver = RefResolver.from_schema(schema) 19 str_schema = json.dumps(schema) 20 for ref_block in re.findall(r'{"\$ref": "#\/definitions\/.+?(?="})"}', str_schema): 21 ref = json.loads(ref_block)["$ref"] 22 str_schema = str_schema.replace( 23 ref_block, json.dumps(json_schema_ref_resolver.resolve(ref)[1]) 24 ) 25 pyschema: dict[str, Any] = json.loads(str_schema) 26 del pyschema["definitions"] 27 return pyschema
For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object relationships.
23def as_airbyte_message( 24 stream: Union[AirbyteStream, StreamDescriptor], 25 current_status: AirbyteStreamStatus, 26 reasons: Optional[List[AirbyteStreamStatusReason]] = None, 27) -> AirbyteMessage: 28 """ 29 Builds an AirbyteStreamStatusTraceMessage for the provided stream 30 """ 31 32 now_millis = datetime.now().timestamp() * 1000.0 33 34 trace_message = AirbyteTraceMessage( 35 type=TraceType.STREAM_STATUS, 36 emitted_at=now_millis, 37 stream_status=AirbyteStreamStatusTraceMessage( 38 stream_descriptor=StreamDescriptor(name=stream.name, namespace=stream.namespace), 39 status=current_status, 40 reasons=reasons, 41 ), 42 ) 43 44 return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)
Builds an AirbyteStreamStatusTraceMessage for the provided stream
21class Record(Mapping[str, Any]): 22 def __init__( 23 self, 24 data: Mapping[str, Any], 25 stream_name: str, 26 associated_slice: Optional[StreamSlice] = None, 27 is_file_transfer_message: bool = False, 28 ): 29 self._data = data 30 self._associated_slice = associated_slice 31 self.stream_name = stream_name 32 self.is_file_transfer_message = is_file_transfer_message 33 34 @property 35 def data(self) -> Mapping[str, Any]: 36 return self._data 37 38 @property 39 def associated_slice(self) -> Optional[StreamSlice]: 40 return self._associated_slice 41 42 def __repr__(self) -> str: 43 return repr(self._data) 44 45 def __getitem__(self, key: str) -> Any: 46 return self._data[key] 47 48 def __len__(self) -> int: 49 return len(self._data) 50 51 def __iter__(self) -> Any: 52 return iter(self._data) 53 54 def __contains__(self, item: object) -> bool: 55 return item in self._data 56 57 def __eq__(self, other: object) -> bool: 58 if isinstance(other, Record): 59 # noinspection PyProtectedMember 60 return self._data == other._data 61 return False 62 63 def __ne__(self, other: object) -> bool: 64 return not self.__eq__(other)
A Mapping is a generic container for associating key/value pairs.
This class provides concrete generic implementations of all methods except for __getitem__, __iter__, and __len__.
22 def __init__( 23 self, 24 data: Mapping[str, Any], 25 stream_name: str, 26 associated_slice: Optional[StreamSlice] = None, 27 is_file_transfer_message: bool = False, 28 ): 29 self._data = data 30 self._associated_slice = associated_slice 31 self.stream_name = stream_name 32 self.is_file_transfer_message = is_file_transfer_message