airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import re 11from functools import partial 12from typing import ( 13 Any, 14 Callable, 15 Dict, 16 List, 17 Mapping, 18 MutableMapping, 19 Optional, 20 Type, 21 Union, 22 get_args, 23 get_origin, 24 get_type_hints, 25) 26 27from isodate import parse_duration 28from pydantic.v1 import BaseModel 29 30from airbyte_cdk.connector_builder.models import ( 31 LogMessage as ConnectorBuilderLogMessage, 32) 33from airbyte_cdk.models import FailureType, Level 34from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 35from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 36from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 37from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 38from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 39from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 40from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 41 DeclarativeAuthenticator, 42 NoAuth, 43) 44from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 45from airbyte_cdk.sources.declarative.auth.oauth import ( 46 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 47) 48from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 49from airbyte_cdk.sources.declarative.auth.token import ( 50 ApiKeyAuthenticator, 51 BasicHttpAuthenticator, 52 BearerAuthenticator, 53 LegacySessionTokenAuthenticator, 54) 55from airbyte_cdk.sources.declarative.auth.token_provider import ( 56 InterpolatedStringTokenProvider, 57 SessionTokenProvider, 58 TokenProvider, 59) 60from airbyte_cdk.sources.declarative.checks import ( 61 CheckDynamicStream, 62 CheckStream, 63 DynamicStreamCheckConfig, 64) 65from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 66from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 67from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream 68from airbyte_cdk.sources.declarative.decoders import ( 69 Decoder, 70 IterableDecoder, 71 JsonDecoder, 72 PaginationDecoderDecorator, 73 XmlDecoder, 74 ZipfileDecoder, 75) 76from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 77 CompositeRawDecoder, 78 CsvParser, 79 GzipParser, 80 JsonLineParser, 81 JsonParser, 82 Parser, 83) 84from airbyte_cdk.sources.declarative.extractors import ( 85 DpathExtractor, 86 RecordFilter, 87 RecordSelector, 88 ResponseToFileExtractor, 89) 90from airbyte_cdk.sources.declarative.extractors.record_filter import ( 91 ClientSideIncrementalRecordFilterDecorator, 92) 93from airbyte_cdk.sources.declarative.incremental import ( 94 ChildPartitionResumableFullRefreshCursor, 95 ConcurrentCursorFactory, 96 ConcurrentPerPartitionCursor, 97 CursorFactory, 98 DatetimeBasedCursor, 99 DeclarativeCursor, 100 GlobalSubstreamCursor, 101 PerPartitionCursor, 102 PerPartitionWithGlobalCursor, 103 ResumableFullRefreshCursor, 104) 105from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 106from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 107from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 108 LegacyToPerPartitionStateMigration, 109) 110from airbyte_cdk.sources.declarative.models import ( 111 CustomStateMigration, 112) 113from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 114 DEPRECATION_LOGS_TAG, 115 BaseModelWithDeprecations, 116) 117from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 118 AddedFieldDefinition as AddedFieldDefinitionModel, 119) 120from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 121 AddFields as AddFieldsModel, 122) 123from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 124 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 125) 126from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 127 AsyncJobStatusMap as AsyncJobStatusMapModel, 128) 129from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 130 AsyncRetriever as AsyncRetrieverModel, 131) 132from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 133 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 134) 135from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 136 BearerAuthenticator as BearerAuthenticatorModel, 137) 138from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 139 CheckDynamicStream as CheckDynamicStreamModel, 140) 141from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 142 CheckStream as CheckStreamModel, 143) 144from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 145 ComplexFieldType as ComplexFieldTypeModel, 146) 147from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 148 ComponentMappingDefinition as ComponentMappingDefinitionModel, 149) 150from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 151 CompositeErrorHandler as CompositeErrorHandlerModel, 152) 153from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 154 ConcurrencyLevel as ConcurrencyLevelModel, 155) 156from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 157 ConfigComponentsResolver as ConfigComponentsResolverModel, 158) 159from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 160 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 161) 162from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 163 CsvDecoder as CsvDecoderModel, 164) 165from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 166 CursorPagination as CursorPaginationModel, 167) 168from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 169 CustomAuthenticator as CustomAuthenticatorModel, 170) 171from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 172 CustomBackoffStrategy as CustomBackoffStrategyModel, 173) 174from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 175 CustomDecoder as CustomDecoderModel, 176) 177from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 178 CustomErrorHandler as CustomErrorHandlerModel, 179) 180from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 181 CustomIncrementalSync as CustomIncrementalSyncModel, 182) 183from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 184 CustomPaginationStrategy as CustomPaginationStrategyModel, 185) 186from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 187 CustomPartitionRouter as CustomPartitionRouterModel, 188) 189from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 190 CustomRecordExtractor as CustomRecordExtractorModel, 191) 192from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 193 CustomRecordFilter as CustomRecordFilterModel, 194) 195from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 196 CustomRequester as CustomRequesterModel, 197) 198from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 199 CustomRetriever as CustomRetrieverModel, 200) 201from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 202 CustomSchemaLoader as CustomSchemaLoader, 203) 204from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 205 CustomSchemaNormalization as CustomSchemaNormalizationModel, 206) 207from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 208 CustomTransformation as CustomTransformationModel, 209) 210from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 211 DatetimeBasedCursor as DatetimeBasedCursorModel, 212) 213from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 214 DeclarativeStream as DeclarativeStreamModel, 215) 216from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 217 DefaultErrorHandler as DefaultErrorHandlerModel, 218) 219from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 220 DefaultPaginator as DefaultPaginatorModel, 221) 222from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 223 DpathExtractor as DpathExtractorModel, 224) 225from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 226 DpathFlattenFields as DpathFlattenFieldsModel, 227) 228from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 229 DynamicSchemaLoader as DynamicSchemaLoaderModel, 230) 231from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 232 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 233) 234from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 235 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 236) 237from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 238 FileUploader as FileUploaderModel, 239) 240from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 241 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 242) 243from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 244 FlattenFields as FlattenFieldsModel, 245) 246from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 247 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 248) 249from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 250 GroupingPartitionRouter as GroupingPartitionRouterModel, 251) 252from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 253 GzipDecoder as GzipDecoderModel, 254) 255from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 256 HTTPAPIBudget as HTTPAPIBudgetModel, 257) 258from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 259 HttpComponentsResolver as HttpComponentsResolverModel, 260) 261from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 262 HttpRequester as HttpRequesterModel, 263) 264from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 265 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 266) 267from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 268 HttpResponseFilter as HttpResponseFilterModel, 269) 270from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 271 IncrementingCountCursor as IncrementingCountCursorModel, 272) 273from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 274 InlineSchemaLoader as InlineSchemaLoaderModel, 275) 276from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 277 IterableDecoder as IterableDecoderModel, 278) 279from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 280 JsonDecoder as JsonDecoderModel, 281) 282from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 283 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 284) 285from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 286 JsonlDecoder as JsonlDecoderModel, 287) 288from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 289 JwtAuthenticator as JwtAuthenticatorModel, 290) 291from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 292 JwtHeaders as JwtHeadersModel, 293) 294from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 295 JwtPayload as JwtPayloadModel, 296) 297from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 298 KeysReplace as KeysReplaceModel, 299) 300from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 301 KeysToLower as KeysToLowerModel, 302) 303from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 304 KeysToSnakeCase as KeysToSnakeCaseModel, 305) 306from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 307 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 308) 309from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 310 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 311) 312from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 313 ListPartitionRouter as ListPartitionRouterModel, 314) 315from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 316 MinMaxDatetime as MinMaxDatetimeModel, 317) 318from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 319 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 320) 321from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 322 NoAuth as NoAuthModel, 323) 324from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 325 NoPagination as NoPaginationModel, 326) 327from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 328 OAuthAuthenticator as OAuthAuthenticatorModel, 329) 330from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 331 OffsetIncrement as OffsetIncrementModel, 332) 333from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 334 PageIncrement as PageIncrementModel, 335) 336from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 337 ParentStreamConfig as ParentStreamConfigModel, 338) 339from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 340 PropertiesFromEndpoint as PropertiesFromEndpointModel, 341) 342from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 343 PropertyChunking as PropertyChunkingModel, 344) 345from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 346 PropertyLimitType as PropertyLimitTypeModel, 347) 348from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 349 QueryProperties as QueryPropertiesModel, 350) 351from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 352 Rate as RateModel, 353) 354from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 355 RecordFilter as RecordFilterModel, 356) 357from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 358 RecordSelector as RecordSelectorModel, 359) 360from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 361 RemoveFields as RemoveFieldsModel, 362) 363from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 364 RequestOption as RequestOptionModel, 365) 366from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 367 RequestPath as RequestPathModel, 368) 369from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 370 ResponseToFileExtractor as ResponseToFileExtractorModel, 371) 372from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 373 SchemaNormalization as SchemaNormalizationModel, 374) 375from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 376 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 377) 378from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 379 SelectiveAuthenticator as SelectiveAuthenticatorModel, 380) 381from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 382 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 383) 384from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 385 SimpleRetriever as SimpleRetrieverModel, 386) 387from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 388from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 389 StateDelegatingStream as StateDelegatingStreamModel, 390) 391from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 392 StreamConfig as StreamConfigModel, 393) 394from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 395 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 396) 397from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 398 TypesMap as TypesMapModel, 399) 400from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 401 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 402) 403from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 404from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 405 WaitTimeFromHeader as WaitTimeFromHeaderModel, 406) 407from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 408 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 409) 410from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 411 XmlDecoder as XmlDecoderModel, 412) 413from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 414 ZipfileDecoder as ZipfileDecoderModel, 415) 416from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( 417 COMPONENTS_MODULE_NAME, 418 SDM_COMPONENTS_MODULE_NAME, 419) 420from airbyte_cdk.sources.declarative.partition_routers import ( 421 CartesianProductStreamSlicer, 422 GroupingPartitionRouter, 423 ListPartitionRouter, 424 PartitionRouter, 425 SinglePartitionRouter, 426 SubstreamPartitionRouter, 427) 428from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 429 AsyncJobPartitionRouter, 430) 431from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 432 ParentStreamConfig, 433) 434from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 435from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 436 CompositeErrorHandler, 437 DefaultErrorHandler, 438 HttpResponseFilter, 439) 440from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 441 ConstantBackoffStrategy, 442 ExponentialBackoffStrategy, 443 WaitTimeFromHeaderBackoffStrategy, 444 WaitUntilTimeFromHeaderBackoffStrategy, 445) 446from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 447from airbyte_cdk.sources.declarative.requesters.paginators import ( 448 DefaultPaginator, 449 NoPagination, 450 PaginatorTestReadDecorator, 451) 452from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 453 CursorPaginationStrategy, 454 CursorStopCondition, 455 OffsetIncrement, 456 PageIncrement, 457 StopConditionPaginationStrategyDecorator, 458) 459from airbyte_cdk.sources.declarative.requesters.query_properties import ( 460 PropertiesFromEndpoint, 461 PropertyChunking, 462 QueryProperties, 463) 464from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 465 PropertyLimitType, 466) 467from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 468 GroupByKey, 469) 470from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 471from airbyte_cdk.sources.declarative.requesters.request_options import ( 472 DatetimeBasedRequestOptionsProvider, 473 DefaultRequestOptionsProvider, 474 InterpolatedRequestOptionsProvider, 475 RequestOptionsProvider, 476) 477from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 478from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod 479from airbyte_cdk.sources.declarative.resolvers import ( 480 ComponentMappingDefinition, 481 ConfigComponentsResolver, 482 HttpComponentsResolver, 483 StreamConfig, 484) 485from airbyte_cdk.sources.declarative.retrievers import ( 486 AsyncRetriever, 487 LazySimpleRetriever, 488 SimpleRetriever, 489 SimpleRetrieverTestReadDecorator, 490) 491from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 492 ConnectorBuilderFileUploader, 493 DefaultFileUploader, 494 FileUploader, 495 LocalFileSystemFileWriter, 496 NoopFileWriter, 497) 498from airbyte_cdk.sources.declarative.schema import ( 499 ComplexFieldType, 500 DefaultSchemaLoader, 501 DynamicSchemaLoader, 502 InlineSchemaLoader, 503 JsonFileSchemaLoader, 504 SchemaTypeIdentifier, 505 TypesMap, 506) 507from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 508from airbyte_cdk.sources.declarative.spec import Spec 509from airbyte_cdk.sources.declarative.stream_slicers import StreamSlicer 510from airbyte_cdk.sources.declarative.transformations import ( 511 AddFields, 512 RecordTransformation, 513 RemoveFields, 514) 515from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 516from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 517 DpathFlattenFields, 518 KeyTransformation, 519) 520from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 521 FlattenFields, 522) 523from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 524 KeysReplaceTransformation, 525) 526from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 527 KeysToLowerTransformation, 528) 529from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 530 KeysToSnakeCaseTransformation, 531) 532from airbyte_cdk.sources.message import ( 533 InMemoryMessageRepository, 534 LogAppenderMessageRepositoryDecorator, 535 MessageRepository, 536 NoopMessageRepository, 537) 538from airbyte_cdk.sources.streams.call_rate import ( 539 APIBudget, 540 FixedWindowCallRatePolicy, 541 HttpAPIBudget, 542 HttpRequestRegexMatcher, 543 MovingWindowCallRatePolicy, 544 Rate, 545 UnlimitedCallRatePolicy, 546) 547from airbyte_cdk.sources.streams.concurrent.clamping import ( 548 ClampingEndProvider, 549 ClampingStrategy, 550 DayClampingStrategy, 551 MonthClampingStrategy, 552 NoClamping, 553 WeekClampingStrategy, 554 Weekday, 555) 556from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField 557from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 558 CustomFormatConcurrentStreamStateConverter, 559 DateTimeStreamStateConverter, 560) 561from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 562 IncrementingCountStreamStateConverter, 563) 564from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 565from airbyte_cdk.sources.types import Config 566from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 567 568ComponentDefinition = Mapping[str, Any] 569 570SCHEMA_TRANSFORMER_TYPE_MAPPING = { 571 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 572 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 573} 574 575 576class ModelToComponentFactory: 577 EPOCH_DATETIME_FORMAT = "%s" 578 579 def __init__( 580 self, 581 limit_pages_fetched_per_slice: Optional[int] = None, 582 limit_slices_fetched: Optional[int] = None, 583 emit_connector_builder_messages: bool = False, 584 disable_retries: bool = False, 585 disable_cache: bool = False, 586 disable_resumable_full_refresh: bool = False, 587 message_repository: Optional[MessageRepository] = None, 588 connector_state_manager: Optional[ConnectorStateManager] = None, 589 max_concurrent_async_job_count: Optional[int] = None, 590 ): 591 self._init_mappings() 592 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 593 self._limit_slices_fetched = limit_slices_fetched 594 self._emit_connector_builder_messages = emit_connector_builder_messages 595 self._disable_retries = disable_retries 596 self._disable_cache = disable_cache 597 self._disable_resumable_full_refresh = disable_resumable_full_refresh 598 self._message_repository = message_repository or InMemoryMessageRepository( 599 self._evaluate_log_level(emit_connector_builder_messages) 600 ) 601 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 602 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 603 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 604 # placeholder for deprecation warnings 605 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 606 607 def _init_mappings(self) -> None: 608 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 609 AddedFieldDefinitionModel: self.create_added_field_definition, 610 AddFieldsModel: self.create_add_fields, 611 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 612 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 613 BearerAuthenticatorModel: self.create_bearer_authenticator, 614 CheckStreamModel: self.create_check_stream, 615 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 616 CheckDynamicStreamModel: self.create_check_dynamic_stream, 617 CompositeErrorHandlerModel: self.create_composite_error_handler, 618 ConcurrencyLevelModel: self.create_concurrency_level, 619 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 620 CsvDecoderModel: self.create_csv_decoder, 621 CursorPaginationModel: self.create_cursor_pagination, 622 CustomAuthenticatorModel: self.create_custom_component, 623 CustomBackoffStrategyModel: self.create_custom_component, 624 CustomDecoderModel: self.create_custom_component, 625 CustomErrorHandlerModel: self.create_custom_component, 626 CustomIncrementalSyncModel: self.create_custom_component, 627 CustomRecordExtractorModel: self.create_custom_component, 628 CustomRecordFilterModel: self.create_custom_component, 629 CustomRequesterModel: self.create_custom_component, 630 CustomRetrieverModel: self.create_custom_component, 631 CustomSchemaLoader: self.create_custom_component, 632 CustomSchemaNormalizationModel: self.create_custom_component, 633 CustomStateMigration: self.create_custom_component, 634 CustomPaginationStrategyModel: self.create_custom_component, 635 CustomPartitionRouterModel: self.create_custom_component, 636 CustomTransformationModel: self.create_custom_component, 637 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 638 DeclarativeStreamModel: self.create_declarative_stream, 639 DefaultErrorHandlerModel: self.create_default_error_handler, 640 DefaultPaginatorModel: self.create_default_paginator, 641 DpathExtractorModel: self.create_dpath_extractor, 642 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 643 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 644 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 645 GroupByKeyMergeStrategyModel: self.create_group_by_key, 646 HttpRequesterModel: self.create_http_requester, 647 HttpResponseFilterModel: self.create_http_response_filter, 648 InlineSchemaLoaderModel: self.create_inline_schema_loader, 649 JsonDecoderModel: self.create_json_decoder, 650 JsonlDecoderModel: self.create_jsonl_decoder, 651 GzipDecoderModel: self.create_gzip_decoder, 652 KeysToLowerModel: self.create_keys_to_lower_transformation, 653 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 654 KeysReplaceModel: self.create_keys_replace_transformation, 655 FlattenFieldsModel: self.create_flatten_fields, 656 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 657 IterableDecoderModel: self.create_iterable_decoder, 658 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 659 XmlDecoderModel: self.create_xml_decoder, 660 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 661 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 662 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 663 TypesMapModel: self.create_types_map, 664 ComplexFieldTypeModel: self.create_complex_field_type, 665 JwtAuthenticatorModel: self.create_jwt_authenticator, 666 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 667 ListPartitionRouterModel: self.create_list_partition_router, 668 MinMaxDatetimeModel: self.create_min_max_datetime, 669 NoAuthModel: self.create_no_auth, 670 NoPaginationModel: self.create_no_pagination, 671 OAuthAuthenticatorModel: self.create_oauth_authenticator, 672 OffsetIncrementModel: self.create_offset_increment, 673 PageIncrementModel: self.create_page_increment, 674 ParentStreamConfigModel: self.create_parent_stream_config, 675 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 676 PropertyChunkingModel: self.create_property_chunking, 677 QueryPropertiesModel: self.create_query_properties, 678 RecordFilterModel: self.create_record_filter, 679 RecordSelectorModel: self.create_record_selector, 680 RemoveFieldsModel: self.create_remove_fields, 681 RequestPathModel: self.create_request_path, 682 RequestOptionModel: self.create_request_option, 683 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 684 SelectiveAuthenticatorModel: self.create_selective_authenticator, 685 SimpleRetrieverModel: self.create_simple_retriever, 686 StateDelegatingStreamModel: self.create_state_delegating_stream, 687 SpecModel: self.create_spec, 688 SubstreamPartitionRouterModel: self.create_substream_partition_router, 689 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 690 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 691 AsyncRetrieverModel: self.create_async_retriever, 692 HttpComponentsResolverModel: self.create_http_components_resolver, 693 ConfigComponentsResolverModel: self.create_config_components_resolver, 694 StreamConfigModel: self.create_stream_config, 695 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 696 ZipfileDecoderModel: self.create_zipfile_decoder, 697 HTTPAPIBudgetModel: self.create_http_api_budget, 698 FileUploaderModel: self.create_file_uploader, 699 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 700 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 701 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 702 RateModel: self.create_rate, 703 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 704 GroupingPartitionRouterModel: self.create_grouping_partition_router, 705 } 706 707 # Needed for the case where we need to perform a second parse on the fields of a custom component 708 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 709 710 def create_component( 711 self, 712 model_type: Type[BaseModel], 713 component_definition: ComponentDefinition, 714 config: Config, 715 **kwargs: Any, 716 ) -> Any: 717 """ 718 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 719 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 720 creating declarative components from that model. 721 722 :param model_type: The type of declarative component that is being initialized 723 :param component_definition: The mapping that represents a declarative component 724 :param config: The connector config that is provided by the customer 725 :return: The declarative component to be used at runtime 726 """ 727 728 component_type = component_definition.get("type") 729 if component_definition.get("type") != model_type.__name__: 730 raise ValueError( 731 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 732 ) 733 734 declarative_component_model = model_type.parse_obj(component_definition) 735 736 if not isinstance(declarative_component_model, model_type): 737 raise ValueError( 738 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 739 ) 740 741 return self._create_component_from_model( 742 model=declarative_component_model, config=config, **kwargs 743 ) 744 745 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 746 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 747 raise ValueError( 748 f"{model.__class__} with attributes {model} is not a valid component type" 749 ) 750 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 751 if not component_constructor: 752 raise ValueError(f"Could not find constructor for {model.__class__}") 753 754 # collect deprecation warnings for supported models. 755 if isinstance(model, BaseModelWithDeprecations): 756 self._collect_model_deprecations(model) 757 758 return component_constructor(model=model, config=config, **kwargs) 759 760 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 761 """ 762 Returns the deprecation warnings that were collected during the creation of components. 763 """ 764 return self._collected_deprecation_logs 765 766 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 767 """ 768 Collects deprecation logs from the given model and appends any new logs to the internal collection. 769 770 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 771 772 Args: 773 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 774 """ 775 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 776 for log in model._deprecation_logs: 777 # avoid duplicates for deprecation logs observed. 778 if log not in self._collected_deprecation_logs: 779 self._collected_deprecation_logs.append(log) 780 781 @staticmethod 782 def create_added_field_definition( 783 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 784 ) -> AddedFieldDefinition: 785 interpolated_value = InterpolatedString.create( 786 model.value, parameters=model.parameters or {} 787 ) 788 return AddedFieldDefinition( 789 path=model.path, 790 value=interpolated_value, 791 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 792 parameters=model.parameters or {}, 793 ) 794 795 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 796 added_field_definitions = [ 797 self._create_component_from_model( 798 model=added_field_definition_model, 799 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 800 added_field_definition_model.value_type 801 ), 802 config=config, 803 ) 804 for added_field_definition_model in model.fields 805 ] 806 return AddFields( 807 fields=added_field_definitions, 808 condition=model.condition or "", 809 parameters=model.parameters or {}, 810 ) 811 812 def create_keys_to_lower_transformation( 813 self, model: KeysToLowerModel, config: Config, **kwargs: Any 814 ) -> KeysToLowerTransformation: 815 return KeysToLowerTransformation() 816 817 def create_keys_to_snake_transformation( 818 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 819 ) -> KeysToSnakeCaseTransformation: 820 return KeysToSnakeCaseTransformation() 821 822 def create_keys_replace_transformation( 823 self, model: KeysReplaceModel, config: Config, **kwargs: Any 824 ) -> KeysReplaceTransformation: 825 return KeysReplaceTransformation( 826 old=model.old, new=model.new, parameters=model.parameters or {} 827 ) 828 829 def create_flatten_fields( 830 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 831 ) -> FlattenFields: 832 return FlattenFields( 833 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 834 ) 835 836 def create_dpath_flatten_fields( 837 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 838 ) -> DpathFlattenFields: 839 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 840 key_transformation = ( 841 KeyTransformation( 842 config=config, 843 prefix=model.key_transformation.prefix, 844 suffix=model.key_transformation.suffix, 845 parameters=model.parameters or {}, 846 ) 847 if model.key_transformation is not None 848 else None 849 ) 850 return DpathFlattenFields( 851 config=config, 852 field_path=model_field_path, 853 delete_origin_value=model.delete_origin_value 854 if model.delete_origin_value is not None 855 else False, 856 replace_record=model.replace_record if model.replace_record is not None else False, 857 key_transformation=key_transformation, 858 parameters=model.parameters or {}, 859 ) 860 861 @staticmethod 862 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 863 if not value_type: 864 return None 865 names_to_types = { 866 ValueType.string: str, 867 ValueType.number: float, 868 ValueType.integer: int, 869 ValueType.boolean: bool, 870 } 871 return names_to_types[value_type] 872 873 def create_api_key_authenticator( 874 self, 875 model: ApiKeyAuthenticatorModel, 876 config: Config, 877 token_provider: Optional[TokenProvider] = None, 878 **kwargs: Any, 879 ) -> ApiKeyAuthenticator: 880 if model.inject_into is None and model.header is None: 881 raise ValueError( 882 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 883 ) 884 885 if model.inject_into is not None and model.header is not None: 886 raise ValueError( 887 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 888 ) 889 890 if token_provider is not None and model.api_token != "": 891 raise ValueError( 892 "If token_provider is set, api_token is ignored and has to be set to empty string." 893 ) 894 895 request_option = ( 896 self._create_component_from_model( 897 model.inject_into, config, parameters=model.parameters or {} 898 ) 899 if model.inject_into 900 else RequestOption( 901 inject_into=RequestOptionType.header, 902 field_name=model.header or "", 903 parameters=model.parameters or {}, 904 ) 905 ) 906 907 return ApiKeyAuthenticator( 908 token_provider=( 909 token_provider 910 if token_provider is not None 911 else InterpolatedStringTokenProvider( 912 api_token=model.api_token or "", 913 config=config, 914 parameters=model.parameters or {}, 915 ) 916 ), 917 request_option=request_option, 918 config=config, 919 parameters=model.parameters or {}, 920 ) 921 922 def create_legacy_to_per_partition_state_migration( 923 self, 924 model: LegacyToPerPartitionStateMigrationModel, 925 config: Mapping[str, Any], 926 declarative_stream: DeclarativeStreamModel, 927 ) -> LegacyToPerPartitionStateMigration: 928 retriever = declarative_stream.retriever 929 if not isinstance(retriever, SimpleRetrieverModel): 930 raise ValueError( 931 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever. Got {type(retriever)}" 932 ) 933 partition_router = retriever.partition_router 934 if not isinstance( 935 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 936 ): 937 raise ValueError( 938 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 939 ) 940 if not hasattr(partition_router, "parent_stream_configs"): 941 raise ValueError( 942 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 943 ) 944 945 if not hasattr(declarative_stream, "incremental_sync"): 946 raise ValueError( 947 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 948 ) 949 950 return LegacyToPerPartitionStateMigration( 951 partition_router, # type: ignore # was already checked above 952 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 953 config, 954 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 955 ) 956 957 def create_session_token_authenticator( 958 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 959 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 960 decoder = ( 961 self._create_component_from_model(model=model.decoder, config=config) 962 if model.decoder 963 else JsonDecoder(parameters={}) 964 ) 965 login_requester = self._create_component_from_model( 966 model=model.login_requester, 967 config=config, 968 name=f"{name}_login_requester", 969 decoder=decoder, 970 ) 971 token_provider = SessionTokenProvider( 972 login_requester=login_requester, 973 session_token_path=model.session_token_path, 974 expiration_duration=parse_duration(model.expiration_duration) 975 if model.expiration_duration 976 else None, 977 parameters=model.parameters or {}, 978 message_repository=self._message_repository, 979 decoder=decoder, 980 ) 981 if model.request_authentication.type == "Bearer": 982 return ModelToComponentFactory.create_bearer_authenticator( 983 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 984 config, 985 token_provider=token_provider, 986 ) 987 else: 988 return self.create_api_key_authenticator( 989 ApiKeyAuthenticatorModel( 990 type="ApiKeyAuthenticator", 991 api_token="", 992 inject_into=model.request_authentication.inject_into, 993 ), # type: ignore # $parameters and headers default to None 994 config=config, 995 token_provider=token_provider, 996 ) 997 998 @staticmethod 999 def create_basic_http_authenticator( 1000 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1001 ) -> BasicHttpAuthenticator: 1002 return BasicHttpAuthenticator( 1003 password=model.password or "", 1004 username=model.username, 1005 config=config, 1006 parameters=model.parameters or {}, 1007 ) 1008 1009 @staticmethod 1010 def create_bearer_authenticator( 1011 model: BearerAuthenticatorModel, 1012 config: Config, 1013 token_provider: Optional[TokenProvider] = None, 1014 **kwargs: Any, 1015 ) -> BearerAuthenticator: 1016 if token_provider is not None and model.api_token != "": 1017 raise ValueError( 1018 "If token_provider is set, api_token is ignored and has to be set to empty string." 1019 ) 1020 return BearerAuthenticator( 1021 token_provider=( 1022 token_provider 1023 if token_provider is not None 1024 else InterpolatedStringTokenProvider( 1025 api_token=model.api_token or "", 1026 config=config, 1027 parameters=model.parameters or {}, 1028 ) 1029 ), 1030 config=config, 1031 parameters=model.parameters or {}, 1032 ) 1033 1034 @staticmethod 1035 def create_dynamic_stream_check_config( 1036 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1037 ) -> DynamicStreamCheckConfig: 1038 return DynamicStreamCheckConfig( 1039 dynamic_stream_name=model.dynamic_stream_name, 1040 stream_count=model.stream_count or 0, 1041 ) 1042 1043 def create_check_stream( 1044 self, model: CheckStreamModel, config: Config, **kwargs: Any 1045 ) -> CheckStream: 1046 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1047 raise ValueError( 1048 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1049 ) 1050 1051 dynamic_streams_check_configs = ( 1052 [ 1053 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1054 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1055 ] 1056 if model.dynamic_streams_check_configs 1057 else [] 1058 ) 1059 1060 return CheckStream( 1061 stream_names=model.stream_names or [], 1062 dynamic_streams_check_configs=dynamic_streams_check_configs, 1063 parameters={}, 1064 ) 1065 1066 @staticmethod 1067 def create_check_dynamic_stream( 1068 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1069 ) -> CheckDynamicStream: 1070 assert model.use_check_availability is not None # for mypy 1071 1072 use_check_availability = model.use_check_availability 1073 1074 return CheckDynamicStream( 1075 stream_count=model.stream_count, 1076 use_check_availability=use_check_availability, 1077 parameters={}, 1078 ) 1079 1080 def create_composite_error_handler( 1081 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1082 ) -> CompositeErrorHandler: 1083 error_handlers = [ 1084 self._create_component_from_model(model=error_handler_model, config=config) 1085 for error_handler_model in model.error_handlers 1086 ] 1087 return CompositeErrorHandler( 1088 error_handlers=error_handlers, parameters=model.parameters or {} 1089 ) 1090 1091 @staticmethod 1092 def create_concurrency_level( 1093 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1094 ) -> ConcurrencyLevel: 1095 return ConcurrencyLevel( 1096 default_concurrency=model.default_concurrency, 1097 max_concurrency=model.max_concurrency, 1098 config=config, 1099 parameters={}, 1100 ) 1101 1102 @staticmethod 1103 def apply_stream_state_migrations( 1104 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1105 ) -> MutableMapping[str, Any]: 1106 if stream_state_migrations: 1107 for state_migration in stream_state_migrations: 1108 if state_migration.should_migrate(stream_state): 1109 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1110 stream_state = dict(state_migration.migrate(stream_state)) 1111 return stream_state 1112 1113 def create_concurrent_cursor_from_datetime_based_cursor( 1114 self, 1115 model_type: Type[BaseModel], 1116 component_definition: ComponentDefinition, 1117 stream_name: str, 1118 stream_namespace: Optional[str], 1119 config: Config, 1120 message_repository: Optional[MessageRepository] = None, 1121 runtime_lookback_window: Optional[datetime.timedelta] = None, 1122 stream_state_migrations: Optional[List[Any]] = None, 1123 **kwargs: Any, 1124 ) -> ConcurrentCursor: 1125 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1126 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1127 # incoming state and connector_state_manager that is initialized when the component factory is created 1128 stream_state = ( 1129 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1130 if "stream_state" not in kwargs 1131 else kwargs["stream_state"] 1132 ) 1133 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1134 1135 component_type = component_definition.get("type") 1136 if component_definition.get("type") != model_type.__name__: 1137 raise ValueError( 1138 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1139 ) 1140 1141 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1142 1143 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1144 raise ValueError( 1145 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1146 ) 1147 1148 interpolated_cursor_field = InterpolatedString.create( 1149 datetime_based_cursor_model.cursor_field, 1150 parameters=datetime_based_cursor_model.parameters or {}, 1151 ) 1152 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1153 1154 interpolated_partition_field_start = InterpolatedString.create( 1155 datetime_based_cursor_model.partition_field_start or "start_time", 1156 parameters=datetime_based_cursor_model.parameters or {}, 1157 ) 1158 interpolated_partition_field_end = InterpolatedString.create( 1159 datetime_based_cursor_model.partition_field_end or "end_time", 1160 parameters=datetime_based_cursor_model.parameters or {}, 1161 ) 1162 1163 slice_boundary_fields = ( 1164 interpolated_partition_field_start.eval(config=config), 1165 interpolated_partition_field_end.eval(config=config), 1166 ) 1167 1168 datetime_format = datetime_based_cursor_model.datetime_format 1169 1170 cursor_granularity = ( 1171 parse_duration(datetime_based_cursor_model.cursor_granularity) 1172 if datetime_based_cursor_model.cursor_granularity 1173 else None 1174 ) 1175 1176 lookback_window = None 1177 interpolated_lookback_window = ( 1178 InterpolatedString.create( 1179 datetime_based_cursor_model.lookback_window, 1180 parameters=datetime_based_cursor_model.parameters or {}, 1181 ) 1182 if datetime_based_cursor_model.lookback_window 1183 else None 1184 ) 1185 if interpolated_lookback_window: 1186 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1187 if evaluated_lookback_window: 1188 lookback_window = parse_duration(evaluated_lookback_window) 1189 1190 connector_state_converter: DateTimeStreamStateConverter 1191 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1192 datetime_format=datetime_format, 1193 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1194 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1195 cursor_granularity=cursor_granularity, 1196 ) 1197 1198 # Adjusts the stream state by applying the runtime lookback window. 1199 # This is used to ensure correct state handling in case of failed partitions. 1200 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1201 if runtime_lookback_window and stream_state_value: 1202 new_stream_state = ( 1203 connector_state_converter.parse_timestamp(stream_state_value) 1204 - runtime_lookback_window 1205 ) 1206 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1207 new_stream_state 1208 ) 1209 1210 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1211 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1212 start_date_runtime_value = self.create_min_max_datetime( 1213 model=datetime_based_cursor_model.start_datetime, config=config 1214 ) 1215 else: 1216 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1217 1218 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1219 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1220 end_date_runtime_value = self.create_min_max_datetime( 1221 model=datetime_based_cursor_model.end_datetime, config=config 1222 ) 1223 else: 1224 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1225 1226 interpolated_start_date = MinMaxDatetime.create( 1227 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1228 parameters=datetime_based_cursor_model.parameters, 1229 ) 1230 interpolated_end_date = ( 1231 None 1232 if not end_date_runtime_value 1233 else MinMaxDatetime.create( 1234 end_date_runtime_value, datetime_based_cursor_model.parameters 1235 ) 1236 ) 1237 1238 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1239 if not interpolated_start_date.datetime_format: 1240 interpolated_start_date.datetime_format = datetime_format 1241 if interpolated_end_date and not interpolated_end_date.datetime_format: 1242 interpolated_end_date.datetime_format = datetime_format 1243 1244 start_date = interpolated_start_date.get_datetime(config=config) 1245 end_date_provider = ( 1246 partial(interpolated_end_date.get_datetime, config) 1247 if interpolated_end_date 1248 else connector_state_converter.get_end_provider() 1249 ) 1250 1251 if ( 1252 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1253 ) or ( 1254 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1255 ): 1256 raise ValueError( 1257 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1258 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1259 ) 1260 1261 # When step is not defined, default to a step size from the starting date to the present moment 1262 step_length = datetime.timedelta.max 1263 interpolated_step = ( 1264 InterpolatedString.create( 1265 datetime_based_cursor_model.step, 1266 parameters=datetime_based_cursor_model.parameters or {}, 1267 ) 1268 if datetime_based_cursor_model.step 1269 else None 1270 ) 1271 if interpolated_step: 1272 evaluated_step = interpolated_step.eval(config) 1273 if evaluated_step: 1274 step_length = parse_duration(evaluated_step) 1275 1276 clamping_strategy: ClampingStrategy = NoClamping() 1277 if datetime_based_cursor_model.clamping: 1278 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1279 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1280 # object which we want to keep agnostic of being low-code 1281 target = InterpolatedString( 1282 string=datetime_based_cursor_model.clamping.target, 1283 parameters=datetime_based_cursor_model.parameters or {}, 1284 ) 1285 evaluated_target = target.eval(config=config) 1286 match evaluated_target: 1287 case "DAY": 1288 clamping_strategy = DayClampingStrategy() 1289 end_date_provider = ClampingEndProvider( 1290 DayClampingStrategy(is_ceiling=False), 1291 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1292 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1293 ) 1294 case "WEEK": 1295 if ( 1296 not datetime_based_cursor_model.clamping.target_details 1297 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1298 ): 1299 raise ValueError( 1300 "Given WEEK clamping, weekday needs to be provided as target_details" 1301 ) 1302 weekday = self._assemble_weekday( 1303 datetime_based_cursor_model.clamping.target_details["weekday"] 1304 ) 1305 clamping_strategy = WeekClampingStrategy(weekday) 1306 end_date_provider = ClampingEndProvider( 1307 WeekClampingStrategy(weekday, is_ceiling=False), 1308 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1309 granularity=cursor_granularity or datetime.timedelta(days=1), 1310 ) 1311 case "MONTH": 1312 clamping_strategy = MonthClampingStrategy() 1313 end_date_provider = ClampingEndProvider( 1314 MonthClampingStrategy(is_ceiling=False), 1315 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1316 granularity=cursor_granularity or datetime.timedelta(days=1), 1317 ) 1318 case _: 1319 raise ValueError( 1320 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1321 ) 1322 1323 return ConcurrentCursor( 1324 stream_name=stream_name, 1325 stream_namespace=stream_namespace, 1326 stream_state=stream_state, 1327 message_repository=message_repository or self._message_repository, 1328 connector_state_manager=self._connector_state_manager, 1329 connector_state_converter=connector_state_converter, 1330 cursor_field=cursor_field, 1331 slice_boundary_fields=slice_boundary_fields, 1332 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1333 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1334 lookback_window=lookback_window, 1335 slice_range=step_length, 1336 cursor_granularity=cursor_granularity, 1337 clamping_strategy=clamping_strategy, 1338 ) 1339 1340 def create_concurrent_cursor_from_incrementing_count_cursor( 1341 self, 1342 model_type: Type[BaseModel], 1343 component_definition: ComponentDefinition, 1344 stream_name: str, 1345 stream_namespace: Optional[str], 1346 config: Config, 1347 message_repository: Optional[MessageRepository] = None, 1348 **kwargs: Any, 1349 ) -> ConcurrentCursor: 1350 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1351 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1352 # incoming state and connector_state_manager that is initialized when the component factory is created 1353 stream_state = ( 1354 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1355 if "stream_state" not in kwargs 1356 else kwargs["stream_state"] 1357 ) 1358 1359 component_type = component_definition.get("type") 1360 if component_definition.get("type") != model_type.__name__: 1361 raise ValueError( 1362 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1363 ) 1364 1365 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1366 1367 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1368 raise ValueError( 1369 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1370 ) 1371 1372 interpolated_start_value = ( 1373 InterpolatedString.create( 1374 incrementing_count_cursor_model.start_value, # type: ignore 1375 parameters=incrementing_count_cursor_model.parameters or {}, 1376 ) 1377 if incrementing_count_cursor_model.start_value 1378 else 0 1379 ) 1380 1381 interpolated_cursor_field = InterpolatedString.create( 1382 incrementing_count_cursor_model.cursor_field, 1383 parameters=incrementing_count_cursor_model.parameters or {}, 1384 ) 1385 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1386 1387 connector_state_converter = IncrementingCountStreamStateConverter( 1388 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1389 ) 1390 1391 return ConcurrentCursor( 1392 stream_name=stream_name, 1393 stream_namespace=stream_namespace, 1394 stream_state=stream_state, 1395 message_repository=message_repository or self._message_repository, 1396 connector_state_manager=self._connector_state_manager, 1397 connector_state_converter=connector_state_converter, 1398 cursor_field=cursor_field, 1399 slice_boundary_fields=None, 1400 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1401 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1402 ) 1403 1404 def _assemble_weekday(self, weekday: str) -> Weekday: 1405 match weekday: 1406 case "MONDAY": 1407 return Weekday.MONDAY 1408 case "TUESDAY": 1409 return Weekday.TUESDAY 1410 case "WEDNESDAY": 1411 return Weekday.WEDNESDAY 1412 case "THURSDAY": 1413 return Weekday.THURSDAY 1414 case "FRIDAY": 1415 return Weekday.FRIDAY 1416 case "SATURDAY": 1417 return Weekday.SATURDAY 1418 case "SUNDAY": 1419 return Weekday.SUNDAY 1420 case _: 1421 raise ValueError(f"Unknown weekday {weekday}") 1422 1423 def create_concurrent_cursor_from_perpartition_cursor( 1424 self, 1425 state_manager: ConnectorStateManager, 1426 model_type: Type[BaseModel], 1427 component_definition: ComponentDefinition, 1428 stream_name: str, 1429 stream_namespace: Optional[str], 1430 config: Config, 1431 stream_state: MutableMapping[str, Any], 1432 partition_router: PartitionRouter, 1433 stream_state_migrations: Optional[List[Any]] = None, 1434 **kwargs: Any, 1435 ) -> ConcurrentPerPartitionCursor: 1436 component_type = component_definition.get("type") 1437 if component_definition.get("type") != model_type.__name__: 1438 raise ValueError( 1439 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1440 ) 1441 1442 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1443 1444 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1445 raise ValueError( 1446 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1447 ) 1448 1449 interpolated_cursor_field = InterpolatedString.create( 1450 datetime_based_cursor_model.cursor_field, 1451 parameters=datetime_based_cursor_model.parameters or {}, 1452 ) 1453 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1454 1455 datetime_format = datetime_based_cursor_model.datetime_format 1456 1457 cursor_granularity = ( 1458 parse_duration(datetime_based_cursor_model.cursor_granularity) 1459 if datetime_based_cursor_model.cursor_granularity 1460 else None 1461 ) 1462 1463 connector_state_converter: DateTimeStreamStateConverter 1464 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1465 datetime_format=datetime_format, 1466 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1467 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1468 cursor_granularity=cursor_granularity, 1469 ) 1470 1471 # Create the cursor factory 1472 cursor_factory = ConcurrentCursorFactory( 1473 partial( 1474 self.create_concurrent_cursor_from_datetime_based_cursor, 1475 state_manager=state_manager, 1476 model_type=model_type, 1477 component_definition=component_definition, 1478 stream_name=stream_name, 1479 stream_namespace=stream_namespace, 1480 config=config, 1481 message_repository=NoopMessageRepository(), 1482 stream_state_migrations=stream_state_migrations, 1483 ) 1484 ) 1485 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1486 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1487 use_global_cursor = isinstance( 1488 partition_router, GroupingPartitionRouter 1489 ) or component_definition.get("global_substream_cursor", False) 1490 1491 # Return the concurrent cursor and state converter 1492 return ConcurrentPerPartitionCursor( 1493 cursor_factory=cursor_factory, 1494 partition_router=partition_router, 1495 stream_name=stream_name, 1496 stream_namespace=stream_namespace, 1497 stream_state=stream_state, 1498 message_repository=self._message_repository, # type: ignore 1499 connector_state_manager=state_manager, 1500 connector_state_converter=connector_state_converter, 1501 cursor_field=cursor_field, 1502 use_global_cursor=use_global_cursor, 1503 ) 1504 1505 @staticmethod 1506 def create_constant_backoff_strategy( 1507 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1508 ) -> ConstantBackoffStrategy: 1509 return ConstantBackoffStrategy( 1510 backoff_time_in_seconds=model.backoff_time_in_seconds, 1511 config=config, 1512 parameters=model.parameters or {}, 1513 ) 1514 1515 def create_cursor_pagination( 1516 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1517 ) -> CursorPaginationStrategy: 1518 if isinstance(decoder, PaginationDecoderDecorator): 1519 inner_decoder = decoder.decoder 1520 else: 1521 inner_decoder = decoder 1522 decoder = PaginationDecoderDecorator(decoder=decoder) 1523 1524 if self._is_supported_decoder_for_pagination(inner_decoder): 1525 decoder_to_use = decoder 1526 else: 1527 raise ValueError( 1528 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1529 ) 1530 1531 return CursorPaginationStrategy( 1532 cursor_value=model.cursor_value, 1533 decoder=decoder_to_use, 1534 page_size=model.page_size, 1535 stop_condition=model.stop_condition, 1536 config=config, 1537 parameters=model.parameters or {}, 1538 ) 1539 1540 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1541 """ 1542 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1543 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1544 :param model: The Pydantic model of the custom component being created 1545 :param config: The custom defined connector config 1546 :return: The declarative component built from the Pydantic model to be used at runtime 1547 """ 1548 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1549 component_fields = get_type_hints(custom_component_class) 1550 model_args = model.dict() 1551 model_args["config"] = config 1552 1553 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1554 # we defer to these arguments over the component's definition 1555 for key, arg in kwargs.items(): 1556 model_args[key] = arg 1557 1558 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1559 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1560 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1561 for model_field, model_value in model_args.items(): 1562 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1563 if ( 1564 isinstance(model_value, dict) 1565 and "type" not in model_value 1566 and model_field in component_fields 1567 ): 1568 derived_type = self._derive_component_type_from_type_hints( 1569 component_fields.get(model_field) 1570 ) 1571 if derived_type: 1572 model_value["type"] = derived_type 1573 1574 if self._is_component(model_value): 1575 model_args[model_field] = self._create_nested_component( 1576 model, model_field, model_value, config 1577 ) 1578 elif isinstance(model_value, list): 1579 vals = [] 1580 for v in model_value: 1581 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1582 derived_type = self._derive_component_type_from_type_hints( 1583 component_fields.get(model_field) 1584 ) 1585 if derived_type: 1586 v["type"] = derived_type 1587 if self._is_component(v): 1588 vals.append(self._create_nested_component(model, model_field, v, config)) 1589 else: 1590 vals.append(v) 1591 model_args[model_field] = vals 1592 1593 kwargs = { 1594 class_field: model_args[class_field] 1595 for class_field in component_fields.keys() 1596 if class_field in model_args 1597 } 1598 return custom_component_class(**kwargs) 1599 1600 @staticmethod 1601 def _get_class_from_fully_qualified_class_name( 1602 full_qualified_class_name: str, 1603 ) -> Any: 1604 """Get a class from its fully qualified name. 1605 1606 If a custom components module is needed, we assume it is already registered - probably 1607 as `source_declarative_manifest.components` or `components`. 1608 1609 Args: 1610 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1611 1612 Returns: 1613 Any: The class object. 1614 1615 Raises: 1616 ValueError: If the class cannot be loaded. 1617 """ 1618 split = full_qualified_class_name.split(".") 1619 module_name_full = ".".join(split[:-1]) 1620 class_name = split[-1] 1621 1622 try: 1623 module_ref = importlib.import_module(module_name_full) 1624 except ModuleNotFoundError as e: 1625 if split[0] == "source_declarative_manifest": 1626 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1627 try: 1628 import os 1629 1630 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1631 module_ref = importlib.import_module( 1632 module_name_with_source_declarative_manifest 1633 ) 1634 except ModuleNotFoundError: 1635 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1636 else: 1637 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1638 1639 try: 1640 return getattr(module_ref, class_name) 1641 except AttributeError as e: 1642 raise ValueError( 1643 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1644 ) from e 1645 1646 @staticmethod 1647 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1648 interface = field_type 1649 while True: 1650 origin = get_origin(interface) 1651 if origin: 1652 # Unnest types until we reach the raw type 1653 # List[T] -> T 1654 # Optional[List[T]] -> T 1655 args = get_args(interface) 1656 interface = args[0] 1657 else: 1658 break 1659 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1660 return interface.__name__ 1661 return None 1662 1663 @staticmethod 1664 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1665 if not cls: 1666 return False 1667 return cls.__module__ == "builtins" 1668 1669 @staticmethod 1670 def _extract_missing_parameters(error: TypeError) -> List[str]: 1671 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1672 if parameter_search: 1673 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1674 else: 1675 return [] 1676 1677 def _create_nested_component( 1678 self, model: Any, model_field: str, model_value: Any, config: Config 1679 ) -> Any: 1680 type_name = model_value.get("type", None) 1681 if not type_name: 1682 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1683 return model_value 1684 1685 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1686 if model_type: 1687 parsed_model = model_type.parse_obj(model_value) 1688 try: 1689 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1690 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1691 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1692 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1693 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1694 # are needed by a component and could not be shared. 1695 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1696 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1697 model_parameters = model_value.get("$parameters", {}) 1698 matching_parameters = { 1699 kwarg: model_parameters[kwarg] 1700 for kwarg in constructor_kwargs 1701 if kwarg in model_parameters 1702 } 1703 return self._create_component_from_model( 1704 model=parsed_model, config=config, **matching_parameters 1705 ) 1706 except TypeError as error: 1707 missing_parameters = self._extract_missing_parameters(error) 1708 if missing_parameters: 1709 raise ValueError( 1710 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1711 + ", ".join( 1712 ( 1713 f"{type_name}.$parameters.{parameter}" 1714 for parameter in missing_parameters 1715 ) 1716 ) 1717 ) 1718 raise TypeError( 1719 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1720 ) 1721 else: 1722 raise ValueError( 1723 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1724 ) 1725 1726 @staticmethod 1727 def _is_component(model_value: Any) -> bool: 1728 return isinstance(model_value, dict) and model_value.get("type") is not None 1729 1730 def create_datetime_based_cursor( 1731 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1732 ) -> DatetimeBasedCursor: 1733 start_datetime: Union[str, MinMaxDatetime] = ( 1734 model.start_datetime 1735 if isinstance(model.start_datetime, str) 1736 else self.create_min_max_datetime(model.start_datetime, config) 1737 ) 1738 end_datetime: Union[str, MinMaxDatetime, None] = None 1739 if model.is_data_feed and model.end_datetime: 1740 raise ValueError("Data feed does not support end_datetime") 1741 if model.is_data_feed and model.is_client_side_incremental: 1742 raise ValueError( 1743 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1744 ) 1745 if model.end_datetime: 1746 end_datetime = ( 1747 model.end_datetime 1748 if isinstance(model.end_datetime, str) 1749 else self.create_min_max_datetime(model.end_datetime, config) 1750 ) 1751 1752 end_time_option = ( 1753 self._create_component_from_model( 1754 model.end_time_option, config, parameters=model.parameters or {} 1755 ) 1756 if model.end_time_option 1757 else None 1758 ) 1759 start_time_option = ( 1760 self._create_component_from_model( 1761 model.start_time_option, config, parameters=model.parameters or {} 1762 ) 1763 if model.start_time_option 1764 else None 1765 ) 1766 1767 return DatetimeBasedCursor( 1768 cursor_field=model.cursor_field, 1769 cursor_datetime_formats=model.cursor_datetime_formats 1770 if model.cursor_datetime_formats 1771 else [], 1772 cursor_granularity=model.cursor_granularity, 1773 datetime_format=model.datetime_format, 1774 end_datetime=end_datetime, 1775 start_datetime=start_datetime, 1776 step=model.step, 1777 end_time_option=end_time_option, 1778 lookback_window=model.lookback_window, 1779 start_time_option=start_time_option, 1780 partition_field_end=model.partition_field_end, 1781 partition_field_start=model.partition_field_start, 1782 message_repository=self._message_repository, 1783 is_compare_strictly=model.is_compare_strictly, 1784 config=config, 1785 parameters=model.parameters or {}, 1786 ) 1787 1788 def create_declarative_stream( 1789 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1790 ) -> DeclarativeStream: 1791 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1792 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1793 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1794 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1795 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1796 1797 primary_key = model.primary_key.__root__ if model.primary_key else None 1798 stop_condition_on_cursor = ( 1799 model.incremental_sync 1800 and hasattr(model.incremental_sync, "is_data_feed") 1801 and model.incremental_sync.is_data_feed 1802 ) 1803 client_side_incremental_sync = None 1804 if ( 1805 model.incremental_sync 1806 and hasattr(model.incremental_sync, "is_client_side_incremental") 1807 and model.incremental_sync.is_client_side_incremental 1808 ): 1809 supported_slicers = ( 1810 DatetimeBasedCursor, 1811 GlobalSubstreamCursor, 1812 PerPartitionWithGlobalCursor, 1813 ) 1814 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1815 raise ValueError( 1816 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1817 ) 1818 cursor = ( 1819 combined_slicers 1820 if isinstance( 1821 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1822 ) 1823 else self._create_component_from_model(model=model.incremental_sync, config=config) 1824 ) 1825 1826 client_side_incremental_sync = {"cursor": cursor} 1827 1828 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1829 cursor_model = model.incremental_sync 1830 1831 end_time_option = ( 1832 self._create_component_from_model( 1833 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1834 ) 1835 if cursor_model.end_time_option 1836 else None 1837 ) 1838 start_time_option = ( 1839 self._create_component_from_model( 1840 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1841 ) 1842 if cursor_model.start_time_option 1843 else None 1844 ) 1845 1846 request_options_provider = DatetimeBasedRequestOptionsProvider( 1847 start_time_option=start_time_option, 1848 end_time_option=end_time_option, 1849 partition_field_start=cursor_model.partition_field_end, 1850 partition_field_end=cursor_model.partition_field_end, 1851 config=config, 1852 parameters=model.parameters or {}, 1853 ) 1854 elif model.incremental_sync and isinstance( 1855 model.incremental_sync, IncrementingCountCursorModel 1856 ): 1857 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1858 1859 start_time_option = ( 1860 self._create_component_from_model( 1861 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1862 config, 1863 parameters=cursor_model.parameters or {}, 1864 ) 1865 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1866 else None 1867 ) 1868 1869 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1870 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1871 partition_field_start = "start" 1872 1873 request_options_provider = DatetimeBasedRequestOptionsProvider( 1874 start_time_option=start_time_option, 1875 partition_field_start=partition_field_start, 1876 config=config, 1877 parameters=model.parameters or {}, 1878 ) 1879 else: 1880 request_options_provider = None 1881 1882 transformations = [] 1883 if model.transformations: 1884 for transformation_model in model.transformations: 1885 transformations.append( 1886 self._create_component_from_model(model=transformation_model, config=config) 1887 ) 1888 file_uploader = None 1889 if model.file_uploader: 1890 file_uploader = self._create_component_from_model( 1891 model=model.file_uploader, config=config 1892 ) 1893 1894 retriever = self._create_component_from_model( 1895 model=model.retriever, 1896 config=config, 1897 name=model.name, 1898 primary_key=primary_key, 1899 stream_slicer=combined_slicers, 1900 request_options_provider=request_options_provider, 1901 stop_condition_on_cursor=stop_condition_on_cursor, 1902 client_side_incremental_sync=client_side_incremental_sync, 1903 transformations=transformations, 1904 file_uploader=file_uploader, 1905 incremental_sync=model.incremental_sync, 1906 ) 1907 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 1908 1909 if model.state_migrations: 1910 state_transformations = [ 1911 self._create_component_from_model(state_migration, config, declarative_stream=model) 1912 for state_migration in model.state_migrations 1913 ] 1914 else: 1915 state_transformations = [] 1916 1917 schema_loader: Union[ 1918 CompositeSchemaLoader, 1919 DefaultSchemaLoader, 1920 DynamicSchemaLoader, 1921 InlineSchemaLoader, 1922 JsonFileSchemaLoader, 1923 ] 1924 if model.schema_loader and isinstance(model.schema_loader, list): 1925 nested_schema_loaders = [ 1926 self._create_component_from_model(model=nested_schema_loader, config=config) 1927 for nested_schema_loader in model.schema_loader 1928 ] 1929 schema_loader = CompositeSchemaLoader( 1930 schema_loaders=nested_schema_loaders, parameters={} 1931 ) 1932 elif model.schema_loader: 1933 schema_loader = self._create_component_from_model( 1934 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 1935 config=config, 1936 ) 1937 else: 1938 options = model.parameters or {} 1939 if "name" not in options: 1940 options["name"] = model.name 1941 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 1942 1943 return DeclarativeStream( 1944 name=model.name or "", 1945 primary_key=primary_key, 1946 retriever=retriever, 1947 schema_loader=schema_loader, 1948 stream_cursor_field=cursor_field or "", 1949 state_migrations=state_transformations, 1950 config=config, 1951 parameters=model.parameters or {}, 1952 ) 1953 1954 def _build_stream_slicer_from_partition_router( 1955 self, 1956 model: Union[ 1957 AsyncRetrieverModel, 1958 CustomRetrieverModel, 1959 SimpleRetrieverModel, 1960 ], 1961 config: Config, 1962 stream_name: Optional[str] = None, 1963 ) -> Optional[PartitionRouter]: 1964 if ( 1965 hasattr(model, "partition_router") 1966 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 1967 and model.partition_router 1968 ): 1969 stream_slicer_model = model.partition_router 1970 if isinstance(stream_slicer_model, list): 1971 return CartesianProductStreamSlicer( 1972 [ 1973 self._create_component_from_model( 1974 model=slicer, config=config, stream_name=stream_name or "" 1975 ) 1976 for slicer in stream_slicer_model 1977 ], 1978 parameters={}, 1979 ) 1980 else: 1981 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 1982 model=stream_slicer_model, config=config, stream_name=stream_name or "" 1983 ) 1984 return None 1985 1986 def _build_incremental_cursor( 1987 self, 1988 model: DeclarativeStreamModel, 1989 stream_slicer: Optional[PartitionRouter], 1990 config: Config, 1991 ) -> Optional[StreamSlicer]: 1992 if model.incremental_sync and stream_slicer: 1993 if model.retriever.type == "AsyncRetriever": 1994 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 1995 state_manager=self._connector_state_manager, 1996 model_type=DatetimeBasedCursorModel, 1997 component_definition=model.incremental_sync.__dict__, 1998 stream_name=model.name or "", 1999 stream_namespace=None, 2000 config=config or {}, 2001 stream_state={}, 2002 partition_router=stream_slicer, 2003 ) 2004 2005 incremental_sync_model = model.incremental_sync 2006 cursor_component = self._create_component_from_model( 2007 model=incremental_sync_model, config=config 2008 ) 2009 is_global_cursor = ( 2010 hasattr(incremental_sync_model, "global_substream_cursor") 2011 and incremental_sync_model.global_substream_cursor 2012 ) 2013 2014 if is_global_cursor: 2015 return GlobalSubstreamCursor( 2016 stream_cursor=cursor_component, partition_router=stream_slicer 2017 ) 2018 return PerPartitionWithGlobalCursor( 2019 cursor_factory=CursorFactory( 2020 lambda: self._create_component_from_model( 2021 model=incremental_sync_model, config=config 2022 ), 2023 ), 2024 partition_router=stream_slicer, 2025 stream_cursor=cursor_component, 2026 ) 2027 elif model.incremental_sync: 2028 if model.retriever.type == "AsyncRetriever": 2029 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2030 model_type=DatetimeBasedCursorModel, 2031 component_definition=model.incremental_sync.__dict__, 2032 stream_name=model.name or "", 2033 stream_namespace=None, 2034 config=config or {}, 2035 stream_state_migrations=model.state_migrations, 2036 ) 2037 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2038 return None 2039 2040 def _build_resumable_cursor( 2041 self, 2042 model: Union[ 2043 AsyncRetrieverModel, 2044 CustomRetrieverModel, 2045 SimpleRetrieverModel, 2046 ], 2047 stream_slicer: Optional[PartitionRouter], 2048 ) -> Optional[StreamSlicer]: 2049 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2050 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2051 return ResumableFullRefreshCursor(parameters={}) 2052 elif stream_slicer: 2053 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2054 return PerPartitionCursor( 2055 cursor_factory=CursorFactory( 2056 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2057 ), 2058 partition_router=stream_slicer, 2059 ) 2060 return None 2061 2062 def _merge_stream_slicers( 2063 self, model: DeclarativeStreamModel, config: Config 2064 ) -> Optional[StreamSlicer]: 2065 retriever_model = model.retriever 2066 2067 stream_slicer = self._build_stream_slicer_from_partition_router( 2068 retriever_model, config, stream_name=model.name 2069 ) 2070 2071 if retriever_model.type == "AsyncRetriever": 2072 is_not_datetime_cursor = ( 2073 model.incremental_sync.type != "DatetimeBasedCursor" 2074 if model.incremental_sync 2075 else None 2076 ) 2077 is_partition_router = ( 2078 bool(retriever_model.partition_router) if model.incremental_sync else None 2079 ) 2080 2081 if is_not_datetime_cursor: 2082 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2083 # support or unordered slices (for example, when we trigger reports for January and February, the report 2084 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2085 # implementation available in the CDK, we can enable more cursors here. 2086 raise ValueError( 2087 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2088 ) 2089 2090 if is_partition_router and not stream_slicer: 2091 # Note that this development is also done in parallel to the per partition development which once merged 2092 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2093 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2094 2095 if model.incremental_sync: 2096 return self._build_incremental_cursor(model, stream_slicer, config) 2097 2098 return ( 2099 stream_slicer 2100 if self._disable_resumable_full_refresh 2101 else self._build_resumable_cursor(retriever_model, stream_slicer) 2102 ) 2103 2104 def create_default_error_handler( 2105 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2106 ) -> DefaultErrorHandler: 2107 backoff_strategies = [] 2108 if model.backoff_strategies: 2109 for backoff_strategy_model in model.backoff_strategies: 2110 backoff_strategies.append( 2111 self._create_component_from_model(model=backoff_strategy_model, config=config) 2112 ) 2113 2114 response_filters = [] 2115 if model.response_filters: 2116 for response_filter_model in model.response_filters: 2117 response_filters.append( 2118 self._create_component_from_model(model=response_filter_model, config=config) 2119 ) 2120 response_filters.append( 2121 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2122 ) 2123 2124 return DefaultErrorHandler( 2125 backoff_strategies=backoff_strategies, 2126 max_retries=model.max_retries, 2127 response_filters=response_filters, 2128 config=config, 2129 parameters=model.parameters or {}, 2130 ) 2131 2132 def create_default_paginator( 2133 self, 2134 model: DefaultPaginatorModel, 2135 config: Config, 2136 *, 2137 url_base: str, 2138 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2139 decoder: Optional[Decoder] = None, 2140 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2141 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2142 if decoder: 2143 if self._is_supported_decoder_for_pagination(decoder): 2144 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2145 else: 2146 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2147 else: 2148 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2149 page_size_option = ( 2150 self._create_component_from_model(model=model.page_size_option, config=config) 2151 if model.page_size_option 2152 else None 2153 ) 2154 page_token_option = ( 2155 self._create_component_from_model(model=model.page_token_option, config=config) 2156 if model.page_token_option 2157 else None 2158 ) 2159 pagination_strategy = self._create_component_from_model( 2160 model=model.pagination_strategy, 2161 config=config, 2162 decoder=decoder_to_use, 2163 extractor_model=extractor_model, 2164 ) 2165 if cursor_used_for_stop_condition: 2166 pagination_strategy = StopConditionPaginationStrategyDecorator( 2167 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2168 ) 2169 paginator = DefaultPaginator( 2170 decoder=decoder_to_use, 2171 page_size_option=page_size_option, 2172 page_token_option=page_token_option, 2173 pagination_strategy=pagination_strategy, 2174 url_base=url_base, 2175 config=config, 2176 parameters=model.parameters or {}, 2177 ) 2178 if self._limit_pages_fetched_per_slice: 2179 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2180 return paginator 2181 2182 def create_dpath_extractor( 2183 self, 2184 model: DpathExtractorModel, 2185 config: Config, 2186 decoder: Optional[Decoder] = None, 2187 **kwargs: Any, 2188 ) -> DpathExtractor: 2189 if decoder: 2190 decoder_to_use = decoder 2191 else: 2192 decoder_to_use = JsonDecoder(parameters={}) 2193 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2194 return DpathExtractor( 2195 decoder=decoder_to_use, 2196 field_path=model_field_path, 2197 config=config, 2198 parameters=model.parameters or {}, 2199 ) 2200 2201 @staticmethod 2202 def create_response_to_file_extractor( 2203 model: ResponseToFileExtractorModel, 2204 **kwargs: Any, 2205 ) -> ResponseToFileExtractor: 2206 return ResponseToFileExtractor(parameters=model.parameters or {}) 2207 2208 @staticmethod 2209 def create_exponential_backoff_strategy( 2210 model: ExponentialBackoffStrategyModel, config: Config 2211 ) -> ExponentialBackoffStrategy: 2212 return ExponentialBackoffStrategy( 2213 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2214 ) 2215 2216 @staticmethod 2217 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2218 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2219 2220 def create_http_requester( 2221 self, 2222 model: HttpRequesterModel, 2223 config: Config, 2224 decoder: Decoder = JsonDecoder(parameters={}), 2225 query_properties_key: Optional[str] = None, 2226 use_cache: Optional[bool] = None, 2227 *, 2228 name: str, 2229 ) -> HttpRequester: 2230 authenticator = ( 2231 self._create_component_from_model( 2232 model=model.authenticator, 2233 config=config, 2234 url_base=model.url or model.url_base, 2235 name=name, 2236 decoder=decoder, 2237 ) 2238 if model.authenticator 2239 else None 2240 ) 2241 error_handler = ( 2242 self._create_component_from_model(model=model.error_handler, config=config) 2243 if model.error_handler 2244 else DefaultErrorHandler( 2245 backoff_strategies=[], 2246 response_filters=[], 2247 config=config, 2248 parameters=model.parameters or {}, 2249 ) 2250 ) 2251 2252 api_budget = self._api_budget 2253 2254 # Removes QueryProperties components from the interpolated mappings because it has been designed 2255 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2256 # instead of through jinja interpolation 2257 request_parameters: Optional[Union[str, Mapping[str, str]]] 2258 if isinstance(model.request_parameters, Mapping): 2259 request_parameters = self._remove_query_properties(model.request_parameters) 2260 else: 2261 request_parameters = model.request_parameters 2262 2263 request_options_provider = InterpolatedRequestOptionsProvider( 2264 request_body=model.request_body, 2265 request_body_data=model.request_body_data, 2266 request_body_json=model.request_body_json, 2267 request_headers=model.request_headers, 2268 request_parameters=request_parameters, 2269 query_properties_key=query_properties_key, 2270 config=config, 2271 parameters=model.parameters or {}, 2272 ) 2273 2274 assert model.use_cache is not None # for mypy 2275 assert model.http_method is not None # for mypy 2276 2277 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2278 2279 return HttpRequester( 2280 name=name, 2281 url=model.url, 2282 url_base=model.url_base, 2283 path=model.path, 2284 authenticator=authenticator, 2285 error_handler=error_handler, 2286 api_budget=api_budget, 2287 http_method=HttpMethod[model.http_method.value], 2288 request_options_provider=request_options_provider, 2289 config=config, 2290 disable_retries=self._disable_retries, 2291 parameters=model.parameters or {}, 2292 message_repository=self._message_repository, 2293 use_cache=should_use_cache, 2294 decoder=decoder, 2295 stream_response=decoder.is_stream_response() if decoder else False, 2296 ) 2297 2298 @staticmethod 2299 def create_http_response_filter( 2300 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2301 ) -> HttpResponseFilter: 2302 if model.action: 2303 action = ResponseAction(model.action.value) 2304 else: 2305 action = None 2306 2307 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2308 2309 http_codes = ( 2310 set(model.http_codes) if model.http_codes else set() 2311 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2312 2313 return HttpResponseFilter( 2314 action=action, 2315 failure_type=failure_type, 2316 error_message=model.error_message or "", 2317 error_message_contains=model.error_message_contains or "", 2318 http_codes=http_codes, 2319 predicate=model.predicate or "", 2320 config=config, 2321 parameters=model.parameters or {}, 2322 ) 2323 2324 @staticmethod 2325 def create_inline_schema_loader( 2326 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2327 ) -> InlineSchemaLoader: 2328 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2329 2330 def create_complex_field_type( 2331 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2332 ) -> ComplexFieldType: 2333 items = ( 2334 self._create_component_from_model(model=model.items, config=config) 2335 if isinstance(model.items, ComplexFieldTypeModel) 2336 else model.items 2337 ) 2338 2339 return ComplexFieldType(field_type=model.field_type, items=items) 2340 2341 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2342 target_type = ( 2343 self._create_component_from_model(model=model.target_type, config=config) 2344 if isinstance(model.target_type, ComplexFieldTypeModel) 2345 else model.target_type 2346 ) 2347 2348 return TypesMap( 2349 target_type=target_type, 2350 current_type=model.current_type, 2351 condition=model.condition if model.condition is not None else "True", 2352 ) 2353 2354 def create_schema_type_identifier( 2355 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2356 ) -> SchemaTypeIdentifier: 2357 types_mapping = [] 2358 if model.types_mapping: 2359 types_mapping.extend( 2360 [ 2361 self._create_component_from_model(types_map, config=config) 2362 for types_map in model.types_mapping 2363 ] 2364 ) 2365 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2366 [x for x in model.schema_pointer] if model.schema_pointer else [] 2367 ) 2368 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2369 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2370 [x for x in model.type_pointer] if model.type_pointer else None 2371 ) 2372 2373 return SchemaTypeIdentifier( 2374 schema_pointer=model_schema_pointer, 2375 key_pointer=model_key_pointer, 2376 type_pointer=model_type_pointer, 2377 types_mapping=types_mapping, 2378 parameters=model.parameters or {}, 2379 ) 2380 2381 def create_dynamic_schema_loader( 2382 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2383 ) -> DynamicSchemaLoader: 2384 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2385 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2386 2387 schema_transformations = [] 2388 if model.schema_transformations: 2389 for transformation_model in model.schema_transformations: 2390 schema_transformations.append( 2391 self._create_component_from_model(model=transformation_model, config=config) 2392 ) 2393 2394 retriever = self._create_component_from_model( 2395 model=model.retriever, 2396 config=config, 2397 name="dynamic_properties", 2398 primary_key=None, 2399 stream_slicer=combined_slicers, 2400 transformations=[], 2401 use_cache=True, 2402 ) 2403 schema_type_identifier = self._create_component_from_model( 2404 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2405 ) 2406 return DynamicSchemaLoader( 2407 retriever=retriever, 2408 config=config, 2409 schema_transformations=schema_transformations, 2410 schema_type_identifier=schema_type_identifier, 2411 parameters=model.parameters or {}, 2412 ) 2413 2414 @staticmethod 2415 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2416 return JsonDecoder(parameters={}) 2417 2418 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2419 return CompositeRawDecoder( 2420 parser=ModelToComponentFactory._get_parser(model, config), 2421 stream_response=False if self._emit_connector_builder_messages else True, 2422 ) 2423 2424 def create_jsonl_decoder( 2425 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2426 ) -> Decoder: 2427 return CompositeRawDecoder( 2428 parser=ModelToComponentFactory._get_parser(model, config), 2429 stream_response=False if self._emit_connector_builder_messages else True, 2430 ) 2431 2432 def create_gzip_decoder( 2433 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2434 ) -> Decoder: 2435 _compressed_response_types = { 2436 "gzip", 2437 "x-gzip", 2438 "gzip, deflate", 2439 "x-gzip, deflate", 2440 "application/zip", 2441 "application/gzip", 2442 "application/x-gzip", 2443 "application/x-zip-compressed", 2444 } 2445 2446 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2447 2448 if self._emit_connector_builder_messages: 2449 # This is very surprising but if the response is not streamed, 2450 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2451 # which uses urllib3 directly and does not uncompress the data. 2452 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2453 2454 return CompositeRawDecoder.by_headers( 2455 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2456 stream_response=True, 2457 fallback_parser=gzip_parser.inner_parser, 2458 ) 2459 2460 @staticmethod 2461 def create_incrementing_count_cursor( 2462 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2463 ) -> DatetimeBasedCursor: 2464 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2465 # we still parse models into components. The issue is that there's no runtime implementation of a 2466 # IncrementingCountCursor. 2467 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2468 return DatetimeBasedCursor( 2469 cursor_field=model.cursor_field, 2470 datetime_format="%Y-%m-%d", 2471 start_datetime="2024-12-12", 2472 config=config, 2473 parameters={}, 2474 ) 2475 2476 @staticmethod 2477 def create_iterable_decoder( 2478 model: IterableDecoderModel, config: Config, **kwargs: Any 2479 ) -> IterableDecoder: 2480 return IterableDecoder(parameters={}) 2481 2482 @staticmethod 2483 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2484 return XmlDecoder(parameters={}) 2485 2486 def create_zipfile_decoder( 2487 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2488 ) -> ZipfileDecoder: 2489 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2490 2491 @staticmethod 2492 def _get_parser(model: BaseModel, config: Config) -> Parser: 2493 if isinstance(model, JsonDecoderModel): 2494 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2495 return JsonParser() 2496 elif isinstance(model, JsonlDecoderModel): 2497 return JsonLineParser() 2498 elif isinstance(model, CsvDecoderModel): 2499 return CsvParser(encoding=model.encoding, delimiter=model.delimiter) 2500 elif isinstance(model, GzipDecoderModel): 2501 return GzipParser( 2502 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2503 ) 2504 elif isinstance( 2505 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2506 ): 2507 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2508 2509 raise ValueError(f"Unknown decoder type {model}") 2510 2511 @staticmethod 2512 def create_json_file_schema_loader( 2513 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2514 ) -> JsonFileSchemaLoader: 2515 return JsonFileSchemaLoader( 2516 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2517 ) 2518 2519 @staticmethod 2520 def create_jwt_authenticator( 2521 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2522 ) -> JwtAuthenticator: 2523 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2524 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2525 return JwtAuthenticator( 2526 config=config, 2527 parameters=model.parameters or {}, 2528 algorithm=JwtAlgorithm(model.algorithm.value), 2529 secret_key=model.secret_key, 2530 base64_encode_secret_key=model.base64_encode_secret_key, 2531 token_duration=model.token_duration, 2532 header_prefix=model.header_prefix, 2533 kid=jwt_headers.kid, 2534 typ=jwt_headers.typ, 2535 cty=jwt_headers.cty, 2536 iss=jwt_payload.iss, 2537 sub=jwt_payload.sub, 2538 aud=jwt_payload.aud, 2539 additional_jwt_headers=model.additional_jwt_headers, 2540 additional_jwt_payload=model.additional_jwt_payload, 2541 ) 2542 2543 def create_list_partition_router( 2544 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2545 ) -> ListPartitionRouter: 2546 request_option = ( 2547 self._create_component_from_model(model.request_option, config) 2548 if model.request_option 2549 else None 2550 ) 2551 return ListPartitionRouter( 2552 cursor_field=model.cursor_field, 2553 request_option=request_option, 2554 values=model.values, 2555 config=config, 2556 parameters=model.parameters or {}, 2557 ) 2558 2559 @staticmethod 2560 def create_min_max_datetime( 2561 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2562 ) -> MinMaxDatetime: 2563 return MinMaxDatetime( 2564 datetime=model.datetime, 2565 datetime_format=model.datetime_format or "", 2566 max_datetime=model.max_datetime or "", 2567 min_datetime=model.min_datetime or "", 2568 parameters=model.parameters or {}, 2569 ) 2570 2571 @staticmethod 2572 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2573 return NoAuth(parameters=model.parameters or {}) 2574 2575 @staticmethod 2576 def create_no_pagination( 2577 model: NoPaginationModel, config: Config, **kwargs: Any 2578 ) -> NoPagination: 2579 return NoPagination(parameters={}) 2580 2581 def create_oauth_authenticator( 2582 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2583 ) -> DeclarativeOauth2Authenticator: 2584 profile_assertion = ( 2585 self._create_component_from_model(model.profile_assertion, config=config) 2586 if model.profile_assertion 2587 else None 2588 ) 2589 2590 if model.refresh_token_updater: 2591 # ignore type error because fixing it would have a lot of dependencies, revisit later 2592 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2593 config, 2594 InterpolatedString.create( 2595 model.token_refresh_endpoint, # type: ignore 2596 parameters=model.parameters or {}, 2597 ).eval(config), 2598 access_token_name=InterpolatedString.create( 2599 model.access_token_name or "access_token", parameters=model.parameters or {} 2600 ).eval(config), 2601 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2602 expires_in_name=InterpolatedString.create( 2603 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2604 ).eval(config), 2605 client_id_name=InterpolatedString.create( 2606 model.client_id_name or "client_id", parameters=model.parameters or {} 2607 ).eval(config), 2608 client_id=InterpolatedString.create( 2609 model.client_id, parameters=model.parameters or {} 2610 ).eval(config) 2611 if model.client_id 2612 else model.client_id, 2613 client_secret_name=InterpolatedString.create( 2614 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2615 ).eval(config), 2616 client_secret=InterpolatedString.create( 2617 model.client_secret, parameters=model.parameters or {} 2618 ).eval(config) 2619 if model.client_secret 2620 else model.client_secret, 2621 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2622 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2623 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2624 grant_type_name=InterpolatedString.create( 2625 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2626 ).eval(config), 2627 grant_type=InterpolatedString.create( 2628 model.grant_type or "refresh_token", parameters=model.parameters or {} 2629 ).eval(config), 2630 refresh_request_body=InterpolatedMapping( 2631 model.refresh_request_body or {}, parameters=model.parameters or {} 2632 ).eval(config), 2633 refresh_request_headers=InterpolatedMapping( 2634 model.refresh_request_headers or {}, parameters=model.parameters or {} 2635 ).eval(config), 2636 scopes=model.scopes, 2637 token_expiry_date_format=model.token_expiry_date_format, 2638 message_repository=self._message_repository, 2639 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2640 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2641 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2642 ) 2643 # ignore type error because fixing it would have a lot of dependencies, revisit later 2644 return DeclarativeOauth2Authenticator( # type: ignore 2645 access_token_name=model.access_token_name or "access_token", 2646 access_token_value=model.access_token_value, 2647 client_id_name=model.client_id_name or "client_id", 2648 client_id=model.client_id, 2649 client_secret_name=model.client_secret_name or "client_secret", 2650 client_secret=model.client_secret, 2651 expires_in_name=model.expires_in_name or "expires_in", 2652 grant_type_name=model.grant_type_name or "grant_type", 2653 grant_type=model.grant_type or "refresh_token", 2654 refresh_request_body=model.refresh_request_body, 2655 refresh_request_headers=model.refresh_request_headers, 2656 refresh_token_name=model.refresh_token_name or "refresh_token", 2657 refresh_token=model.refresh_token, 2658 scopes=model.scopes, 2659 token_expiry_date=model.token_expiry_date, 2660 token_expiry_date_format=model.token_expiry_date_format, 2661 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2662 token_refresh_endpoint=model.token_refresh_endpoint, 2663 config=config, 2664 parameters=model.parameters or {}, 2665 message_repository=self._message_repository, 2666 profile_assertion=profile_assertion, 2667 use_profile_assertion=model.use_profile_assertion, 2668 ) 2669 2670 def create_offset_increment( 2671 self, 2672 model: OffsetIncrementModel, 2673 config: Config, 2674 decoder: Decoder, 2675 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2676 **kwargs: Any, 2677 ) -> OffsetIncrement: 2678 if isinstance(decoder, PaginationDecoderDecorator): 2679 inner_decoder = decoder.decoder 2680 else: 2681 inner_decoder = decoder 2682 decoder = PaginationDecoderDecorator(decoder=decoder) 2683 2684 if self._is_supported_decoder_for_pagination(inner_decoder): 2685 decoder_to_use = decoder 2686 else: 2687 raise ValueError( 2688 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2689 ) 2690 2691 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2692 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2693 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2694 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2695 # When we have more time to investigate we can look into reusing the same component. 2696 extractor = ( 2697 self._create_component_from_model( 2698 model=extractor_model, config=config, decoder=decoder_to_use 2699 ) 2700 if extractor_model 2701 else None 2702 ) 2703 2704 return OffsetIncrement( 2705 page_size=model.page_size, 2706 config=config, 2707 decoder=decoder_to_use, 2708 extractor=extractor, 2709 inject_on_first_request=model.inject_on_first_request or False, 2710 parameters=model.parameters or {}, 2711 ) 2712 2713 @staticmethod 2714 def create_page_increment( 2715 model: PageIncrementModel, config: Config, **kwargs: Any 2716 ) -> PageIncrement: 2717 return PageIncrement( 2718 page_size=model.page_size, 2719 config=config, 2720 start_from_page=model.start_from_page or 0, 2721 inject_on_first_request=model.inject_on_first_request or False, 2722 parameters=model.parameters or {}, 2723 ) 2724 2725 def create_parent_stream_config( 2726 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2727 ) -> ParentStreamConfig: 2728 declarative_stream = self._create_component_from_model( 2729 model.stream, config=config, **kwargs 2730 ) 2731 request_option = ( 2732 self._create_component_from_model(model.request_option, config=config) 2733 if model.request_option 2734 else None 2735 ) 2736 2737 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2738 raise ValueError( 2739 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2740 ) 2741 2742 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2743 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2744 ) 2745 2746 return ParentStreamConfig( 2747 parent_key=model.parent_key, 2748 request_option=request_option, 2749 stream=declarative_stream, 2750 partition_field=model.partition_field, 2751 config=config, 2752 incremental_dependency=model.incremental_dependency or False, 2753 parameters=model.parameters or {}, 2754 extra_fields=model.extra_fields, 2755 lazy_read_pointer=model_lazy_read_pointer, 2756 ) 2757 2758 def create_properties_from_endpoint( 2759 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2760 ) -> PropertiesFromEndpoint: 2761 retriever = self._create_component_from_model( 2762 model=model.retriever, 2763 config=config, 2764 name="dynamic_properties", 2765 primary_key=None, 2766 stream_slicer=None, 2767 transformations=[], 2768 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2769 ) 2770 return PropertiesFromEndpoint( 2771 property_field_path=model.property_field_path, 2772 retriever=retriever, 2773 config=config, 2774 parameters=model.parameters or {}, 2775 ) 2776 2777 def create_property_chunking( 2778 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2779 ) -> PropertyChunking: 2780 record_merge_strategy = ( 2781 self._create_component_from_model( 2782 model=model.record_merge_strategy, config=config, **kwargs 2783 ) 2784 if model.record_merge_strategy 2785 else None 2786 ) 2787 2788 property_limit_type: PropertyLimitType 2789 match model.property_limit_type: 2790 case PropertyLimitTypeModel.property_count: 2791 property_limit_type = PropertyLimitType.property_count 2792 case PropertyLimitTypeModel.characters: 2793 property_limit_type = PropertyLimitType.characters 2794 case _: 2795 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2796 2797 return PropertyChunking( 2798 property_limit_type=property_limit_type, 2799 property_limit=model.property_limit, 2800 record_merge_strategy=record_merge_strategy, 2801 config=config, 2802 parameters=model.parameters or {}, 2803 ) 2804 2805 def create_query_properties( 2806 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2807 ) -> QueryProperties: 2808 if isinstance(model.property_list, list): 2809 property_list = model.property_list 2810 else: 2811 property_list = self._create_component_from_model( 2812 model=model.property_list, config=config, **kwargs 2813 ) 2814 2815 property_chunking = ( 2816 self._create_component_from_model( 2817 model=model.property_chunking, config=config, **kwargs 2818 ) 2819 if model.property_chunking 2820 else None 2821 ) 2822 2823 return QueryProperties( 2824 property_list=property_list, 2825 always_include_properties=model.always_include_properties, 2826 property_chunking=property_chunking, 2827 config=config, 2828 parameters=model.parameters or {}, 2829 ) 2830 2831 @staticmethod 2832 def create_record_filter( 2833 model: RecordFilterModel, config: Config, **kwargs: Any 2834 ) -> RecordFilter: 2835 return RecordFilter( 2836 condition=model.condition or "", config=config, parameters=model.parameters or {} 2837 ) 2838 2839 @staticmethod 2840 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 2841 return RequestPath(parameters={}) 2842 2843 @staticmethod 2844 def create_request_option( 2845 model: RequestOptionModel, config: Config, **kwargs: Any 2846 ) -> RequestOption: 2847 inject_into = RequestOptionType(model.inject_into.value) 2848 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 2849 [ 2850 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 2851 for segment in model.field_path 2852 ] 2853 if model.field_path 2854 else None 2855 ) 2856 field_name = ( 2857 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 2858 if model.field_name 2859 else None 2860 ) 2861 return RequestOption( 2862 field_name=field_name, 2863 field_path=field_path, 2864 inject_into=inject_into, 2865 parameters=kwargs.get("parameters", {}), 2866 ) 2867 2868 def create_record_selector( 2869 self, 2870 model: RecordSelectorModel, 2871 config: Config, 2872 *, 2873 name: str, 2874 transformations: List[RecordTransformation] | None = None, 2875 decoder: Decoder | None = None, 2876 client_side_incremental_sync: Dict[str, Any] | None = None, 2877 file_uploader: Optional[DefaultFileUploader] = None, 2878 **kwargs: Any, 2879 ) -> RecordSelector: 2880 extractor = self._create_component_from_model( 2881 model=model.extractor, decoder=decoder, config=config 2882 ) 2883 record_filter = ( 2884 self._create_component_from_model(model.record_filter, config=config) 2885 if model.record_filter 2886 else None 2887 ) 2888 2889 transform_before_filtering = ( 2890 False if model.transform_before_filtering is None else model.transform_before_filtering 2891 ) 2892 if client_side_incremental_sync: 2893 record_filter = ClientSideIncrementalRecordFilterDecorator( 2894 config=config, 2895 parameters=model.parameters, 2896 condition=model.record_filter.condition 2897 if (model.record_filter and hasattr(model.record_filter, "condition")) 2898 else None, 2899 **client_side_incremental_sync, 2900 ) 2901 transform_before_filtering = ( 2902 True 2903 if model.transform_before_filtering is None 2904 else model.transform_before_filtering 2905 ) 2906 2907 if model.schema_normalization is None: 2908 # default to no schema normalization if not set 2909 model.schema_normalization = SchemaNormalizationModel.None_ 2910 2911 schema_normalization = ( 2912 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 2913 if isinstance(model.schema_normalization, SchemaNormalizationModel) 2914 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 2915 ) 2916 2917 return RecordSelector( 2918 extractor=extractor, 2919 name=name, 2920 config=config, 2921 record_filter=record_filter, 2922 transformations=transformations or [], 2923 file_uploader=file_uploader, 2924 schema_normalization=schema_normalization, 2925 parameters=model.parameters or {}, 2926 transform_before_filtering=transform_before_filtering, 2927 ) 2928 2929 @staticmethod 2930 def create_remove_fields( 2931 model: RemoveFieldsModel, config: Config, **kwargs: Any 2932 ) -> RemoveFields: 2933 return RemoveFields( 2934 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 2935 ) 2936 2937 def create_selective_authenticator( 2938 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 2939 ) -> DeclarativeAuthenticator: 2940 authenticators = { 2941 name: self._create_component_from_model(model=auth, config=config) 2942 for name, auth in model.authenticators.items() 2943 } 2944 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 2945 return SelectiveAuthenticator( # type: ignore[abstract] 2946 config=config, 2947 authenticators=authenticators, 2948 authenticator_selection_path=model.authenticator_selection_path, 2949 **kwargs, 2950 ) 2951 2952 @staticmethod 2953 def create_legacy_session_token_authenticator( 2954 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 2955 ) -> LegacySessionTokenAuthenticator: 2956 return LegacySessionTokenAuthenticator( 2957 api_url=url_base, 2958 header=model.header, 2959 login_url=model.login_url, 2960 password=model.password or "", 2961 session_token=model.session_token or "", 2962 session_token_response_key=model.session_token_response_key or "", 2963 username=model.username or "", 2964 validate_session_url=model.validate_session_url, 2965 config=config, 2966 parameters=model.parameters or {}, 2967 ) 2968 2969 def create_simple_retriever( 2970 self, 2971 model: SimpleRetrieverModel, 2972 config: Config, 2973 *, 2974 name: str, 2975 primary_key: Optional[Union[str, List[str], List[List[str]]]], 2976 stream_slicer: Optional[StreamSlicer], 2977 request_options_provider: Optional[RequestOptionsProvider] = None, 2978 stop_condition_on_cursor: bool = False, 2979 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 2980 transformations: List[RecordTransformation], 2981 file_uploader: Optional[DefaultFileUploader] = None, 2982 incremental_sync: Optional[ 2983 Union[ 2984 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 2985 ] 2986 ] = None, 2987 use_cache: Optional[bool] = None, 2988 **kwargs: Any, 2989 ) -> SimpleRetriever: 2990 def _get_url() -> str: 2991 """ 2992 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 2993 This is needed because the URL is not set until the requester is created. 2994 """ 2995 2996 _url = ( 2997 model.requester.url 2998 if hasattr(model.requester, "url") and model.requester.url is not None 2999 else requester.get_url() 3000 ) 3001 _url_base = ( 3002 model.requester.url_base 3003 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3004 else requester.get_url_base() 3005 ) 3006 3007 return _url or _url_base 3008 3009 decoder = ( 3010 self._create_component_from_model(model=model.decoder, config=config) 3011 if model.decoder 3012 else JsonDecoder(parameters={}) 3013 ) 3014 record_selector = self._create_component_from_model( 3015 model=model.record_selector, 3016 name=name, 3017 config=config, 3018 decoder=decoder, 3019 transformations=transformations, 3020 client_side_incremental_sync=client_side_incremental_sync, 3021 file_uploader=file_uploader, 3022 ) 3023 3024 query_properties: Optional[QueryProperties] = None 3025 query_properties_key: Optional[str] = None 3026 if self._query_properties_in_request_parameters(model.requester): 3027 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3028 # places instead of default to request_parameters which isn't clearly documented 3029 if ( 3030 hasattr(model.requester, "fetch_properties_from_endpoint") 3031 and model.requester.fetch_properties_from_endpoint 3032 ): 3033 raise ValueError( 3034 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3035 ) 3036 3037 query_properties_definitions = [] 3038 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3039 if isinstance(request_parameter, QueryPropertiesModel): 3040 query_properties_key = key 3041 query_properties_definitions.append(request_parameter) 3042 3043 if len(query_properties_definitions) > 1: 3044 raise ValueError( 3045 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3046 ) 3047 3048 if len(query_properties_definitions) == 1: 3049 query_properties = self._create_component_from_model( 3050 model=query_properties_definitions[0], config=config 3051 ) 3052 elif ( 3053 hasattr(model.requester, "fetch_properties_from_endpoint") 3054 and model.requester.fetch_properties_from_endpoint 3055 ): 3056 query_properties_definition = QueryPropertiesModel( 3057 type="QueryProperties", 3058 property_list=model.requester.fetch_properties_from_endpoint, 3059 always_include_properties=None, 3060 property_chunking=None, 3061 ) # type: ignore # $parameters has a default value 3062 3063 query_properties = self.create_query_properties( 3064 model=query_properties_definition, 3065 config=config, 3066 ) 3067 3068 requester = self._create_component_from_model( 3069 model=model.requester, 3070 decoder=decoder, 3071 name=name, 3072 query_properties_key=query_properties_key, 3073 use_cache=use_cache, 3074 config=config, 3075 ) 3076 3077 # Define cursor only if per partition or common incremental support is needed 3078 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3079 3080 if ( 3081 not isinstance(stream_slicer, DatetimeBasedCursor) 3082 or type(stream_slicer) is not DatetimeBasedCursor 3083 ): 3084 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3085 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3086 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3087 # request_options_provider 3088 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3089 elif not request_options_provider: 3090 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3091 3092 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3093 3094 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3095 paginator = ( 3096 self._create_component_from_model( 3097 model=model.paginator, 3098 config=config, 3099 url_base=_get_url(), 3100 extractor_model=model.record_selector.extractor, 3101 decoder=decoder, 3102 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3103 ) 3104 if model.paginator 3105 else NoPagination(parameters={}) 3106 ) 3107 3108 ignore_stream_slicer_parameters_on_paginated_requests = ( 3109 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3110 ) 3111 3112 if ( 3113 model.partition_router 3114 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3115 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3116 and any( 3117 parent_stream_config.lazy_read_pointer 3118 for parent_stream_config in model.partition_router.parent_stream_configs 3119 ) 3120 ): 3121 if incremental_sync: 3122 if incremental_sync.type != "DatetimeBasedCursor": 3123 raise ValueError( 3124 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3125 ) 3126 3127 elif incremental_sync.step or incremental_sync.cursor_granularity: 3128 raise ValueError( 3129 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3130 ) 3131 3132 if model.decoder and model.decoder.type != "JsonDecoder": 3133 raise ValueError( 3134 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3135 ) 3136 3137 return LazySimpleRetriever( 3138 name=name, 3139 paginator=paginator, 3140 primary_key=primary_key, 3141 requester=requester, 3142 record_selector=record_selector, 3143 stream_slicer=stream_slicer, 3144 request_option_provider=request_options_provider, 3145 cursor=cursor, 3146 config=config, 3147 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3148 parameters=model.parameters or {}, 3149 ) 3150 3151 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3152 return SimpleRetrieverTestReadDecorator( 3153 name=name, 3154 paginator=paginator, 3155 primary_key=primary_key, 3156 requester=requester, 3157 record_selector=record_selector, 3158 stream_slicer=stream_slicer, 3159 request_option_provider=request_options_provider, 3160 cursor=cursor, 3161 config=config, 3162 maximum_number_of_slices=self._limit_slices_fetched or 5, 3163 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3164 parameters=model.parameters or {}, 3165 ) 3166 return SimpleRetriever( 3167 name=name, 3168 paginator=paginator, 3169 primary_key=primary_key, 3170 requester=requester, 3171 record_selector=record_selector, 3172 stream_slicer=stream_slicer, 3173 request_option_provider=request_options_provider, 3174 cursor=cursor, 3175 config=config, 3176 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3177 additional_query_properties=query_properties, 3178 parameters=model.parameters or {}, 3179 ) 3180 3181 @staticmethod 3182 def _query_properties_in_request_parameters( 3183 requester: Union[HttpRequesterModel, CustomRequesterModel], 3184 ) -> bool: 3185 if not hasattr(requester, "request_parameters"): 3186 return False 3187 request_parameters = requester.request_parameters 3188 if request_parameters and isinstance(request_parameters, Mapping): 3189 for request_parameter in request_parameters.values(): 3190 if isinstance(request_parameter, QueryPropertiesModel): 3191 return True 3192 return False 3193 3194 @staticmethod 3195 def _remove_query_properties( 3196 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3197 ) -> Mapping[str, str]: 3198 return { 3199 parameter_field: request_parameter 3200 for parameter_field, request_parameter in request_parameters.items() 3201 if not isinstance(request_parameter, QueryPropertiesModel) 3202 } 3203 3204 def create_state_delegating_stream( 3205 self, 3206 model: StateDelegatingStreamModel, 3207 config: Config, 3208 has_parent_state: Optional[bool] = None, 3209 **kwargs: Any, 3210 ) -> DeclarativeStream: 3211 if ( 3212 model.full_refresh_stream.name != model.name 3213 or model.name != model.incremental_stream.name 3214 ): 3215 raise ValueError( 3216 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3217 ) 3218 3219 stream_model = ( 3220 model.incremental_stream 3221 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3222 else model.full_refresh_stream 3223 ) 3224 3225 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3226 3227 def _create_async_job_status_mapping( 3228 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3229 ) -> Mapping[str, AsyncJobStatus]: 3230 api_status_to_cdk_status = {} 3231 for cdk_status, api_statuses in model.dict().items(): 3232 if cdk_status == "type": 3233 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3234 continue 3235 3236 for status in api_statuses: 3237 if status in api_status_to_cdk_status: 3238 raise ValueError( 3239 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3240 ) 3241 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3242 return api_status_to_cdk_status 3243 3244 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3245 match status: 3246 case "running": 3247 return AsyncJobStatus.RUNNING 3248 case "completed": 3249 return AsyncJobStatus.COMPLETED 3250 case "failed": 3251 return AsyncJobStatus.FAILED 3252 case "timeout": 3253 return AsyncJobStatus.TIMED_OUT 3254 case _: 3255 raise ValueError(f"Unsupported CDK status {status}") 3256 3257 def create_async_retriever( 3258 self, 3259 model: AsyncRetrieverModel, 3260 config: Config, 3261 *, 3262 name: str, 3263 primary_key: Optional[ 3264 Union[str, List[str], List[List[str]]] 3265 ], # this seems to be needed to match create_simple_retriever 3266 stream_slicer: Optional[StreamSlicer], 3267 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3268 transformations: List[RecordTransformation], 3269 **kwargs: Any, 3270 ) -> AsyncRetriever: 3271 def _get_download_retriever() -> SimpleRetrieverTestReadDecorator | SimpleRetriever: 3272 record_selector = RecordSelector( 3273 extractor=download_extractor, 3274 name=name, 3275 record_filter=None, 3276 transformations=transformations, 3277 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3278 config=config, 3279 parameters={}, 3280 ) 3281 paginator = ( 3282 self._create_component_from_model( 3283 model=model.download_paginator, 3284 decoder=decoder, 3285 config=config, 3286 url_base="", 3287 ) 3288 if model.download_paginator 3289 else NoPagination(parameters={}) 3290 ) 3291 maximum_number_of_slices = self._limit_slices_fetched or 5 3292 3293 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3294 return SimpleRetrieverTestReadDecorator( 3295 requester=download_requester, 3296 record_selector=record_selector, 3297 primary_key=None, 3298 name=job_download_components_name, 3299 paginator=paginator, 3300 config=config, 3301 parameters={}, 3302 maximum_number_of_slices=maximum_number_of_slices, 3303 ) 3304 3305 return SimpleRetriever( 3306 requester=download_requester, 3307 record_selector=record_selector, 3308 primary_key=None, 3309 name=job_download_components_name, 3310 paginator=paginator, 3311 config=config, 3312 parameters={}, 3313 ) 3314 3315 def _get_job_timeout() -> datetime.timedelta: 3316 user_defined_timeout: Optional[int] = ( 3317 int( 3318 InterpolatedString.create( 3319 str(model.polling_job_timeout), 3320 parameters={}, 3321 ).eval(config) 3322 ) 3323 if model.polling_job_timeout 3324 else None 3325 ) 3326 3327 # check for user defined timeout during the test read or 15 minutes 3328 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3329 # default value for non-connector builder is 60 minutes. 3330 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3331 3332 return ( 3333 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3334 ) 3335 3336 decoder = ( 3337 self._create_component_from_model(model=model.decoder, config=config) 3338 if model.decoder 3339 else JsonDecoder(parameters={}) 3340 ) 3341 record_selector = self._create_component_from_model( 3342 model=model.record_selector, 3343 config=config, 3344 decoder=decoder, 3345 name=name, 3346 transformations=transformations, 3347 client_side_incremental_sync=client_side_incremental_sync, 3348 ) 3349 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3350 creation_requester = self._create_component_from_model( 3351 model=model.creation_requester, 3352 decoder=decoder, 3353 config=config, 3354 name=f"job creation - {name}", 3355 ) 3356 polling_requester = self._create_component_from_model( 3357 model=model.polling_requester, 3358 decoder=decoder, 3359 config=config, 3360 name=f"job polling - {name}", 3361 ) 3362 job_download_components_name = f"job download - {name}" 3363 download_decoder = ( 3364 self._create_component_from_model(model=model.download_decoder, config=config) 3365 if model.download_decoder 3366 else JsonDecoder(parameters={}) 3367 ) 3368 download_extractor = ( 3369 self._create_component_from_model( 3370 model=model.download_extractor, 3371 config=config, 3372 decoder=download_decoder, 3373 parameters=model.parameters, 3374 ) 3375 if model.download_extractor 3376 else DpathExtractor( 3377 [], 3378 config=config, 3379 decoder=download_decoder, 3380 parameters=model.parameters or {}, 3381 ) 3382 ) 3383 download_requester = self._create_component_from_model( 3384 model=model.download_requester, 3385 decoder=download_decoder, 3386 config=config, 3387 name=job_download_components_name, 3388 ) 3389 download_retriever = _get_download_retriever() 3390 abort_requester = ( 3391 self._create_component_from_model( 3392 model=model.abort_requester, 3393 decoder=decoder, 3394 config=config, 3395 name=f"job abort - {name}", 3396 ) 3397 if model.abort_requester 3398 else None 3399 ) 3400 delete_requester = ( 3401 self._create_component_from_model( 3402 model=model.delete_requester, 3403 decoder=decoder, 3404 config=config, 3405 name=f"job delete - {name}", 3406 ) 3407 if model.delete_requester 3408 else None 3409 ) 3410 download_target_requester = ( 3411 self._create_component_from_model( 3412 model=model.download_target_requester, 3413 decoder=decoder, 3414 config=config, 3415 name=f"job extract_url - {name}", 3416 ) 3417 if model.download_target_requester 3418 else None 3419 ) 3420 status_extractor = self._create_component_from_model( 3421 model=model.status_extractor, decoder=decoder, config=config, name=name 3422 ) 3423 download_target_extractor = self._create_component_from_model( 3424 model=model.download_target_extractor, 3425 decoder=decoder, 3426 config=config, 3427 name=name, 3428 ) 3429 3430 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3431 creation_requester=creation_requester, 3432 polling_requester=polling_requester, 3433 download_retriever=download_retriever, 3434 download_target_requester=download_target_requester, 3435 abort_requester=abort_requester, 3436 delete_requester=delete_requester, 3437 status_extractor=status_extractor, 3438 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3439 download_target_extractor=download_target_extractor, 3440 job_timeout=_get_job_timeout(), 3441 ) 3442 3443 async_job_partition_router = AsyncJobPartitionRouter( 3444 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3445 job_repository, 3446 stream_slices, 3447 self._job_tracker, 3448 self._message_repository, 3449 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3450 has_bulk_parent=False, 3451 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3452 # `None` == default retry is set to 3 attempts, under the hood. 3453 job_max_retry=1 if self._emit_connector_builder_messages else None, 3454 ), 3455 stream_slicer=stream_slicer, 3456 config=config, 3457 parameters=model.parameters or {}, 3458 ) 3459 3460 return AsyncRetriever( 3461 record_selector=record_selector, 3462 stream_slicer=async_job_partition_router, 3463 config=config, 3464 parameters=model.parameters or {}, 3465 ) 3466 3467 @staticmethod 3468 def create_spec(model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3469 return Spec( 3470 connection_specification=model.connection_specification, 3471 documentation_url=model.documentation_url, 3472 advanced_auth=model.advanced_auth, 3473 parameters={}, 3474 ) 3475 3476 def create_substream_partition_router( 3477 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3478 ) -> SubstreamPartitionRouter: 3479 parent_stream_configs = [] 3480 if model.parent_stream_configs: 3481 parent_stream_configs.extend( 3482 [ 3483 self._create_message_repository_substream_wrapper( 3484 model=parent_stream_config, config=config, **kwargs 3485 ) 3486 for parent_stream_config in model.parent_stream_configs 3487 ] 3488 ) 3489 3490 return SubstreamPartitionRouter( 3491 parent_stream_configs=parent_stream_configs, 3492 parameters=model.parameters or {}, 3493 config=config, 3494 ) 3495 3496 def _create_message_repository_substream_wrapper( 3497 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3498 ) -> Any: 3499 substream_factory = ModelToComponentFactory( 3500 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3501 limit_slices_fetched=self._limit_slices_fetched, 3502 emit_connector_builder_messages=self._emit_connector_builder_messages, 3503 disable_retries=self._disable_retries, 3504 disable_cache=self._disable_cache, 3505 message_repository=LogAppenderMessageRepositoryDecorator( 3506 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3507 self._message_repository, 3508 self._evaluate_log_level(self._emit_connector_builder_messages), 3509 ), 3510 ) 3511 3512 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3513 has_parent_state = bool( 3514 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3515 if model.incremental_dependency 3516 else False 3517 ) 3518 return substream_factory._create_component_from_model( 3519 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3520 ) 3521 3522 @staticmethod 3523 def create_wait_time_from_header( 3524 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3525 ) -> WaitTimeFromHeaderBackoffStrategy: 3526 return WaitTimeFromHeaderBackoffStrategy( 3527 header=model.header, 3528 parameters=model.parameters or {}, 3529 config=config, 3530 regex=model.regex, 3531 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3532 if model.max_waiting_time_in_seconds is not None 3533 else None, 3534 ) 3535 3536 @staticmethod 3537 def create_wait_until_time_from_header( 3538 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3539 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3540 return WaitUntilTimeFromHeaderBackoffStrategy( 3541 header=model.header, 3542 parameters=model.parameters or {}, 3543 config=config, 3544 min_wait=model.min_wait, 3545 regex=model.regex, 3546 ) 3547 3548 def get_message_repository(self) -> MessageRepository: 3549 return self._message_repository 3550 3551 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3552 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3553 3554 @staticmethod 3555 def create_components_mapping_definition( 3556 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3557 ) -> ComponentMappingDefinition: 3558 interpolated_value = InterpolatedString.create( 3559 model.value, parameters=model.parameters or {} 3560 ) 3561 field_path = [ 3562 InterpolatedString.create(path, parameters=model.parameters or {}) 3563 for path in model.field_path 3564 ] 3565 return ComponentMappingDefinition( 3566 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3567 value=interpolated_value, 3568 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3569 parameters=model.parameters or {}, 3570 ) 3571 3572 def create_http_components_resolver( 3573 self, model: HttpComponentsResolverModel, config: Config 3574 ) -> Any: 3575 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3576 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3577 3578 retriever = self._create_component_from_model( 3579 model=model.retriever, 3580 config=config, 3581 name="", 3582 primary_key=None, 3583 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3584 transformations=[], 3585 ) 3586 3587 components_mapping = [ 3588 self._create_component_from_model( 3589 model=components_mapping_definition_model, 3590 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3591 components_mapping_definition_model.value_type 3592 ), 3593 config=config, 3594 ) 3595 for components_mapping_definition_model in model.components_mapping 3596 ] 3597 3598 return HttpComponentsResolver( 3599 retriever=retriever, 3600 config=config, 3601 components_mapping=components_mapping, 3602 parameters=model.parameters or {}, 3603 ) 3604 3605 @staticmethod 3606 def create_stream_config( 3607 model: StreamConfigModel, config: Config, **kwargs: Any 3608 ) -> StreamConfig: 3609 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3610 [x for x in model.configs_pointer] if model.configs_pointer else [] 3611 ) 3612 3613 return StreamConfig( 3614 configs_pointer=model_configs_pointer, 3615 parameters=model.parameters or {}, 3616 ) 3617 3618 def create_config_components_resolver( 3619 self, model: ConfigComponentsResolverModel, config: Config 3620 ) -> Any: 3621 stream_config = self._create_component_from_model( 3622 model.stream_config, config=config, parameters=model.parameters or {} 3623 ) 3624 3625 components_mapping = [ 3626 self._create_component_from_model( 3627 model=components_mapping_definition_model, 3628 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3629 components_mapping_definition_model.value_type 3630 ), 3631 config=config, 3632 ) 3633 for components_mapping_definition_model in model.components_mapping 3634 ] 3635 3636 return ConfigComponentsResolver( 3637 stream_config=stream_config, 3638 config=config, 3639 components_mapping=components_mapping, 3640 parameters=model.parameters or {}, 3641 ) 3642 3643 _UNSUPPORTED_DECODER_ERROR = ( 3644 "Specified decoder of {decoder_type} is not supported for pagination." 3645 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3646 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3647 ) 3648 3649 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3650 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3651 return True 3652 elif isinstance(decoder, CompositeRawDecoder): 3653 return self._is_supported_parser_for_pagination(decoder.parser) 3654 else: 3655 return False 3656 3657 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3658 if isinstance(parser, JsonParser): 3659 return True 3660 elif isinstance(parser, GzipParser): 3661 return isinstance(parser.inner_parser, JsonParser) 3662 else: 3663 return False 3664 3665 def create_http_api_budget( 3666 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3667 ) -> HttpAPIBudget: 3668 policies = [ 3669 self._create_component_from_model(model=policy, config=config) 3670 for policy in model.policies 3671 ] 3672 3673 return HttpAPIBudget( 3674 policies=policies, 3675 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3676 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3677 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3678 ) 3679 3680 def create_fixed_window_call_rate_policy( 3681 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3682 ) -> FixedWindowCallRatePolicy: 3683 matchers = [ 3684 self._create_component_from_model(model=matcher, config=config) 3685 for matcher in model.matchers 3686 ] 3687 3688 # Set the initial reset timestamp to 10 days from now. 3689 # This value will be updated by the first request. 3690 return FixedWindowCallRatePolicy( 3691 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3692 period=parse_duration(model.period), 3693 call_limit=model.call_limit, 3694 matchers=matchers, 3695 ) 3696 3697 def create_file_uploader( 3698 self, model: FileUploaderModel, config: Config, **kwargs: Any 3699 ) -> FileUploader: 3700 name = "File Uploader" 3701 requester = self._create_component_from_model( 3702 model=model.requester, 3703 config=config, 3704 name=name, 3705 **kwargs, 3706 ) 3707 download_target_extractor = self._create_component_from_model( 3708 model=model.download_target_extractor, 3709 config=config, 3710 name=name, 3711 **kwargs, 3712 ) 3713 emit_connector_builder_messages = self._emit_connector_builder_messages 3714 file_uploader = DefaultFileUploader( 3715 requester=requester, 3716 download_target_extractor=download_target_extractor, 3717 config=config, 3718 file_writer=NoopFileWriter() 3719 if emit_connector_builder_messages 3720 else LocalFileSystemFileWriter(), 3721 parameters=model.parameters or {}, 3722 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3723 ) 3724 3725 return ( 3726 ConnectorBuilderFileUploader(file_uploader) 3727 if emit_connector_builder_messages 3728 else file_uploader 3729 ) 3730 3731 def create_moving_window_call_rate_policy( 3732 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3733 ) -> MovingWindowCallRatePolicy: 3734 rates = [ 3735 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3736 ] 3737 matchers = [ 3738 self._create_component_from_model(model=matcher, config=config) 3739 for matcher in model.matchers 3740 ] 3741 return MovingWindowCallRatePolicy( 3742 rates=rates, 3743 matchers=matchers, 3744 ) 3745 3746 def create_unlimited_call_rate_policy( 3747 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 3748 ) -> UnlimitedCallRatePolicy: 3749 matchers = [ 3750 self._create_component_from_model(model=matcher, config=config) 3751 for matcher in model.matchers 3752 ] 3753 3754 return UnlimitedCallRatePolicy( 3755 matchers=matchers, 3756 ) 3757 3758 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 3759 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 3760 return Rate( 3761 limit=int(interpolated_limit.eval(config=config)), 3762 interval=parse_duration(model.interval), 3763 ) 3764 3765 def create_http_request_matcher( 3766 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 3767 ) -> HttpRequestRegexMatcher: 3768 return HttpRequestRegexMatcher( 3769 method=model.method, 3770 url_base=model.url_base, 3771 url_path_pattern=model.url_path_pattern, 3772 params=model.params, 3773 headers=model.headers, 3774 ) 3775 3776 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 3777 self._api_budget = self.create_component( 3778 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 3779 ) 3780 3781 def create_grouping_partition_router( 3782 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 3783 ) -> GroupingPartitionRouter: 3784 underlying_router = self._create_component_from_model( 3785 model=model.underlying_partition_router, config=config 3786 ) 3787 if model.group_size < 1: 3788 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 3789 3790 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 3791 # because they are specific to individual partitions and cannot be aggregated or handled 3792 # when grouping, potentially leading to incorrect API calls. Any request customization 3793 # should be managed at the stream level through the requester's configuration. 3794 if isinstance(underlying_router, SubstreamPartitionRouter): 3795 if any( 3796 parent_config.request_option 3797 for parent_config in underlying_router.parent_stream_configs 3798 ): 3799 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3800 3801 if isinstance(underlying_router, ListPartitionRouter): 3802 if underlying_router.request_option: 3803 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3804 3805 return GroupingPartitionRouter( 3806 group_size=model.group_size, 3807 underlying_partition_router=underlying_router, 3808 deduplicate=model.deduplicate if model.deduplicate is not None else True, 3809 config=config, 3810 )
577class ModelToComponentFactory: 578 EPOCH_DATETIME_FORMAT = "%s" 579 580 def __init__( 581 self, 582 limit_pages_fetched_per_slice: Optional[int] = None, 583 limit_slices_fetched: Optional[int] = None, 584 emit_connector_builder_messages: bool = False, 585 disable_retries: bool = False, 586 disable_cache: bool = False, 587 disable_resumable_full_refresh: bool = False, 588 message_repository: Optional[MessageRepository] = None, 589 connector_state_manager: Optional[ConnectorStateManager] = None, 590 max_concurrent_async_job_count: Optional[int] = None, 591 ): 592 self._init_mappings() 593 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 594 self._limit_slices_fetched = limit_slices_fetched 595 self._emit_connector_builder_messages = emit_connector_builder_messages 596 self._disable_retries = disable_retries 597 self._disable_cache = disable_cache 598 self._disable_resumable_full_refresh = disable_resumable_full_refresh 599 self._message_repository = message_repository or InMemoryMessageRepository( 600 self._evaluate_log_level(emit_connector_builder_messages) 601 ) 602 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 603 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 604 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 605 # placeholder for deprecation warnings 606 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 607 608 def _init_mappings(self) -> None: 609 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 610 AddedFieldDefinitionModel: self.create_added_field_definition, 611 AddFieldsModel: self.create_add_fields, 612 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 613 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 614 BearerAuthenticatorModel: self.create_bearer_authenticator, 615 CheckStreamModel: self.create_check_stream, 616 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 617 CheckDynamicStreamModel: self.create_check_dynamic_stream, 618 CompositeErrorHandlerModel: self.create_composite_error_handler, 619 ConcurrencyLevelModel: self.create_concurrency_level, 620 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 621 CsvDecoderModel: self.create_csv_decoder, 622 CursorPaginationModel: self.create_cursor_pagination, 623 CustomAuthenticatorModel: self.create_custom_component, 624 CustomBackoffStrategyModel: self.create_custom_component, 625 CustomDecoderModel: self.create_custom_component, 626 CustomErrorHandlerModel: self.create_custom_component, 627 CustomIncrementalSyncModel: self.create_custom_component, 628 CustomRecordExtractorModel: self.create_custom_component, 629 CustomRecordFilterModel: self.create_custom_component, 630 CustomRequesterModel: self.create_custom_component, 631 CustomRetrieverModel: self.create_custom_component, 632 CustomSchemaLoader: self.create_custom_component, 633 CustomSchemaNormalizationModel: self.create_custom_component, 634 CustomStateMigration: self.create_custom_component, 635 CustomPaginationStrategyModel: self.create_custom_component, 636 CustomPartitionRouterModel: self.create_custom_component, 637 CustomTransformationModel: self.create_custom_component, 638 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 639 DeclarativeStreamModel: self.create_declarative_stream, 640 DefaultErrorHandlerModel: self.create_default_error_handler, 641 DefaultPaginatorModel: self.create_default_paginator, 642 DpathExtractorModel: self.create_dpath_extractor, 643 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 644 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 645 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 646 GroupByKeyMergeStrategyModel: self.create_group_by_key, 647 HttpRequesterModel: self.create_http_requester, 648 HttpResponseFilterModel: self.create_http_response_filter, 649 InlineSchemaLoaderModel: self.create_inline_schema_loader, 650 JsonDecoderModel: self.create_json_decoder, 651 JsonlDecoderModel: self.create_jsonl_decoder, 652 GzipDecoderModel: self.create_gzip_decoder, 653 KeysToLowerModel: self.create_keys_to_lower_transformation, 654 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 655 KeysReplaceModel: self.create_keys_replace_transformation, 656 FlattenFieldsModel: self.create_flatten_fields, 657 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 658 IterableDecoderModel: self.create_iterable_decoder, 659 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 660 XmlDecoderModel: self.create_xml_decoder, 661 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 662 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 663 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 664 TypesMapModel: self.create_types_map, 665 ComplexFieldTypeModel: self.create_complex_field_type, 666 JwtAuthenticatorModel: self.create_jwt_authenticator, 667 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 668 ListPartitionRouterModel: self.create_list_partition_router, 669 MinMaxDatetimeModel: self.create_min_max_datetime, 670 NoAuthModel: self.create_no_auth, 671 NoPaginationModel: self.create_no_pagination, 672 OAuthAuthenticatorModel: self.create_oauth_authenticator, 673 OffsetIncrementModel: self.create_offset_increment, 674 PageIncrementModel: self.create_page_increment, 675 ParentStreamConfigModel: self.create_parent_stream_config, 676 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 677 PropertyChunkingModel: self.create_property_chunking, 678 QueryPropertiesModel: self.create_query_properties, 679 RecordFilterModel: self.create_record_filter, 680 RecordSelectorModel: self.create_record_selector, 681 RemoveFieldsModel: self.create_remove_fields, 682 RequestPathModel: self.create_request_path, 683 RequestOptionModel: self.create_request_option, 684 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 685 SelectiveAuthenticatorModel: self.create_selective_authenticator, 686 SimpleRetrieverModel: self.create_simple_retriever, 687 StateDelegatingStreamModel: self.create_state_delegating_stream, 688 SpecModel: self.create_spec, 689 SubstreamPartitionRouterModel: self.create_substream_partition_router, 690 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 691 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 692 AsyncRetrieverModel: self.create_async_retriever, 693 HttpComponentsResolverModel: self.create_http_components_resolver, 694 ConfigComponentsResolverModel: self.create_config_components_resolver, 695 StreamConfigModel: self.create_stream_config, 696 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 697 ZipfileDecoderModel: self.create_zipfile_decoder, 698 HTTPAPIBudgetModel: self.create_http_api_budget, 699 FileUploaderModel: self.create_file_uploader, 700 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 701 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 702 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 703 RateModel: self.create_rate, 704 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 705 GroupingPartitionRouterModel: self.create_grouping_partition_router, 706 } 707 708 # Needed for the case where we need to perform a second parse on the fields of a custom component 709 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 710 711 def create_component( 712 self, 713 model_type: Type[BaseModel], 714 component_definition: ComponentDefinition, 715 config: Config, 716 **kwargs: Any, 717 ) -> Any: 718 """ 719 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 720 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 721 creating declarative components from that model. 722 723 :param model_type: The type of declarative component that is being initialized 724 :param component_definition: The mapping that represents a declarative component 725 :param config: The connector config that is provided by the customer 726 :return: The declarative component to be used at runtime 727 """ 728 729 component_type = component_definition.get("type") 730 if component_definition.get("type") != model_type.__name__: 731 raise ValueError( 732 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 733 ) 734 735 declarative_component_model = model_type.parse_obj(component_definition) 736 737 if not isinstance(declarative_component_model, model_type): 738 raise ValueError( 739 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 740 ) 741 742 return self._create_component_from_model( 743 model=declarative_component_model, config=config, **kwargs 744 ) 745 746 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 747 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 748 raise ValueError( 749 f"{model.__class__} with attributes {model} is not a valid component type" 750 ) 751 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 752 if not component_constructor: 753 raise ValueError(f"Could not find constructor for {model.__class__}") 754 755 # collect deprecation warnings for supported models. 756 if isinstance(model, BaseModelWithDeprecations): 757 self._collect_model_deprecations(model) 758 759 return component_constructor(model=model, config=config, **kwargs) 760 761 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 762 """ 763 Returns the deprecation warnings that were collected during the creation of components. 764 """ 765 return self._collected_deprecation_logs 766 767 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 768 """ 769 Collects deprecation logs from the given model and appends any new logs to the internal collection. 770 771 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 772 773 Args: 774 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 775 """ 776 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 777 for log in model._deprecation_logs: 778 # avoid duplicates for deprecation logs observed. 779 if log not in self._collected_deprecation_logs: 780 self._collected_deprecation_logs.append(log) 781 782 @staticmethod 783 def create_added_field_definition( 784 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 785 ) -> AddedFieldDefinition: 786 interpolated_value = InterpolatedString.create( 787 model.value, parameters=model.parameters or {} 788 ) 789 return AddedFieldDefinition( 790 path=model.path, 791 value=interpolated_value, 792 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 793 parameters=model.parameters or {}, 794 ) 795 796 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 797 added_field_definitions = [ 798 self._create_component_from_model( 799 model=added_field_definition_model, 800 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 801 added_field_definition_model.value_type 802 ), 803 config=config, 804 ) 805 for added_field_definition_model in model.fields 806 ] 807 return AddFields( 808 fields=added_field_definitions, 809 condition=model.condition or "", 810 parameters=model.parameters or {}, 811 ) 812 813 def create_keys_to_lower_transformation( 814 self, model: KeysToLowerModel, config: Config, **kwargs: Any 815 ) -> KeysToLowerTransformation: 816 return KeysToLowerTransformation() 817 818 def create_keys_to_snake_transformation( 819 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 820 ) -> KeysToSnakeCaseTransformation: 821 return KeysToSnakeCaseTransformation() 822 823 def create_keys_replace_transformation( 824 self, model: KeysReplaceModel, config: Config, **kwargs: Any 825 ) -> KeysReplaceTransformation: 826 return KeysReplaceTransformation( 827 old=model.old, new=model.new, parameters=model.parameters or {} 828 ) 829 830 def create_flatten_fields( 831 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 832 ) -> FlattenFields: 833 return FlattenFields( 834 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 835 ) 836 837 def create_dpath_flatten_fields( 838 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 839 ) -> DpathFlattenFields: 840 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 841 key_transformation = ( 842 KeyTransformation( 843 config=config, 844 prefix=model.key_transformation.prefix, 845 suffix=model.key_transformation.suffix, 846 parameters=model.parameters or {}, 847 ) 848 if model.key_transformation is not None 849 else None 850 ) 851 return DpathFlattenFields( 852 config=config, 853 field_path=model_field_path, 854 delete_origin_value=model.delete_origin_value 855 if model.delete_origin_value is not None 856 else False, 857 replace_record=model.replace_record if model.replace_record is not None else False, 858 key_transformation=key_transformation, 859 parameters=model.parameters or {}, 860 ) 861 862 @staticmethod 863 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 864 if not value_type: 865 return None 866 names_to_types = { 867 ValueType.string: str, 868 ValueType.number: float, 869 ValueType.integer: int, 870 ValueType.boolean: bool, 871 } 872 return names_to_types[value_type] 873 874 def create_api_key_authenticator( 875 self, 876 model: ApiKeyAuthenticatorModel, 877 config: Config, 878 token_provider: Optional[TokenProvider] = None, 879 **kwargs: Any, 880 ) -> ApiKeyAuthenticator: 881 if model.inject_into is None and model.header is None: 882 raise ValueError( 883 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 884 ) 885 886 if model.inject_into is not None and model.header is not None: 887 raise ValueError( 888 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 889 ) 890 891 if token_provider is not None and model.api_token != "": 892 raise ValueError( 893 "If token_provider is set, api_token is ignored and has to be set to empty string." 894 ) 895 896 request_option = ( 897 self._create_component_from_model( 898 model.inject_into, config, parameters=model.parameters or {} 899 ) 900 if model.inject_into 901 else RequestOption( 902 inject_into=RequestOptionType.header, 903 field_name=model.header or "", 904 parameters=model.parameters or {}, 905 ) 906 ) 907 908 return ApiKeyAuthenticator( 909 token_provider=( 910 token_provider 911 if token_provider is not None 912 else InterpolatedStringTokenProvider( 913 api_token=model.api_token or "", 914 config=config, 915 parameters=model.parameters or {}, 916 ) 917 ), 918 request_option=request_option, 919 config=config, 920 parameters=model.parameters or {}, 921 ) 922 923 def create_legacy_to_per_partition_state_migration( 924 self, 925 model: LegacyToPerPartitionStateMigrationModel, 926 config: Mapping[str, Any], 927 declarative_stream: DeclarativeStreamModel, 928 ) -> LegacyToPerPartitionStateMigration: 929 retriever = declarative_stream.retriever 930 if not isinstance(retriever, SimpleRetrieverModel): 931 raise ValueError( 932 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever. Got {type(retriever)}" 933 ) 934 partition_router = retriever.partition_router 935 if not isinstance( 936 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 937 ): 938 raise ValueError( 939 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 940 ) 941 if not hasattr(partition_router, "parent_stream_configs"): 942 raise ValueError( 943 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 944 ) 945 946 if not hasattr(declarative_stream, "incremental_sync"): 947 raise ValueError( 948 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 949 ) 950 951 return LegacyToPerPartitionStateMigration( 952 partition_router, # type: ignore # was already checked above 953 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 954 config, 955 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 956 ) 957 958 def create_session_token_authenticator( 959 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 960 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 961 decoder = ( 962 self._create_component_from_model(model=model.decoder, config=config) 963 if model.decoder 964 else JsonDecoder(parameters={}) 965 ) 966 login_requester = self._create_component_from_model( 967 model=model.login_requester, 968 config=config, 969 name=f"{name}_login_requester", 970 decoder=decoder, 971 ) 972 token_provider = SessionTokenProvider( 973 login_requester=login_requester, 974 session_token_path=model.session_token_path, 975 expiration_duration=parse_duration(model.expiration_duration) 976 if model.expiration_duration 977 else None, 978 parameters=model.parameters or {}, 979 message_repository=self._message_repository, 980 decoder=decoder, 981 ) 982 if model.request_authentication.type == "Bearer": 983 return ModelToComponentFactory.create_bearer_authenticator( 984 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 985 config, 986 token_provider=token_provider, 987 ) 988 else: 989 return self.create_api_key_authenticator( 990 ApiKeyAuthenticatorModel( 991 type="ApiKeyAuthenticator", 992 api_token="", 993 inject_into=model.request_authentication.inject_into, 994 ), # type: ignore # $parameters and headers default to None 995 config=config, 996 token_provider=token_provider, 997 ) 998 999 @staticmethod 1000 def create_basic_http_authenticator( 1001 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1002 ) -> BasicHttpAuthenticator: 1003 return BasicHttpAuthenticator( 1004 password=model.password or "", 1005 username=model.username, 1006 config=config, 1007 parameters=model.parameters or {}, 1008 ) 1009 1010 @staticmethod 1011 def create_bearer_authenticator( 1012 model: BearerAuthenticatorModel, 1013 config: Config, 1014 token_provider: Optional[TokenProvider] = None, 1015 **kwargs: Any, 1016 ) -> BearerAuthenticator: 1017 if token_provider is not None and model.api_token != "": 1018 raise ValueError( 1019 "If token_provider is set, api_token is ignored and has to be set to empty string." 1020 ) 1021 return BearerAuthenticator( 1022 token_provider=( 1023 token_provider 1024 if token_provider is not None 1025 else InterpolatedStringTokenProvider( 1026 api_token=model.api_token or "", 1027 config=config, 1028 parameters=model.parameters or {}, 1029 ) 1030 ), 1031 config=config, 1032 parameters=model.parameters or {}, 1033 ) 1034 1035 @staticmethod 1036 def create_dynamic_stream_check_config( 1037 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1038 ) -> DynamicStreamCheckConfig: 1039 return DynamicStreamCheckConfig( 1040 dynamic_stream_name=model.dynamic_stream_name, 1041 stream_count=model.stream_count or 0, 1042 ) 1043 1044 def create_check_stream( 1045 self, model: CheckStreamModel, config: Config, **kwargs: Any 1046 ) -> CheckStream: 1047 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1048 raise ValueError( 1049 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1050 ) 1051 1052 dynamic_streams_check_configs = ( 1053 [ 1054 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1055 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1056 ] 1057 if model.dynamic_streams_check_configs 1058 else [] 1059 ) 1060 1061 return CheckStream( 1062 stream_names=model.stream_names or [], 1063 dynamic_streams_check_configs=dynamic_streams_check_configs, 1064 parameters={}, 1065 ) 1066 1067 @staticmethod 1068 def create_check_dynamic_stream( 1069 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1070 ) -> CheckDynamicStream: 1071 assert model.use_check_availability is not None # for mypy 1072 1073 use_check_availability = model.use_check_availability 1074 1075 return CheckDynamicStream( 1076 stream_count=model.stream_count, 1077 use_check_availability=use_check_availability, 1078 parameters={}, 1079 ) 1080 1081 def create_composite_error_handler( 1082 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1083 ) -> CompositeErrorHandler: 1084 error_handlers = [ 1085 self._create_component_from_model(model=error_handler_model, config=config) 1086 for error_handler_model in model.error_handlers 1087 ] 1088 return CompositeErrorHandler( 1089 error_handlers=error_handlers, parameters=model.parameters or {} 1090 ) 1091 1092 @staticmethod 1093 def create_concurrency_level( 1094 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1095 ) -> ConcurrencyLevel: 1096 return ConcurrencyLevel( 1097 default_concurrency=model.default_concurrency, 1098 max_concurrency=model.max_concurrency, 1099 config=config, 1100 parameters={}, 1101 ) 1102 1103 @staticmethod 1104 def apply_stream_state_migrations( 1105 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1106 ) -> MutableMapping[str, Any]: 1107 if stream_state_migrations: 1108 for state_migration in stream_state_migrations: 1109 if state_migration.should_migrate(stream_state): 1110 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1111 stream_state = dict(state_migration.migrate(stream_state)) 1112 return stream_state 1113 1114 def create_concurrent_cursor_from_datetime_based_cursor( 1115 self, 1116 model_type: Type[BaseModel], 1117 component_definition: ComponentDefinition, 1118 stream_name: str, 1119 stream_namespace: Optional[str], 1120 config: Config, 1121 message_repository: Optional[MessageRepository] = None, 1122 runtime_lookback_window: Optional[datetime.timedelta] = None, 1123 stream_state_migrations: Optional[List[Any]] = None, 1124 **kwargs: Any, 1125 ) -> ConcurrentCursor: 1126 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1127 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1128 # incoming state and connector_state_manager that is initialized when the component factory is created 1129 stream_state = ( 1130 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1131 if "stream_state" not in kwargs 1132 else kwargs["stream_state"] 1133 ) 1134 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1135 1136 component_type = component_definition.get("type") 1137 if component_definition.get("type") != model_type.__name__: 1138 raise ValueError( 1139 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1140 ) 1141 1142 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1143 1144 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1145 raise ValueError( 1146 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1147 ) 1148 1149 interpolated_cursor_field = InterpolatedString.create( 1150 datetime_based_cursor_model.cursor_field, 1151 parameters=datetime_based_cursor_model.parameters or {}, 1152 ) 1153 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1154 1155 interpolated_partition_field_start = InterpolatedString.create( 1156 datetime_based_cursor_model.partition_field_start or "start_time", 1157 parameters=datetime_based_cursor_model.parameters or {}, 1158 ) 1159 interpolated_partition_field_end = InterpolatedString.create( 1160 datetime_based_cursor_model.partition_field_end or "end_time", 1161 parameters=datetime_based_cursor_model.parameters or {}, 1162 ) 1163 1164 slice_boundary_fields = ( 1165 interpolated_partition_field_start.eval(config=config), 1166 interpolated_partition_field_end.eval(config=config), 1167 ) 1168 1169 datetime_format = datetime_based_cursor_model.datetime_format 1170 1171 cursor_granularity = ( 1172 parse_duration(datetime_based_cursor_model.cursor_granularity) 1173 if datetime_based_cursor_model.cursor_granularity 1174 else None 1175 ) 1176 1177 lookback_window = None 1178 interpolated_lookback_window = ( 1179 InterpolatedString.create( 1180 datetime_based_cursor_model.lookback_window, 1181 parameters=datetime_based_cursor_model.parameters or {}, 1182 ) 1183 if datetime_based_cursor_model.lookback_window 1184 else None 1185 ) 1186 if interpolated_lookback_window: 1187 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1188 if evaluated_lookback_window: 1189 lookback_window = parse_duration(evaluated_lookback_window) 1190 1191 connector_state_converter: DateTimeStreamStateConverter 1192 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1193 datetime_format=datetime_format, 1194 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1195 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1196 cursor_granularity=cursor_granularity, 1197 ) 1198 1199 # Adjusts the stream state by applying the runtime lookback window. 1200 # This is used to ensure correct state handling in case of failed partitions. 1201 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1202 if runtime_lookback_window and stream_state_value: 1203 new_stream_state = ( 1204 connector_state_converter.parse_timestamp(stream_state_value) 1205 - runtime_lookback_window 1206 ) 1207 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1208 new_stream_state 1209 ) 1210 1211 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1212 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1213 start_date_runtime_value = self.create_min_max_datetime( 1214 model=datetime_based_cursor_model.start_datetime, config=config 1215 ) 1216 else: 1217 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1218 1219 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1220 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1221 end_date_runtime_value = self.create_min_max_datetime( 1222 model=datetime_based_cursor_model.end_datetime, config=config 1223 ) 1224 else: 1225 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1226 1227 interpolated_start_date = MinMaxDatetime.create( 1228 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1229 parameters=datetime_based_cursor_model.parameters, 1230 ) 1231 interpolated_end_date = ( 1232 None 1233 if not end_date_runtime_value 1234 else MinMaxDatetime.create( 1235 end_date_runtime_value, datetime_based_cursor_model.parameters 1236 ) 1237 ) 1238 1239 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1240 if not interpolated_start_date.datetime_format: 1241 interpolated_start_date.datetime_format = datetime_format 1242 if interpolated_end_date and not interpolated_end_date.datetime_format: 1243 interpolated_end_date.datetime_format = datetime_format 1244 1245 start_date = interpolated_start_date.get_datetime(config=config) 1246 end_date_provider = ( 1247 partial(interpolated_end_date.get_datetime, config) 1248 if interpolated_end_date 1249 else connector_state_converter.get_end_provider() 1250 ) 1251 1252 if ( 1253 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1254 ) or ( 1255 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1256 ): 1257 raise ValueError( 1258 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1259 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1260 ) 1261 1262 # When step is not defined, default to a step size from the starting date to the present moment 1263 step_length = datetime.timedelta.max 1264 interpolated_step = ( 1265 InterpolatedString.create( 1266 datetime_based_cursor_model.step, 1267 parameters=datetime_based_cursor_model.parameters or {}, 1268 ) 1269 if datetime_based_cursor_model.step 1270 else None 1271 ) 1272 if interpolated_step: 1273 evaluated_step = interpolated_step.eval(config) 1274 if evaluated_step: 1275 step_length = parse_duration(evaluated_step) 1276 1277 clamping_strategy: ClampingStrategy = NoClamping() 1278 if datetime_based_cursor_model.clamping: 1279 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1280 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1281 # object which we want to keep agnostic of being low-code 1282 target = InterpolatedString( 1283 string=datetime_based_cursor_model.clamping.target, 1284 parameters=datetime_based_cursor_model.parameters or {}, 1285 ) 1286 evaluated_target = target.eval(config=config) 1287 match evaluated_target: 1288 case "DAY": 1289 clamping_strategy = DayClampingStrategy() 1290 end_date_provider = ClampingEndProvider( 1291 DayClampingStrategy(is_ceiling=False), 1292 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1293 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1294 ) 1295 case "WEEK": 1296 if ( 1297 not datetime_based_cursor_model.clamping.target_details 1298 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1299 ): 1300 raise ValueError( 1301 "Given WEEK clamping, weekday needs to be provided as target_details" 1302 ) 1303 weekday = self._assemble_weekday( 1304 datetime_based_cursor_model.clamping.target_details["weekday"] 1305 ) 1306 clamping_strategy = WeekClampingStrategy(weekday) 1307 end_date_provider = ClampingEndProvider( 1308 WeekClampingStrategy(weekday, is_ceiling=False), 1309 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1310 granularity=cursor_granularity or datetime.timedelta(days=1), 1311 ) 1312 case "MONTH": 1313 clamping_strategy = MonthClampingStrategy() 1314 end_date_provider = ClampingEndProvider( 1315 MonthClampingStrategy(is_ceiling=False), 1316 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1317 granularity=cursor_granularity or datetime.timedelta(days=1), 1318 ) 1319 case _: 1320 raise ValueError( 1321 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1322 ) 1323 1324 return ConcurrentCursor( 1325 stream_name=stream_name, 1326 stream_namespace=stream_namespace, 1327 stream_state=stream_state, 1328 message_repository=message_repository or self._message_repository, 1329 connector_state_manager=self._connector_state_manager, 1330 connector_state_converter=connector_state_converter, 1331 cursor_field=cursor_field, 1332 slice_boundary_fields=slice_boundary_fields, 1333 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1334 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1335 lookback_window=lookback_window, 1336 slice_range=step_length, 1337 cursor_granularity=cursor_granularity, 1338 clamping_strategy=clamping_strategy, 1339 ) 1340 1341 def create_concurrent_cursor_from_incrementing_count_cursor( 1342 self, 1343 model_type: Type[BaseModel], 1344 component_definition: ComponentDefinition, 1345 stream_name: str, 1346 stream_namespace: Optional[str], 1347 config: Config, 1348 message_repository: Optional[MessageRepository] = None, 1349 **kwargs: Any, 1350 ) -> ConcurrentCursor: 1351 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1352 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1353 # incoming state and connector_state_manager that is initialized when the component factory is created 1354 stream_state = ( 1355 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1356 if "stream_state" not in kwargs 1357 else kwargs["stream_state"] 1358 ) 1359 1360 component_type = component_definition.get("type") 1361 if component_definition.get("type") != model_type.__name__: 1362 raise ValueError( 1363 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1364 ) 1365 1366 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1367 1368 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1369 raise ValueError( 1370 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1371 ) 1372 1373 interpolated_start_value = ( 1374 InterpolatedString.create( 1375 incrementing_count_cursor_model.start_value, # type: ignore 1376 parameters=incrementing_count_cursor_model.parameters or {}, 1377 ) 1378 if incrementing_count_cursor_model.start_value 1379 else 0 1380 ) 1381 1382 interpolated_cursor_field = InterpolatedString.create( 1383 incrementing_count_cursor_model.cursor_field, 1384 parameters=incrementing_count_cursor_model.parameters or {}, 1385 ) 1386 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1387 1388 connector_state_converter = IncrementingCountStreamStateConverter( 1389 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1390 ) 1391 1392 return ConcurrentCursor( 1393 stream_name=stream_name, 1394 stream_namespace=stream_namespace, 1395 stream_state=stream_state, 1396 message_repository=message_repository or self._message_repository, 1397 connector_state_manager=self._connector_state_manager, 1398 connector_state_converter=connector_state_converter, 1399 cursor_field=cursor_field, 1400 slice_boundary_fields=None, 1401 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1402 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1403 ) 1404 1405 def _assemble_weekday(self, weekday: str) -> Weekday: 1406 match weekday: 1407 case "MONDAY": 1408 return Weekday.MONDAY 1409 case "TUESDAY": 1410 return Weekday.TUESDAY 1411 case "WEDNESDAY": 1412 return Weekday.WEDNESDAY 1413 case "THURSDAY": 1414 return Weekday.THURSDAY 1415 case "FRIDAY": 1416 return Weekday.FRIDAY 1417 case "SATURDAY": 1418 return Weekday.SATURDAY 1419 case "SUNDAY": 1420 return Weekday.SUNDAY 1421 case _: 1422 raise ValueError(f"Unknown weekday {weekday}") 1423 1424 def create_concurrent_cursor_from_perpartition_cursor( 1425 self, 1426 state_manager: ConnectorStateManager, 1427 model_type: Type[BaseModel], 1428 component_definition: ComponentDefinition, 1429 stream_name: str, 1430 stream_namespace: Optional[str], 1431 config: Config, 1432 stream_state: MutableMapping[str, Any], 1433 partition_router: PartitionRouter, 1434 stream_state_migrations: Optional[List[Any]] = None, 1435 **kwargs: Any, 1436 ) -> ConcurrentPerPartitionCursor: 1437 component_type = component_definition.get("type") 1438 if component_definition.get("type") != model_type.__name__: 1439 raise ValueError( 1440 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1441 ) 1442 1443 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1444 1445 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1446 raise ValueError( 1447 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1448 ) 1449 1450 interpolated_cursor_field = InterpolatedString.create( 1451 datetime_based_cursor_model.cursor_field, 1452 parameters=datetime_based_cursor_model.parameters or {}, 1453 ) 1454 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1455 1456 datetime_format = datetime_based_cursor_model.datetime_format 1457 1458 cursor_granularity = ( 1459 parse_duration(datetime_based_cursor_model.cursor_granularity) 1460 if datetime_based_cursor_model.cursor_granularity 1461 else None 1462 ) 1463 1464 connector_state_converter: DateTimeStreamStateConverter 1465 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1466 datetime_format=datetime_format, 1467 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1468 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1469 cursor_granularity=cursor_granularity, 1470 ) 1471 1472 # Create the cursor factory 1473 cursor_factory = ConcurrentCursorFactory( 1474 partial( 1475 self.create_concurrent_cursor_from_datetime_based_cursor, 1476 state_manager=state_manager, 1477 model_type=model_type, 1478 component_definition=component_definition, 1479 stream_name=stream_name, 1480 stream_namespace=stream_namespace, 1481 config=config, 1482 message_repository=NoopMessageRepository(), 1483 stream_state_migrations=stream_state_migrations, 1484 ) 1485 ) 1486 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1487 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1488 use_global_cursor = isinstance( 1489 partition_router, GroupingPartitionRouter 1490 ) or component_definition.get("global_substream_cursor", False) 1491 1492 # Return the concurrent cursor and state converter 1493 return ConcurrentPerPartitionCursor( 1494 cursor_factory=cursor_factory, 1495 partition_router=partition_router, 1496 stream_name=stream_name, 1497 stream_namespace=stream_namespace, 1498 stream_state=stream_state, 1499 message_repository=self._message_repository, # type: ignore 1500 connector_state_manager=state_manager, 1501 connector_state_converter=connector_state_converter, 1502 cursor_field=cursor_field, 1503 use_global_cursor=use_global_cursor, 1504 ) 1505 1506 @staticmethod 1507 def create_constant_backoff_strategy( 1508 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1509 ) -> ConstantBackoffStrategy: 1510 return ConstantBackoffStrategy( 1511 backoff_time_in_seconds=model.backoff_time_in_seconds, 1512 config=config, 1513 parameters=model.parameters or {}, 1514 ) 1515 1516 def create_cursor_pagination( 1517 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1518 ) -> CursorPaginationStrategy: 1519 if isinstance(decoder, PaginationDecoderDecorator): 1520 inner_decoder = decoder.decoder 1521 else: 1522 inner_decoder = decoder 1523 decoder = PaginationDecoderDecorator(decoder=decoder) 1524 1525 if self._is_supported_decoder_for_pagination(inner_decoder): 1526 decoder_to_use = decoder 1527 else: 1528 raise ValueError( 1529 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1530 ) 1531 1532 return CursorPaginationStrategy( 1533 cursor_value=model.cursor_value, 1534 decoder=decoder_to_use, 1535 page_size=model.page_size, 1536 stop_condition=model.stop_condition, 1537 config=config, 1538 parameters=model.parameters or {}, 1539 ) 1540 1541 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1542 """ 1543 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1544 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1545 :param model: The Pydantic model of the custom component being created 1546 :param config: The custom defined connector config 1547 :return: The declarative component built from the Pydantic model to be used at runtime 1548 """ 1549 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1550 component_fields = get_type_hints(custom_component_class) 1551 model_args = model.dict() 1552 model_args["config"] = config 1553 1554 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1555 # we defer to these arguments over the component's definition 1556 for key, arg in kwargs.items(): 1557 model_args[key] = arg 1558 1559 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1560 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1561 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1562 for model_field, model_value in model_args.items(): 1563 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1564 if ( 1565 isinstance(model_value, dict) 1566 and "type" not in model_value 1567 and model_field in component_fields 1568 ): 1569 derived_type = self._derive_component_type_from_type_hints( 1570 component_fields.get(model_field) 1571 ) 1572 if derived_type: 1573 model_value["type"] = derived_type 1574 1575 if self._is_component(model_value): 1576 model_args[model_field] = self._create_nested_component( 1577 model, model_field, model_value, config 1578 ) 1579 elif isinstance(model_value, list): 1580 vals = [] 1581 for v in model_value: 1582 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1583 derived_type = self._derive_component_type_from_type_hints( 1584 component_fields.get(model_field) 1585 ) 1586 if derived_type: 1587 v["type"] = derived_type 1588 if self._is_component(v): 1589 vals.append(self._create_nested_component(model, model_field, v, config)) 1590 else: 1591 vals.append(v) 1592 model_args[model_field] = vals 1593 1594 kwargs = { 1595 class_field: model_args[class_field] 1596 for class_field in component_fields.keys() 1597 if class_field in model_args 1598 } 1599 return custom_component_class(**kwargs) 1600 1601 @staticmethod 1602 def _get_class_from_fully_qualified_class_name( 1603 full_qualified_class_name: str, 1604 ) -> Any: 1605 """Get a class from its fully qualified name. 1606 1607 If a custom components module is needed, we assume it is already registered - probably 1608 as `source_declarative_manifest.components` or `components`. 1609 1610 Args: 1611 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1612 1613 Returns: 1614 Any: The class object. 1615 1616 Raises: 1617 ValueError: If the class cannot be loaded. 1618 """ 1619 split = full_qualified_class_name.split(".") 1620 module_name_full = ".".join(split[:-1]) 1621 class_name = split[-1] 1622 1623 try: 1624 module_ref = importlib.import_module(module_name_full) 1625 except ModuleNotFoundError as e: 1626 if split[0] == "source_declarative_manifest": 1627 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1628 try: 1629 import os 1630 1631 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1632 module_ref = importlib.import_module( 1633 module_name_with_source_declarative_manifest 1634 ) 1635 except ModuleNotFoundError: 1636 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1637 else: 1638 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1639 1640 try: 1641 return getattr(module_ref, class_name) 1642 except AttributeError as e: 1643 raise ValueError( 1644 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1645 ) from e 1646 1647 @staticmethod 1648 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1649 interface = field_type 1650 while True: 1651 origin = get_origin(interface) 1652 if origin: 1653 # Unnest types until we reach the raw type 1654 # List[T] -> T 1655 # Optional[List[T]] -> T 1656 args = get_args(interface) 1657 interface = args[0] 1658 else: 1659 break 1660 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1661 return interface.__name__ 1662 return None 1663 1664 @staticmethod 1665 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1666 if not cls: 1667 return False 1668 return cls.__module__ == "builtins" 1669 1670 @staticmethod 1671 def _extract_missing_parameters(error: TypeError) -> List[str]: 1672 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1673 if parameter_search: 1674 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1675 else: 1676 return [] 1677 1678 def _create_nested_component( 1679 self, model: Any, model_field: str, model_value: Any, config: Config 1680 ) -> Any: 1681 type_name = model_value.get("type", None) 1682 if not type_name: 1683 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1684 return model_value 1685 1686 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1687 if model_type: 1688 parsed_model = model_type.parse_obj(model_value) 1689 try: 1690 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1691 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1692 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1693 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1694 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1695 # are needed by a component and could not be shared. 1696 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1697 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1698 model_parameters = model_value.get("$parameters", {}) 1699 matching_parameters = { 1700 kwarg: model_parameters[kwarg] 1701 for kwarg in constructor_kwargs 1702 if kwarg in model_parameters 1703 } 1704 return self._create_component_from_model( 1705 model=parsed_model, config=config, **matching_parameters 1706 ) 1707 except TypeError as error: 1708 missing_parameters = self._extract_missing_parameters(error) 1709 if missing_parameters: 1710 raise ValueError( 1711 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1712 + ", ".join( 1713 ( 1714 f"{type_name}.$parameters.{parameter}" 1715 for parameter in missing_parameters 1716 ) 1717 ) 1718 ) 1719 raise TypeError( 1720 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1721 ) 1722 else: 1723 raise ValueError( 1724 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1725 ) 1726 1727 @staticmethod 1728 def _is_component(model_value: Any) -> bool: 1729 return isinstance(model_value, dict) and model_value.get("type") is not None 1730 1731 def create_datetime_based_cursor( 1732 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1733 ) -> DatetimeBasedCursor: 1734 start_datetime: Union[str, MinMaxDatetime] = ( 1735 model.start_datetime 1736 if isinstance(model.start_datetime, str) 1737 else self.create_min_max_datetime(model.start_datetime, config) 1738 ) 1739 end_datetime: Union[str, MinMaxDatetime, None] = None 1740 if model.is_data_feed and model.end_datetime: 1741 raise ValueError("Data feed does not support end_datetime") 1742 if model.is_data_feed and model.is_client_side_incremental: 1743 raise ValueError( 1744 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1745 ) 1746 if model.end_datetime: 1747 end_datetime = ( 1748 model.end_datetime 1749 if isinstance(model.end_datetime, str) 1750 else self.create_min_max_datetime(model.end_datetime, config) 1751 ) 1752 1753 end_time_option = ( 1754 self._create_component_from_model( 1755 model.end_time_option, config, parameters=model.parameters or {} 1756 ) 1757 if model.end_time_option 1758 else None 1759 ) 1760 start_time_option = ( 1761 self._create_component_from_model( 1762 model.start_time_option, config, parameters=model.parameters or {} 1763 ) 1764 if model.start_time_option 1765 else None 1766 ) 1767 1768 return DatetimeBasedCursor( 1769 cursor_field=model.cursor_field, 1770 cursor_datetime_formats=model.cursor_datetime_formats 1771 if model.cursor_datetime_formats 1772 else [], 1773 cursor_granularity=model.cursor_granularity, 1774 datetime_format=model.datetime_format, 1775 end_datetime=end_datetime, 1776 start_datetime=start_datetime, 1777 step=model.step, 1778 end_time_option=end_time_option, 1779 lookback_window=model.lookback_window, 1780 start_time_option=start_time_option, 1781 partition_field_end=model.partition_field_end, 1782 partition_field_start=model.partition_field_start, 1783 message_repository=self._message_repository, 1784 is_compare_strictly=model.is_compare_strictly, 1785 config=config, 1786 parameters=model.parameters or {}, 1787 ) 1788 1789 def create_declarative_stream( 1790 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1791 ) -> DeclarativeStream: 1792 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1793 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1794 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1795 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1796 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1797 1798 primary_key = model.primary_key.__root__ if model.primary_key else None 1799 stop_condition_on_cursor = ( 1800 model.incremental_sync 1801 and hasattr(model.incremental_sync, "is_data_feed") 1802 and model.incremental_sync.is_data_feed 1803 ) 1804 client_side_incremental_sync = None 1805 if ( 1806 model.incremental_sync 1807 and hasattr(model.incremental_sync, "is_client_side_incremental") 1808 and model.incremental_sync.is_client_side_incremental 1809 ): 1810 supported_slicers = ( 1811 DatetimeBasedCursor, 1812 GlobalSubstreamCursor, 1813 PerPartitionWithGlobalCursor, 1814 ) 1815 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1816 raise ValueError( 1817 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1818 ) 1819 cursor = ( 1820 combined_slicers 1821 if isinstance( 1822 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1823 ) 1824 else self._create_component_from_model(model=model.incremental_sync, config=config) 1825 ) 1826 1827 client_side_incremental_sync = {"cursor": cursor} 1828 1829 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1830 cursor_model = model.incremental_sync 1831 1832 end_time_option = ( 1833 self._create_component_from_model( 1834 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1835 ) 1836 if cursor_model.end_time_option 1837 else None 1838 ) 1839 start_time_option = ( 1840 self._create_component_from_model( 1841 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1842 ) 1843 if cursor_model.start_time_option 1844 else None 1845 ) 1846 1847 request_options_provider = DatetimeBasedRequestOptionsProvider( 1848 start_time_option=start_time_option, 1849 end_time_option=end_time_option, 1850 partition_field_start=cursor_model.partition_field_end, 1851 partition_field_end=cursor_model.partition_field_end, 1852 config=config, 1853 parameters=model.parameters or {}, 1854 ) 1855 elif model.incremental_sync and isinstance( 1856 model.incremental_sync, IncrementingCountCursorModel 1857 ): 1858 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1859 1860 start_time_option = ( 1861 self._create_component_from_model( 1862 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1863 config, 1864 parameters=cursor_model.parameters or {}, 1865 ) 1866 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1867 else None 1868 ) 1869 1870 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1871 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1872 partition_field_start = "start" 1873 1874 request_options_provider = DatetimeBasedRequestOptionsProvider( 1875 start_time_option=start_time_option, 1876 partition_field_start=partition_field_start, 1877 config=config, 1878 parameters=model.parameters or {}, 1879 ) 1880 else: 1881 request_options_provider = None 1882 1883 transformations = [] 1884 if model.transformations: 1885 for transformation_model in model.transformations: 1886 transformations.append( 1887 self._create_component_from_model(model=transformation_model, config=config) 1888 ) 1889 file_uploader = None 1890 if model.file_uploader: 1891 file_uploader = self._create_component_from_model( 1892 model=model.file_uploader, config=config 1893 ) 1894 1895 retriever = self._create_component_from_model( 1896 model=model.retriever, 1897 config=config, 1898 name=model.name, 1899 primary_key=primary_key, 1900 stream_slicer=combined_slicers, 1901 request_options_provider=request_options_provider, 1902 stop_condition_on_cursor=stop_condition_on_cursor, 1903 client_side_incremental_sync=client_side_incremental_sync, 1904 transformations=transformations, 1905 file_uploader=file_uploader, 1906 incremental_sync=model.incremental_sync, 1907 ) 1908 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 1909 1910 if model.state_migrations: 1911 state_transformations = [ 1912 self._create_component_from_model(state_migration, config, declarative_stream=model) 1913 for state_migration in model.state_migrations 1914 ] 1915 else: 1916 state_transformations = [] 1917 1918 schema_loader: Union[ 1919 CompositeSchemaLoader, 1920 DefaultSchemaLoader, 1921 DynamicSchemaLoader, 1922 InlineSchemaLoader, 1923 JsonFileSchemaLoader, 1924 ] 1925 if model.schema_loader and isinstance(model.schema_loader, list): 1926 nested_schema_loaders = [ 1927 self._create_component_from_model(model=nested_schema_loader, config=config) 1928 for nested_schema_loader in model.schema_loader 1929 ] 1930 schema_loader = CompositeSchemaLoader( 1931 schema_loaders=nested_schema_loaders, parameters={} 1932 ) 1933 elif model.schema_loader: 1934 schema_loader = self._create_component_from_model( 1935 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 1936 config=config, 1937 ) 1938 else: 1939 options = model.parameters or {} 1940 if "name" not in options: 1941 options["name"] = model.name 1942 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 1943 1944 return DeclarativeStream( 1945 name=model.name or "", 1946 primary_key=primary_key, 1947 retriever=retriever, 1948 schema_loader=schema_loader, 1949 stream_cursor_field=cursor_field or "", 1950 state_migrations=state_transformations, 1951 config=config, 1952 parameters=model.parameters or {}, 1953 ) 1954 1955 def _build_stream_slicer_from_partition_router( 1956 self, 1957 model: Union[ 1958 AsyncRetrieverModel, 1959 CustomRetrieverModel, 1960 SimpleRetrieverModel, 1961 ], 1962 config: Config, 1963 stream_name: Optional[str] = None, 1964 ) -> Optional[PartitionRouter]: 1965 if ( 1966 hasattr(model, "partition_router") 1967 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 1968 and model.partition_router 1969 ): 1970 stream_slicer_model = model.partition_router 1971 if isinstance(stream_slicer_model, list): 1972 return CartesianProductStreamSlicer( 1973 [ 1974 self._create_component_from_model( 1975 model=slicer, config=config, stream_name=stream_name or "" 1976 ) 1977 for slicer in stream_slicer_model 1978 ], 1979 parameters={}, 1980 ) 1981 else: 1982 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 1983 model=stream_slicer_model, config=config, stream_name=stream_name or "" 1984 ) 1985 return None 1986 1987 def _build_incremental_cursor( 1988 self, 1989 model: DeclarativeStreamModel, 1990 stream_slicer: Optional[PartitionRouter], 1991 config: Config, 1992 ) -> Optional[StreamSlicer]: 1993 if model.incremental_sync and stream_slicer: 1994 if model.retriever.type == "AsyncRetriever": 1995 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 1996 state_manager=self._connector_state_manager, 1997 model_type=DatetimeBasedCursorModel, 1998 component_definition=model.incremental_sync.__dict__, 1999 stream_name=model.name or "", 2000 stream_namespace=None, 2001 config=config or {}, 2002 stream_state={}, 2003 partition_router=stream_slicer, 2004 ) 2005 2006 incremental_sync_model = model.incremental_sync 2007 cursor_component = self._create_component_from_model( 2008 model=incremental_sync_model, config=config 2009 ) 2010 is_global_cursor = ( 2011 hasattr(incremental_sync_model, "global_substream_cursor") 2012 and incremental_sync_model.global_substream_cursor 2013 ) 2014 2015 if is_global_cursor: 2016 return GlobalSubstreamCursor( 2017 stream_cursor=cursor_component, partition_router=stream_slicer 2018 ) 2019 return PerPartitionWithGlobalCursor( 2020 cursor_factory=CursorFactory( 2021 lambda: self._create_component_from_model( 2022 model=incremental_sync_model, config=config 2023 ), 2024 ), 2025 partition_router=stream_slicer, 2026 stream_cursor=cursor_component, 2027 ) 2028 elif model.incremental_sync: 2029 if model.retriever.type == "AsyncRetriever": 2030 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2031 model_type=DatetimeBasedCursorModel, 2032 component_definition=model.incremental_sync.__dict__, 2033 stream_name=model.name or "", 2034 stream_namespace=None, 2035 config=config or {}, 2036 stream_state_migrations=model.state_migrations, 2037 ) 2038 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2039 return None 2040 2041 def _build_resumable_cursor( 2042 self, 2043 model: Union[ 2044 AsyncRetrieverModel, 2045 CustomRetrieverModel, 2046 SimpleRetrieverModel, 2047 ], 2048 stream_slicer: Optional[PartitionRouter], 2049 ) -> Optional[StreamSlicer]: 2050 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2051 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2052 return ResumableFullRefreshCursor(parameters={}) 2053 elif stream_slicer: 2054 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2055 return PerPartitionCursor( 2056 cursor_factory=CursorFactory( 2057 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2058 ), 2059 partition_router=stream_slicer, 2060 ) 2061 return None 2062 2063 def _merge_stream_slicers( 2064 self, model: DeclarativeStreamModel, config: Config 2065 ) -> Optional[StreamSlicer]: 2066 retriever_model = model.retriever 2067 2068 stream_slicer = self._build_stream_slicer_from_partition_router( 2069 retriever_model, config, stream_name=model.name 2070 ) 2071 2072 if retriever_model.type == "AsyncRetriever": 2073 is_not_datetime_cursor = ( 2074 model.incremental_sync.type != "DatetimeBasedCursor" 2075 if model.incremental_sync 2076 else None 2077 ) 2078 is_partition_router = ( 2079 bool(retriever_model.partition_router) if model.incremental_sync else None 2080 ) 2081 2082 if is_not_datetime_cursor: 2083 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2084 # support or unordered slices (for example, when we trigger reports for January and February, the report 2085 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2086 # implementation available in the CDK, we can enable more cursors here. 2087 raise ValueError( 2088 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2089 ) 2090 2091 if is_partition_router and not stream_slicer: 2092 # Note that this development is also done in parallel to the per partition development which once merged 2093 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2094 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2095 2096 if model.incremental_sync: 2097 return self._build_incremental_cursor(model, stream_slicer, config) 2098 2099 return ( 2100 stream_slicer 2101 if self._disable_resumable_full_refresh 2102 else self._build_resumable_cursor(retriever_model, stream_slicer) 2103 ) 2104 2105 def create_default_error_handler( 2106 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2107 ) -> DefaultErrorHandler: 2108 backoff_strategies = [] 2109 if model.backoff_strategies: 2110 for backoff_strategy_model in model.backoff_strategies: 2111 backoff_strategies.append( 2112 self._create_component_from_model(model=backoff_strategy_model, config=config) 2113 ) 2114 2115 response_filters = [] 2116 if model.response_filters: 2117 for response_filter_model in model.response_filters: 2118 response_filters.append( 2119 self._create_component_from_model(model=response_filter_model, config=config) 2120 ) 2121 response_filters.append( 2122 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2123 ) 2124 2125 return DefaultErrorHandler( 2126 backoff_strategies=backoff_strategies, 2127 max_retries=model.max_retries, 2128 response_filters=response_filters, 2129 config=config, 2130 parameters=model.parameters or {}, 2131 ) 2132 2133 def create_default_paginator( 2134 self, 2135 model: DefaultPaginatorModel, 2136 config: Config, 2137 *, 2138 url_base: str, 2139 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2140 decoder: Optional[Decoder] = None, 2141 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2142 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2143 if decoder: 2144 if self._is_supported_decoder_for_pagination(decoder): 2145 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2146 else: 2147 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2148 else: 2149 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2150 page_size_option = ( 2151 self._create_component_from_model(model=model.page_size_option, config=config) 2152 if model.page_size_option 2153 else None 2154 ) 2155 page_token_option = ( 2156 self._create_component_from_model(model=model.page_token_option, config=config) 2157 if model.page_token_option 2158 else None 2159 ) 2160 pagination_strategy = self._create_component_from_model( 2161 model=model.pagination_strategy, 2162 config=config, 2163 decoder=decoder_to_use, 2164 extractor_model=extractor_model, 2165 ) 2166 if cursor_used_for_stop_condition: 2167 pagination_strategy = StopConditionPaginationStrategyDecorator( 2168 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2169 ) 2170 paginator = DefaultPaginator( 2171 decoder=decoder_to_use, 2172 page_size_option=page_size_option, 2173 page_token_option=page_token_option, 2174 pagination_strategy=pagination_strategy, 2175 url_base=url_base, 2176 config=config, 2177 parameters=model.parameters or {}, 2178 ) 2179 if self._limit_pages_fetched_per_slice: 2180 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2181 return paginator 2182 2183 def create_dpath_extractor( 2184 self, 2185 model: DpathExtractorModel, 2186 config: Config, 2187 decoder: Optional[Decoder] = None, 2188 **kwargs: Any, 2189 ) -> DpathExtractor: 2190 if decoder: 2191 decoder_to_use = decoder 2192 else: 2193 decoder_to_use = JsonDecoder(parameters={}) 2194 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2195 return DpathExtractor( 2196 decoder=decoder_to_use, 2197 field_path=model_field_path, 2198 config=config, 2199 parameters=model.parameters or {}, 2200 ) 2201 2202 @staticmethod 2203 def create_response_to_file_extractor( 2204 model: ResponseToFileExtractorModel, 2205 **kwargs: Any, 2206 ) -> ResponseToFileExtractor: 2207 return ResponseToFileExtractor(parameters=model.parameters or {}) 2208 2209 @staticmethod 2210 def create_exponential_backoff_strategy( 2211 model: ExponentialBackoffStrategyModel, config: Config 2212 ) -> ExponentialBackoffStrategy: 2213 return ExponentialBackoffStrategy( 2214 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2215 ) 2216 2217 @staticmethod 2218 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2219 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2220 2221 def create_http_requester( 2222 self, 2223 model: HttpRequesterModel, 2224 config: Config, 2225 decoder: Decoder = JsonDecoder(parameters={}), 2226 query_properties_key: Optional[str] = None, 2227 use_cache: Optional[bool] = None, 2228 *, 2229 name: str, 2230 ) -> HttpRequester: 2231 authenticator = ( 2232 self._create_component_from_model( 2233 model=model.authenticator, 2234 config=config, 2235 url_base=model.url or model.url_base, 2236 name=name, 2237 decoder=decoder, 2238 ) 2239 if model.authenticator 2240 else None 2241 ) 2242 error_handler = ( 2243 self._create_component_from_model(model=model.error_handler, config=config) 2244 if model.error_handler 2245 else DefaultErrorHandler( 2246 backoff_strategies=[], 2247 response_filters=[], 2248 config=config, 2249 parameters=model.parameters or {}, 2250 ) 2251 ) 2252 2253 api_budget = self._api_budget 2254 2255 # Removes QueryProperties components from the interpolated mappings because it has been designed 2256 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2257 # instead of through jinja interpolation 2258 request_parameters: Optional[Union[str, Mapping[str, str]]] 2259 if isinstance(model.request_parameters, Mapping): 2260 request_parameters = self._remove_query_properties(model.request_parameters) 2261 else: 2262 request_parameters = model.request_parameters 2263 2264 request_options_provider = InterpolatedRequestOptionsProvider( 2265 request_body=model.request_body, 2266 request_body_data=model.request_body_data, 2267 request_body_json=model.request_body_json, 2268 request_headers=model.request_headers, 2269 request_parameters=request_parameters, 2270 query_properties_key=query_properties_key, 2271 config=config, 2272 parameters=model.parameters or {}, 2273 ) 2274 2275 assert model.use_cache is not None # for mypy 2276 assert model.http_method is not None # for mypy 2277 2278 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2279 2280 return HttpRequester( 2281 name=name, 2282 url=model.url, 2283 url_base=model.url_base, 2284 path=model.path, 2285 authenticator=authenticator, 2286 error_handler=error_handler, 2287 api_budget=api_budget, 2288 http_method=HttpMethod[model.http_method.value], 2289 request_options_provider=request_options_provider, 2290 config=config, 2291 disable_retries=self._disable_retries, 2292 parameters=model.parameters or {}, 2293 message_repository=self._message_repository, 2294 use_cache=should_use_cache, 2295 decoder=decoder, 2296 stream_response=decoder.is_stream_response() if decoder else False, 2297 ) 2298 2299 @staticmethod 2300 def create_http_response_filter( 2301 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2302 ) -> HttpResponseFilter: 2303 if model.action: 2304 action = ResponseAction(model.action.value) 2305 else: 2306 action = None 2307 2308 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2309 2310 http_codes = ( 2311 set(model.http_codes) if model.http_codes else set() 2312 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2313 2314 return HttpResponseFilter( 2315 action=action, 2316 failure_type=failure_type, 2317 error_message=model.error_message or "", 2318 error_message_contains=model.error_message_contains or "", 2319 http_codes=http_codes, 2320 predicate=model.predicate or "", 2321 config=config, 2322 parameters=model.parameters or {}, 2323 ) 2324 2325 @staticmethod 2326 def create_inline_schema_loader( 2327 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2328 ) -> InlineSchemaLoader: 2329 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2330 2331 def create_complex_field_type( 2332 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2333 ) -> ComplexFieldType: 2334 items = ( 2335 self._create_component_from_model(model=model.items, config=config) 2336 if isinstance(model.items, ComplexFieldTypeModel) 2337 else model.items 2338 ) 2339 2340 return ComplexFieldType(field_type=model.field_type, items=items) 2341 2342 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2343 target_type = ( 2344 self._create_component_from_model(model=model.target_type, config=config) 2345 if isinstance(model.target_type, ComplexFieldTypeModel) 2346 else model.target_type 2347 ) 2348 2349 return TypesMap( 2350 target_type=target_type, 2351 current_type=model.current_type, 2352 condition=model.condition if model.condition is not None else "True", 2353 ) 2354 2355 def create_schema_type_identifier( 2356 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2357 ) -> SchemaTypeIdentifier: 2358 types_mapping = [] 2359 if model.types_mapping: 2360 types_mapping.extend( 2361 [ 2362 self._create_component_from_model(types_map, config=config) 2363 for types_map in model.types_mapping 2364 ] 2365 ) 2366 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2367 [x for x in model.schema_pointer] if model.schema_pointer else [] 2368 ) 2369 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2370 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2371 [x for x in model.type_pointer] if model.type_pointer else None 2372 ) 2373 2374 return SchemaTypeIdentifier( 2375 schema_pointer=model_schema_pointer, 2376 key_pointer=model_key_pointer, 2377 type_pointer=model_type_pointer, 2378 types_mapping=types_mapping, 2379 parameters=model.parameters or {}, 2380 ) 2381 2382 def create_dynamic_schema_loader( 2383 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2384 ) -> DynamicSchemaLoader: 2385 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2386 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2387 2388 schema_transformations = [] 2389 if model.schema_transformations: 2390 for transformation_model in model.schema_transformations: 2391 schema_transformations.append( 2392 self._create_component_from_model(model=transformation_model, config=config) 2393 ) 2394 2395 retriever = self._create_component_from_model( 2396 model=model.retriever, 2397 config=config, 2398 name="dynamic_properties", 2399 primary_key=None, 2400 stream_slicer=combined_slicers, 2401 transformations=[], 2402 use_cache=True, 2403 ) 2404 schema_type_identifier = self._create_component_from_model( 2405 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2406 ) 2407 return DynamicSchemaLoader( 2408 retriever=retriever, 2409 config=config, 2410 schema_transformations=schema_transformations, 2411 schema_type_identifier=schema_type_identifier, 2412 parameters=model.parameters or {}, 2413 ) 2414 2415 @staticmethod 2416 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2417 return JsonDecoder(parameters={}) 2418 2419 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2420 return CompositeRawDecoder( 2421 parser=ModelToComponentFactory._get_parser(model, config), 2422 stream_response=False if self._emit_connector_builder_messages else True, 2423 ) 2424 2425 def create_jsonl_decoder( 2426 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2427 ) -> Decoder: 2428 return CompositeRawDecoder( 2429 parser=ModelToComponentFactory._get_parser(model, config), 2430 stream_response=False if self._emit_connector_builder_messages else True, 2431 ) 2432 2433 def create_gzip_decoder( 2434 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2435 ) -> Decoder: 2436 _compressed_response_types = { 2437 "gzip", 2438 "x-gzip", 2439 "gzip, deflate", 2440 "x-gzip, deflate", 2441 "application/zip", 2442 "application/gzip", 2443 "application/x-gzip", 2444 "application/x-zip-compressed", 2445 } 2446 2447 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2448 2449 if self._emit_connector_builder_messages: 2450 # This is very surprising but if the response is not streamed, 2451 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2452 # which uses urllib3 directly and does not uncompress the data. 2453 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2454 2455 return CompositeRawDecoder.by_headers( 2456 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2457 stream_response=True, 2458 fallback_parser=gzip_parser.inner_parser, 2459 ) 2460 2461 @staticmethod 2462 def create_incrementing_count_cursor( 2463 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2464 ) -> DatetimeBasedCursor: 2465 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2466 # we still parse models into components. The issue is that there's no runtime implementation of a 2467 # IncrementingCountCursor. 2468 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2469 return DatetimeBasedCursor( 2470 cursor_field=model.cursor_field, 2471 datetime_format="%Y-%m-%d", 2472 start_datetime="2024-12-12", 2473 config=config, 2474 parameters={}, 2475 ) 2476 2477 @staticmethod 2478 def create_iterable_decoder( 2479 model: IterableDecoderModel, config: Config, **kwargs: Any 2480 ) -> IterableDecoder: 2481 return IterableDecoder(parameters={}) 2482 2483 @staticmethod 2484 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2485 return XmlDecoder(parameters={}) 2486 2487 def create_zipfile_decoder( 2488 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2489 ) -> ZipfileDecoder: 2490 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2491 2492 @staticmethod 2493 def _get_parser(model: BaseModel, config: Config) -> Parser: 2494 if isinstance(model, JsonDecoderModel): 2495 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2496 return JsonParser() 2497 elif isinstance(model, JsonlDecoderModel): 2498 return JsonLineParser() 2499 elif isinstance(model, CsvDecoderModel): 2500 return CsvParser(encoding=model.encoding, delimiter=model.delimiter) 2501 elif isinstance(model, GzipDecoderModel): 2502 return GzipParser( 2503 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2504 ) 2505 elif isinstance( 2506 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2507 ): 2508 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2509 2510 raise ValueError(f"Unknown decoder type {model}") 2511 2512 @staticmethod 2513 def create_json_file_schema_loader( 2514 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2515 ) -> JsonFileSchemaLoader: 2516 return JsonFileSchemaLoader( 2517 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2518 ) 2519 2520 @staticmethod 2521 def create_jwt_authenticator( 2522 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2523 ) -> JwtAuthenticator: 2524 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2525 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2526 return JwtAuthenticator( 2527 config=config, 2528 parameters=model.parameters or {}, 2529 algorithm=JwtAlgorithm(model.algorithm.value), 2530 secret_key=model.secret_key, 2531 base64_encode_secret_key=model.base64_encode_secret_key, 2532 token_duration=model.token_duration, 2533 header_prefix=model.header_prefix, 2534 kid=jwt_headers.kid, 2535 typ=jwt_headers.typ, 2536 cty=jwt_headers.cty, 2537 iss=jwt_payload.iss, 2538 sub=jwt_payload.sub, 2539 aud=jwt_payload.aud, 2540 additional_jwt_headers=model.additional_jwt_headers, 2541 additional_jwt_payload=model.additional_jwt_payload, 2542 ) 2543 2544 def create_list_partition_router( 2545 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2546 ) -> ListPartitionRouter: 2547 request_option = ( 2548 self._create_component_from_model(model.request_option, config) 2549 if model.request_option 2550 else None 2551 ) 2552 return ListPartitionRouter( 2553 cursor_field=model.cursor_field, 2554 request_option=request_option, 2555 values=model.values, 2556 config=config, 2557 parameters=model.parameters or {}, 2558 ) 2559 2560 @staticmethod 2561 def create_min_max_datetime( 2562 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2563 ) -> MinMaxDatetime: 2564 return MinMaxDatetime( 2565 datetime=model.datetime, 2566 datetime_format=model.datetime_format or "", 2567 max_datetime=model.max_datetime or "", 2568 min_datetime=model.min_datetime or "", 2569 parameters=model.parameters or {}, 2570 ) 2571 2572 @staticmethod 2573 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2574 return NoAuth(parameters=model.parameters or {}) 2575 2576 @staticmethod 2577 def create_no_pagination( 2578 model: NoPaginationModel, config: Config, **kwargs: Any 2579 ) -> NoPagination: 2580 return NoPagination(parameters={}) 2581 2582 def create_oauth_authenticator( 2583 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2584 ) -> DeclarativeOauth2Authenticator: 2585 profile_assertion = ( 2586 self._create_component_from_model(model.profile_assertion, config=config) 2587 if model.profile_assertion 2588 else None 2589 ) 2590 2591 if model.refresh_token_updater: 2592 # ignore type error because fixing it would have a lot of dependencies, revisit later 2593 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2594 config, 2595 InterpolatedString.create( 2596 model.token_refresh_endpoint, # type: ignore 2597 parameters=model.parameters or {}, 2598 ).eval(config), 2599 access_token_name=InterpolatedString.create( 2600 model.access_token_name or "access_token", parameters=model.parameters or {} 2601 ).eval(config), 2602 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2603 expires_in_name=InterpolatedString.create( 2604 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2605 ).eval(config), 2606 client_id_name=InterpolatedString.create( 2607 model.client_id_name or "client_id", parameters=model.parameters or {} 2608 ).eval(config), 2609 client_id=InterpolatedString.create( 2610 model.client_id, parameters=model.parameters or {} 2611 ).eval(config) 2612 if model.client_id 2613 else model.client_id, 2614 client_secret_name=InterpolatedString.create( 2615 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2616 ).eval(config), 2617 client_secret=InterpolatedString.create( 2618 model.client_secret, parameters=model.parameters or {} 2619 ).eval(config) 2620 if model.client_secret 2621 else model.client_secret, 2622 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2623 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2624 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2625 grant_type_name=InterpolatedString.create( 2626 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2627 ).eval(config), 2628 grant_type=InterpolatedString.create( 2629 model.grant_type or "refresh_token", parameters=model.parameters or {} 2630 ).eval(config), 2631 refresh_request_body=InterpolatedMapping( 2632 model.refresh_request_body or {}, parameters=model.parameters or {} 2633 ).eval(config), 2634 refresh_request_headers=InterpolatedMapping( 2635 model.refresh_request_headers or {}, parameters=model.parameters or {} 2636 ).eval(config), 2637 scopes=model.scopes, 2638 token_expiry_date_format=model.token_expiry_date_format, 2639 message_repository=self._message_repository, 2640 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2641 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2642 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2643 ) 2644 # ignore type error because fixing it would have a lot of dependencies, revisit later 2645 return DeclarativeOauth2Authenticator( # type: ignore 2646 access_token_name=model.access_token_name or "access_token", 2647 access_token_value=model.access_token_value, 2648 client_id_name=model.client_id_name or "client_id", 2649 client_id=model.client_id, 2650 client_secret_name=model.client_secret_name or "client_secret", 2651 client_secret=model.client_secret, 2652 expires_in_name=model.expires_in_name or "expires_in", 2653 grant_type_name=model.grant_type_name or "grant_type", 2654 grant_type=model.grant_type or "refresh_token", 2655 refresh_request_body=model.refresh_request_body, 2656 refresh_request_headers=model.refresh_request_headers, 2657 refresh_token_name=model.refresh_token_name or "refresh_token", 2658 refresh_token=model.refresh_token, 2659 scopes=model.scopes, 2660 token_expiry_date=model.token_expiry_date, 2661 token_expiry_date_format=model.token_expiry_date_format, 2662 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2663 token_refresh_endpoint=model.token_refresh_endpoint, 2664 config=config, 2665 parameters=model.parameters or {}, 2666 message_repository=self._message_repository, 2667 profile_assertion=profile_assertion, 2668 use_profile_assertion=model.use_profile_assertion, 2669 ) 2670 2671 def create_offset_increment( 2672 self, 2673 model: OffsetIncrementModel, 2674 config: Config, 2675 decoder: Decoder, 2676 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2677 **kwargs: Any, 2678 ) -> OffsetIncrement: 2679 if isinstance(decoder, PaginationDecoderDecorator): 2680 inner_decoder = decoder.decoder 2681 else: 2682 inner_decoder = decoder 2683 decoder = PaginationDecoderDecorator(decoder=decoder) 2684 2685 if self._is_supported_decoder_for_pagination(inner_decoder): 2686 decoder_to_use = decoder 2687 else: 2688 raise ValueError( 2689 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2690 ) 2691 2692 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2693 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2694 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2695 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2696 # When we have more time to investigate we can look into reusing the same component. 2697 extractor = ( 2698 self._create_component_from_model( 2699 model=extractor_model, config=config, decoder=decoder_to_use 2700 ) 2701 if extractor_model 2702 else None 2703 ) 2704 2705 return OffsetIncrement( 2706 page_size=model.page_size, 2707 config=config, 2708 decoder=decoder_to_use, 2709 extractor=extractor, 2710 inject_on_first_request=model.inject_on_first_request or False, 2711 parameters=model.parameters or {}, 2712 ) 2713 2714 @staticmethod 2715 def create_page_increment( 2716 model: PageIncrementModel, config: Config, **kwargs: Any 2717 ) -> PageIncrement: 2718 return PageIncrement( 2719 page_size=model.page_size, 2720 config=config, 2721 start_from_page=model.start_from_page or 0, 2722 inject_on_first_request=model.inject_on_first_request or False, 2723 parameters=model.parameters or {}, 2724 ) 2725 2726 def create_parent_stream_config( 2727 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2728 ) -> ParentStreamConfig: 2729 declarative_stream = self._create_component_from_model( 2730 model.stream, config=config, **kwargs 2731 ) 2732 request_option = ( 2733 self._create_component_from_model(model.request_option, config=config) 2734 if model.request_option 2735 else None 2736 ) 2737 2738 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2739 raise ValueError( 2740 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2741 ) 2742 2743 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2744 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2745 ) 2746 2747 return ParentStreamConfig( 2748 parent_key=model.parent_key, 2749 request_option=request_option, 2750 stream=declarative_stream, 2751 partition_field=model.partition_field, 2752 config=config, 2753 incremental_dependency=model.incremental_dependency or False, 2754 parameters=model.parameters or {}, 2755 extra_fields=model.extra_fields, 2756 lazy_read_pointer=model_lazy_read_pointer, 2757 ) 2758 2759 def create_properties_from_endpoint( 2760 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2761 ) -> PropertiesFromEndpoint: 2762 retriever = self._create_component_from_model( 2763 model=model.retriever, 2764 config=config, 2765 name="dynamic_properties", 2766 primary_key=None, 2767 stream_slicer=None, 2768 transformations=[], 2769 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2770 ) 2771 return PropertiesFromEndpoint( 2772 property_field_path=model.property_field_path, 2773 retriever=retriever, 2774 config=config, 2775 parameters=model.parameters or {}, 2776 ) 2777 2778 def create_property_chunking( 2779 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2780 ) -> PropertyChunking: 2781 record_merge_strategy = ( 2782 self._create_component_from_model( 2783 model=model.record_merge_strategy, config=config, **kwargs 2784 ) 2785 if model.record_merge_strategy 2786 else None 2787 ) 2788 2789 property_limit_type: PropertyLimitType 2790 match model.property_limit_type: 2791 case PropertyLimitTypeModel.property_count: 2792 property_limit_type = PropertyLimitType.property_count 2793 case PropertyLimitTypeModel.characters: 2794 property_limit_type = PropertyLimitType.characters 2795 case _: 2796 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2797 2798 return PropertyChunking( 2799 property_limit_type=property_limit_type, 2800 property_limit=model.property_limit, 2801 record_merge_strategy=record_merge_strategy, 2802 config=config, 2803 parameters=model.parameters or {}, 2804 ) 2805 2806 def create_query_properties( 2807 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2808 ) -> QueryProperties: 2809 if isinstance(model.property_list, list): 2810 property_list = model.property_list 2811 else: 2812 property_list = self._create_component_from_model( 2813 model=model.property_list, config=config, **kwargs 2814 ) 2815 2816 property_chunking = ( 2817 self._create_component_from_model( 2818 model=model.property_chunking, config=config, **kwargs 2819 ) 2820 if model.property_chunking 2821 else None 2822 ) 2823 2824 return QueryProperties( 2825 property_list=property_list, 2826 always_include_properties=model.always_include_properties, 2827 property_chunking=property_chunking, 2828 config=config, 2829 parameters=model.parameters or {}, 2830 ) 2831 2832 @staticmethod 2833 def create_record_filter( 2834 model: RecordFilterModel, config: Config, **kwargs: Any 2835 ) -> RecordFilter: 2836 return RecordFilter( 2837 condition=model.condition or "", config=config, parameters=model.parameters or {} 2838 ) 2839 2840 @staticmethod 2841 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 2842 return RequestPath(parameters={}) 2843 2844 @staticmethod 2845 def create_request_option( 2846 model: RequestOptionModel, config: Config, **kwargs: Any 2847 ) -> RequestOption: 2848 inject_into = RequestOptionType(model.inject_into.value) 2849 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 2850 [ 2851 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 2852 for segment in model.field_path 2853 ] 2854 if model.field_path 2855 else None 2856 ) 2857 field_name = ( 2858 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 2859 if model.field_name 2860 else None 2861 ) 2862 return RequestOption( 2863 field_name=field_name, 2864 field_path=field_path, 2865 inject_into=inject_into, 2866 parameters=kwargs.get("parameters", {}), 2867 ) 2868 2869 def create_record_selector( 2870 self, 2871 model: RecordSelectorModel, 2872 config: Config, 2873 *, 2874 name: str, 2875 transformations: List[RecordTransformation] | None = None, 2876 decoder: Decoder | None = None, 2877 client_side_incremental_sync: Dict[str, Any] | None = None, 2878 file_uploader: Optional[DefaultFileUploader] = None, 2879 **kwargs: Any, 2880 ) -> RecordSelector: 2881 extractor = self._create_component_from_model( 2882 model=model.extractor, decoder=decoder, config=config 2883 ) 2884 record_filter = ( 2885 self._create_component_from_model(model.record_filter, config=config) 2886 if model.record_filter 2887 else None 2888 ) 2889 2890 transform_before_filtering = ( 2891 False if model.transform_before_filtering is None else model.transform_before_filtering 2892 ) 2893 if client_side_incremental_sync: 2894 record_filter = ClientSideIncrementalRecordFilterDecorator( 2895 config=config, 2896 parameters=model.parameters, 2897 condition=model.record_filter.condition 2898 if (model.record_filter and hasattr(model.record_filter, "condition")) 2899 else None, 2900 **client_side_incremental_sync, 2901 ) 2902 transform_before_filtering = ( 2903 True 2904 if model.transform_before_filtering is None 2905 else model.transform_before_filtering 2906 ) 2907 2908 if model.schema_normalization is None: 2909 # default to no schema normalization if not set 2910 model.schema_normalization = SchemaNormalizationModel.None_ 2911 2912 schema_normalization = ( 2913 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 2914 if isinstance(model.schema_normalization, SchemaNormalizationModel) 2915 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 2916 ) 2917 2918 return RecordSelector( 2919 extractor=extractor, 2920 name=name, 2921 config=config, 2922 record_filter=record_filter, 2923 transformations=transformations or [], 2924 file_uploader=file_uploader, 2925 schema_normalization=schema_normalization, 2926 parameters=model.parameters or {}, 2927 transform_before_filtering=transform_before_filtering, 2928 ) 2929 2930 @staticmethod 2931 def create_remove_fields( 2932 model: RemoveFieldsModel, config: Config, **kwargs: Any 2933 ) -> RemoveFields: 2934 return RemoveFields( 2935 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 2936 ) 2937 2938 def create_selective_authenticator( 2939 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 2940 ) -> DeclarativeAuthenticator: 2941 authenticators = { 2942 name: self._create_component_from_model(model=auth, config=config) 2943 for name, auth in model.authenticators.items() 2944 } 2945 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 2946 return SelectiveAuthenticator( # type: ignore[abstract] 2947 config=config, 2948 authenticators=authenticators, 2949 authenticator_selection_path=model.authenticator_selection_path, 2950 **kwargs, 2951 ) 2952 2953 @staticmethod 2954 def create_legacy_session_token_authenticator( 2955 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 2956 ) -> LegacySessionTokenAuthenticator: 2957 return LegacySessionTokenAuthenticator( 2958 api_url=url_base, 2959 header=model.header, 2960 login_url=model.login_url, 2961 password=model.password or "", 2962 session_token=model.session_token or "", 2963 session_token_response_key=model.session_token_response_key or "", 2964 username=model.username or "", 2965 validate_session_url=model.validate_session_url, 2966 config=config, 2967 parameters=model.parameters or {}, 2968 ) 2969 2970 def create_simple_retriever( 2971 self, 2972 model: SimpleRetrieverModel, 2973 config: Config, 2974 *, 2975 name: str, 2976 primary_key: Optional[Union[str, List[str], List[List[str]]]], 2977 stream_slicer: Optional[StreamSlicer], 2978 request_options_provider: Optional[RequestOptionsProvider] = None, 2979 stop_condition_on_cursor: bool = False, 2980 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 2981 transformations: List[RecordTransformation], 2982 file_uploader: Optional[DefaultFileUploader] = None, 2983 incremental_sync: Optional[ 2984 Union[ 2985 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 2986 ] 2987 ] = None, 2988 use_cache: Optional[bool] = None, 2989 **kwargs: Any, 2990 ) -> SimpleRetriever: 2991 def _get_url() -> str: 2992 """ 2993 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 2994 This is needed because the URL is not set until the requester is created. 2995 """ 2996 2997 _url = ( 2998 model.requester.url 2999 if hasattr(model.requester, "url") and model.requester.url is not None 3000 else requester.get_url() 3001 ) 3002 _url_base = ( 3003 model.requester.url_base 3004 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3005 else requester.get_url_base() 3006 ) 3007 3008 return _url or _url_base 3009 3010 decoder = ( 3011 self._create_component_from_model(model=model.decoder, config=config) 3012 if model.decoder 3013 else JsonDecoder(parameters={}) 3014 ) 3015 record_selector = self._create_component_from_model( 3016 model=model.record_selector, 3017 name=name, 3018 config=config, 3019 decoder=decoder, 3020 transformations=transformations, 3021 client_side_incremental_sync=client_side_incremental_sync, 3022 file_uploader=file_uploader, 3023 ) 3024 3025 query_properties: Optional[QueryProperties] = None 3026 query_properties_key: Optional[str] = None 3027 if self._query_properties_in_request_parameters(model.requester): 3028 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3029 # places instead of default to request_parameters which isn't clearly documented 3030 if ( 3031 hasattr(model.requester, "fetch_properties_from_endpoint") 3032 and model.requester.fetch_properties_from_endpoint 3033 ): 3034 raise ValueError( 3035 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3036 ) 3037 3038 query_properties_definitions = [] 3039 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3040 if isinstance(request_parameter, QueryPropertiesModel): 3041 query_properties_key = key 3042 query_properties_definitions.append(request_parameter) 3043 3044 if len(query_properties_definitions) > 1: 3045 raise ValueError( 3046 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3047 ) 3048 3049 if len(query_properties_definitions) == 1: 3050 query_properties = self._create_component_from_model( 3051 model=query_properties_definitions[0], config=config 3052 ) 3053 elif ( 3054 hasattr(model.requester, "fetch_properties_from_endpoint") 3055 and model.requester.fetch_properties_from_endpoint 3056 ): 3057 query_properties_definition = QueryPropertiesModel( 3058 type="QueryProperties", 3059 property_list=model.requester.fetch_properties_from_endpoint, 3060 always_include_properties=None, 3061 property_chunking=None, 3062 ) # type: ignore # $parameters has a default value 3063 3064 query_properties = self.create_query_properties( 3065 model=query_properties_definition, 3066 config=config, 3067 ) 3068 3069 requester = self._create_component_from_model( 3070 model=model.requester, 3071 decoder=decoder, 3072 name=name, 3073 query_properties_key=query_properties_key, 3074 use_cache=use_cache, 3075 config=config, 3076 ) 3077 3078 # Define cursor only if per partition or common incremental support is needed 3079 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3080 3081 if ( 3082 not isinstance(stream_slicer, DatetimeBasedCursor) 3083 or type(stream_slicer) is not DatetimeBasedCursor 3084 ): 3085 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3086 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3087 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3088 # request_options_provider 3089 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3090 elif not request_options_provider: 3091 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3092 3093 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3094 3095 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3096 paginator = ( 3097 self._create_component_from_model( 3098 model=model.paginator, 3099 config=config, 3100 url_base=_get_url(), 3101 extractor_model=model.record_selector.extractor, 3102 decoder=decoder, 3103 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3104 ) 3105 if model.paginator 3106 else NoPagination(parameters={}) 3107 ) 3108 3109 ignore_stream_slicer_parameters_on_paginated_requests = ( 3110 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3111 ) 3112 3113 if ( 3114 model.partition_router 3115 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3116 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3117 and any( 3118 parent_stream_config.lazy_read_pointer 3119 for parent_stream_config in model.partition_router.parent_stream_configs 3120 ) 3121 ): 3122 if incremental_sync: 3123 if incremental_sync.type != "DatetimeBasedCursor": 3124 raise ValueError( 3125 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3126 ) 3127 3128 elif incremental_sync.step or incremental_sync.cursor_granularity: 3129 raise ValueError( 3130 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3131 ) 3132 3133 if model.decoder and model.decoder.type != "JsonDecoder": 3134 raise ValueError( 3135 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3136 ) 3137 3138 return LazySimpleRetriever( 3139 name=name, 3140 paginator=paginator, 3141 primary_key=primary_key, 3142 requester=requester, 3143 record_selector=record_selector, 3144 stream_slicer=stream_slicer, 3145 request_option_provider=request_options_provider, 3146 cursor=cursor, 3147 config=config, 3148 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3149 parameters=model.parameters or {}, 3150 ) 3151 3152 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3153 return SimpleRetrieverTestReadDecorator( 3154 name=name, 3155 paginator=paginator, 3156 primary_key=primary_key, 3157 requester=requester, 3158 record_selector=record_selector, 3159 stream_slicer=stream_slicer, 3160 request_option_provider=request_options_provider, 3161 cursor=cursor, 3162 config=config, 3163 maximum_number_of_slices=self._limit_slices_fetched or 5, 3164 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3165 parameters=model.parameters or {}, 3166 ) 3167 return SimpleRetriever( 3168 name=name, 3169 paginator=paginator, 3170 primary_key=primary_key, 3171 requester=requester, 3172 record_selector=record_selector, 3173 stream_slicer=stream_slicer, 3174 request_option_provider=request_options_provider, 3175 cursor=cursor, 3176 config=config, 3177 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3178 additional_query_properties=query_properties, 3179 parameters=model.parameters or {}, 3180 ) 3181 3182 @staticmethod 3183 def _query_properties_in_request_parameters( 3184 requester: Union[HttpRequesterModel, CustomRequesterModel], 3185 ) -> bool: 3186 if not hasattr(requester, "request_parameters"): 3187 return False 3188 request_parameters = requester.request_parameters 3189 if request_parameters and isinstance(request_parameters, Mapping): 3190 for request_parameter in request_parameters.values(): 3191 if isinstance(request_parameter, QueryPropertiesModel): 3192 return True 3193 return False 3194 3195 @staticmethod 3196 def _remove_query_properties( 3197 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3198 ) -> Mapping[str, str]: 3199 return { 3200 parameter_field: request_parameter 3201 for parameter_field, request_parameter in request_parameters.items() 3202 if not isinstance(request_parameter, QueryPropertiesModel) 3203 } 3204 3205 def create_state_delegating_stream( 3206 self, 3207 model: StateDelegatingStreamModel, 3208 config: Config, 3209 has_parent_state: Optional[bool] = None, 3210 **kwargs: Any, 3211 ) -> DeclarativeStream: 3212 if ( 3213 model.full_refresh_stream.name != model.name 3214 or model.name != model.incremental_stream.name 3215 ): 3216 raise ValueError( 3217 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3218 ) 3219 3220 stream_model = ( 3221 model.incremental_stream 3222 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3223 else model.full_refresh_stream 3224 ) 3225 3226 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3227 3228 def _create_async_job_status_mapping( 3229 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3230 ) -> Mapping[str, AsyncJobStatus]: 3231 api_status_to_cdk_status = {} 3232 for cdk_status, api_statuses in model.dict().items(): 3233 if cdk_status == "type": 3234 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3235 continue 3236 3237 for status in api_statuses: 3238 if status in api_status_to_cdk_status: 3239 raise ValueError( 3240 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3241 ) 3242 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3243 return api_status_to_cdk_status 3244 3245 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3246 match status: 3247 case "running": 3248 return AsyncJobStatus.RUNNING 3249 case "completed": 3250 return AsyncJobStatus.COMPLETED 3251 case "failed": 3252 return AsyncJobStatus.FAILED 3253 case "timeout": 3254 return AsyncJobStatus.TIMED_OUT 3255 case _: 3256 raise ValueError(f"Unsupported CDK status {status}") 3257 3258 def create_async_retriever( 3259 self, 3260 model: AsyncRetrieverModel, 3261 config: Config, 3262 *, 3263 name: str, 3264 primary_key: Optional[ 3265 Union[str, List[str], List[List[str]]] 3266 ], # this seems to be needed to match create_simple_retriever 3267 stream_slicer: Optional[StreamSlicer], 3268 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3269 transformations: List[RecordTransformation], 3270 **kwargs: Any, 3271 ) -> AsyncRetriever: 3272 def _get_download_retriever() -> SimpleRetrieverTestReadDecorator | SimpleRetriever: 3273 record_selector = RecordSelector( 3274 extractor=download_extractor, 3275 name=name, 3276 record_filter=None, 3277 transformations=transformations, 3278 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3279 config=config, 3280 parameters={}, 3281 ) 3282 paginator = ( 3283 self._create_component_from_model( 3284 model=model.download_paginator, 3285 decoder=decoder, 3286 config=config, 3287 url_base="", 3288 ) 3289 if model.download_paginator 3290 else NoPagination(parameters={}) 3291 ) 3292 maximum_number_of_slices = self._limit_slices_fetched or 5 3293 3294 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3295 return SimpleRetrieverTestReadDecorator( 3296 requester=download_requester, 3297 record_selector=record_selector, 3298 primary_key=None, 3299 name=job_download_components_name, 3300 paginator=paginator, 3301 config=config, 3302 parameters={}, 3303 maximum_number_of_slices=maximum_number_of_slices, 3304 ) 3305 3306 return SimpleRetriever( 3307 requester=download_requester, 3308 record_selector=record_selector, 3309 primary_key=None, 3310 name=job_download_components_name, 3311 paginator=paginator, 3312 config=config, 3313 parameters={}, 3314 ) 3315 3316 def _get_job_timeout() -> datetime.timedelta: 3317 user_defined_timeout: Optional[int] = ( 3318 int( 3319 InterpolatedString.create( 3320 str(model.polling_job_timeout), 3321 parameters={}, 3322 ).eval(config) 3323 ) 3324 if model.polling_job_timeout 3325 else None 3326 ) 3327 3328 # check for user defined timeout during the test read or 15 minutes 3329 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3330 # default value for non-connector builder is 60 minutes. 3331 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3332 3333 return ( 3334 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3335 ) 3336 3337 decoder = ( 3338 self._create_component_from_model(model=model.decoder, config=config) 3339 if model.decoder 3340 else JsonDecoder(parameters={}) 3341 ) 3342 record_selector = self._create_component_from_model( 3343 model=model.record_selector, 3344 config=config, 3345 decoder=decoder, 3346 name=name, 3347 transformations=transformations, 3348 client_side_incremental_sync=client_side_incremental_sync, 3349 ) 3350 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3351 creation_requester = self._create_component_from_model( 3352 model=model.creation_requester, 3353 decoder=decoder, 3354 config=config, 3355 name=f"job creation - {name}", 3356 ) 3357 polling_requester = self._create_component_from_model( 3358 model=model.polling_requester, 3359 decoder=decoder, 3360 config=config, 3361 name=f"job polling - {name}", 3362 ) 3363 job_download_components_name = f"job download - {name}" 3364 download_decoder = ( 3365 self._create_component_from_model(model=model.download_decoder, config=config) 3366 if model.download_decoder 3367 else JsonDecoder(parameters={}) 3368 ) 3369 download_extractor = ( 3370 self._create_component_from_model( 3371 model=model.download_extractor, 3372 config=config, 3373 decoder=download_decoder, 3374 parameters=model.parameters, 3375 ) 3376 if model.download_extractor 3377 else DpathExtractor( 3378 [], 3379 config=config, 3380 decoder=download_decoder, 3381 parameters=model.parameters or {}, 3382 ) 3383 ) 3384 download_requester = self._create_component_from_model( 3385 model=model.download_requester, 3386 decoder=download_decoder, 3387 config=config, 3388 name=job_download_components_name, 3389 ) 3390 download_retriever = _get_download_retriever() 3391 abort_requester = ( 3392 self._create_component_from_model( 3393 model=model.abort_requester, 3394 decoder=decoder, 3395 config=config, 3396 name=f"job abort - {name}", 3397 ) 3398 if model.abort_requester 3399 else None 3400 ) 3401 delete_requester = ( 3402 self._create_component_from_model( 3403 model=model.delete_requester, 3404 decoder=decoder, 3405 config=config, 3406 name=f"job delete - {name}", 3407 ) 3408 if model.delete_requester 3409 else None 3410 ) 3411 download_target_requester = ( 3412 self._create_component_from_model( 3413 model=model.download_target_requester, 3414 decoder=decoder, 3415 config=config, 3416 name=f"job extract_url - {name}", 3417 ) 3418 if model.download_target_requester 3419 else None 3420 ) 3421 status_extractor = self._create_component_from_model( 3422 model=model.status_extractor, decoder=decoder, config=config, name=name 3423 ) 3424 download_target_extractor = self._create_component_from_model( 3425 model=model.download_target_extractor, 3426 decoder=decoder, 3427 config=config, 3428 name=name, 3429 ) 3430 3431 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3432 creation_requester=creation_requester, 3433 polling_requester=polling_requester, 3434 download_retriever=download_retriever, 3435 download_target_requester=download_target_requester, 3436 abort_requester=abort_requester, 3437 delete_requester=delete_requester, 3438 status_extractor=status_extractor, 3439 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3440 download_target_extractor=download_target_extractor, 3441 job_timeout=_get_job_timeout(), 3442 ) 3443 3444 async_job_partition_router = AsyncJobPartitionRouter( 3445 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3446 job_repository, 3447 stream_slices, 3448 self._job_tracker, 3449 self._message_repository, 3450 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3451 has_bulk_parent=False, 3452 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3453 # `None` == default retry is set to 3 attempts, under the hood. 3454 job_max_retry=1 if self._emit_connector_builder_messages else None, 3455 ), 3456 stream_slicer=stream_slicer, 3457 config=config, 3458 parameters=model.parameters or {}, 3459 ) 3460 3461 return AsyncRetriever( 3462 record_selector=record_selector, 3463 stream_slicer=async_job_partition_router, 3464 config=config, 3465 parameters=model.parameters or {}, 3466 ) 3467 3468 @staticmethod 3469 def create_spec(model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3470 return Spec( 3471 connection_specification=model.connection_specification, 3472 documentation_url=model.documentation_url, 3473 advanced_auth=model.advanced_auth, 3474 parameters={}, 3475 ) 3476 3477 def create_substream_partition_router( 3478 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3479 ) -> SubstreamPartitionRouter: 3480 parent_stream_configs = [] 3481 if model.parent_stream_configs: 3482 parent_stream_configs.extend( 3483 [ 3484 self._create_message_repository_substream_wrapper( 3485 model=parent_stream_config, config=config, **kwargs 3486 ) 3487 for parent_stream_config in model.parent_stream_configs 3488 ] 3489 ) 3490 3491 return SubstreamPartitionRouter( 3492 parent_stream_configs=parent_stream_configs, 3493 parameters=model.parameters or {}, 3494 config=config, 3495 ) 3496 3497 def _create_message_repository_substream_wrapper( 3498 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3499 ) -> Any: 3500 substream_factory = ModelToComponentFactory( 3501 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3502 limit_slices_fetched=self._limit_slices_fetched, 3503 emit_connector_builder_messages=self._emit_connector_builder_messages, 3504 disable_retries=self._disable_retries, 3505 disable_cache=self._disable_cache, 3506 message_repository=LogAppenderMessageRepositoryDecorator( 3507 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3508 self._message_repository, 3509 self._evaluate_log_level(self._emit_connector_builder_messages), 3510 ), 3511 ) 3512 3513 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3514 has_parent_state = bool( 3515 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3516 if model.incremental_dependency 3517 else False 3518 ) 3519 return substream_factory._create_component_from_model( 3520 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3521 ) 3522 3523 @staticmethod 3524 def create_wait_time_from_header( 3525 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3526 ) -> WaitTimeFromHeaderBackoffStrategy: 3527 return WaitTimeFromHeaderBackoffStrategy( 3528 header=model.header, 3529 parameters=model.parameters or {}, 3530 config=config, 3531 regex=model.regex, 3532 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3533 if model.max_waiting_time_in_seconds is not None 3534 else None, 3535 ) 3536 3537 @staticmethod 3538 def create_wait_until_time_from_header( 3539 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3540 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3541 return WaitUntilTimeFromHeaderBackoffStrategy( 3542 header=model.header, 3543 parameters=model.parameters or {}, 3544 config=config, 3545 min_wait=model.min_wait, 3546 regex=model.regex, 3547 ) 3548 3549 def get_message_repository(self) -> MessageRepository: 3550 return self._message_repository 3551 3552 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3553 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3554 3555 @staticmethod 3556 def create_components_mapping_definition( 3557 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3558 ) -> ComponentMappingDefinition: 3559 interpolated_value = InterpolatedString.create( 3560 model.value, parameters=model.parameters or {} 3561 ) 3562 field_path = [ 3563 InterpolatedString.create(path, parameters=model.parameters or {}) 3564 for path in model.field_path 3565 ] 3566 return ComponentMappingDefinition( 3567 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3568 value=interpolated_value, 3569 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3570 parameters=model.parameters or {}, 3571 ) 3572 3573 def create_http_components_resolver( 3574 self, model: HttpComponentsResolverModel, config: Config 3575 ) -> Any: 3576 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3577 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3578 3579 retriever = self._create_component_from_model( 3580 model=model.retriever, 3581 config=config, 3582 name="", 3583 primary_key=None, 3584 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3585 transformations=[], 3586 ) 3587 3588 components_mapping = [ 3589 self._create_component_from_model( 3590 model=components_mapping_definition_model, 3591 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3592 components_mapping_definition_model.value_type 3593 ), 3594 config=config, 3595 ) 3596 for components_mapping_definition_model in model.components_mapping 3597 ] 3598 3599 return HttpComponentsResolver( 3600 retriever=retriever, 3601 config=config, 3602 components_mapping=components_mapping, 3603 parameters=model.parameters or {}, 3604 ) 3605 3606 @staticmethod 3607 def create_stream_config( 3608 model: StreamConfigModel, config: Config, **kwargs: Any 3609 ) -> StreamConfig: 3610 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3611 [x for x in model.configs_pointer] if model.configs_pointer else [] 3612 ) 3613 3614 return StreamConfig( 3615 configs_pointer=model_configs_pointer, 3616 parameters=model.parameters or {}, 3617 ) 3618 3619 def create_config_components_resolver( 3620 self, model: ConfigComponentsResolverModel, config: Config 3621 ) -> Any: 3622 stream_config = self._create_component_from_model( 3623 model.stream_config, config=config, parameters=model.parameters or {} 3624 ) 3625 3626 components_mapping = [ 3627 self._create_component_from_model( 3628 model=components_mapping_definition_model, 3629 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3630 components_mapping_definition_model.value_type 3631 ), 3632 config=config, 3633 ) 3634 for components_mapping_definition_model in model.components_mapping 3635 ] 3636 3637 return ConfigComponentsResolver( 3638 stream_config=stream_config, 3639 config=config, 3640 components_mapping=components_mapping, 3641 parameters=model.parameters or {}, 3642 ) 3643 3644 _UNSUPPORTED_DECODER_ERROR = ( 3645 "Specified decoder of {decoder_type} is not supported for pagination." 3646 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3647 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3648 ) 3649 3650 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3651 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3652 return True 3653 elif isinstance(decoder, CompositeRawDecoder): 3654 return self._is_supported_parser_for_pagination(decoder.parser) 3655 else: 3656 return False 3657 3658 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3659 if isinstance(parser, JsonParser): 3660 return True 3661 elif isinstance(parser, GzipParser): 3662 return isinstance(parser.inner_parser, JsonParser) 3663 else: 3664 return False 3665 3666 def create_http_api_budget( 3667 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3668 ) -> HttpAPIBudget: 3669 policies = [ 3670 self._create_component_from_model(model=policy, config=config) 3671 for policy in model.policies 3672 ] 3673 3674 return HttpAPIBudget( 3675 policies=policies, 3676 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3677 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3678 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3679 ) 3680 3681 def create_fixed_window_call_rate_policy( 3682 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3683 ) -> FixedWindowCallRatePolicy: 3684 matchers = [ 3685 self._create_component_from_model(model=matcher, config=config) 3686 for matcher in model.matchers 3687 ] 3688 3689 # Set the initial reset timestamp to 10 days from now. 3690 # This value will be updated by the first request. 3691 return FixedWindowCallRatePolicy( 3692 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3693 period=parse_duration(model.period), 3694 call_limit=model.call_limit, 3695 matchers=matchers, 3696 ) 3697 3698 def create_file_uploader( 3699 self, model: FileUploaderModel, config: Config, **kwargs: Any 3700 ) -> FileUploader: 3701 name = "File Uploader" 3702 requester = self._create_component_from_model( 3703 model=model.requester, 3704 config=config, 3705 name=name, 3706 **kwargs, 3707 ) 3708 download_target_extractor = self._create_component_from_model( 3709 model=model.download_target_extractor, 3710 config=config, 3711 name=name, 3712 **kwargs, 3713 ) 3714 emit_connector_builder_messages = self._emit_connector_builder_messages 3715 file_uploader = DefaultFileUploader( 3716 requester=requester, 3717 download_target_extractor=download_target_extractor, 3718 config=config, 3719 file_writer=NoopFileWriter() 3720 if emit_connector_builder_messages 3721 else LocalFileSystemFileWriter(), 3722 parameters=model.parameters or {}, 3723 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3724 ) 3725 3726 return ( 3727 ConnectorBuilderFileUploader(file_uploader) 3728 if emit_connector_builder_messages 3729 else file_uploader 3730 ) 3731 3732 def create_moving_window_call_rate_policy( 3733 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3734 ) -> MovingWindowCallRatePolicy: 3735 rates = [ 3736 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3737 ] 3738 matchers = [ 3739 self._create_component_from_model(model=matcher, config=config) 3740 for matcher in model.matchers 3741 ] 3742 return MovingWindowCallRatePolicy( 3743 rates=rates, 3744 matchers=matchers, 3745 ) 3746 3747 def create_unlimited_call_rate_policy( 3748 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 3749 ) -> UnlimitedCallRatePolicy: 3750 matchers = [ 3751 self._create_component_from_model(model=matcher, config=config) 3752 for matcher in model.matchers 3753 ] 3754 3755 return UnlimitedCallRatePolicy( 3756 matchers=matchers, 3757 ) 3758 3759 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 3760 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 3761 return Rate( 3762 limit=int(interpolated_limit.eval(config=config)), 3763 interval=parse_duration(model.interval), 3764 ) 3765 3766 def create_http_request_matcher( 3767 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 3768 ) -> HttpRequestRegexMatcher: 3769 return HttpRequestRegexMatcher( 3770 method=model.method, 3771 url_base=model.url_base, 3772 url_path_pattern=model.url_path_pattern, 3773 params=model.params, 3774 headers=model.headers, 3775 ) 3776 3777 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 3778 self._api_budget = self.create_component( 3779 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 3780 ) 3781 3782 def create_grouping_partition_router( 3783 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 3784 ) -> GroupingPartitionRouter: 3785 underlying_router = self._create_component_from_model( 3786 model=model.underlying_partition_router, config=config 3787 ) 3788 if model.group_size < 1: 3789 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 3790 3791 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 3792 # because they are specific to individual partitions and cannot be aggregated or handled 3793 # when grouping, potentially leading to incorrect API calls. Any request customization 3794 # should be managed at the stream level through the requester's configuration. 3795 if isinstance(underlying_router, SubstreamPartitionRouter): 3796 if any( 3797 parent_config.request_option 3798 for parent_config in underlying_router.parent_stream_configs 3799 ): 3800 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3801 3802 if isinstance(underlying_router, ListPartitionRouter): 3803 if underlying_router.request_option: 3804 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3805 3806 return GroupingPartitionRouter( 3807 group_size=model.group_size, 3808 underlying_partition_router=underlying_router, 3809 deduplicate=model.deduplicate if model.deduplicate is not None else True, 3810 config=config, 3811 )
580 def __init__( 581 self, 582 limit_pages_fetched_per_slice: Optional[int] = None, 583 limit_slices_fetched: Optional[int] = None, 584 emit_connector_builder_messages: bool = False, 585 disable_retries: bool = False, 586 disable_cache: bool = False, 587 disable_resumable_full_refresh: bool = False, 588 message_repository: Optional[MessageRepository] = None, 589 connector_state_manager: Optional[ConnectorStateManager] = None, 590 max_concurrent_async_job_count: Optional[int] = None, 591 ): 592 self._init_mappings() 593 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 594 self._limit_slices_fetched = limit_slices_fetched 595 self._emit_connector_builder_messages = emit_connector_builder_messages 596 self._disable_retries = disable_retries 597 self._disable_cache = disable_cache 598 self._disable_resumable_full_refresh = disable_resumable_full_refresh 599 self._message_repository = message_repository or InMemoryMessageRepository( 600 self._evaluate_log_level(emit_connector_builder_messages) 601 ) 602 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 603 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 604 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 605 # placeholder for deprecation warnings 606 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
711 def create_component( 712 self, 713 model_type: Type[BaseModel], 714 component_definition: ComponentDefinition, 715 config: Config, 716 **kwargs: Any, 717 ) -> Any: 718 """ 719 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 720 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 721 creating declarative components from that model. 722 723 :param model_type: The type of declarative component that is being initialized 724 :param component_definition: The mapping that represents a declarative component 725 :param config: The connector config that is provided by the customer 726 :return: The declarative component to be used at runtime 727 """ 728 729 component_type = component_definition.get("type") 730 if component_definition.get("type") != model_type.__name__: 731 raise ValueError( 732 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 733 ) 734 735 declarative_component_model = model_type.parse_obj(component_definition) 736 737 if not isinstance(declarative_component_model, model_type): 738 raise ValueError( 739 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 740 ) 741 742 return self._create_component_from_model( 743 model=declarative_component_model, config=config, **kwargs 744 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
761 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 762 """ 763 Returns the deprecation warnings that were collected during the creation of components. 764 """ 765 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
782 @staticmethod 783 def create_added_field_definition( 784 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 785 ) -> AddedFieldDefinition: 786 interpolated_value = InterpolatedString.create( 787 model.value, parameters=model.parameters or {} 788 ) 789 return AddedFieldDefinition( 790 path=model.path, 791 value=interpolated_value, 792 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 793 parameters=model.parameters or {}, 794 )
796 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 797 added_field_definitions = [ 798 self._create_component_from_model( 799 model=added_field_definition_model, 800 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 801 added_field_definition_model.value_type 802 ), 803 config=config, 804 ) 805 for added_field_definition_model in model.fields 806 ] 807 return AddFields( 808 fields=added_field_definitions, 809 condition=model.condition or "", 810 parameters=model.parameters or {}, 811 )
837 def create_dpath_flatten_fields( 838 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 839 ) -> DpathFlattenFields: 840 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 841 key_transformation = ( 842 KeyTransformation( 843 config=config, 844 prefix=model.key_transformation.prefix, 845 suffix=model.key_transformation.suffix, 846 parameters=model.parameters or {}, 847 ) 848 if model.key_transformation is not None 849 else None 850 ) 851 return DpathFlattenFields( 852 config=config, 853 field_path=model_field_path, 854 delete_origin_value=model.delete_origin_value 855 if model.delete_origin_value is not None 856 else False, 857 replace_record=model.replace_record if model.replace_record is not None else False, 858 key_transformation=key_transformation, 859 parameters=model.parameters or {}, 860 )
874 def create_api_key_authenticator( 875 self, 876 model: ApiKeyAuthenticatorModel, 877 config: Config, 878 token_provider: Optional[TokenProvider] = None, 879 **kwargs: Any, 880 ) -> ApiKeyAuthenticator: 881 if model.inject_into is None and model.header is None: 882 raise ValueError( 883 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 884 ) 885 886 if model.inject_into is not None and model.header is not None: 887 raise ValueError( 888 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 889 ) 890 891 if token_provider is not None and model.api_token != "": 892 raise ValueError( 893 "If token_provider is set, api_token is ignored and has to be set to empty string." 894 ) 895 896 request_option = ( 897 self._create_component_from_model( 898 model.inject_into, config, parameters=model.parameters or {} 899 ) 900 if model.inject_into 901 else RequestOption( 902 inject_into=RequestOptionType.header, 903 field_name=model.header or "", 904 parameters=model.parameters or {}, 905 ) 906 ) 907 908 return ApiKeyAuthenticator( 909 token_provider=( 910 token_provider 911 if token_provider is not None 912 else InterpolatedStringTokenProvider( 913 api_token=model.api_token or "", 914 config=config, 915 parameters=model.parameters or {}, 916 ) 917 ), 918 request_option=request_option, 919 config=config, 920 parameters=model.parameters or {}, 921 )
923 def create_legacy_to_per_partition_state_migration( 924 self, 925 model: LegacyToPerPartitionStateMigrationModel, 926 config: Mapping[str, Any], 927 declarative_stream: DeclarativeStreamModel, 928 ) -> LegacyToPerPartitionStateMigration: 929 retriever = declarative_stream.retriever 930 if not isinstance(retriever, SimpleRetrieverModel): 931 raise ValueError( 932 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever. Got {type(retriever)}" 933 ) 934 partition_router = retriever.partition_router 935 if not isinstance( 936 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 937 ): 938 raise ValueError( 939 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 940 ) 941 if not hasattr(partition_router, "parent_stream_configs"): 942 raise ValueError( 943 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 944 ) 945 946 if not hasattr(declarative_stream, "incremental_sync"): 947 raise ValueError( 948 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 949 ) 950 951 return LegacyToPerPartitionStateMigration( 952 partition_router, # type: ignore # was already checked above 953 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 954 config, 955 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 956 )
958 def create_session_token_authenticator( 959 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 960 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 961 decoder = ( 962 self._create_component_from_model(model=model.decoder, config=config) 963 if model.decoder 964 else JsonDecoder(parameters={}) 965 ) 966 login_requester = self._create_component_from_model( 967 model=model.login_requester, 968 config=config, 969 name=f"{name}_login_requester", 970 decoder=decoder, 971 ) 972 token_provider = SessionTokenProvider( 973 login_requester=login_requester, 974 session_token_path=model.session_token_path, 975 expiration_duration=parse_duration(model.expiration_duration) 976 if model.expiration_duration 977 else None, 978 parameters=model.parameters or {}, 979 message_repository=self._message_repository, 980 decoder=decoder, 981 ) 982 if model.request_authentication.type == "Bearer": 983 return ModelToComponentFactory.create_bearer_authenticator( 984 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 985 config, 986 token_provider=token_provider, 987 ) 988 else: 989 return self.create_api_key_authenticator( 990 ApiKeyAuthenticatorModel( 991 type="ApiKeyAuthenticator", 992 api_token="", 993 inject_into=model.request_authentication.inject_into, 994 ), # type: ignore # $parameters and headers default to None 995 config=config, 996 token_provider=token_provider, 997 )
999 @staticmethod 1000 def create_basic_http_authenticator( 1001 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1002 ) -> BasicHttpAuthenticator: 1003 return BasicHttpAuthenticator( 1004 password=model.password or "", 1005 username=model.username, 1006 config=config, 1007 parameters=model.parameters or {}, 1008 )
1010 @staticmethod 1011 def create_bearer_authenticator( 1012 model: BearerAuthenticatorModel, 1013 config: Config, 1014 token_provider: Optional[TokenProvider] = None, 1015 **kwargs: Any, 1016 ) -> BearerAuthenticator: 1017 if token_provider is not None and model.api_token != "": 1018 raise ValueError( 1019 "If token_provider is set, api_token is ignored and has to be set to empty string." 1020 ) 1021 return BearerAuthenticator( 1022 token_provider=( 1023 token_provider 1024 if token_provider is not None 1025 else InterpolatedStringTokenProvider( 1026 api_token=model.api_token or "", 1027 config=config, 1028 parameters=model.parameters or {}, 1029 ) 1030 ), 1031 config=config, 1032 parameters=model.parameters or {}, 1033 )
1035 @staticmethod 1036 def create_dynamic_stream_check_config( 1037 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1038 ) -> DynamicStreamCheckConfig: 1039 return DynamicStreamCheckConfig( 1040 dynamic_stream_name=model.dynamic_stream_name, 1041 stream_count=model.stream_count or 0, 1042 )
1044 def create_check_stream( 1045 self, model: CheckStreamModel, config: Config, **kwargs: Any 1046 ) -> CheckStream: 1047 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1048 raise ValueError( 1049 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1050 ) 1051 1052 dynamic_streams_check_configs = ( 1053 [ 1054 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1055 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1056 ] 1057 if model.dynamic_streams_check_configs 1058 else [] 1059 ) 1060 1061 return CheckStream( 1062 stream_names=model.stream_names or [], 1063 dynamic_streams_check_configs=dynamic_streams_check_configs, 1064 parameters={}, 1065 )
1067 @staticmethod 1068 def create_check_dynamic_stream( 1069 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1070 ) -> CheckDynamicStream: 1071 assert model.use_check_availability is not None # for mypy 1072 1073 use_check_availability = model.use_check_availability 1074 1075 return CheckDynamicStream( 1076 stream_count=model.stream_count, 1077 use_check_availability=use_check_availability, 1078 parameters={}, 1079 )
1081 def create_composite_error_handler( 1082 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1083 ) -> CompositeErrorHandler: 1084 error_handlers = [ 1085 self._create_component_from_model(model=error_handler_model, config=config) 1086 for error_handler_model in model.error_handlers 1087 ] 1088 return CompositeErrorHandler( 1089 error_handlers=error_handlers, parameters=model.parameters or {} 1090 )
1092 @staticmethod 1093 def create_concurrency_level( 1094 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1095 ) -> ConcurrencyLevel: 1096 return ConcurrencyLevel( 1097 default_concurrency=model.default_concurrency, 1098 max_concurrency=model.max_concurrency, 1099 config=config, 1100 parameters={}, 1101 )
1103 @staticmethod 1104 def apply_stream_state_migrations( 1105 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1106 ) -> MutableMapping[str, Any]: 1107 if stream_state_migrations: 1108 for state_migration in stream_state_migrations: 1109 if state_migration.should_migrate(stream_state): 1110 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1111 stream_state = dict(state_migration.migrate(stream_state)) 1112 return stream_state
1114 def create_concurrent_cursor_from_datetime_based_cursor( 1115 self, 1116 model_type: Type[BaseModel], 1117 component_definition: ComponentDefinition, 1118 stream_name: str, 1119 stream_namespace: Optional[str], 1120 config: Config, 1121 message_repository: Optional[MessageRepository] = None, 1122 runtime_lookback_window: Optional[datetime.timedelta] = None, 1123 stream_state_migrations: Optional[List[Any]] = None, 1124 **kwargs: Any, 1125 ) -> ConcurrentCursor: 1126 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1127 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1128 # incoming state and connector_state_manager that is initialized when the component factory is created 1129 stream_state = ( 1130 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1131 if "stream_state" not in kwargs 1132 else kwargs["stream_state"] 1133 ) 1134 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1135 1136 component_type = component_definition.get("type") 1137 if component_definition.get("type") != model_type.__name__: 1138 raise ValueError( 1139 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1140 ) 1141 1142 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1143 1144 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1145 raise ValueError( 1146 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1147 ) 1148 1149 interpolated_cursor_field = InterpolatedString.create( 1150 datetime_based_cursor_model.cursor_field, 1151 parameters=datetime_based_cursor_model.parameters or {}, 1152 ) 1153 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1154 1155 interpolated_partition_field_start = InterpolatedString.create( 1156 datetime_based_cursor_model.partition_field_start or "start_time", 1157 parameters=datetime_based_cursor_model.parameters or {}, 1158 ) 1159 interpolated_partition_field_end = InterpolatedString.create( 1160 datetime_based_cursor_model.partition_field_end or "end_time", 1161 parameters=datetime_based_cursor_model.parameters or {}, 1162 ) 1163 1164 slice_boundary_fields = ( 1165 interpolated_partition_field_start.eval(config=config), 1166 interpolated_partition_field_end.eval(config=config), 1167 ) 1168 1169 datetime_format = datetime_based_cursor_model.datetime_format 1170 1171 cursor_granularity = ( 1172 parse_duration(datetime_based_cursor_model.cursor_granularity) 1173 if datetime_based_cursor_model.cursor_granularity 1174 else None 1175 ) 1176 1177 lookback_window = None 1178 interpolated_lookback_window = ( 1179 InterpolatedString.create( 1180 datetime_based_cursor_model.lookback_window, 1181 parameters=datetime_based_cursor_model.parameters or {}, 1182 ) 1183 if datetime_based_cursor_model.lookback_window 1184 else None 1185 ) 1186 if interpolated_lookback_window: 1187 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1188 if evaluated_lookback_window: 1189 lookback_window = parse_duration(evaluated_lookback_window) 1190 1191 connector_state_converter: DateTimeStreamStateConverter 1192 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1193 datetime_format=datetime_format, 1194 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1195 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1196 cursor_granularity=cursor_granularity, 1197 ) 1198 1199 # Adjusts the stream state by applying the runtime lookback window. 1200 # This is used to ensure correct state handling in case of failed partitions. 1201 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1202 if runtime_lookback_window and stream_state_value: 1203 new_stream_state = ( 1204 connector_state_converter.parse_timestamp(stream_state_value) 1205 - runtime_lookback_window 1206 ) 1207 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1208 new_stream_state 1209 ) 1210 1211 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1212 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1213 start_date_runtime_value = self.create_min_max_datetime( 1214 model=datetime_based_cursor_model.start_datetime, config=config 1215 ) 1216 else: 1217 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1218 1219 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1220 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1221 end_date_runtime_value = self.create_min_max_datetime( 1222 model=datetime_based_cursor_model.end_datetime, config=config 1223 ) 1224 else: 1225 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1226 1227 interpolated_start_date = MinMaxDatetime.create( 1228 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1229 parameters=datetime_based_cursor_model.parameters, 1230 ) 1231 interpolated_end_date = ( 1232 None 1233 if not end_date_runtime_value 1234 else MinMaxDatetime.create( 1235 end_date_runtime_value, datetime_based_cursor_model.parameters 1236 ) 1237 ) 1238 1239 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1240 if not interpolated_start_date.datetime_format: 1241 interpolated_start_date.datetime_format = datetime_format 1242 if interpolated_end_date and not interpolated_end_date.datetime_format: 1243 interpolated_end_date.datetime_format = datetime_format 1244 1245 start_date = interpolated_start_date.get_datetime(config=config) 1246 end_date_provider = ( 1247 partial(interpolated_end_date.get_datetime, config) 1248 if interpolated_end_date 1249 else connector_state_converter.get_end_provider() 1250 ) 1251 1252 if ( 1253 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1254 ) or ( 1255 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1256 ): 1257 raise ValueError( 1258 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1259 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1260 ) 1261 1262 # When step is not defined, default to a step size from the starting date to the present moment 1263 step_length = datetime.timedelta.max 1264 interpolated_step = ( 1265 InterpolatedString.create( 1266 datetime_based_cursor_model.step, 1267 parameters=datetime_based_cursor_model.parameters or {}, 1268 ) 1269 if datetime_based_cursor_model.step 1270 else None 1271 ) 1272 if interpolated_step: 1273 evaluated_step = interpolated_step.eval(config) 1274 if evaluated_step: 1275 step_length = parse_duration(evaluated_step) 1276 1277 clamping_strategy: ClampingStrategy = NoClamping() 1278 if datetime_based_cursor_model.clamping: 1279 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1280 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1281 # object which we want to keep agnostic of being low-code 1282 target = InterpolatedString( 1283 string=datetime_based_cursor_model.clamping.target, 1284 parameters=datetime_based_cursor_model.parameters or {}, 1285 ) 1286 evaluated_target = target.eval(config=config) 1287 match evaluated_target: 1288 case "DAY": 1289 clamping_strategy = DayClampingStrategy() 1290 end_date_provider = ClampingEndProvider( 1291 DayClampingStrategy(is_ceiling=False), 1292 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1293 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1294 ) 1295 case "WEEK": 1296 if ( 1297 not datetime_based_cursor_model.clamping.target_details 1298 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1299 ): 1300 raise ValueError( 1301 "Given WEEK clamping, weekday needs to be provided as target_details" 1302 ) 1303 weekday = self._assemble_weekday( 1304 datetime_based_cursor_model.clamping.target_details["weekday"] 1305 ) 1306 clamping_strategy = WeekClampingStrategy(weekday) 1307 end_date_provider = ClampingEndProvider( 1308 WeekClampingStrategy(weekday, is_ceiling=False), 1309 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1310 granularity=cursor_granularity or datetime.timedelta(days=1), 1311 ) 1312 case "MONTH": 1313 clamping_strategy = MonthClampingStrategy() 1314 end_date_provider = ClampingEndProvider( 1315 MonthClampingStrategy(is_ceiling=False), 1316 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1317 granularity=cursor_granularity or datetime.timedelta(days=1), 1318 ) 1319 case _: 1320 raise ValueError( 1321 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1322 ) 1323 1324 return ConcurrentCursor( 1325 stream_name=stream_name, 1326 stream_namespace=stream_namespace, 1327 stream_state=stream_state, 1328 message_repository=message_repository or self._message_repository, 1329 connector_state_manager=self._connector_state_manager, 1330 connector_state_converter=connector_state_converter, 1331 cursor_field=cursor_field, 1332 slice_boundary_fields=slice_boundary_fields, 1333 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1334 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1335 lookback_window=lookback_window, 1336 slice_range=step_length, 1337 cursor_granularity=cursor_granularity, 1338 clamping_strategy=clamping_strategy, 1339 )
1341 def create_concurrent_cursor_from_incrementing_count_cursor( 1342 self, 1343 model_type: Type[BaseModel], 1344 component_definition: ComponentDefinition, 1345 stream_name: str, 1346 stream_namespace: Optional[str], 1347 config: Config, 1348 message_repository: Optional[MessageRepository] = None, 1349 **kwargs: Any, 1350 ) -> ConcurrentCursor: 1351 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1352 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1353 # incoming state and connector_state_manager that is initialized when the component factory is created 1354 stream_state = ( 1355 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1356 if "stream_state" not in kwargs 1357 else kwargs["stream_state"] 1358 ) 1359 1360 component_type = component_definition.get("type") 1361 if component_definition.get("type") != model_type.__name__: 1362 raise ValueError( 1363 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1364 ) 1365 1366 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1367 1368 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1369 raise ValueError( 1370 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1371 ) 1372 1373 interpolated_start_value = ( 1374 InterpolatedString.create( 1375 incrementing_count_cursor_model.start_value, # type: ignore 1376 parameters=incrementing_count_cursor_model.parameters or {}, 1377 ) 1378 if incrementing_count_cursor_model.start_value 1379 else 0 1380 ) 1381 1382 interpolated_cursor_field = InterpolatedString.create( 1383 incrementing_count_cursor_model.cursor_field, 1384 parameters=incrementing_count_cursor_model.parameters or {}, 1385 ) 1386 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1387 1388 connector_state_converter = IncrementingCountStreamStateConverter( 1389 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1390 ) 1391 1392 return ConcurrentCursor( 1393 stream_name=stream_name, 1394 stream_namespace=stream_namespace, 1395 stream_state=stream_state, 1396 message_repository=message_repository or self._message_repository, 1397 connector_state_manager=self._connector_state_manager, 1398 connector_state_converter=connector_state_converter, 1399 cursor_field=cursor_field, 1400 slice_boundary_fields=None, 1401 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1402 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1403 )
1424 def create_concurrent_cursor_from_perpartition_cursor( 1425 self, 1426 state_manager: ConnectorStateManager, 1427 model_type: Type[BaseModel], 1428 component_definition: ComponentDefinition, 1429 stream_name: str, 1430 stream_namespace: Optional[str], 1431 config: Config, 1432 stream_state: MutableMapping[str, Any], 1433 partition_router: PartitionRouter, 1434 stream_state_migrations: Optional[List[Any]] = None, 1435 **kwargs: Any, 1436 ) -> ConcurrentPerPartitionCursor: 1437 component_type = component_definition.get("type") 1438 if component_definition.get("type") != model_type.__name__: 1439 raise ValueError( 1440 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1441 ) 1442 1443 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1444 1445 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1446 raise ValueError( 1447 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1448 ) 1449 1450 interpolated_cursor_field = InterpolatedString.create( 1451 datetime_based_cursor_model.cursor_field, 1452 parameters=datetime_based_cursor_model.parameters or {}, 1453 ) 1454 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1455 1456 datetime_format = datetime_based_cursor_model.datetime_format 1457 1458 cursor_granularity = ( 1459 parse_duration(datetime_based_cursor_model.cursor_granularity) 1460 if datetime_based_cursor_model.cursor_granularity 1461 else None 1462 ) 1463 1464 connector_state_converter: DateTimeStreamStateConverter 1465 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1466 datetime_format=datetime_format, 1467 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1468 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1469 cursor_granularity=cursor_granularity, 1470 ) 1471 1472 # Create the cursor factory 1473 cursor_factory = ConcurrentCursorFactory( 1474 partial( 1475 self.create_concurrent_cursor_from_datetime_based_cursor, 1476 state_manager=state_manager, 1477 model_type=model_type, 1478 component_definition=component_definition, 1479 stream_name=stream_name, 1480 stream_namespace=stream_namespace, 1481 config=config, 1482 message_repository=NoopMessageRepository(), 1483 stream_state_migrations=stream_state_migrations, 1484 ) 1485 ) 1486 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1487 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1488 use_global_cursor = isinstance( 1489 partition_router, GroupingPartitionRouter 1490 ) or component_definition.get("global_substream_cursor", False) 1491 1492 # Return the concurrent cursor and state converter 1493 return ConcurrentPerPartitionCursor( 1494 cursor_factory=cursor_factory, 1495 partition_router=partition_router, 1496 stream_name=stream_name, 1497 stream_namespace=stream_namespace, 1498 stream_state=stream_state, 1499 message_repository=self._message_repository, # type: ignore 1500 connector_state_manager=state_manager, 1501 connector_state_converter=connector_state_converter, 1502 cursor_field=cursor_field, 1503 use_global_cursor=use_global_cursor, 1504 )
1506 @staticmethod 1507 def create_constant_backoff_strategy( 1508 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1509 ) -> ConstantBackoffStrategy: 1510 return ConstantBackoffStrategy( 1511 backoff_time_in_seconds=model.backoff_time_in_seconds, 1512 config=config, 1513 parameters=model.parameters or {}, 1514 )
1516 def create_cursor_pagination( 1517 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1518 ) -> CursorPaginationStrategy: 1519 if isinstance(decoder, PaginationDecoderDecorator): 1520 inner_decoder = decoder.decoder 1521 else: 1522 inner_decoder = decoder 1523 decoder = PaginationDecoderDecorator(decoder=decoder) 1524 1525 if self._is_supported_decoder_for_pagination(inner_decoder): 1526 decoder_to_use = decoder 1527 else: 1528 raise ValueError( 1529 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1530 ) 1531 1532 return CursorPaginationStrategy( 1533 cursor_value=model.cursor_value, 1534 decoder=decoder_to_use, 1535 page_size=model.page_size, 1536 stop_condition=model.stop_condition, 1537 config=config, 1538 parameters=model.parameters or {}, 1539 )
1541 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1542 """ 1543 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1544 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1545 :param model: The Pydantic model of the custom component being created 1546 :param config: The custom defined connector config 1547 :return: The declarative component built from the Pydantic model to be used at runtime 1548 """ 1549 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1550 component_fields = get_type_hints(custom_component_class) 1551 model_args = model.dict() 1552 model_args["config"] = config 1553 1554 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1555 # we defer to these arguments over the component's definition 1556 for key, arg in kwargs.items(): 1557 model_args[key] = arg 1558 1559 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1560 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1561 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1562 for model_field, model_value in model_args.items(): 1563 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1564 if ( 1565 isinstance(model_value, dict) 1566 and "type" not in model_value 1567 and model_field in component_fields 1568 ): 1569 derived_type = self._derive_component_type_from_type_hints( 1570 component_fields.get(model_field) 1571 ) 1572 if derived_type: 1573 model_value["type"] = derived_type 1574 1575 if self._is_component(model_value): 1576 model_args[model_field] = self._create_nested_component( 1577 model, model_field, model_value, config 1578 ) 1579 elif isinstance(model_value, list): 1580 vals = [] 1581 for v in model_value: 1582 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1583 derived_type = self._derive_component_type_from_type_hints( 1584 component_fields.get(model_field) 1585 ) 1586 if derived_type: 1587 v["type"] = derived_type 1588 if self._is_component(v): 1589 vals.append(self._create_nested_component(model, model_field, v, config)) 1590 else: 1591 vals.append(v) 1592 model_args[model_field] = vals 1593 1594 kwargs = { 1595 class_field: model_args[class_field] 1596 for class_field in component_fields.keys() 1597 if class_field in model_args 1598 } 1599 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1731 def create_datetime_based_cursor( 1732 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1733 ) -> DatetimeBasedCursor: 1734 start_datetime: Union[str, MinMaxDatetime] = ( 1735 model.start_datetime 1736 if isinstance(model.start_datetime, str) 1737 else self.create_min_max_datetime(model.start_datetime, config) 1738 ) 1739 end_datetime: Union[str, MinMaxDatetime, None] = None 1740 if model.is_data_feed and model.end_datetime: 1741 raise ValueError("Data feed does not support end_datetime") 1742 if model.is_data_feed and model.is_client_side_incremental: 1743 raise ValueError( 1744 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1745 ) 1746 if model.end_datetime: 1747 end_datetime = ( 1748 model.end_datetime 1749 if isinstance(model.end_datetime, str) 1750 else self.create_min_max_datetime(model.end_datetime, config) 1751 ) 1752 1753 end_time_option = ( 1754 self._create_component_from_model( 1755 model.end_time_option, config, parameters=model.parameters or {} 1756 ) 1757 if model.end_time_option 1758 else None 1759 ) 1760 start_time_option = ( 1761 self._create_component_from_model( 1762 model.start_time_option, config, parameters=model.parameters or {} 1763 ) 1764 if model.start_time_option 1765 else None 1766 ) 1767 1768 return DatetimeBasedCursor( 1769 cursor_field=model.cursor_field, 1770 cursor_datetime_formats=model.cursor_datetime_formats 1771 if model.cursor_datetime_formats 1772 else [], 1773 cursor_granularity=model.cursor_granularity, 1774 datetime_format=model.datetime_format, 1775 end_datetime=end_datetime, 1776 start_datetime=start_datetime, 1777 step=model.step, 1778 end_time_option=end_time_option, 1779 lookback_window=model.lookback_window, 1780 start_time_option=start_time_option, 1781 partition_field_end=model.partition_field_end, 1782 partition_field_start=model.partition_field_start, 1783 message_repository=self._message_repository, 1784 is_compare_strictly=model.is_compare_strictly, 1785 config=config, 1786 parameters=model.parameters or {}, 1787 )
1789 def create_declarative_stream( 1790 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1791 ) -> DeclarativeStream: 1792 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1793 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1794 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1795 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1796 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1797 1798 primary_key = model.primary_key.__root__ if model.primary_key else None 1799 stop_condition_on_cursor = ( 1800 model.incremental_sync 1801 and hasattr(model.incremental_sync, "is_data_feed") 1802 and model.incremental_sync.is_data_feed 1803 ) 1804 client_side_incremental_sync = None 1805 if ( 1806 model.incremental_sync 1807 and hasattr(model.incremental_sync, "is_client_side_incremental") 1808 and model.incremental_sync.is_client_side_incremental 1809 ): 1810 supported_slicers = ( 1811 DatetimeBasedCursor, 1812 GlobalSubstreamCursor, 1813 PerPartitionWithGlobalCursor, 1814 ) 1815 if combined_slicers and not isinstance(combined_slicers, supported_slicers): 1816 raise ValueError( 1817 "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead" 1818 ) 1819 cursor = ( 1820 combined_slicers 1821 if isinstance( 1822 combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor) 1823 ) 1824 else self._create_component_from_model(model=model.incremental_sync, config=config) 1825 ) 1826 1827 client_side_incremental_sync = {"cursor": cursor} 1828 1829 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1830 cursor_model = model.incremental_sync 1831 1832 end_time_option = ( 1833 self._create_component_from_model( 1834 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1835 ) 1836 if cursor_model.end_time_option 1837 else None 1838 ) 1839 start_time_option = ( 1840 self._create_component_from_model( 1841 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1842 ) 1843 if cursor_model.start_time_option 1844 else None 1845 ) 1846 1847 request_options_provider = DatetimeBasedRequestOptionsProvider( 1848 start_time_option=start_time_option, 1849 end_time_option=end_time_option, 1850 partition_field_start=cursor_model.partition_field_end, 1851 partition_field_end=cursor_model.partition_field_end, 1852 config=config, 1853 parameters=model.parameters or {}, 1854 ) 1855 elif model.incremental_sync and isinstance( 1856 model.incremental_sync, IncrementingCountCursorModel 1857 ): 1858 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1859 1860 start_time_option = ( 1861 self._create_component_from_model( 1862 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1863 config, 1864 parameters=cursor_model.parameters or {}, 1865 ) 1866 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1867 else None 1868 ) 1869 1870 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1871 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1872 partition_field_start = "start" 1873 1874 request_options_provider = DatetimeBasedRequestOptionsProvider( 1875 start_time_option=start_time_option, 1876 partition_field_start=partition_field_start, 1877 config=config, 1878 parameters=model.parameters or {}, 1879 ) 1880 else: 1881 request_options_provider = None 1882 1883 transformations = [] 1884 if model.transformations: 1885 for transformation_model in model.transformations: 1886 transformations.append( 1887 self._create_component_from_model(model=transformation_model, config=config) 1888 ) 1889 file_uploader = None 1890 if model.file_uploader: 1891 file_uploader = self._create_component_from_model( 1892 model=model.file_uploader, config=config 1893 ) 1894 1895 retriever = self._create_component_from_model( 1896 model=model.retriever, 1897 config=config, 1898 name=model.name, 1899 primary_key=primary_key, 1900 stream_slicer=combined_slicers, 1901 request_options_provider=request_options_provider, 1902 stop_condition_on_cursor=stop_condition_on_cursor, 1903 client_side_incremental_sync=client_side_incremental_sync, 1904 transformations=transformations, 1905 file_uploader=file_uploader, 1906 incremental_sync=model.incremental_sync, 1907 ) 1908 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 1909 1910 if model.state_migrations: 1911 state_transformations = [ 1912 self._create_component_from_model(state_migration, config, declarative_stream=model) 1913 for state_migration in model.state_migrations 1914 ] 1915 else: 1916 state_transformations = [] 1917 1918 schema_loader: Union[ 1919 CompositeSchemaLoader, 1920 DefaultSchemaLoader, 1921 DynamicSchemaLoader, 1922 InlineSchemaLoader, 1923 JsonFileSchemaLoader, 1924 ] 1925 if model.schema_loader and isinstance(model.schema_loader, list): 1926 nested_schema_loaders = [ 1927 self._create_component_from_model(model=nested_schema_loader, config=config) 1928 for nested_schema_loader in model.schema_loader 1929 ] 1930 schema_loader = CompositeSchemaLoader( 1931 schema_loaders=nested_schema_loaders, parameters={} 1932 ) 1933 elif model.schema_loader: 1934 schema_loader = self._create_component_from_model( 1935 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 1936 config=config, 1937 ) 1938 else: 1939 options = model.parameters or {} 1940 if "name" not in options: 1941 options["name"] = model.name 1942 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 1943 1944 return DeclarativeStream( 1945 name=model.name or "", 1946 primary_key=primary_key, 1947 retriever=retriever, 1948 schema_loader=schema_loader, 1949 stream_cursor_field=cursor_field or "", 1950 state_migrations=state_transformations, 1951 config=config, 1952 parameters=model.parameters or {}, 1953 )
2105 def create_default_error_handler( 2106 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2107 ) -> DefaultErrorHandler: 2108 backoff_strategies = [] 2109 if model.backoff_strategies: 2110 for backoff_strategy_model in model.backoff_strategies: 2111 backoff_strategies.append( 2112 self._create_component_from_model(model=backoff_strategy_model, config=config) 2113 ) 2114 2115 response_filters = [] 2116 if model.response_filters: 2117 for response_filter_model in model.response_filters: 2118 response_filters.append( 2119 self._create_component_from_model(model=response_filter_model, config=config) 2120 ) 2121 response_filters.append( 2122 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2123 ) 2124 2125 return DefaultErrorHandler( 2126 backoff_strategies=backoff_strategies, 2127 max_retries=model.max_retries, 2128 response_filters=response_filters, 2129 config=config, 2130 parameters=model.parameters or {}, 2131 )
2133 def create_default_paginator( 2134 self, 2135 model: DefaultPaginatorModel, 2136 config: Config, 2137 *, 2138 url_base: str, 2139 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2140 decoder: Optional[Decoder] = None, 2141 cursor_used_for_stop_condition: Optional[DeclarativeCursor] = None, 2142 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2143 if decoder: 2144 if self._is_supported_decoder_for_pagination(decoder): 2145 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2146 else: 2147 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2148 else: 2149 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2150 page_size_option = ( 2151 self._create_component_from_model(model=model.page_size_option, config=config) 2152 if model.page_size_option 2153 else None 2154 ) 2155 page_token_option = ( 2156 self._create_component_from_model(model=model.page_token_option, config=config) 2157 if model.page_token_option 2158 else None 2159 ) 2160 pagination_strategy = self._create_component_from_model( 2161 model=model.pagination_strategy, 2162 config=config, 2163 decoder=decoder_to_use, 2164 extractor_model=extractor_model, 2165 ) 2166 if cursor_used_for_stop_condition: 2167 pagination_strategy = StopConditionPaginationStrategyDecorator( 2168 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2169 ) 2170 paginator = DefaultPaginator( 2171 decoder=decoder_to_use, 2172 page_size_option=page_size_option, 2173 page_token_option=page_token_option, 2174 pagination_strategy=pagination_strategy, 2175 url_base=url_base, 2176 config=config, 2177 parameters=model.parameters or {}, 2178 ) 2179 if self._limit_pages_fetched_per_slice: 2180 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2181 return paginator
2183 def create_dpath_extractor( 2184 self, 2185 model: DpathExtractorModel, 2186 config: Config, 2187 decoder: Optional[Decoder] = None, 2188 **kwargs: Any, 2189 ) -> DpathExtractor: 2190 if decoder: 2191 decoder_to_use = decoder 2192 else: 2193 decoder_to_use = JsonDecoder(parameters={}) 2194 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2195 return DpathExtractor( 2196 decoder=decoder_to_use, 2197 field_path=model_field_path, 2198 config=config, 2199 parameters=model.parameters or {}, 2200 )
2221 def create_http_requester( 2222 self, 2223 model: HttpRequesterModel, 2224 config: Config, 2225 decoder: Decoder = JsonDecoder(parameters={}), 2226 query_properties_key: Optional[str] = None, 2227 use_cache: Optional[bool] = None, 2228 *, 2229 name: str, 2230 ) -> HttpRequester: 2231 authenticator = ( 2232 self._create_component_from_model( 2233 model=model.authenticator, 2234 config=config, 2235 url_base=model.url or model.url_base, 2236 name=name, 2237 decoder=decoder, 2238 ) 2239 if model.authenticator 2240 else None 2241 ) 2242 error_handler = ( 2243 self._create_component_from_model(model=model.error_handler, config=config) 2244 if model.error_handler 2245 else DefaultErrorHandler( 2246 backoff_strategies=[], 2247 response_filters=[], 2248 config=config, 2249 parameters=model.parameters or {}, 2250 ) 2251 ) 2252 2253 api_budget = self._api_budget 2254 2255 # Removes QueryProperties components from the interpolated mappings because it has been designed 2256 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2257 # instead of through jinja interpolation 2258 request_parameters: Optional[Union[str, Mapping[str, str]]] 2259 if isinstance(model.request_parameters, Mapping): 2260 request_parameters = self._remove_query_properties(model.request_parameters) 2261 else: 2262 request_parameters = model.request_parameters 2263 2264 request_options_provider = InterpolatedRequestOptionsProvider( 2265 request_body=model.request_body, 2266 request_body_data=model.request_body_data, 2267 request_body_json=model.request_body_json, 2268 request_headers=model.request_headers, 2269 request_parameters=request_parameters, 2270 query_properties_key=query_properties_key, 2271 config=config, 2272 parameters=model.parameters or {}, 2273 ) 2274 2275 assert model.use_cache is not None # for mypy 2276 assert model.http_method is not None # for mypy 2277 2278 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2279 2280 return HttpRequester( 2281 name=name, 2282 url=model.url, 2283 url_base=model.url_base, 2284 path=model.path, 2285 authenticator=authenticator, 2286 error_handler=error_handler, 2287 api_budget=api_budget, 2288 http_method=HttpMethod[model.http_method.value], 2289 request_options_provider=request_options_provider, 2290 config=config, 2291 disable_retries=self._disable_retries, 2292 parameters=model.parameters or {}, 2293 message_repository=self._message_repository, 2294 use_cache=should_use_cache, 2295 decoder=decoder, 2296 stream_response=decoder.is_stream_response() if decoder else False, 2297 )
2299 @staticmethod 2300 def create_http_response_filter( 2301 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2302 ) -> HttpResponseFilter: 2303 if model.action: 2304 action = ResponseAction(model.action.value) 2305 else: 2306 action = None 2307 2308 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2309 2310 http_codes = ( 2311 set(model.http_codes) if model.http_codes else set() 2312 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2313 2314 return HttpResponseFilter( 2315 action=action, 2316 failure_type=failure_type, 2317 error_message=model.error_message or "", 2318 error_message_contains=model.error_message_contains or "", 2319 http_codes=http_codes, 2320 predicate=model.predicate or "", 2321 config=config, 2322 parameters=model.parameters or {}, 2323 )
2331 def create_complex_field_type( 2332 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2333 ) -> ComplexFieldType: 2334 items = ( 2335 self._create_component_from_model(model=model.items, config=config) 2336 if isinstance(model.items, ComplexFieldTypeModel) 2337 else model.items 2338 ) 2339 2340 return ComplexFieldType(field_type=model.field_type, items=items)
2342 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2343 target_type = ( 2344 self._create_component_from_model(model=model.target_type, config=config) 2345 if isinstance(model.target_type, ComplexFieldTypeModel) 2346 else model.target_type 2347 ) 2348 2349 return TypesMap( 2350 target_type=target_type, 2351 current_type=model.current_type, 2352 condition=model.condition if model.condition is not None else "True", 2353 )
2355 def create_schema_type_identifier( 2356 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2357 ) -> SchemaTypeIdentifier: 2358 types_mapping = [] 2359 if model.types_mapping: 2360 types_mapping.extend( 2361 [ 2362 self._create_component_from_model(types_map, config=config) 2363 for types_map in model.types_mapping 2364 ] 2365 ) 2366 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2367 [x for x in model.schema_pointer] if model.schema_pointer else [] 2368 ) 2369 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2370 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2371 [x for x in model.type_pointer] if model.type_pointer else None 2372 ) 2373 2374 return SchemaTypeIdentifier( 2375 schema_pointer=model_schema_pointer, 2376 key_pointer=model_key_pointer, 2377 type_pointer=model_type_pointer, 2378 types_mapping=types_mapping, 2379 parameters=model.parameters or {}, 2380 )
2382 def create_dynamic_schema_loader( 2383 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2384 ) -> DynamicSchemaLoader: 2385 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2386 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2387 2388 schema_transformations = [] 2389 if model.schema_transformations: 2390 for transformation_model in model.schema_transformations: 2391 schema_transformations.append( 2392 self._create_component_from_model(model=transformation_model, config=config) 2393 ) 2394 2395 retriever = self._create_component_from_model( 2396 model=model.retriever, 2397 config=config, 2398 name="dynamic_properties", 2399 primary_key=None, 2400 stream_slicer=combined_slicers, 2401 transformations=[], 2402 use_cache=True, 2403 ) 2404 schema_type_identifier = self._create_component_from_model( 2405 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2406 ) 2407 return DynamicSchemaLoader( 2408 retriever=retriever, 2409 config=config, 2410 schema_transformations=schema_transformations, 2411 schema_type_identifier=schema_type_identifier, 2412 parameters=model.parameters or {}, 2413 )
2433 def create_gzip_decoder( 2434 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2435 ) -> Decoder: 2436 _compressed_response_types = { 2437 "gzip", 2438 "x-gzip", 2439 "gzip, deflate", 2440 "x-gzip, deflate", 2441 "application/zip", 2442 "application/gzip", 2443 "application/x-gzip", 2444 "application/x-zip-compressed", 2445 } 2446 2447 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2448 2449 if self._emit_connector_builder_messages: 2450 # This is very surprising but if the response is not streamed, 2451 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2452 # which uses urllib3 directly and does not uncompress the data. 2453 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2454 2455 return CompositeRawDecoder.by_headers( 2456 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2457 stream_response=True, 2458 fallback_parser=gzip_parser.inner_parser, 2459 )
2461 @staticmethod 2462 def create_incrementing_count_cursor( 2463 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2464 ) -> DatetimeBasedCursor: 2465 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2466 # we still parse models into components. The issue is that there's no runtime implementation of a 2467 # IncrementingCountCursor. 2468 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2469 return DatetimeBasedCursor( 2470 cursor_field=model.cursor_field, 2471 datetime_format="%Y-%m-%d", 2472 start_datetime="2024-12-12", 2473 config=config, 2474 parameters={}, 2475 )
2520 @staticmethod 2521 def create_jwt_authenticator( 2522 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2523 ) -> JwtAuthenticator: 2524 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2525 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2526 return JwtAuthenticator( 2527 config=config, 2528 parameters=model.parameters or {}, 2529 algorithm=JwtAlgorithm(model.algorithm.value), 2530 secret_key=model.secret_key, 2531 base64_encode_secret_key=model.base64_encode_secret_key, 2532 token_duration=model.token_duration, 2533 header_prefix=model.header_prefix, 2534 kid=jwt_headers.kid, 2535 typ=jwt_headers.typ, 2536 cty=jwt_headers.cty, 2537 iss=jwt_payload.iss, 2538 sub=jwt_payload.sub, 2539 aud=jwt_payload.aud, 2540 additional_jwt_headers=model.additional_jwt_headers, 2541 additional_jwt_payload=model.additional_jwt_payload, 2542 )
2544 def create_list_partition_router( 2545 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2546 ) -> ListPartitionRouter: 2547 request_option = ( 2548 self._create_component_from_model(model.request_option, config) 2549 if model.request_option 2550 else None 2551 ) 2552 return ListPartitionRouter( 2553 cursor_field=model.cursor_field, 2554 request_option=request_option, 2555 values=model.values, 2556 config=config, 2557 parameters=model.parameters or {}, 2558 )
2560 @staticmethod 2561 def create_min_max_datetime( 2562 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2563 ) -> MinMaxDatetime: 2564 return MinMaxDatetime( 2565 datetime=model.datetime, 2566 datetime_format=model.datetime_format or "", 2567 max_datetime=model.max_datetime or "", 2568 min_datetime=model.min_datetime or "", 2569 parameters=model.parameters or {}, 2570 )
2582 def create_oauth_authenticator( 2583 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2584 ) -> DeclarativeOauth2Authenticator: 2585 profile_assertion = ( 2586 self._create_component_from_model(model.profile_assertion, config=config) 2587 if model.profile_assertion 2588 else None 2589 ) 2590 2591 if model.refresh_token_updater: 2592 # ignore type error because fixing it would have a lot of dependencies, revisit later 2593 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2594 config, 2595 InterpolatedString.create( 2596 model.token_refresh_endpoint, # type: ignore 2597 parameters=model.parameters or {}, 2598 ).eval(config), 2599 access_token_name=InterpolatedString.create( 2600 model.access_token_name or "access_token", parameters=model.parameters or {} 2601 ).eval(config), 2602 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2603 expires_in_name=InterpolatedString.create( 2604 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2605 ).eval(config), 2606 client_id_name=InterpolatedString.create( 2607 model.client_id_name or "client_id", parameters=model.parameters or {} 2608 ).eval(config), 2609 client_id=InterpolatedString.create( 2610 model.client_id, parameters=model.parameters or {} 2611 ).eval(config) 2612 if model.client_id 2613 else model.client_id, 2614 client_secret_name=InterpolatedString.create( 2615 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2616 ).eval(config), 2617 client_secret=InterpolatedString.create( 2618 model.client_secret, parameters=model.parameters or {} 2619 ).eval(config) 2620 if model.client_secret 2621 else model.client_secret, 2622 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2623 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2624 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2625 grant_type_name=InterpolatedString.create( 2626 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2627 ).eval(config), 2628 grant_type=InterpolatedString.create( 2629 model.grant_type or "refresh_token", parameters=model.parameters or {} 2630 ).eval(config), 2631 refresh_request_body=InterpolatedMapping( 2632 model.refresh_request_body or {}, parameters=model.parameters or {} 2633 ).eval(config), 2634 refresh_request_headers=InterpolatedMapping( 2635 model.refresh_request_headers or {}, parameters=model.parameters or {} 2636 ).eval(config), 2637 scopes=model.scopes, 2638 token_expiry_date_format=model.token_expiry_date_format, 2639 message_repository=self._message_repository, 2640 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2641 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2642 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2643 ) 2644 # ignore type error because fixing it would have a lot of dependencies, revisit later 2645 return DeclarativeOauth2Authenticator( # type: ignore 2646 access_token_name=model.access_token_name or "access_token", 2647 access_token_value=model.access_token_value, 2648 client_id_name=model.client_id_name or "client_id", 2649 client_id=model.client_id, 2650 client_secret_name=model.client_secret_name or "client_secret", 2651 client_secret=model.client_secret, 2652 expires_in_name=model.expires_in_name or "expires_in", 2653 grant_type_name=model.grant_type_name or "grant_type", 2654 grant_type=model.grant_type or "refresh_token", 2655 refresh_request_body=model.refresh_request_body, 2656 refresh_request_headers=model.refresh_request_headers, 2657 refresh_token_name=model.refresh_token_name or "refresh_token", 2658 refresh_token=model.refresh_token, 2659 scopes=model.scopes, 2660 token_expiry_date=model.token_expiry_date, 2661 token_expiry_date_format=model.token_expiry_date_format, 2662 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2663 token_refresh_endpoint=model.token_refresh_endpoint, 2664 config=config, 2665 parameters=model.parameters or {}, 2666 message_repository=self._message_repository, 2667 profile_assertion=profile_assertion, 2668 use_profile_assertion=model.use_profile_assertion, 2669 )
2671 def create_offset_increment( 2672 self, 2673 model: OffsetIncrementModel, 2674 config: Config, 2675 decoder: Decoder, 2676 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2677 **kwargs: Any, 2678 ) -> OffsetIncrement: 2679 if isinstance(decoder, PaginationDecoderDecorator): 2680 inner_decoder = decoder.decoder 2681 else: 2682 inner_decoder = decoder 2683 decoder = PaginationDecoderDecorator(decoder=decoder) 2684 2685 if self._is_supported_decoder_for_pagination(inner_decoder): 2686 decoder_to_use = decoder 2687 else: 2688 raise ValueError( 2689 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2690 ) 2691 2692 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2693 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2694 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2695 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2696 # When we have more time to investigate we can look into reusing the same component. 2697 extractor = ( 2698 self._create_component_from_model( 2699 model=extractor_model, config=config, decoder=decoder_to_use 2700 ) 2701 if extractor_model 2702 else None 2703 ) 2704 2705 return OffsetIncrement( 2706 page_size=model.page_size, 2707 config=config, 2708 decoder=decoder_to_use, 2709 extractor=extractor, 2710 inject_on_first_request=model.inject_on_first_request or False, 2711 parameters=model.parameters or {}, 2712 )
2714 @staticmethod 2715 def create_page_increment( 2716 model: PageIncrementModel, config: Config, **kwargs: Any 2717 ) -> PageIncrement: 2718 return PageIncrement( 2719 page_size=model.page_size, 2720 config=config, 2721 start_from_page=model.start_from_page or 0, 2722 inject_on_first_request=model.inject_on_first_request or False, 2723 parameters=model.parameters or {}, 2724 )
2726 def create_parent_stream_config( 2727 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2728 ) -> ParentStreamConfig: 2729 declarative_stream = self._create_component_from_model( 2730 model.stream, config=config, **kwargs 2731 ) 2732 request_option = ( 2733 self._create_component_from_model(model.request_option, config=config) 2734 if model.request_option 2735 else None 2736 ) 2737 2738 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2739 raise ValueError( 2740 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2741 ) 2742 2743 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2744 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2745 ) 2746 2747 return ParentStreamConfig( 2748 parent_key=model.parent_key, 2749 request_option=request_option, 2750 stream=declarative_stream, 2751 partition_field=model.partition_field, 2752 config=config, 2753 incremental_dependency=model.incremental_dependency or False, 2754 parameters=model.parameters or {}, 2755 extra_fields=model.extra_fields, 2756 lazy_read_pointer=model_lazy_read_pointer, 2757 )
2759 def create_properties_from_endpoint( 2760 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2761 ) -> PropertiesFromEndpoint: 2762 retriever = self._create_component_from_model( 2763 model=model.retriever, 2764 config=config, 2765 name="dynamic_properties", 2766 primary_key=None, 2767 stream_slicer=None, 2768 transformations=[], 2769 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2770 ) 2771 return PropertiesFromEndpoint( 2772 property_field_path=model.property_field_path, 2773 retriever=retriever, 2774 config=config, 2775 parameters=model.parameters or {}, 2776 )
2778 def create_property_chunking( 2779 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2780 ) -> PropertyChunking: 2781 record_merge_strategy = ( 2782 self._create_component_from_model( 2783 model=model.record_merge_strategy, config=config, **kwargs 2784 ) 2785 if model.record_merge_strategy 2786 else None 2787 ) 2788 2789 property_limit_type: PropertyLimitType 2790 match model.property_limit_type: 2791 case PropertyLimitTypeModel.property_count: 2792 property_limit_type = PropertyLimitType.property_count 2793 case PropertyLimitTypeModel.characters: 2794 property_limit_type = PropertyLimitType.characters 2795 case _: 2796 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2797 2798 return PropertyChunking( 2799 property_limit_type=property_limit_type, 2800 property_limit=model.property_limit, 2801 record_merge_strategy=record_merge_strategy, 2802 config=config, 2803 parameters=model.parameters or {}, 2804 )
2806 def create_query_properties( 2807 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2808 ) -> QueryProperties: 2809 if isinstance(model.property_list, list): 2810 property_list = model.property_list 2811 else: 2812 property_list = self._create_component_from_model( 2813 model=model.property_list, config=config, **kwargs 2814 ) 2815 2816 property_chunking = ( 2817 self._create_component_from_model( 2818 model=model.property_chunking, config=config, **kwargs 2819 ) 2820 if model.property_chunking 2821 else None 2822 ) 2823 2824 return QueryProperties( 2825 property_list=property_list, 2826 always_include_properties=model.always_include_properties, 2827 property_chunking=property_chunking, 2828 config=config, 2829 parameters=model.parameters or {}, 2830 )
2844 @staticmethod 2845 def create_request_option( 2846 model: RequestOptionModel, config: Config, **kwargs: Any 2847 ) -> RequestOption: 2848 inject_into = RequestOptionType(model.inject_into.value) 2849 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 2850 [ 2851 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 2852 for segment in model.field_path 2853 ] 2854 if model.field_path 2855 else None 2856 ) 2857 field_name = ( 2858 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 2859 if model.field_name 2860 else None 2861 ) 2862 return RequestOption( 2863 field_name=field_name, 2864 field_path=field_path, 2865 inject_into=inject_into, 2866 parameters=kwargs.get("parameters", {}), 2867 )
2869 def create_record_selector( 2870 self, 2871 model: RecordSelectorModel, 2872 config: Config, 2873 *, 2874 name: str, 2875 transformations: List[RecordTransformation] | None = None, 2876 decoder: Decoder | None = None, 2877 client_side_incremental_sync: Dict[str, Any] | None = None, 2878 file_uploader: Optional[DefaultFileUploader] = None, 2879 **kwargs: Any, 2880 ) -> RecordSelector: 2881 extractor = self._create_component_from_model( 2882 model=model.extractor, decoder=decoder, config=config 2883 ) 2884 record_filter = ( 2885 self._create_component_from_model(model.record_filter, config=config) 2886 if model.record_filter 2887 else None 2888 ) 2889 2890 transform_before_filtering = ( 2891 False if model.transform_before_filtering is None else model.transform_before_filtering 2892 ) 2893 if client_side_incremental_sync: 2894 record_filter = ClientSideIncrementalRecordFilterDecorator( 2895 config=config, 2896 parameters=model.parameters, 2897 condition=model.record_filter.condition 2898 if (model.record_filter and hasattr(model.record_filter, "condition")) 2899 else None, 2900 **client_side_incremental_sync, 2901 ) 2902 transform_before_filtering = ( 2903 True 2904 if model.transform_before_filtering is None 2905 else model.transform_before_filtering 2906 ) 2907 2908 if model.schema_normalization is None: 2909 # default to no schema normalization if not set 2910 model.schema_normalization = SchemaNormalizationModel.None_ 2911 2912 schema_normalization = ( 2913 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 2914 if isinstance(model.schema_normalization, SchemaNormalizationModel) 2915 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 2916 ) 2917 2918 return RecordSelector( 2919 extractor=extractor, 2920 name=name, 2921 config=config, 2922 record_filter=record_filter, 2923 transformations=transformations or [], 2924 file_uploader=file_uploader, 2925 schema_normalization=schema_normalization, 2926 parameters=model.parameters or {}, 2927 transform_before_filtering=transform_before_filtering, 2928 )
2938 def create_selective_authenticator( 2939 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 2940 ) -> DeclarativeAuthenticator: 2941 authenticators = { 2942 name: self._create_component_from_model(model=auth, config=config) 2943 for name, auth in model.authenticators.items() 2944 } 2945 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 2946 return SelectiveAuthenticator( # type: ignore[abstract] 2947 config=config, 2948 authenticators=authenticators, 2949 authenticator_selection_path=model.authenticator_selection_path, 2950 **kwargs, 2951 )
2953 @staticmethod 2954 def create_legacy_session_token_authenticator( 2955 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 2956 ) -> LegacySessionTokenAuthenticator: 2957 return LegacySessionTokenAuthenticator( 2958 api_url=url_base, 2959 header=model.header, 2960 login_url=model.login_url, 2961 password=model.password or "", 2962 session_token=model.session_token or "", 2963 session_token_response_key=model.session_token_response_key or "", 2964 username=model.username or "", 2965 validate_session_url=model.validate_session_url, 2966 config=config, 2967 parameters=model.parameters or {}, 2968 )
2970 def create_simple_retriever( 2971 self, 2972 model: SimpleRetrieverModel, 2973 config: Config, 2974 *, 2975 name: str, 2976 primary_key: Optional[Union[str, List[str], List[List[str]]]], 2977 stream_slicer: Optional[StreamSlicer], 2978 request_options_provider: Optional[RequestOptionsProvider] = None, 2979 stop_condition_on_cursor: bool = False, 2980 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 2981 transformations: List[RecordTransformation], 2982 file_uploader: Optional[DefaultFileUploader] = None, 2983 incremental_sync: Optional[ 2984 Union[ 2985 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 2986 ] 2987 ] = None, 2988 use_cache: Optional[bool] = None, 2989 **kwargs: Any, 2990 ) -> SimpleRetriever: 2991 def _get_url() -> str: 2992 """ 2993 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 2994 This is needed because the URL is not set until the requester is created. 2995 """ 2996 2997 _url = ( 2998 model.requester.url 2999 if hasattr(model.requester, "url") and model.requester.url is not None 3000 else requester.get_url() 3001 ) 3002 _url_base = ( 3003 model.requester.url_base 3004 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3005 else requester.get_url_base() 3006 ) 3007 3008 return _url or _url_base 3009 3010 decoder = ( 3011 self._create_component_from_model(model=model.decoder, config=config) 3012 if model.decoder 3013 else JsonDecoder(parameters={}) 3014 ) 3015 record_selector = self._create_component_from_model( 3016 model=model.record_selector, 3017 name=name, 3018 config=config, 3019 decoder=decoder, 3020 transformations=transformations, 3021 client_side_incremental_sync=client_side_incremental_sync, 3022 file_uploader=file_uploader, 3023 ) 3024 3025 query_properties: Optional[QueryProperties] = None 3026 query_properties_key: Optional[str] = None 3027 if self._query_properties_in_request_parameters(model.requester): 3028 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3029 # places instead of default to request_parameters which isn't clearly documented 3030 if ( 3031 hasattr(model.requester, "fetch_properties_from_endpoint") 3032 and model.requester.fetch_properties_from_endpoint 3033 ): 3034 raise ValueError( 3035 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3036 ) 3037 3038 query_properties_definitions = [] 3039 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3040 if isinstance(request_parameter, QueryPropertiesModel): 3041 query_properties_key = key 3042 query_properties_definitions.append(request_parameter) 3043 3044 if len(query_properties_definitions) > 1: 3045 raise ValueError( 3046 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3047 ) 3048 3049 if len(query_properties_definitions) == 1: 3050 query_properties = self._create_component_from_model( 3051 model=query_properties_definitions[0], config=config 3052 ) 3053 elif ( 3054 hasattr(model.requester, "fetch_properties_from_endpoint") 3055 and model.requester.fetch_properties_from_endpoint 3056 ): 3057 query_properties_definition = QueryPropertiesModel( 3058 type="QueryProperties", 3059 property_list=model.requester.fetch_properties_from_endpoint, 3060 always_include_properties=None, 3061 property_chunking=None, 3062 ) # type: ignore # $parameters has a default value 3063 3064 query_properties = self.create_query_properties( 3065 model=query_properties_definition, 3066 config=config, 3067 ) 3068 3069 requester = self._create_component_from_model( 3070 model=model.requester, 3071 decoder=decoder, 3072 name=name, 3073 query_properties_key=query_properties_key, 3074 use_cache=use_cache, 3075 config=config, 3076 ) 3077 3078 # Define cursor only if per partition or common incremental support is needed 3079 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3080 3081 if ( 3082 not isinstance(stream_slicer, DatetimeBasedCursor) 3083 or type(stream_slicer) is not DatetimeBasedCursor 3084 ): 3085 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3086 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3087 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3088 # request_options_provider 3089 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3090 elif not request_options_provider: 3091 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3092 3093 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3094 3095 cursor_used_for_stop_condition = cursor if stop_condition_on_cursor else None 3096 paginator = ( 3097 self._create_component_from_model( 3098 model=model.paginator, 3099 config=config, 3100 url_base=_get_url(), 3101 extractor_model=model.record_selector.extractor, 3102 decoder=decoder, 3103 cursor_used_for_stop_condition=cursor_used_for_stop_condition, 3104 ) 3105 if model.paginator 3106 else NoPagination(parameters={}) 3107 ) 3108 3109 ignore_stream_slicer_parameters_on_paginated_requests = ( 3110 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3111 ) 3112 3113 if ( 3114 model.partition_router 3115 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3116 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3117 and any( 3118 parent_stream_config.lazy_read_pointer 3119 for parent_stream_config in model.partition_router.parent_stream_configs 3120 ) 3121 ): 3122 if incremental_sync: 3123 if incremental_sync.type != "DatetimeBasedCursor": 3124 raise ValueError( 3125 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3126 ) 3127 3128 elif incremental_sync.step or incremental_sync.cursor_granularity: 3129 raise ValueError( 3130 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3131 ) 3132 3133 if model.decoder and model.decoder.type != "JsonDecoder": 3134 raise ValueError( 3135 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3136 ) 3137 3138 return LazySimpleRetriever( 3139 name=name, 3140 paginator=paginator, 3141 primary_key=primary_key, 3142 requester=requester, 3143 record_selector=record_selector, 3144 stream_slicer=stream_slicer, 3145 request_option_provider=request_options_provider, 3146 cursor=cursor, 3147 config=config, 3148 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3149 parameters=model.parameters or {}, 3150 ) 3151 3152 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3153 return SimpleRetrieverTestReadDecorator( 3154 name=name, 3155 paginator=paginator, 3156 primary_key=primary_key, 3157 requester=requester, 3158 record_selector=record_selector, 3159 stream_slicer=stream_slicer, 3160 request_option_provider=request_options_provider, 3161 cursor=cursor, 3162 config=config, 3163 maximum_number_of_slices=self._limit_slices_fetched or 5, 3164 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3165 parameters=model.parameters or {}, 3166 ) 3167 return SimpleRetriever( 3168 name=name, 3169 paginator=paginator, 3170 primary_key=primary_key, 3171 requester=requester, 3172 record_selector=record_selector, 3173 stream_slicer=stream_slicer, 3174 request_option_provider=request_options_provider, 3175 cursor=cursor, 3176 config=config, 3177 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3178 additional_query_properties=query_properties, 3179 parameters=model.parameters or {}, 3180 )
3205 def create_state_delegating_stream( 3206 self, 3207 model: StateDelegatingStreamModel, 3208 config: Config, 3209 has_parent_state: Optional[bool] = None, 3210 **kwargs: Any, 3211 ) -> DeclarativeStream: 3212 if ( 3213 model.full_refresh_stream.name != model.name 3214 or model.name != model.incremental_stream.name 3215 ): 3216 raise ValueError( 3217 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3218 ) 3219 3220 stream_model = ( 3221 model.incremental_stream 3222 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3223 else model.full_refresh_stream 3224 ) 3225 3226 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3258 def create_async_retriever( 3259 self, 3260 model: AsyncRetrieverModel, 3261 config: Config, 3262 *, 3263 name: str, 3264 primary_key: Optional[ 3265 Union[str, List[str], List[List[str]]] 3266 ], # this seems to be needed to match create_simple_retriever 3267 stream_slicer: Optional[StreamSlicer], 3268 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3269 transformations: List[RecordTransformation], 3270 **kwargs: Any, 3271 ) -> AsyncRetriever: 3272 def _get_download_retriever() -> SimpleRetrieverTestReadDecorator | SimpleRetriever: 3273 record_selector = RecordSelector( 3274 extractor=download_extractor, 3275 name=name, 3276 record_filter=None, 3277 transformations=transformations, 3278 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3279 config=config, 3280 parameters={}, 3281 ) 3282 paginator = ( 3283 self._create_component_from_model( 3284 model=model.download_paginator, 3285 decoder=decoder, 3286 config=config, 3287 url_base="", 3288 ) 3289 if model.download_paginator 3290 else NoPagination(parameters={}) 3291 ) 3292 maximum_number_of_slices = self._limit_slices_fetched or 5 3293 3294 if self._limit_slices_fetched or self._emit_connector_builder_messages: 3295 return SimpleRetrieverTestReadDecorator( 3296 requester=download_requester, 3297 record_selector=record_selector, 3298 primary_key=None, 3299 name=job_download_components_name, 3300 paginator=paginator, 3301 config=config, 3302 parameters={}, 3303 maximum_number_of_slices=maximum_number_of_slices, 3304 ) 3305 3306 return SimpleRetriever( 3307 requester=download_requester, 3308 record_selector=record_selector, 3309 primary_key=None, 3310 name=job_download_components_name, 3311 paginator=paginator, 3312 config=config, 3313 parameters={}, 3314 ) 3315 3316 def _get_job_timeout() -> datetime.timedelta: 3317 user_defined_timeout: Optional[int] = ( 3318 int( 3319 InterpolatedString.create( 3320 str(model.polling_job_timeout), 3321 parameters={}, 3322 ).eval(config) 3323 ) 3324 if model.polling_job_timeout 3325 else None 3326 ) 3327 3328 # check for user defined timeout during the test read or 15 minutes 3329 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3330 # default value for non-connector builder is 60 minutes. 3331 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3332 3333 return ( 3334 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3335 ) 3336 3337 decoder = ( 3338 self._create_component_from_model(model=model.decoder, config=config) 3339 if model.decoder 3340 else JsonDecoder(parameters={}) 3341 ) 3342 record_selector = self._create_component_from_model( 3343 model=model.record_selector, 3344 config=config, 3345 decoder=decoder, 3346 name=name, 3347 transformations=transformations, 3348 client_side_incremental_sync=client_side_incremental_sync, 3349 ) 3350 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3351 creation_requester = self._create_component_from_model( 3352 model=model.creation_requester, 3353 decoder=decoder, 3354 config=config, 3355 name=f"job creation - {name}", 3356 ) 3357 polling_requester = self._create_component_from_model( 3358 model=model.polling_requester, 3359 decoder=decoder, 3360 config=config, 3361 name=f"job polling - {name}", 3362 ) 3363 job_download_components_name = f"job download - {name}" 3364 download_decoder = ( 3365 self._create_component_from_model(model=model.download_decoder, config=config) 3366 if model.download_decoder 3367 else JsonDecoder(parameters={}) 3368 ) 3369 download_extractor = ( 3370 self._create_component_from_model( 3371 model=model.download_extractor, 3372 config=config, 3373 decoder=download_decoder, 3374 parameters=model.parameters, 3375 ) 3376 if model.download_extractor 3377 else DpathExtractor( 3378 [], 3379 config=config, 3380 decoder=download_decoder, 3381 parameters=model.parameters or {}, 3382 ) 3383 ) 3384 download_requester = self._create_component_from_model( 3385 model=model.download_requester, 3386 decoder=download_decoder, 3387 config=config, 3388 name=job_download_components_name, 3389 ) 3390 download_retriever = _get_download_retriever() 3391 abort_requester = ( 3392 self._create_component_from_model( 3393 model=model.abort_requester, 3394 decoder=decoder, 3395 config=config, 3396 name=f"job abort - {name}", 3397 ) 3398 if model.abort_requester 3399 else None 3400 ) 3401 delete_requester = ( 3402 self._create_component_from_model( 3403 model=model.delete_requester, 3404 decoder=decoder, 3405 config=config, 3406 name=f"job delete - {name}", 3407 ) 3408 if model.delete_requester 3409 else None 3410 ) 3411 download_target_requester = ( 3412 self._create_component_from_model( 3413 model=model.download_target_requester, 3414 decoder=decoder, 3415 config=config, 3416 name=f"job extract_url - {name}", 3417 ) 3418 if model.download_target_requester 3419 else None 3420 ) 3421 status_extractor = self._create_component_from_model( 3422 model=model.status_extractor, decoder=decoder, config=config, name=name 3423 ) 3424 download_target_extractor = self._create_component_from_model( 3425 model=model.download_target_extractor, 3426 decoder=decoder, 3427 config=config, 3428 name=name, 3429 ) 3430 3431 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3432 creation_requester=creation_requester, 3433 polling_requester=polling_requester, 3434 download_retriever=download_retriever, 3435 download_target_requester=download_target_requester, 3436 abort_requester=abort_requester, 3437 delete_requester=delete_requester, 3438 status_extractor=status_extractor, 3439 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3440 download_target_extractor=download_target_extractor, 3441 job_timeout=_get_job_timeout(), 3442 ) 3443 3444 async_job_partition_router = AsyncJobPartitionRouter( 3445 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3446 job_repository, 3447 stream_slices, 3448 self._job_tracker, 3449 self._message_repository, 3450 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3451 has_bulk_parent=False, 3452 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3453 # `None` == default retry is set to 3 attempts, under the hood. 3454 job_max_retry=1 if self._emit_connector_builder_messages else None, 3455 ), 3456 stream_slicer=stream_slicer, 3457 config=config, 3458 parameters=model.parameters or {}, 3459 ) 3460 3461 return AsyncRetriever( 3462 record_selector=record_selector, 3463 stream_slicer=async_job_partition_router, 3464 config=config, 3465 parameters=model.parameters or {}, 3466 )
3477 def create_substream_partition_router( 3478 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3479 ) -> SubstreamPartitionRouter: 3480 parent_stream_configs = [] 3481 if model.parent_stream_configs: 3482 parent_stream_configs.extend( 3483 [ 3484 self._create_message_repository_substream_wrapper( 3485 model=parent_stream_config, config=config, **kwargs 3486 ) 3487 for parent_stream_config in model.parent_stream_configs 3488 ] 3489 ) 3490 3491 return SubstreamPartitionRouter( 3492 parent_stream_configs=parent_stream_configs, 3493 parameters=model.parameters or {}, 3494 config=config, 3495 )
3523 @staticmethod 3524 def create_wait_time_from_header( 3525 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3526 ) -> WaitTimeFromHeaderBackoffStrategy: 3527 return WaitTimeFromHeaderBackoffStrategy( 3528 header=model.header, 3529 parameters=model.parameters or {}, 3530 config=config, 3531 regex=model.regex, 3532 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3533 if model.max_waiting_time_in_seconds is not None 3534 else None, 3535 )
3537 @staticmethod 3538 def create_wait_until_time_from_header( 3539 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3540 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3541 return WaitUntilTimeFromHeaderBackoffStrategy( 3542 header=model.header, 3543 parameters=model.parameters or {}, 3544 config=config, 3545 min_wait=model.min_wait, 3546 regex=model.regex, 3547 )
3555 @staticmethod 3556 def create_components_mapping_definition( 3557 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3558 ) -> ComponentMappingDefinition: 3559 interpolated_value = InterpolatedString.create( 3560 model.value, parameters=model.parameters or {} 3561 ) 3562 field_path = [ 3563 InterpolatedString.create(path, parameters=model.parameters or {}) 3564 for path in model.field_path 3565 ] 3566 return ComponentMappingDefinition( 3567 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3568 value=interpolated_value, 3569 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3570 parameters=model.parameters or {}, 3571 )
3573 def create_http_components_resolver( 3574 self, model: HttpComponentsResolverModel, config: Config 3575 ) -> Any: 3576 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3577 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3578 3579 retriever = self._create_component_from_model( 3580 model=model.retriever, 3581 config=config, 3582 name="", 3583 primary_key=None, 3584 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3585 transformations=[], 3586 ) 3587 3588 components_mapping = [ 3589 self._create_component_from_model( 3590 model=components_mapping_definition_model, 3591 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3592 components_mapping_definition_model.value_type 3593 ), 3594 config=config, 3595 ) 3596 for components_mapping_definition_model in model.components_mapping 3597 ] 3598 3599 return HttpComponentsResolver( 3600 retriever=retriever, 3601 config=config, 3602 components_mapping=components_mapping, 3603 parameters=model.parameters or {}, 3604 )
3606 @staticmethod 3607 def create_stream_config( 3608 model: StreamConfigModel, config: Config, **kwargs: Any 3609 ) -> StreamConfig: 3610 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3611 [x for x in model.configs_pointer] if model.configs_pointer else [] 3612 ) 3613 3614 return StreamConfig( 3615 configs_pointer=model_configs_pointer, 3616 parameters=model.parameters or {}, 3617 )
3619 def create_config_components_resolver( 3620 self, model: ConfigComponentsResolverModel, config: Config 3621 ) -> Any: 3622 stream_config = self._create_component_from_model( 3623 model.stream_config, config=config, parameters=model.parameters or {} 3624 ) 3625 3626 components_mapping = [ 3627 self._create_component_from_model( 3628 model=components_mapping_definition_model, 3629 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3630 components_mapping_definition_model.value_type 3631 ), 3632 config=config, 3633 ) 3634 for components_mapping_definition_model in model.components_mapping 3635 ] 3636 3637 return ConfigComponentsResolver( 3638 stream_config=stream_config, 3639 config=config, 3640 components_mapping=components_mapping, 3641 parameters=model.parameters or {}, 3642 )
3666 def create_http_api_budget( 3667 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3668 ) -> HttpAPIBudget: 3669 policies = [ 3670 self._create_component_from_model(model=policy, config=config) 3671 for policy in model.policies 3672 ] 3673 3674 return HttpAPIBudget( 3675 policies=policies, 3676 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 3677 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 3678 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 3679 )
3681 def create_fixed_window_call_rate_policy( 3682 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 3683 ) -> FixedWindowCallRatePolicy: 3684 matchers = [ 3685 self._create_component_from_model(model=matcher, config=config) 3686 for matcher in model.matchers 3687 ] 3688 3689 # Set the initial reset timestamp to 10 days from now. 3690 # This value will be updated by the first request. 3691 return FixedWindowCallRatePolicy( 3692 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 3693 period=parse_duration(model.period), 3694 call_limit=model.call_limit, 3695 matchers=matchers, 3696 )
3698 def create_file_uploader( 3699 self, model: FileUploaderModel, config: Config, **kwargs: Any 3700 ) -> FileUploader: 3701 name = "File Uploader" 3702 requester = self._create_component_from_model( 3703 model=model.requester, 3704 config=config, 3705 name=name, 3706 **kwargs, 3707 ) 3708 download_target_extractor = self._create_component_from_model( 3709 model=model.download_target_extractor, 3710 config=config, 3711 name=name, 3712 **kwargs, 3713 ) 3714 emit_connector_builder_messages = self._emit_connector_builder_messages 3715 file_uploader = DefaultFileUploader( 3716 requester=requester, 3717 download_target_extractor=download_target_extractor, 3718 config=config, 3719 file_writer=NoopFileWriter() 3720 if emit_connector_builder_messages 3721 else LocalFileSystemFileWriter(), 3722 parameters=model.parameters or {}, 3723 filename_extractor=model.filename_extractor if model.filename_extractor else None, 3724 ) 3725 3726 return ( 3727 ConnectorBuilderFileUploader(file_uploader) 3728 if emit_connector_builder_messages 3729 else file_uploader 3730 )
3732 def create_moving_window_call_rate_policy( 3733 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 3734 ) -> MovingWindowCallRatePolicy: 3735 rates = [ 3736 self._create_component_from_model(model=rate, config=config) for rate in model.rates 3737 ] 3738 matchers = [ 3739 self._create_component_from_model(model=matcher, config=config) 3740 for matcher in model.matchers 3741 ] 3742 return MovingWindowCallRatePolicy( 3743 rates=rates, 3744 matchers=matchers, 3745 )
3747 def create_unlimited_call_rate_policy( 3748 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 3749 ) -> UnlimitedCallRatePolicy: 3750 matchers = [ 3751 self._create_component_from_model(model=matcher, config=config) 3752 for matcher in model.matchers 3753 ] 3754 3755 return UnlimitedCallRatePolicy( 3756 matchers=matchers, 3757 )
3766 def create_http_request_matcher( 3767 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 3768 ) -> HttpRequestRegexMatcher: 3769 return HttpRequestRegexMatcher( 3770 method=model.method, 3771 url_base=model.url_base, 3772 url_path_pattern=model.url_path_pattern, 3773 params=model.params, 3774 headers=model.headers, 3775 )
3782 def create_grouping_partition_router( 3783 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 3784 ) -> GroupingPartitionRouter: 3785 underlying_router = self._create_component_from_model( 3786 model=model.underlying_partition_router, config=config 3787 ) 3788 if model.group_size < 1: 3789 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 3790 3791 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 3792 # because they are specific to individual partitions and cannot be aggregated or handled 3793 # when grouping, potentially leading to incorrect API calls. Any request customization 3794 # should be managed at the stream level through the requester's configuration. 3795 if isinstance(underlying_router, SubstreamPartitionRouter): 3796 if any( 3797 parent_config.request_option 3798 for parent_config in underlying_router.parent_stream_configs 3799 ): 3800 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3801 3802 if isinstance(underlying_router, ListPartitionRouter): 3803 if underlying_router.request_option: 3804 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 3805 3806 return GroupingPartitionRouter( 3807 group_size=model.group_size, 3808 underlying_partition_router=underlying_router, 3809 deduplicate=model.deduplicate if model.deduplicate is not None else True, 3810 config=config, 3811 )