airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import re 11from functools import partial 12from typing import ( 13 Any, 14 Callable, 15 Dict, 16 List, 17 Mapping, 18 MutableMapping, 19 Optional, 20 Type, 21 Union, 22 cast, 23 get_args, 24 get_origin, 25 get_type_hints, 26) 27 28from isodate import parse_duration 29from pydantic.v1 import BaseModel 30from requests import Response 31 32from airbyte_cdk.connector_builder.models import ( 33 LogMessage as ConnectorBuilderLogMessage, 34) 35from airbyte_cdk.models import FailureType, Level 36from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 37from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 38from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 39from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 40from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 41from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 42from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 43 DeclarativeAuthenticator, 44 NoAuth, 45) 46from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 47from airbyte_cdk.sources.declarative.auth.oauth import ( 48 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 49) 50from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 51from airbyte_cdk.sources.declarative.auth.token import ( 52 ApiKeyAuthenticator, 53 BasicHttpAuthenticator, 54 BearerAuthenticator, 55 LegacySessionTokenAuthenticator, 56) 57from airbyte_cdk.sources.declarative.auth.token_provider import ( 58 InterpolatedStringTokenProvider, 59 SessionTokenProvider, 60 TokenProvider, 61) 62from airbyte_cdk.sources.declarative.checks import ( 63 CheckDynamicStream, 64 CheckStream, 65 DynamicStreamCheckConfig, 66) 67from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 68from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 69from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream 70from airbyte_cdk.sources.declarative.decoders import ( 71 Decoder, 72 IterableDecoder, 73 JsonDecoder, 74 PaginationDecoderDecorator, 75 XmlDecoder, 76 ZipfileDecoder, 77) 78from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 79 CompositeRawDecoder, 80 CsvParser, 81 GzipParser, 82 JsonLineParser, 83 JsonParser, 84 Parser, 85) 86from airbyte_cdk.sources.declarative.extractors import ( 87 DpathExtractor, 88 RecordFilter, 89 RecordSelector, 90 ResponseToFileExtractor, 91) 92from airbyte_cdk.sources.declarative.extractors.record_filter import ( 93 ClientSideIncrementalRecordFilterDecorator, 94) 95from airbyte_cdk.sources.declarative.incremental import ( 96 ChildPartitionResumableFullRefreshCursor, 97 ConcurrentCursorFactory, 98 ConcurrentPerPartitionCursor, 99 CursorFactory, 100 DatetimeBasedCursor, 101 DeclarativeCursor, 102 GlobalSubstreamCursor, 103 PerPartitionCursor, 104 PerPartitionWithGlobalCursor, 105 ResumableFullRefreshCursor, 106) 107from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 108from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 109from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 110 LegacyToPerPartitionStateMigration, 111) 112from airbyte_cdk.sources.declarative.models import ( 113 CustomStateMigration, 114) 115from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 116 DEPRECATION_LOGS_TAG, 117 BaseModelWithDeprecations, 118) 119from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 120 AddedFieldDefinition as AddedFieldDefinitionModel, 121) 122from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 123 AddFields as AddFieldsModel, 124) 125from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 126 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 127) 128from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 129 AsyncJobStatusMap as AsyncJobStatusMapModel, 130) 131from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 132 AsyncRetriever as AsyncRetrieverModel, 133) 134from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 135 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 136) 137from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 138 BearerAuthenticator as BearerAuthenticatorModel, 139) 140from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 141 CheckDynamicStream as CheckDynamicStreamModel, 142) 143from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 144 CheckStream as CheckStreamModel, 145) 146from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 147 ComplexFieldType as ComplexFieldTypeModel, 148) 149from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 150 ComponentMappingDefinition as ComponentMappingDefinitionModel, 151) 152from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 153 CompositeErrorHandler as CompositeErrorHandlerModel, 154) 155from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 156 ConcurrencyLevel as ConcurrencyLevelModel, 157) 158from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 159 ConfigAddFields as ConfigAddFieldsModel, 160) 161from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 162 ConfigComponentsResolver as ConfigComponentsResolverModel, 163) 164from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 165 ConfigMigration as ConfigMigrationModel, 166) 167from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 168 ConfigRemapField as ConfigRemapFieldModel, 169) 170from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 171 ConfigRemoveFields as ConfigRemoveFieldsModel, 172) 173from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 174 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 175) 176from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 177 CsvDecoder as CsvDecoderModel, 178) 179from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 180 CursorPagination as CursorPaginationModel, 181) 182from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 183 CustomAuthenticator as CustomAuthenticatorModel, 184) 185from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 186 CustomBackoffStrategy as CustomBackoffStrategyModel, 187) 188from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 189 CustomConfigTransformation as CustomConfigTransformationModel, 190) 191from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 192 CustomDecoder as CustomDecoderModel, 193) 194from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 195 CustomErrorHandler as CustomErrorHandlerModel, 196) 197from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 198 CustomIncrementalSync as CustomIncrementalSyncModel, 199) 200from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 201 CustomPaginationStrategy as CustomPaginationStrategyModel, 202) 203from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 204 CustomPartitionRouter as CustomPartitionRouterModel, 205) 206from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 207 CustomRecordExtractor as CustomRecordExtractorModel, 208) 209from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 210 CustomRecordFilter as CustomRecordFilterModel, 211) 212from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 213 CustomRequester as CustomRequesterModel, 214) 215from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 216 CustomRetriever as CustomRetrieverModel, 217) 218from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 219 CustomSchemaLoader as CustomSchemaLoader, 220) 221from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 222 CustomSchemaNormalization as CustomSchemaNormalizationModel, 223) 224from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 225 CustomTransformation as CustomTransformationModel, 226) 227from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 228 CustomValidationStrategy as CustomValidationStrategyModel, 229) 230from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 231 DatetimeBasedCursor as DatetimeBasedCursorModel, 232) 233from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 234 DeclarativeStream as DeclarativeStreamModel, 235) 236from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 237 DefaultErrorHandler as DefaultErrorHandlerModel, 238) 239from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 240 DefaultPaginator as DefaultPaginatorModel, 241) 242from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 243 DpathExtractor as DpathExtractorModel, 244) 245from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 246 DpathFlattenFields as DpathFlattenFieldsModel, 247) 248from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 249 DpathValidator as DpathValidatorModel, 250) 251from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 252 DynamicSchemaLoader as DynamicSchemaLoaderModel, 253) 254from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 255 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 256) 257from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 258 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 259) 260from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 261 FileUploader as FileUploaderModel, 262) 263from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 264 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 265) 266from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 267 FlattenFields as FlattenFieldsModel, 268) 269from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 270 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 271) 272from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 273 GroupingPartitionRouter as GroupingPartitionRouterModel, 274) 275from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 276 GzipDecoder as GzipDecoderModel, 277) 278from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 279 HTTPAPIBudget as HTTPAPIBudgetModel, 280) 281from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 282 HttpComponentsResolver as HttpComponentsResolverModel, 283) 284from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 285 HttpRequester as HttpRequesterModel, 286) 287from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 288 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 289) 290from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 291 HttpResponseFilter as HttpResponseFilterModel, 292) 293from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 294 IncrementingCountCursor as IncrementingCountCursorModel, 295) 296from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 297 InlineSchemaLoader as InlineSchemaLoaderModel, 298) 299from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 300 IterableDecoder as IterableDecoderModel, 301) 302from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 303 JsonDecoder as JsonDecoderModel, 304) 305from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 306 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 307) 308from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 309 JsonlDecoder as JsonlDecoderModel, 310) 311from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 312 JwtAuthenticator as JwtAuthenticatorModel, 313) 314from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 315 JwtHeaders as JwtHeadersModel, 316) 317from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 318 JwtPayload as JwtPayloadModel, 319) 320from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 321 KeysReplace as KeysReplaceModel, 322) 323from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 324 KeysToLower as KeysToLowerModel, 325) 326from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 327 KeysToSnakeCase as KeysToSnakeCaseModel, 328) 329from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 330 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 331) 332from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 333 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 334) 335from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 336 ListPartitionRouter as ListPartitionRouterModel, 337) 338from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 339 MinMaxDatetime as MinMaxDatetimeModel, 340) 341from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 342 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 343) 344from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 345 NoAuth as NoAuthModel, 346) 347from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 348 NoPagination as NoPaginationModel, 349) 350from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 351 OAuthAuthenticator as OAuthAuthenticatorModel, 352) 353from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 354 OffsetIncrement as OffsetIncrementModel, 355) 356from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 357 PageIncrement as PageIncrementModel, 358) 359from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 360 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 361) 362from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 363 ParentStreamConfig as ParentStreamConfigModel, 364) 365from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 366 PredicateValidator as PredicateValidatorModel, 367) 368from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 369 PropertiesFromEndpoint as PropertiesFromEndpointModel, 370) 371from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 372 PropertyChunking as PropertyChunkingModel, 373) 374from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 375 PropertyLimitType as PropertyLimitTypeModel, 376) 377from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 378 QueryProperties as QueryPropertiesModel, 379) 380from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 381 Rate as RateModel, 382) 383from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 384 RecordFilter as RecordFilterModel, 385) 386from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 387 RecordSelector as RecordSelectorModel, 388) 389from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 390 RemoveFields as RemoveFieldsModel, 391) 392from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 393 RequestOption as RequestOptionModel, 394) 395from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 396 RequestPath as RequestPathModel, 397) 398from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 399 ResponseToFileExtractor as ResponseToFileExtractorModel, 400) 401from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 402 SchemaNormalization as SchemaNormalizationModel, 403) 404from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 405 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 406) 407from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 408 SelectiveAuthenticator as SelectiveAuthenticatorModel, 409) 410from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 411 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 412) 413from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 414 SimpleRetriever as SimpleRetrieverModel, 415) 416from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 417from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 418 StateDelegatingStream as StateDelegatingStreamModel, 419) 420from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 421 StreamConfig as StreamConfigModel, 422) 423from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 424 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 425) 426from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 427 TypesMap as TypesMapModel, 428) 429from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 430 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 431) 432from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 433 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 434) 435from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 436from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 437 WaitTimeFromHeader as WaitTimeFromHeaderModel, 438) 439from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 440 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 441) 442from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 443 XmlDecoder as XmlDecoderModel, 444) 445from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 446 ZipfileDecoder as ZipfileDecoderModel, 447) 448from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import ( 449 COMPONENTS_MODULE_NAME, 450 SDM_COMPONENTS_MODULE_NAME, 451) 452from airbyte_cdk.sources.declarative.partition_routers import ( 453 CartesianProductStreamSlicer, 454 GroupingPartitionRouter, 455 ListPartitionRouter, 456 PartitionRouter, 457 SinglePartitionRouter, 458 SubstreamPartitionRouter, 459) 460from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 461 AsyncJobPartitionRouter, 462) 463from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 464 ParentStreamConfig, 465) 466from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 467from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 468 CompositeErrorHandler, 469 DefaultErrorHandler, 470 HttpResponseFilter, 471) 472from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 473 ConstantBackoffStrategy, 474 ExponentialBackoffStrategy, 475 WaitTimeFromHeaderBackoffStrategy, 476 WaitUntilTimeFromHeaderBackoffStrategy, 477) 478from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 479from airbyte_cdk.sources.declarative.requesters.paginators import ( 480 DefaultPaginator, 481 NoPagination, 482 PaginatorTestReadDecorator, 483) 484from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 485 CursorPaginationStrategy, 486 CursorStopCondition, 487 OffsetIncrement, 488 PageIncrement, 489 StopConditionPaginationStrategyDecorator, 490) 491from airbyte_cdk.sources.declarative.requesters.query_properties import ( 492 PropertiesFromEndpoint, 493 PropertyChunking, 494 QueryProperties, 495) 496from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 497 PropertyLimitType, 498) 499from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 500 GroupByKey, 501) 502from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 503from airbyte_cdk.sources.declarative.requesters.request_options import ( 504 DatetimeBasedRequestOptionsProvider, 505 DefaultRequestOptionsProvider, 506 InterpolatedRequestOptionsProvider, 507 RequestOptionsProvider, 508) 509from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 510from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 511from airbyte_cdk.sources.declarative.resolvers import ( 512 ComponentMappingDefinition, 513 ConfigComponentsResolver, 514 HttpComponentsResolver, 515 ParametrizedComponentsResolver, 516 StreamConfig, 517 StreamParametersDefinition, 518) 519from airbyte_cdk.sources.declarative.retrievers import ( 520 AsyncRetriever, 521 LazySimpleRetriever, 522 SimpleRetriever, 523) 524from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 525 ConnectorBuilderFileUploader, 526 DefaultFileUploader, 527 FileUploader, 528 LocalFileSystemFileWriter, 529 NoopFileWriter, 530) 531from airbyte_cdk.sources.declarative.schema import ( 532 ComplexFieldType, 533 DefaultSchemaLoader, 534 DynamicSchemaLoader, 535 InlineSchemaLoader, 536 JsonFileSchemaLoader, 537 SchemaTypeIdentifier, 538 TypesMap, 539) 540from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 541from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 542from airbyte_cdk.sources.declarative.stream_slicers import ( 543 StreamSlicer, 544 StreamSlicerTestReadDecorator, 545) 546from airbyte_cdk.sources.declarative.transformations import ( 547 AddFields, 548 RecordTransformation, 549 RemoveFields, 550) 551from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 552from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 553 ConfigAddFields, 554 ConfigRemapField, 555 ConfigRemoveFields, 556) 557from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 558 ConfigTransformation, 559) 560from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 561 DpathFlattenFields, 562 KeyTransformation, 563) 564from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 565 FlattenFields, 566) 567from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 568 KeysReplaceTransformation, 569) 570from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 571 KeysToLowerTransformation, 572) 573from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 574 KeysToSnakeCaseTransformation, 575) 576from airbyte_cdk.sources.declarative.validators import ( 577 DpathValidator, 578 PredicateValidator, 579 ValidateAdheresToSchema, 580) 581from airbyte_cdk.sources.http_logger import format_http_message 582from airbyte_cdk.sources.message import ( 583 InMemoryMessageRepository, 584 LogAppenderMessageRepositoryDecorator, 585 MessageRepository, 586 NoopMessageRepository, 587) 588from airbyte_cdk.sources.streams.call_rate import ( 589 APIBudget, 590 FixedWindowCallRatePolicy, 591 HttpAPIBudget, 592 HttpRequestRegexMatcher, 593 MovingWindowCallRatePolicy, 594 Rate, 595 UnlimitedCallRatePolicy, 596) 597from airbyte_cdk.sources.streams.concurrent.clamping import ( 598 ClampingEndProvider, 599 ClampingStrategy, 600 DayClampingStrategy, 601 MonthClampingStrategy, 602 NoClamping, 603 WeekClampingStrategy, 604 Weekday, 605) 606from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, Cursor, CursorField 607from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 608 CustomFormatConcurrentStreamStateConverter, 609 DateTimeStreamStateConverter, 610) 611from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 612 IncrementingCountStreamStateConverter, 613) 614from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 615from airbyte_cdk.sources.types import Config 616from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 617 618ComponentDefinition = Mapping[str, Any] 619 620SCHEMA_TRANSFORMER_TYPE_MAPPING = { 621 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 622 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 623} 624 625 626class ModelToComponentFactory: 627 EPOCH_DATETIME_FORMAT = "%s" 628 629 def __init__( 630 self, 631 limit_pages_fetched_per_slice: Optional[int] = None, 632 limit_slices_fetched: Optional[int] = None, 633 emit_connector_builder_messages: bool = False, 634 disable_retries: bool = False, 635 disable_cache: bool = False, 636 disable_resumable_full_refresh: bool = False, 637 message_repository: Optional[MessageRepository] = None, 638 connector_state_manager: Optional[ConnectorStateManager] = None, 639 max_concurrent_async_job_count: Optional[int] = None, 640 ): 641 self._init_mappings() 642 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 643 self._limit_slices_fetched = limit_slices_fetched 644 self._emit_connector_builder_messages = emit_connector_builder_messages 645 self._disable_retries = disable_retries 646 self._disable_cache = disable_cache 647 self._disable_resumable_full_refresh = disable_resumable_full_refresh 648 self._message_repository = message_repository or InMemoryMessageRepository( 649 self._evaluate_log_level(emit_connector_builder_messages) 650 ) 651 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 652 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 653 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 654 # placeholder for deprecation warnings 655 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 656 657 def _init_mappings(self) -> None: 658 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 659 AddedFieldDefinitionModel: self.create_added_field_definition, 660 AddFieldsModel: self.create_add_fields, 661 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 662 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 663 BearerAuthenticatorModel: self.create_bearer_authenticator, 664 CheckStreamModel: self.create_check_stream, 665 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 666 CheckDynamicStreamModel: self.create_check_dynamic_stream, 667 CompositeErrorHandlerModel: self.create_composite_error_handler, 668 ConcurrencyLevelModel: self.create_concurrency_level, 669 ConfigMigrationModel: self.create_config_migration, 670 ConfigAddFieldsModel: self.create_config_add_fields, 671 ConfigRemapFieldModel: self.create_config_remap_field, 672 ConfigRemoveFieldsModel: self.create_config_remove_fields, 673 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 674 CsvDecoderModel: self.create_csv_decoder, 675 CursorPaginationModel: self.create_cursor_pagination, 676 CustomAuthenticatorModel: self.create_custom_component, 677 CustomBackoffStrategyModel: self.create_custom_component, 678 CustomDecoderModel: self.create_custom_component, 679 CustomErrorHandlerModel: self.create_custom_component, 680 CustomIncrementalSyncModel: self.create_custom_component, 681 CustomRecordExtractorModel: self.create_custom_component, 682 CustomRecordFilterModel: self.create_custom_component, 683 CustomRequesterModel: self.create_custom_component, 684 CustomRetrieverModel: self.create_custom_component, 685 CustomSchemaLoader: self.create_custom_component, 686 CustomSchemaNormalizationModel: self.create_custom_component, 687 CustomStateMigration: self.create_custom_component, 688 CustomPaginationStrategyModel: self.create_custom_component, 689 CustomPartitionRouterModel: self.create_custom_component, 690 CustomTransformationModel: self.create_custom_component, 691 CustomValidationStrategyModel: self.create_custom_component, 692 CustomConfigTransformationModel: self.create_custom_component, 693 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 694 DeclarativeStreamModel: self.create_declarative_stream, 695 DefaultErrorHandlerModel: self.create_default_error_handler, 696 DefaultPaginatorModel: self.create_default_paginator, 697 DpathExtractorModel: self.create_dpath_extractor, 698 DpathValidatorModel: self.create_dpath_validator, 699 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 700 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 701 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 702 GroupByKeyMergeStrategyModel: self.create_group_by_key, 703 HttpRequesterModel: self.create_http_requester, 704 HttpResponseFilterModel: self.create_http_response_filter, 705 InlineSchemaLoaderModel: self.create_inline_schema_loader, 706 JsonDecoderModel: self.create_json_decoder, 707 JsonlDecoderModel: self.create_jsonl_decoder, 708 GzipDecoderModel: self.create_gzip_decoder, 709 KeysToLowerModel: self.create_keys_to_lower_transformation, 710 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 711 KeysReplaceModel: self.create_keys_replace_transformation, 712 FlattenFieldsModel: self.create_flatten_fields, 713 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 714 IterableDecoderModel: self.create_iterable_decoder, 715 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 716 XmlDecoderModel: self.create_xml_decoder, 717 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 718 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 719 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 720 TypesMapModel: self.create_types_map, 721 ComplexFieldTypeModel: self.create_complex_field_type, 722 JwtAuthenticatorModel: self.create_jwt_authenticator, 723 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 724 ListPartitionRouterModel: self.create_list_partition_router, 725 MinMaxDatetimeModel: self.create_min_max_datetime, 726 NoAuthModel: self.create_no_auth, 727 NoPaginationModel: self.create_no_pagination, 728 OAuthAuthenticatorModel: self.create_oauth_authenticator, 729 OffsetIncrementModel: self.create_offset_increment, 730 PageIncrementModel: self.create_page_increment, 731 ParentStreamConfigModel: self.create_parent_stream_config, 732 PredicateValidatorModel: self.create_predicate_validator, 733 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 734 PropertyChunkingModel: self.create_property_chunking, 735 QueryPropertiesModel: self.create_query_properties, 736 RecordFilterModel: self.create_record_filter, 737 RecordSelectorModel: self.create_record_selector, 738 RemoveFieldsModel: self.create_remove_fields, 739 RequestPathModel: self.create_request_path, 740 RequestOptionModel: self.create_request_option, 741 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 742 SelectiveAuthenticatorModel: self.create_selective_authenticator, 743 SimpleRetrieverModel: self.create_simple_retriever, 744 StateDelegatingStreamModel: self.create_state_delegating_stream, 745 SpecModel: self.create_spec, 746 SubstreamPartitionRouterModel: self.create_substream_partition_router, 747 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 748 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 749 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 750 AsyncRetrieverModel: self.create_async_retriever, 751 HttpComponentsResolverModel: self.create_http_components_resolver, 752 ConfigComponentsResolverModel: self.create_config_components_resolver, 753 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 754 StreamConfigModel: self.create_stream_config, 755 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 756 ZipfileDecoderModel: self.create_zipfile_decoder, 757 HTTPAPIBudgetModel: self.create_http_api_budget, 758 FileUploaderModel: self.create_file_uploader, 759 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 760 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 761 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 762 RateModel: self.create_rate, 763 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 764 GroupingPartitionRouterModel: self.create_grouping_partition_router, 765 } 766 767 # Needed for the case where we need to perform a second parse on the fields of a custom component 768 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 769 770 def create_component( 771 self, 772 model_type: Type[BaseModel], 773 component_definition: ComponentDefinition, 774 config: Config, 775 **kwargs: Any, 776 ) -> Any: 777 """ 778 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 779 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 780 creating declarative components from that model. 781 782 :param model_type: The type of declarative component that is being initialized 783 :param component_definition: The mapping that represents a declarative component 784 :param config: The connector config that is provided by the customer 785 :return: The declarative component to be used at runtime 786 """ 787 788 component_type = component_definition.get("type") 789 if component_definition.get("type") != model_type.__name__: 790 raise ValueError( 791 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 792 ) 793 794 declarative_component_model = model_type.parse_obj(component_definition) 795 796 if not isinstance(declarative_component_model, model_type): 797 raise ValueError( 798 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 799 ) 800 801 return self._create_component_from_model( 802 model=declarative_component_model, config=config, **kwargs 803 ) 804 805 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 806 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 807 raise ValueError( 808 f"{model.__class__} with attributes {model} is not a valid component type" 809 ) 810 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 811 if not component_constructor: 812 raise ValueError(f"Could not find constructor for {model.__class__}") 813 814 # collect deprecation warnings for supported models. 815 if isinstance(model, BaseModelWithDeprecations): 816 self._collect_model_deprecations(model) 817 818 return component_constructor(model=model, config=config, **kwargs) 819 820 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 821 """ 822 Returns the deprecation warnings that were collected during the creation of components. 823 """ 824 return self._collected_deprecation_logs 825 826 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 827 """ 828 Collects deprecation logs from the given model and appends any new logs to the internal collection. 829 830 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 831 832 Args: 833 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 834 """ 835 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 836 for log in model._deprecation_logs: 837 # avoid duplicates for deprecation logs observed. 838 if log not in self._collected_deprecation_logs: 839 self._collected_deprecation_logs.append(log) 840 841 def create_config_migration( 842 self, model: ConfigMigrationModel, config: Config 843 ) -> ConfigMigration: 844 transformations: List[ConfigTransformation] = [ 845 self._create_component_from_model(transformation, config) 846 for transformation in model.transformations 847 ] 848 849 return ConfigMigration( 850 description=model.description, 851 transformations=transformations, 852 ) 853 854 def create_config_add_fields( 855 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 856 ) -> ConfigAddFields: 857 fields = [self._create_component_from_model(field, config) for field in model.fields] 858 return ConfigAddFields( 859 fields=fields, 860 condition=model.condition or "", 861 ) 862 863 @staticmethod 864 def create_config_remove_fields( 865 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 866 ) -> ConfigRemoveFields: 867 return ConfigRemoveFields( 868 field_pointers=model.field_pointers, 869 condition=model.condition or "", 870 ) 871 872 @staticmethod 873 def create_config_remap_field( 874 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 875 ) -> ConfigRemapField: 876 mapping = cast(Mapping[str, Any], model.map) 877 return ConfigRemapField( 878 map=mapping, 879 field_path=model.field_path, 880 config=config, 881 ) 882 883 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 884 strategy = self._create_component_from_model(model.validation_strategy, config) 885 886 return DpathValidator( 887 field_path=model.field_path, 888 strategy=strategy, 889 ) 890 891 def create_predicate_validator( 892 self, model: PredicateValidatorModel, config: Config 893 ) -> PredicateValidator: 894 strategy = self._create_component_from_model(model.validation_strategy, config) 895 896 return PredicateValidator( 897 value=model.value, 898 strategy=strategy, 899 ) 900 901 @staticmethod 902 def create_validate_adheres_to_schema( 903 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 904 ) -> ValidateAdheresToSchema: 905 base_schema = cast(Mapping[str, Any], model.base_schema) 906 return ValidateAdheresToSchema( 907 schema=base_schema, 908 ) 909 910 @staticmethod 911 def create_added_field_definition( 912 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 913 ) -> AddedFieldDefinition: 914 interpolated_value = InterpolatedString.create( 915 model.value, parameters=model.parameters or {} 916 ) 917 return AddedFieldDefinition( 918 path=model.path, 919 value=interpolated_value, 920 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 921 parameters=model.parameters or {}, 922 ) 923 924 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 925 added_field_definitions = [ 926 self._create_component_from_model( 927 model=added_field_definition_model, 928 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 929 added_field_definition_model.value_type 930 ), 931 config=config, 932 ) 933 for added_field_definition_model in model.fields 934 ] 935 return AddFields( 936 fields=added_field_definitions, 937 condition=model.condition or "", 938 parameters=model.parameters or {}, 939 ) 940 941 def create_keys_to_lower_transformation( 942 self, model: KeysToLowerModel, config: Config, **kwargs: Any 943 ) -> KeysToLowerTransformation: 944 return KeysToLowerTransformation() 945 946 def create_keys_to_snake_transformation( 947 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 948 ) -> KeysToSnakeCaseTransformation: 949 return KeysToSnakeCaseTransformation() 950 951 def create_keys_replace_transformation( 952 self, model: KeysReplaceModel, config: Config, **kwargs: Any 953 ) -> KeysReplaceTransformation: 954 return KeysReplaceTransformation( 955 old=model.old, new=model.new, parameters=model.parameters or {} 956 ) 957 958 def create_flatten_fields( 959 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 960 ) -> FlattenFields: 961 return FlattenFields( 962 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 963 ) 964 965 def create_dpath_flatten_fields( 966 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 967 ) -> DpathFlattenFields: 968 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 969 key_transformation = ( 970 KeyTransformation( 971 config=config, 972 prefix=model.key_transformation.prefix, 973 suffix=model.key_transformation.suffix, 974 parameters=model.parameters or {}, 975 ) 976 if model.key_transformation is not None 977 else None 978 ) 979 return DpathFlattenFields( 980 config=config, 981 field_path=model_field_path, 982 delete_origin_value=model.delete_origin_value 983 if model.delete_origin_value is not None 984 else False, 985 replace_record=model.replace_record if model.replace_record is not None else False, 986 key_transformation=key_transformation, 987 parameters=model.parameters or {}, 988 ) 989 990 @staticmethod 991 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 992 if not value_type: 993 return None 994 names_to_types = { 995 ValueType.string: str, 996 ValueType.number: float, 997 ValueType.integer: int, 998 ValueType.boolean: bool, 999 } 1000 return names_to_types[value_type] 1001 1002 def create_api_key_authenticator( 1003 self, 1004 model: ApiKeyAuthenticatorModel, 1005 config: Config, 1006 token_provider: Optional[TokenProvider] = None, 1007 **kwargs: Any, 1008 ) -> ApiKeyAuthenticator: 1009 if model.inject_into is None and model.header is None: 1010 raise ValueError( 1011 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1012 ) 1013 1014 if model.inject_into is not None and model.header is not None: 1015 raise ValueError( 1016 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1017 ) 1018 1019 if token_provider is not None and model.api_token != "": 1020 raise ValueError( 1021 "If token_provider is set, api_token is ignored and has to be set to empty string." 1022 ) 1023 1024 request_option = ( 1025 self._create_component_from_model( 1026 model.inject_into, config, parameters=model.parameters or {} 1027 ) 1028 if model.inject_into 1029 else RequestOption( 1030 inject_into=RequestOptionType.header, 1031 field_name=model.header or "", 1032 parameters=model.parameters or {}, 1033 ) 1034 ) 1035 1036 return ApiKeyAuthenticator( 1037 token_provider=( 1038 token_provider 1039 if token_provider is not None 1040 else InterpolatedStringTokenProvider( 1041 api_token=model.api_token or "", 1042 config=config, 1043 parameters=model.parameters or {}, 1044 ) 1045 ), 1046 request_option=request_option, 1047 config=config, 1048 parameters=model.parameters or {}, 1049 ) 1050 1051 def create_legacy_to_per_partition_state_migration( 1052 self, 1053 model: LegacyToPerPartitionStateMigrationModel, 1054 config: Mapping[str, Any], 1055 declarative_stream: DeclarativeStreamModel, 1056 ) -> LegacyToPerPartitionStateMigration: 1057 retriever = declarative_stream.retriever 1058 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1059 raise ValueError( 1060 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1061 ) 1062 partition_router = retriever.partition_router 1063 if not isinstance( 1064 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1065 ): 1066 raise ValueError( 1067 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1068 ) 1069 if not hasattr(partition_router, "parent_stream_configs"): 1070 raise ValueError( 1071 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1072 ) 1073 1074 if not hasattr(declarative_stream, "incremental_sync"): 1075 raise ValueError( 1076 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1077 ) 1078 1079 return LegacyToPerPartitionStateMigration( 1080 partition_router, # type: ignore # was already checked above 1081 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1082 config, 1083 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1084 ) 1085 1086 def create_session_token_authenticator( 1087 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1088 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1089 decoder = ( 1090 self._create_component_from_model(model=model.decoder, config=config) 1091 if model.decoder 1092 else JsonDecoder(parameters={}) 1093 ) 1094 login_requester = self._create_component_from_model( 1095 model=model.login_requester, 1096 config=config, 1097 name=f"{name}_login_requester", 1098 decoder=decoder, 1099 ) 1100 token_provider = SessionTokenProvider( 1101 login_requester=login_requester, 1102 session_token_path=model.session_token_path, 1103 expiration_duration=parse_duration(model.expiration_duration) 1104 if model.expiration_duration 1105 else None, 1106 parameters=model.parameters or {}, 1107 message_repository=self._message_repository, 1108 decoder=decoder, 1109 ) 1110 if model.request_authentication.type == "Bearer": 1111 return ModelToComponentFactory.create_bearer_authenticator( 1112 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1113 config, 1114 token_provider=token_provider, 1115 ) 1116 else: 1117 return self.create_api_key_authenticator( 1118 ApiKeyAuthenticatorModel( 1119 type="ApiKeyAuthenticator", 1120 api_token="", 1121 inject_into=model.request_authentication.inject_into, 1122 ), # type: ignore # $parameters and headers default to None 1123 config=config, 1124 token_provider=token_provider, 1125 ) 1126 1127 @staticmethod 1128 def create_basic_http_authenticator( 1129 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1130 ) -> BasicHttpAuthenticator: 1131 return BasicHttpAuthenticator( 1132 password=model.password or "", 1133 username=model.username, 1134 config=config, 1135 parameters=model.parameters or {}, 1136 ) 1137 1138 @staticmethod 1139 def create_bearer_authenticator( 1140 model: BearerAuthenticatorModel, 1141 config: Config, 1142 token_provider: Optional[TokenProvider] = None, 1143 **kwargs: Any, 1144 ) -> BearerAuthenticator: 1145 if token_provider is not None and model.api_token != "": 1146 raise ValueError( 1147 "If token_provider is set, api_token is ignored and has to be set to empty string." 1148 ) 1149 return BearerAuthenticator( 1150 token_provider=( 1151 token_provider 1152 if token_provider is not None 1153 else InterpolatedStringTokenProvider( 1154 api_token=model.api_token or "", 1155 config=config, 1156 parameters=model.parameters or {}, 1157 ) 1158 ), 1159 config=config, 1160 parameters=model.parameters or {}, 1161 ) 1162 1163 @staticmethod 1164 def create_dynamic_stream_check_config( 1165 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1166 ) -> DynamicStreamCheckConfig: 1167 return DynamicStreamCheckConfig( 1168 dynamic_stream_name=model.dynamic_stream_name, 1169 stream_count=model.stream_count or 0, 1170 ) 1171 1172 def create_check_stream( 1173 self, model: CheckStreamModel, config: Config, **kwargs: Any 1174 ) -> CheckStream: 1175 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1176 raise ValueError( 1177 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1178 ) 1179 1180 dynamic_streams_check_configs = ( 1181 [ 1182 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1183 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1184 ] 1185 if model.dynamic_streams_check_configs 1186 else [] 1187 ) 1188 1189 return CheckStream( 1190 stream_names=model.stream_names or [], 1191 dynamic_streams_check_configs=dynamic_streams_check_configs, 1192 parameters={}, 1193 ) 1194 1195 @staticmethod 1196 def create_check_dynamic_stream( 1197 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1198 ) -> CheckDynamicStream: 1199 assert model.use_check_availability is not None # for mypy 1200 1201 use_check_availability = model.use_check_availability 1202 1203 return CheckDynamicStream( 1204 stream_count=model.stream_count, 1205 use_check_availability=use_check_availability, 1206 parameters={}, 1207 ) 1208 1209 def create_composite_error_handler( 1210 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1211 ) -> CompositeErrorHandler: 1212 error_handlers = [ 1213 self._create_component_from_model(model=error_handler_model, config=config) 1214 for error_handler_model in model.error_handlers 1215 ] 1216 return CompositeErrorHandler( 1217 error_handlers=error_handlers, parameters=model.parameters or {} 1218 ) 1219 1220 @staticmethod 1221 def create_concurrency_level( 1222 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1223 ) -> ConcurrencyLevel: 1224 return ConcurrencyLevel( 1225 default_concurrency=model.default_concurrency, 1226 max_concurrency=model.max_concurrency, 1227 config=config, 1228 parameters={}, 1229 ) 1230 1231 @staticmethod 1232 def apply_stream_state_migrations( 1233 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1234 ) -> MutableMapping[str, Any]: 1235 if stream_state_migrations: 1236 for state_migration in stream_state_migrations: 1237 if state_migration.should_migrate(stream_state): 1238 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1239 stream_state = dict(state_migration.migrate(stream_state)) 1240 return stream_state 1241 1242 def create_concurrent_cursor_from_datetime_based_cursor( 1243 self, 1244 model_type: Type[BaseModel], 1245 component_definition: ComponentDefinition, 1246 stream_name: str, 1247 stream_namespace: Optional[str], 1248 config: Config, 1249 message_repository: Optional[MessageRepository] = None, 1250 runtime_lookback_window: Optional[datetime.timedelta] = None, 1251 stream_state_migrations: Optional[List[Any]] = None, 1252 **kwargs: Any, 1253 ) -> ConcurrentCursor: 1254 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1255 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1256 # incoming state and connector_state_manager that is initialized when the component factory is created 1257 stream_state = ( 1258 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1259 if "stream_state" not in kwargs 1260 else kwargs["stream_state"] 1261 ) 1262 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1263 1264 component_type = component_definition.get("type") 1265 if component_definition.get("type") != model_type.__name__: 1266 raise ValueError( 1267 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1268 ) 1269 1270 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1271 1272 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1273 raise ValueError( 1274 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1275 ) 1276 1277 interpolated_cursor_field = InterpolatedString.create( 1278 datetime_based_cursor_model.cursor_field, 1279 parameters=datetime_based_cursor_model.parameters or {}, 1280 ) 1281 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1282 1283 interpolated_partition_field_start = InterpolatedString.create( 1284 datetime_based_cursor_model.partition_field_start or "start_time", 1285 parameters=datetime_based_cursor_model.parameters or {}, 1286 ) 1287 interpolated_partition_field_end = InterpolatedString.create( 1288 datetime_based_cursor_model.partition_field_end or "end_time", 1289 parameters=datetime_based_cursor_model.parameters or {}, 1290 ) 1291 1292 slice_boundary_fields = ( 1293 interpolated_partition_field_start.eval(config=config), 1294 interpolated_partition_field_end.eval(config=config), 1295 ) 1296 1297 datetime_format = datetime_based_cursor_model.datetime_format 1298 1299 cursor_granularity = ( 1300 parse_duration(datetime_based_cursor_model.cursor_granularity) 1301 if datetime_based_cursor_model.cursor_granularity 1302 else None 1303 ) 1304 1305 lookback_window = None 1306 interpolated_lookback_window = ( 1307 InterpolatedString.create( 1308 datetime_based_cursor_model.lookback_window, 1309 parameters=datetime_based_cursor_model.parameters or {}, 1310 ) 1311 if datetime_based_cursor_model.lookback_window 1312 else None 1313 ) 1314 if interpolated_lookback_window: 1315 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1316 if evaluated_lookback_window: 1317 lookback_window = parse_duration(evaluated_lookback_window) 1318 1319 connector_state_converter: DateTimeStreamStateConverter 1320 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1321 datetime_format=datetime_format, 1322 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1323 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1324 cursor_granularity=cursor_granularity, 1325 ) 1326 1327 # Adjusts the stream state by applying the runtime lookback window. 1328 # This is used to ensure correct state handling in case of failed partitions. 1329 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1330 if runtime_lookback_window and stream_state_value: 1331 new_stream_state = ( 1332 connector_state_converter.parse_timestamp(stream_state_value) 1333 - runtime_lookback_window 1334 ) 1335 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1336 new_stream_state 1337 ) 1338 1339 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1340 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1341 start_date_runtime_value = self.create_min_max_datetime( 1342 model=datetime_based_cursor_model.start_datetime, config=config 1343 ) 1344 else: 1345 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1346 1347 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1348 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1349 end_date_runtime_value = self.create_min_max_datetime( 1350 model=datetime_based_cursor_model.end_datetime, config=config 1351 ) 1352 else: 1353 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1354 1355 interpolated_start_date = MinMaxDatetime.create( 1356 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1357 parameters=datetime_based_cursor_model.parameters, 1358 ) 1359 interpolated_end_date = ( 1360 None 1361 if not end_date_runtime_value 1362 else MinMaxDatetime.create( 1363 end_date_runtime_value, datetime_based_cursor_model.parameters 1364 ) 1365 ) 1366 1367 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1368 if not interpolated_start_date.datetime_format: 1369 interpolated_start_date.datetime_format = datetime_format 1370 if interpolated_end_date and not interpolated_end_date.datetime_format: 1371 interpolated_end_date.datetime_format = datetime_format 1372 1373 start_date = interpolated_start_date.get_datetime(config=config) 1374 end_date_provider = ( 1375 partial(interpolated_end_date.get_datetime, config) 1376 if interpolated_end_date 1377 else connector_state_converter.get_end_provider() 1378 ) 1379 1380 if ( 1381 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1382 ) or ( 1383 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1384 ): 1385 raise ValueError( 1386 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1387 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1388 ) 1389 1390 # When step is not defined, default to a step size from the starting date to the present moment 1391 step_length = datetime.timedelta.max 1392 interpolated_step = ( 1393 InterpolatedString.create( 1394 datetime_based_cursor_model.step, 1395 parameters=datetime_based_cursor_model.parameters or {}, 1396 ) 1397 if datetime_based_cursor_model.step 1398 else None 1399 ) 1400 if interpolated_step: 1401 evaluated_step = interpolated_step.eval(config) 1402 if evaluated_step: 1403 step_length = parse_duration(evaluated_step) 1404 1405 clamping_strategy: ClampingStrategy = NoClamping() 1406 if datetime_based_cursor_model.clamping: 1407 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1408 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1409 # object which we want to keep agnostic of being low-code 1410 target = InterpolatedString( 1411 string=datetime_based_cursor_model.clamping.target, 1412 parameters=datetime_based_cursor_model.parameters or {}, 1413 ) 1414 evaluated_target = target.eval(config=config) 1415 match evaluated_target: 1416 case "DAY": 1417 clamping_strategy = DayClampingStrategy() 1418 end_date_provider = ClampingEndProvider( 1419 DayClampingStrategy(is_ceiling=False), 1420 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1421 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1422 ) 1423 case "WEEK": 1424 if ( 1425 not datetime_based_cursor_model.clamping.target_details 1426 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1427 ): 1428 raise ValueError( 1429 "Given WEEK clamping, weekday needs to be provided as target_details" 1430 ) 1431 weekday = self._assemble_weekday( 1432 datetime_based_cursor_model.clamping.target_details["weekday"] 1433 ) 1434 clamping_strategy = WeekClampingStrategy(weekday) 1435 end_date_provider = ClampingEndProvider( 1436 WeekClampingStrategy(weekday, is_ceiling=False), 1437 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1438 granularity=cursor_granularity or datetime.timedelta(days=1), 1439 ) 1440 case "MONTH": 1441 clamping_strategy = MonthClampingStrategy() 1442 end_date_provider = ClampingEndProvider( 1443 MonthClampingStrategy(is_ceiling=False), 1444 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1445 granularity=cursor_granularity or datetime.timedelta(days=1), 1446 ) 1447 case _: 1448 raise ValueError( 1449 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1450 ) 1451 1452 return ConcurrentCursor( 1453 stream_name=stream_name, 1454 stream_namespace=stream_namespace, 1455 stream_state=stream_state, 1456 message_repository=message_repository or self._message_repository, 1457 connector_state_manager=self._connector_state_manager, 1458 connector_state_converter=connector_state_converter, 1459 cursor_field=cursor_field, 1460 slice_boundary_fields=slice_boundary_fields, 1461 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1462 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1463 lookback_window=lookback_window, 1464 slice_range=step_length, 1465 cursor_granularity=cursor_granularity, 1466 clamping_strategy=clamping_strategy, 1467 ) 1468 1469 def create_concurrent_cursor_from_incrementing_count_cursor( 1470 self, 1471 model_type: Type[BaseModel], 1472 component_definition: ComponentDefinition, 1473 stream_name: str, 1474 stream_namespace: Optional[str], 1475 config: Config, 1476 message_repository: Optional[MessageRepository] = None, 1477 stream_state_migrations: Optional[List[Any]] = None, 1478 **kwargs: Any, 1479 ) -> ConcurrentCursor: 1480 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1481 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1482 # incoming state and connector_state_manager that is initialized when the component factory is created 1483 stream_state = ( 1484 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1485 if "stream_state" not in kwargs 1486 else kwargs["stream_state"] 1487 ) 1488 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1489 1490 component_type = component_definition.get("type") 1491 if component_definition.get("type") != model_type.__name__: 1492 raise ValueError( 1493 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1494 ) 1495 1496 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1497 1498 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1499 raise ValueError( 1500 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1501 ) 1502 1503 interpolated_start_value = ( 1504 InterpolatedString.create( 1505 incrementing_count_cursor_model.start_value, # type: ignore 1506 parameters=incrementing_count_cursor_model.parameters or {}, 1507 ) 1508 if incrementing_count_cursor_model.start_value 1509 else 0 1510 ) 1511 1512 interpolated_cursor_field = InterpolatedString.create( 1513 incrementing_count_cursor_model.cursor_field, 1514 parameters=incrementing_count_cursor_model.parameters or {}, 1515 ) 1516 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1517 1518 connector_state_converter = IncrementingCountStreamStateConverter( 1519 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1520 ) 1521 1522 return ConcurrentCursor( 1523 stream_name=stream_name, 1524 stream_namespace=stream_namespace, 1525 stream_state=stream_state, 1526 message_repository=message_repository or self._message_repository, 1527 connector_state_manager=self._connector_state_manager, 1528 connector_state_converter=connector_state_converter, 1529 cursor_field=cursor_field, 1530 slice_boundary_fields=None, 1531 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1532 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 ) 1534 1535 def _assemble_weekday(self, weekday: str) -> Weekday: 1536 match weekday: 1537 case "MONDAY": 1538 return Weekday.MONDAY 1539 case "TUESDAY": 1540 return Weekday.TUESDAY 1541 case "WEDNESDAY": 1542 return Weekday.WEDNESDAY 1543 case "THURSDAY": 1544 return Weekday.THURSDAY 1545 case "FRIDAY": 1546 return Weekday.FRIDAY 1547 case "SATURDAY": 1548 return Weekday.SATURDAY 1549 case "SUNDAY": 1550 return Weekday.SUNDAY 1551 case _: 1552 raise ValueError(f"Unknown weekday {weekday}") 1553 1554 def create_concurrent_cursor_from_perpartition_cursor( 1555 self, 1556 state_manager: ConnectorStateManager, 1557 model_type: Type[BaseModel], 1558 component_definition: ComponentDefinition, 1559 stream_name: str, 1560 stream_namespace: Optional[str], 1561 config: Config, 1562 stream_state: MutableMapping[str, Any], 1563 partition_router: PartitionRouter, 1564 stream_state_migrations: Optional[List[Any]] = None, 1565 attempt_to_create_cursor_if_not_provided: bool = False, 1566 **kwargs: Any, 1567 ) -> ConcurrentPerPartitionCursor: 1568 component_type = component_definition.get("type") 1569 if component_definition.get("type") != model_type.__name__: 1570 raise ValueError( 1571 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1572 ) 1573 1574 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1575 1576 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1577 raise ValueError( 1578 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1579 ) 1580 1581 interpolated_cursor_field = InterpolatedString.create( 1582 datetime_based_cursor_model.cursor_field, 1583 parameters=datetime_based_cursor_model.parameters or {}, 1584 ) 1585 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1586 1587 datetime_format = datetime_based_cursor_model.datetime_format 1588 1589 cursor_granularity = ( 1590 parse_duration(datetime_based_cursor_model.cursor_granularity) 1591 if datetime_based_cursor_model.cursor_granularity 1592 else None 1593 ) 1594 1595 connector_state_converter: DateTimeStreamStateConverter 1596 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1597 datetime_format=datetime_format, 1598 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1599 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1600 cursor_granularity=cursor_granularity, 1601 ) 1602 1603 # Create the cursor factory 1604 cursor_factory = ConcurrentCursorFactory( 1605 partial( 1606 self.create_concurrent_cursor_from_datetime_based_cursor, 1607 state_manager=state_manager, 1608 model_type=model_type, 1609 component_definition=component_definition, 1610 stream_name=stream_name, 1611 stream_namespace=stream_namespace, 1612 config=config, 1613 message_repository=NoopMessageRepository(), 1614 stream_state_migrations=stream_state_migrations, 1615 ) 1616 ) 1617 1618 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1619 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1620 use_global_cursor = isinstance( 1621 partition_router, GroupingPartitionRouter 1622 ) or component_definition.get("global_substream_cursor", False) 1623 1624 # Return the concurrent cursor and state converter 1625 return ConcurrentPerPartitionCursor( 1626 cursor_factory=cursor_factory, 1627 partition_router=partition_router, 1628 stream_name=stream_name, 1629 stream_namespace=stream_namespace, 1630 stream_state=stream_state, 1631 message_repository=self._message_repository, # type: ignore 1632 connector_state_manager=state_manager, 1633 connector_state_converter=connector_state_converter, 1634 cursor_field=cursor_field, 1635 use_global_cursor=use_global_cursor, 1636 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1637 ) 1638 1639 @staticmethod 1640 def create_constant_backoff_strategy( 1641 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1642 ) -> ConstantBackoffStrategy: 1643 return ConstantBackoffStrategy( 1644 backoff_time_in_seconds=model.backoff_time_in_seconds, 1645 config=config, 1646 parameters=model.parameters or {}, 1647 ) 1648 1649 def create_cursor_pagination( 1650 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1651 ) -> CursorPaginationStrategy: 1652 if isinstance(decoder, PaginationDecoderDecorator): 1653 inner_decoder = decoder.decoder 1654 else: 1655 inner_decoder = decoder 1656 decoder = PaginationDecoderDecorator(decoder=decoder) 1657 1658 if self._is_supported_decoder_for_pagination(inner_decoder): 1659 decoder_to_use = decoder 1660 else: 1661 raise ValueError( 1662 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1663 ) 1664 1665 return CursorPaginationStrategy( 1666 cursor_value=model.cursor_value, 1667 decoder=decoder_to_use, 1668 page_size=model.page_size, 1669 stop_condition=model.stop_condition, 1670 config=config, 1671 parameters=model.parameters or {}, 1672 ) 1673 1674 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1675 """ 1676 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1677 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1678 :param model: The Pydantic model of the custom component being created 1679 :param config: The custom defined connector config 1680 :return: The declarative component built from the Pydantic model to be used at runtime 1681 """ 1682 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1683 component_fields = get_type_hints(custom_component_class) 1684 model_args = model.dict() 1685 model_args["config"] = config 1686 1687 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1688 # we defer to these arguments over the component's definition 1689 for key, arg in kwargs.items(): 1690 model_args[key] = arg 1691 1692 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1693 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1694 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1695 for model_field, model_value in model_args.items(): 1696 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1697 if ( 1698 isinstance(model_value, dict) 1699 and "type" not in model_value 1700 and model_field in component_fields 1701 ): 1702 derived_type = self._derive_component_type_from_type_hints( 1703 component_fields.get(model_field) 1704 ) 1705 if derived_type: 1706 model_value["type"] = derived_type 1707 1708 if self._is_component(model_value): 1709 model_args[model_field] = self._create_nested_component( 1710 model, model_field, model_value, config 1711 ) 1712 elif isinstance(model_value, list): 1713 vals = [] 1714 for v in model_value: 1715 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1716 derived_type = self._derive_component_type_from_type_hints( 1717 component_fields.get(model_field) 1718 ) 1719 if derived_type: 1720 v["type"] = derived_type 1721 if self._is_component(v): 1722 vals.append(self._create_nested_component(model, model_field, v, config)) 1723 else: 1724 vals.append(v) 1725 model_args[model_field] = vals 1726 1727 kwargs = { 1728 class_field: model_args[class_field] 1729 for class_field in component_fields.keys() 1730 if class_field in model_args 1731 } 1732 return custom_component_class(**kwargs) 1733 1734 @staticmethod 1735 def _get_class_from_fully_qualified_class_name( 1736 full_qualified_class_name: str, 1737 ) -> Any: 1738 """Get a class from its fully qualified name. 1739 1740 If a custom components module is needed, we assume it is already registered - probably 1741 as `source_declarative_manifest.components` or `components`. 1742 1743 Args: 1744 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1745 1746 Returns: 1747 Any: The class object. 1748 1749 Raises: 1750 ValueError: If the class cannot be loaded. 1751 """ 1752 split = full_qualified_class_name.split(".") 1753 module_name_full = ".".join(split[:-1]) 1754 class_name = split[-1] 1755 1756 try: 1757 module_ref = importlib.import_module(module_name_full) 1758 except ModuleNotFoundError as e: 1759 if split[0] == "source_declarative_manifest": 1760 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1761 try: 1762 import os 1763 1764 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1765 module_ref = importlib.import_module( 1766 module_name_with_source_declarative_manifest 1767 ) 1768 except ModuleNotFoundError: 1769 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1770 else: 1771 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1772 1773 try: 1774 return getattr(module_ref, class_name) 1775 except AttributeError as e: 1776 raise ValueError( 1777 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1778 ) from e 1779 1780 @staticmethod 1781 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1782 interface = field_type 1783 while True: 1784 origin = get_origin(interface) 1785 if origin: 1786 # Unnest types until we reach the raw type 1787 # List[T] -> T 1788 # Optional[List[T]] -> T 1789 args = get_args(interface) 1790 interface = args[0] 1791 else: 1792 break 1793 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1794 return interface.__name__ 1795 return None 1796 1797 @staticmethod 1798 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1799 if not cls: 1800 return False 1801 return cls.__module__ == "builtins" 1802 1803 @staticmethod 1804 def _extract_missing_parameters(error: TypeError) -> List[str]: 1805 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1806 if parameter_search: 1807 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1808 else: 1809 return [] 1810 1811 def _create_nested_component( 1812 self, model: Any, model_field: str, model_value: Any, config: Config 1813 ) -> Any: 1814 type_name = model_value.get("type", None) 1815 if not type_name: 1816 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1817 return model_value 1818 1819 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1820 if model_type: 1821 parsed_model = model_type.parse_obj(model_value) 1822 try: 1823 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1824 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1825 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1826 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1827 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1828 # are needed by a component and could not be shared. 1829 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1830 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1831 model_parameters = model_value.get("$parameters", {}) 1832 matching_parameters = { 1833 kwarg: model_parameters[kwarg] 1834 for kwarg in constructor_kwargs 1835 if kwarg in model_parameters 1836 } 1837 return self._create_component_from_model( 1838 model=parsed_model, config=config, **matching_parameters 1839 ) 1840 except TypeError as error: 1841 missing_parameters = self._extract_missing_parameters(error) 1842 if missing_parameters: 1843 raise ValueError( 1844 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1845 + ", ".join( 1846 ( 1847 f"{type_name}.$parameters.{parameter}" 1848 for parameter in missing_parameters 1849 ) 1850 ) 1851 ) 1852 raise TypeError( 1853 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1854 ) 1855 else: 1856 raise ValueError( 1857 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1858 ) 1859 1860 @staticmethod 1861 def _is_component(model_value: Any) -> bool: 1862 return isinstance(model_value, dict) and model_value.get("type") is not None 1863 1864 def create_datetime_based_cursor( 1865 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1866 ) -> DatetimeBasedCursor: 1867 start_datetime: Union[str, MinMaxDatetime] = ( 1868 model.start_datetime 1869 if isinstance(model.start_datetime, str) 1870 else self.create_min_max_datetime(model.start_datetime, config) 1871 ) 1872 end_datetime: Union[str, MinMaxDatetime, None] = None 1873 if model.is_data_feed and model.end_datetime: 1874 raise ValueError("Data feed does not support end_datetime") 1875 if model.is_data_feed and model.is_client_side_incremental: 1876 raise ValueError( 1877 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1878 ) 1879 if model.end_datetime: 1880 end_datetime = ( 1881 model.end_datetime 1882 if isinstance(model.end_datetime, str) 1883 else self.create_min_max_datetime(model.end_datetime, config) 1884 ) 1885 1886 end_time_option = ( 1887 self._create_component_from_model( 1888 model.end_time_option, config, parameters=model.parameters or {} 1889 ) 1890 if model.end_time_option 1891 else None 1892 ) 1893 start_time_option = ( 1894 self._create_component_from_model( 1895 model.start_time_option, config, parameters=model.parameters or {} 1896 ) 1897 if model.start_time_option 1898 else None 1899 ) 1900 1901 return DatetimeBasedCursor( 1902 cursor_field=model.cursor_field, 1903 cursor_datetime_formats=model.cursor_datetime_formats 1904 if model.cursor_datetime_formats 1905 else [], 1906 cursor_granularity=model.cursor_granularity, 1907 datetime_format=model.datetime_format, 1908 end_datetime=end_datetime, 1909 start_datetime=start_datetime, 1910 step=model.step, 1911 end_time_option=end_time_option, 1912 lookback_window=model.lookback_window, 1913 start_time_option=start_time_option, 1914 partition_field_end=model.partition_field_end, 1915 partition_field_start=model.partition_field_start, 1916 message_repository=self._message_repository, 1917 is_compare_strictly=model.is_compare_strictly, 1918 config=config, 1919 parameters=model.parameters or {}, 1920 ) 1921 1922 def create_declarative_stream( 1923 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1924 ) -> DeclarativeStream: 1925 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1926 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1927 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1928 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1929 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1930 1931 primary_key = model.primary_key.__root__ if model.primary_key else None 1932 stop_condition_on_cursor = ( 1933 model.incremental_sync 1934 and hasattr(model.incremental_sync, "is_data_feed") 1935 and model.incremental_sync.is_data_feed 1936 ) 1937 client_side_filtering_enabled = ( 1938 model.incremental_sync 1939 and hasattr(model.incremental_sync, "is_client_side_incremental") 1940 and model.incremental_sync.is_client_side_incremental 1941 ) 1942 concurrent_cursor = None 1943 if stop_condition_on_cursor or client_side_filtering_enabled: 1944 stream_slicer = self._build_stream_slicer_from_partition_router( 1945 model.retriever, config, stream_name=model.name 1946 ) 1947 concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) 1948 1949 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1950 cursor_model = model.incremental_sync 1951 1952 end_time_option = ( 1953 self._create_component_from_model( 1954 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1955 ) 1956 if cursor_model.end_time_option 1957 else None 1958 ) 1959 start_time_option = ( 1960 self._create_component_from_model( 1961 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1962 ) 1963 if cursor_model.start_time_option 1964 else None 1965 ) 1966 1967 request_options_provider = DatetimeBasedRequestOptionsProvider( 1968 start_time_option=start_time_option, 1969 end_time_option=end_time_option, 1970 partition_field_start=cursor_model.partition_field_end, 1971 partition_field_end=cursor_model.partition_field_end, 1972 config=config, 1973 parameters=model.parameters or {}, 1974 ) 1975 elif model.incremental_sync and isinstance( 1976 model.incremental_sync, IncrementingCountCursorModel 1977 ): 1978 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1979 1980 start_time_option = ( 1981 self._create_component_from_model( 1982 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1983 config, 1984 parameters=cursor_model.parameters or {}, 1985 ) 1986 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1987 else None 1988 ) 1989 1990 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1991 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1992 partition_field_start = "start" 1993 1994 request_options_provider = DatetimeBasedRequestOptionsProvider( 1995 start_time_option=start_time_option, 1996 partition_field_start=partition_field_start, 1997 config=config, 1998 parameters=model.parameters or {}, 1999 ) 2000 else: 2001 request_options_provider = None 2002 2003 transformations = [] 2004 if model.transformations: 2005 for transformation_model in model.transformations: 2006 transformations.append( 2007 self._create_component_from_model(model=transformation_model, config=config) 2008 ) 2009 file_uploader = None 2010 if model.file_uploader: 2011 file_uploader = self._create_component_from_model( 2012 model=model.file_uploader, config=config 2013 ) 2014 2015 retriever = self._create_component_from_model( 2016 model=model.retriever, 2017 config=config, 2018 name=model.name, 2019 primary_key=primary_key, 2020 stream_slicer=combined_slicers, 2021 request_options_provider=request_options_provider, 2022 stop_condition_cursor=concurrent_cursor, 2023 client_side_incremental_sync={"cursor": concurrent_cursor} 2024 if client_side_filtering_enabled 2025 else None, 2026 transformations=transformations, 2027 file_uploader=file_uploader, 2028 incremental_sync=model.incremental_sync, 2029 ) 2030 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2031 2032 if model.state_migrations: 2033 state_transformations = [ 2034 self._create_component_from_model(state_migration, config, declarative_stream=model) 2035 for state_migration in model.state_migrations 2036 ] 2037 else: 2038 state_transformations = [] 2039 2040 schema_loader: Union[ 2041 CompositeSchemaLoader, 2042 DefaultSchemaLoader, 2043 DynamicSchemaLoader, 2044 InlineSchemaLoader, 2045 JsonFileSchemaLoader, 2046 ] 2047 if model.schema_loader and isinstance(model.schema_loader, list): 2048 nested_schema_loaders = [ 2049 self._create_component_from_model(model=nested_schema_loader, config=config) 2050 for nested_schema_loader in model.schema_loader 2051 ] 2052 schema_loader = CompositeSchemaLoader( 2053 schema_loaders=nested_schema_loaders, parameters={} 2054 ) 2055 elif model.schema_loader: 2056 schema_loader = self._create_component_from_model( 2057 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2058 config=config, 2059 ) 2060 else: 2061 options = model.parameters or {} 2062 if "name" not in options: 2063 options["name"] = model.name 2064 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2065 2066 return DeclarativeStream( 2067 name=model.name or "", 2068 primary_key=primary_key, 2069 retriever=retriever, 2070 schema_loader=schema_loader, 2071 stream_cursor_field=cursor_field or "", 2072 state_migrations=state_transformations, 2073 config=config, 2074 parameters=model.parameters or {}, 2075 ) 2076 2077 def _build_stream_slicer_from_partition_router( 2078 self, 2079 model: Union[ 2080 AsyncRetrieverModel, 2081 CustomRetrieverModel, 2082 SimpleRetrieverModel, 2083 ], 2084 config: Config, 2085 stream_name: Optional[str] = None, 2086 ) -> Optional[PartitionRouter]: 2087 if ( 2088 hasattr(model, "partition_router") 2089 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2090 and model.partition_router 2091 ): 2092 stream_slicer_model = model.partition_router 2093 if isinstance(stream_slicer_model, list): 2094 return CartesianProductStreamSlicer( 2095 [ 2096 self._create_component_from_model( 2097 model=slicer, config=config, stream_name=stream_name or "" 2098 ) 2099 for slicer in stream_slicer_model 2100 ], 2101 parameters={}, 2102 ) 2103 else: 2104 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2105 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2106 ) 2107 return None 2108 2109 def _build_incremental_cursor( 2110 self, 2111 model: DeclarativeStreamModel, 2112 stream_slicer: Optional[PartitionRouter], 2113 config: Config, 2114 ) -> Optional[StreamSlicer]: 2115 state_transformations = ( 2116 [ 2117 self._create_component_from_model(state_migration, config, declarative_stream=model) 2118 for state_migration in model.state_migrations 2119 ] 2120 if model.state_migrations 2121 else [] 2122 ) 2123 2124 if model.incremental_sync and stream_slicer: 2125 if model.retriever.type == "AsyncRetriever": 2126 stream_name = model.name or "" 2127 stream_namespace = None 2128 stream_state = self._connector_state_manager.get_stream_state( 2129 stream_name, stream_namespace 2130 ) 2131 2132 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2133 state_manager=self._connector_state_manager, 2134 model_type=DatetimeBasedCursorModel, 2135 component_definition=model.incremental_sync.__dict__, 2136 stream_name=stream_name, 2137 stream_namespace=stream_namespace, 2138 config=config or {}, 2139 stream_state=stream_state, 2140 stream_state_migrations=state_transformations, 2141 partition_router=stream_slicer, 2142 ) 2143 2144 incremental_sync_model = model.incremental_sync 2145 cursor_component = self._create_component_from_model( 2146 model=incremental_sync_model, config=config 2147 ) 2148 is_global_cursor = ( 2149 hasattr(incremental_sync_model, "global_substream_cursor") 2150 and incremental_sync_model.global_substream_cursor 2151 ) 2152 2153 if is_global_cursor: 2154 return GlobalSubstreamCursor( 2155 stream_cursor=cursor_component, partition_router=stream_slicer 2156 ) 2157 return PerPartitionWithGlobalCursor( 2158 cursor_factory=CursorFactory( 2159 lambda: self._create_component_from_model( 2160 model=incremental_sync_model, config=config 2161 ), 2162 ), 2163 partition_router=stream_slicer, 2164 stream_cursor=cursor_component, 2165 ) 2166 elif model.incremental_sync: 2167 if model.retriever.type == "AsyncRetriever": 2168 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2169 model_type=DatetimeBasedCursorModel, 2170 component_definition=model.incremental_sync.__dict__, 2171 stream_name=model.name or "", 2172 stream_namespace=None, 2173 config=config or {}, 2174 stream_state_migrations=state_transformations, 2175 ) 2176 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2177 return None 2178 2179 def _build_concurrent_cursor( 2180 self, 2181 model: DeclarativeStreamModel, 2182 stream_slicer: Optional[PartitionRouter], 2183 config: Config, 2184 ) -> Optional[StreamSlicer]: 2185 stream_state = self._connector_state_manager.get_stream_state( 2186 stream_name=model.name or "", namespace=None 2187 ) 2188 2189 if model.state_migrations: 2190 state_transformations = [ 2191 self._create_component_from_model(state_migration, config, declarative_stream=model) 2192 for state_migration in model.state_migrations 2193 ] 2194 else: 2195 state_transformations = [] 2196 2197 if model.incremental_sync and stream_slicer: 2198 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2199 state_manager=self._connector_state_manager, 2200 model_type=DatetimeBasedCursorModel, 2201 component_definition=model.incremental_sync.__dict__, 2202 stream_name=model.name or "", 2203 stream_namespace=None, 2204 config=config or {}, 2205 stream_state=stream_state, 2206 stream_state_migrations=state_transformations, 2207 partition_router=stream_slicer, 2208 attempt_to_create_cursor_if_not_provided=True, 2209 ) 2210 elif model.incremental_sync: 2211 if type(model.incremental_sync) == IncrementingCountCursorModel: 2212 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2213 model_type=IncrementingCountCursorModel, 2214 component_definition=model.incremental_sync.__dict__, 2215 stream_name=model.name or "", 2216 stream_namespace=None, 2217 config=config or {}, 2218 stream_state_migrations=state_transformations, 2219 ) 2220 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2221 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2222 model_type=type(model.incremental_sync), 2223 component_definition=model.incremental_sync.__dict__, 2224 stream_name=model.name or "", 2225 stream_namespace=None, 2226 config=config or {}, 2227 stream_state_migrations=state_transformations, 2228 attempt_to_create_cursor_if_not_provided=True, 2229 ) 2230 else: 2231 raise ValueError( 2232 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2233 ) 2234 return None 2235 2236 def _build_resumable_cursor( 2237 self, 2238 model: Union[ 2239 AsyncRetrieverModel, 2240 CustomRetrieverModel, 2241 SimpleRetrieverModel, 2242 ], 2243 stream_slicer: Optional[PartitionRouter], 2244 ) -> Optional[StreamSlicer]: 2245 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2246 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2247 return ResumableFullRefreshCursor(parameters={}) 2248 elif stream_slicer: 2249 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2250 return PerPartitionCursor( 2251 cursor_factory=CursorFactory( 2252 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2253 ), 2254 partition_router=stream_slicer, 2255 ) 2256 return None 2257 2258 def _merge_stream_slicers( 2259 self, model: DeclarativeStreamModel, config: Config 2260 ) -> Optional[StreamSlicer]: 2261 retriever_model = model.retriever 2262 2263 stream_slicer = self._build_stream_slicer_from_partition_router( 2264 retriever_model, config, stream_name=model.name 2265 ) 2266 2267 if retriever_model.type == "AsyncRetriever": 2268 is_not_datetime_cursor = ( 2269 model.incremental_sync.type != "DatetimeBasedCursor" 2270 if model.incremental_sync 2271 else None 2272 ) 2273 is_partition_router = ( 2274 bool(retriever_model.partition_router) if model.incremental_sync else None 2275 ) 2276 2277 if is_not_datetime_cursor: 2278 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2279 # support or unordered slices (for example, when we trigger reports for January and February, the report 2280 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2281 # implementation available in the CDK, we can enable more cursors here. 2282 raise ValueError( 2283 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2284 ) 2285 2286 if is_partition_router and not stream_slicer: 2287 # Note that this development is also done in parallel to the per partition development which once merged 2288 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2289 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2290 2291 if model.incremental_sync: 2292 return self._build_incremental_cursor(model, stream_slicer, config) 2293 2294 return ( 2295 stream_slicer 2296 if self._disable_resumable_full_refresh 2297 else self._build_resumable_cursor(retriever_model, stream_slicer) 2298 ) 2299 2300 def create_default_error_handler( 2301 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2302 ) -> DefaultErrorHandler: 2303 backoff_strategies = [] 2304 if model.backoff_strategies: 2305 for backoff_strategy_model in model.backoff_strategies: 2306 backoff_strategies.append( 2307 self._create_component_from_model(model=backoff_strategy_model, config=config) 2308 ) 2309 2310 response_filters = [] 2311 if model.response_filters: 2312 for response_filter_model in model.response_filters: 2313 response_filters.append( 2314 self._create_component_from_model(model=response_filter_model, config=config) 2315 ) 2316 response_filters.append( 2317 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2318 ) 2319 2320 return DefaultErrorHandler( 2321 backoff_strategies=backoff_strategies, 2322 max_retries=model.max_retries, 2323 response_filters=response_filters, 2324 config=config, 2325 parameters=model.parameters or {}, 2326 ) 2327 2328 def create_default_paginator( 2329 self, 2330 model: DefaultPaginatorModel, 2331 config: Config, 2332 *, 2333 url_base: str, 2334 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2335 decoder: Optional[Decoder] = None, 2336 cursor_used_for_stop_condition: Optional[Cursor] = None, 2337 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2338 if decoder: 2339 if self._is_supported_decoder_for_pagination(decoder): 2340 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2341 else: 2342 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2343 else: 2344 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2345 page_size_option = ( 2346 self._create_component_from_model(model=model.page_size_option, config=config) 2347 if model.page_size_option 2348 else None 2349 ) 2350 page_token_option = ( 2351 self._create_component_from_model(model=model.page_token_option, config=config) 2352 if model.page_token_option 2353 else None 2354 ) 2355 pagination_strategy = self._create_component_from_model( 2356 model=model.pagination_strategy, 2357 config=config, 2358 decoder=decoder_to_use, 2359 extractor_model=extractor_model, 2360 ) 2361 if cursor_used_for_stop_condition: 2362 pagination_strategy = StopConditionPaginationStrategyDecorator( 2363 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2364 ) 2365 paginator = DefaultPaginator( 2366 decoder=decoder_to_use, 2367 page_size_option=page_size_option, 2368 page_token_option=page_token_option, 2369 pagination_strategy=pagination_strategy, 2370 url_base=url_base, 2371 config=config, 2372 parameters=model.parameters or {}, 2373 ) 2374 if self._limit_pages_fetched_per_slice: 2375 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2376 return paginator 2377 2378 def create_dpath_extractor( 2379 self, 2380 model: DpathExtractorModel, 2381 config: Config, 2382 decoder: Optional[Decoder] = None, 2383 **kwargs: Any, 2384 ) -> DpathExtractor: 2385 if decoder: 2386 decoder_to_use = decoder 2387 else: 2388 decoder_to_use = JsonDecoder(parameters={}) 2389 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2390 return DpathExtractor( 2391 decoder=decoder_to_use, 2392 field_path=model_field_path, 2393 config=config, 2394 parameters=model.parameters or {}, 2395 ) 2396 2397 @staticmethod 2398 def create_response_to_file_extractor( 2399 model: ResponseToFileExtractorModel, 2400 **kwargs: Any, 2401 ) -> ResponseToFileExtractor: 2402 return ResponseToFileExtractor(parameters=model.parameters or {}) 2403 2404 @staticmethod 2405 def create_exponential_backoff_strategy( 2406 model: ExponentialBackoffStrategyModel, config: Config 2407 ) -> ExponentialBackoffStrategy: 2408 return ExponentialBackoffStrategy( 2409 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2410 ) 2411 2412 @staticmethod 2413 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2414 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2415 2416 def create_http_requester( 2417 self, 2418 model: HttpRequesterModel, 2419 config: Config, 2420 decoder: Decoder = JsonDecoder(parameters={}), 2421 query_properties_key: Optional[str] = None, 2422 use_cache: Optional[bool] = None, 2423 *, 2424 name: str, 2425 ) -> HttpRequester: 2426 authenticator = ( 2427 self._create_component_from_model( 2428 model=model.authenticator, 2429 config=config, 2430 url_base=model.url or model.url_base, 2431 name=name, 2432 decoder=decoder, 2433 ) 2434 if model.authenticator 2435 else None 2436 ) 2437 error_handler = ( 2438 self._create_component_from_model(model=model.error_handler, config=config) 2439 if model.error_handler 2440 else DefaultErrorHandler( 2441 backoff_strategies=[], 2442 response_filters=[], 2443 config=config, 2444 parameters=model.parameters or {}, 2445 ) 2446 ) 2447 2448 api_budget = self._api_budget 2449 2450 # Removes QueryProperties components from the interpolated mappings because it has been designed 2451 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2452 # instead of through jinja interpolation 2453 request_parameters: Optional[Union[str, Mapping[str, str]]] 2454 if isinstance(model.request_parameters, Mapping): 2455 request_parameters = self._remove_query_properties(model.request_parameters) 2456 else: 2457 request_parameters = model.request_parameters 2458 2459 request_options_provider = InterpolatedRequestOptionsProvider( 2460 request_body=model.request_body, 2461 request_body_data=model.request_body_data, 2462 request_body_json=model.request_body_json, 2463 request_headers=model.request_headers, 2464 request_parameters=request_parameters, 2465 query_properties_key=query_properties_key, 2466 config=config, 2467 parameters=model.parameters or {}, 2468 ) 2469 2470 assert model.use_cache is not None # for mypy 2471 assert model.http_method is not None # for mypy 2472 2473 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2474 2475 return HttpRequester( 2476 name=name, 2477 url=model.url, 2478 url_base=model.url_base, 2479 path=model.path, 2480 authenticator=authenticator, 2481 error_handler=error_handler, 2482 api_budget=api_budget, 2483 http_method=HttpMethod[model.http_method.value], 2484 request_options_provider=request_options_provider, 2485 config=config, 2486 disable_retries=self._disable_retries, 2487 parameters=model.parameters or {}, 2488 message_repository=self._message_repository, 2489 use_cache=should_use_cache, 2490 decoder=decoder, 2491 stream_response=decoder.is_stream_response() if decoder else False, 2492 ) 2493 2494 @staticmethod 2495 def create_http_response_filter( 2496 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2497 ) -> HttpResponseFilter: 2498 if model.action: 2499 action = ResponseAction(model.action.value) 2500 else: 2501 action = None 2502 2503 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2504 2505 http_codes = ( 2506 set(model.http_codes) if model.http_codes else set() 2507 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2508 2509 return HttpResponseFilter( 2510 action=action, 2511 failure_type=failure_type, 2512 error_message=model.error_message or "", 2513 error_message_contains=model.error_message_contains or "", 2514 http_codes=http_codes, 2515 predicate=model.predicate or "", 2516 config=config, 2517 parameters=model.parameters or {}, 2518 ) 2519 2520 @staticmethod 2521 def create_inline_schema_loader( 2522 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2523 ) -> InlineSchemaLoader: 2524 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2525 2526 def create_complex_field_type( 2527 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2528 ) -> ComplexFieldType: 2529 items = ( 2530 self._create_component_from_model(model=model.items, config=config) 2531 if isinstance(model.items, ComplexFieldTypeModel) 2532 else model.items 2533 ) 2534 2535 return ComplexFieldType(field_type=model.field_type, items=items) 2536 2537 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2538 target_type = ( 2539 self._create_component_from_model(model=model.target_type, config=config) 2540 if isinstance(model.target_type, ComplexFieldTypeModel) 2541 else model.target_type 2542 ) 2543 2544 return TypesMap( 2545 target_type=target_type, 2546 current_type=model.current_type, 2547 condition=model.condition if model.condition is not None else "True", 2548 ) 2549 2550 def create_schema_type_identifier( 2551 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2552 ) -> SchemaTypeIdentifier: 2553 types_mapping = [] 2554 if model.types_mapping: 2555 types_mapping.extend( 2556 [ 2557 self._create_component_from_model(types_map, config=config) 2558 for types_map in model.types_mapping 2559 ] 2560 ) 2561 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2562 [x for x in model.schema_pointer] if model.schema_pointer else [] 2563 ) 2564 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2565 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2566 [x for x in model.type_pointer] if model.type_pointer else None 2567 ) 2568 2569 return SchemaTypeIdentifier( 2570 schema_pointer=model_schema_pointer, 2571 key_pointer=model_key_pointer, 2572 type_pointer=model_type_pointer, 2573 types_mapping=types_mapping, 2574 parameters=model.parameters or {}, 2575 ) 2576 2577 def create_dynamic_schema_loader( 2578 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2579 ) -> DynamicSchemaLoader: 2580 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2581 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2582 2583 schema_transformations = [] 2584 if model.schema_transformations: 2585 for transformation_model in model.schema_transformations: 2586 schema_transformations.append( 2587 self._create_component_from_model(model=transformation_model, config=config) 2588 ) 2589 name = "dynamic_properties" 2590 retriever = self._create_component_from_model( 2591 model=model.retriever, 2592 config=config, 2593 name=name, 2594 primary_key=None, 2595 stream_slicer=combined_slicers, 2596 transformations=[], 2597 use_cache=True, 2598 log_formatter=( 2599 lambda response: format_http_message( 2600 response, 2601 f"Schema loader '{name}' request", 2602 f"Request performed in order to extract schema.", 2603 name, 2604 is_auxiliary=True, 2605 ) 2606 ), 2607 ) 2608 schema_type_identifier = self._create_component_from_model( 2609 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2610 ) 2611 schema_filter = ( 2612 self._create_component_from_model( 2613 model.schema_filter, config=config, parameters=model.parameters or {} 2614 ) 2615 if model.schema_filter is not None 2616 else None 2617 ) 2618 2619 return DynamicSchemaLoader( 2620 retriever=retriever, 2621 config=config, 2622 schema_transformations=schema_transformations, 2623 schema_filter=schema_filter, 2624 schema_type_identifier=schema_type_identifier, 2625 parameters=model.parameters or {}, 2626 ) 2627 2628 @staticmethod 2629 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2630 return JsonDecoder(parameters={}) 2631 2632 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2633 return CompositeRawDecoder( 2634 parser=ModelToComponentFactory._get_parser(model, config), 2635 stream_response=False if self._emit_connector_builder_messages else True, 2636 ) 2637 2638 def create_jsonl_decoder( 2639 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2640 ) -> Decoder: 2641 return CompositeRawDecoder( 2642 parser=ModelToComponentFactory._get_parser(model, config), 2643 stream_response=False if self._emit_connector_builder_messages else True, 2644 ) 2645 2646 def create_gzip_decoder( 2647 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2648 ) -> Decoder: 2649 _compressed_response_types = { 2650 "gzip", 2651 "x-gzip", 2652 "gzip, deflate", 2653 "x-gzip, deflate", 2654 "application/zip", 2655 "application/gzip", 2656 "application/x-gzip", 2657 "application/x-zip-compressed", 2658 } 2659 2660 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2661 2662 if self._emit_connector_builder_messages: 2663 # This is very surprising but if the response is not streamed, 2664 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2665 # which uses urllib3 directly and does not uncompress the data. 2666 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2667 2668 return CompositeRawDecoder.by_headers( 2669 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2670 stream_response=True, 2671 fallback_parser=gzip_parser.inner_parser, 2672 ) 2673 2674 @staticmethod 2675 def create_incrementing_count_cursor( 2676 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2677 ) -> DatetimeBasedCursor: 2678 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2679 # we still parse models into components. The issue is that there's no runtime implementation of a 2680 # IncrementingCountCursor. 2681 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2682 return DatetimeBasedCursor( 2683 cursor_field=model.cursor_field, 2684 datetime_format="%Y-%m-%d", 2685 start_datetime="2024-12-12", 2686 config=config, 2687 parameters={}, 2688 ) 2689 2690 @staticmethod 2691 def create_iterable_decoder( 2692 model: IterableDecoderModel, config: Config, **kwargs: Any 2693 ) -> IterableDecoder: 2694 return IterableDecoder(parameters={}) 2695 2696 @staticmethod 2697 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2698 return XmlDecoder(parameters={}) 2699 2700 def create_zipfile_decoder( 2701 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2702 ) -> ZipfileDecoder: 2703 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2704 2705 @staticmethod 2706 def _get_parser(model: BaseModel, config: Config) -> Parser: 2707 if isinstance(model, JsonDecoderModel): 2708 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2709 return JsonParser() 2710 elif isinstance(model, JsonlDecoderModel): 2711 return JsonLineParser() 2712 elif isinstance(model, CsvDecoderModel): 2713 return CsvParser( 2714 encoding=model.encoding, 2715 delimiter=model.delimiter, 2716 set_values_to_none=model.set_values_to_none, 2717 ) 2718 elif isinstance(model, GzipDecoderModel): 2719 return GzipParser( 2720 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2721 ) 2722 elif isinstance( 2723 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2724 ): 2725 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2726 2727 raise ValueError(f"Unknown decoder type {model}") 2728 2729 @staticmethod 2730 def create_json_file_schema_loader( 2731 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2732 ) -> JsonFileSchemaLoader: 2733 return JsonFileSchemaLoader( 2734 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2735 ) 2736 2737 @staticmethod 2738 def create_jwt_authenticator( 2739 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2740 ) -> JwtAuthenticator: 2741 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2742 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2743 return JwtAuthenticator( 2744 config=config, 2745 parameters=model.parameters or {}, 2746 algorithm=JwtAlgorithm(model.algorithm.value), 2747 secret_key=model.secret_key, 2748 base64_encode_secret_key=model.base64_encode_secret_key, 2749 token_duration=model.token_duration, 2750 header_prefix=model.header_prefix, 2751 kid=jwt_headers.kid, 2752 typ=jwt_headers.typ, 2753 cty=jwt_headers.cty, 2754 iss=jwt_payload.iss, 2755 sub=jwt_payload.sub, 2756 aud=jwt_payload.aud, 2757 additional_jwt_headers=model.additional_jwt_headers, 2758 additional_jwt_payload=model.additional_jwt_payload, 2759 ) 2760 2761 def create_list_partition_router( 2762 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2763 ) -> ListPartitionRouter: 2764 request_option = ( 2765 self._create_component_from_model(model.request_option, config) 2766 if model.request_option 2767 else None 2768 ) 2769 return ListPartitionRouter( 2770 cursor_field=model.cursor_field, 2771 request_option=request_option, 2772 values=model.values, 2773 config=config, 2774 parameters=model.parameters or {}, 2775 ) 2776 2777 @staticmethod 2778 def create_min_max_datetime( 2779 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2780 ) -> MinMaxDatetime: 2781 return MinMaxDatetime( 2782 datetime=model.datetime, 2783 datetime_format=model.datetime_format or "", 2784 max_datetime=model.max_datetime or "", 2785 min_datetime=model.min_datetime or "", 2786 parameters=model.parameters or {}, 2787 ) 2788 2789 @staticmethod 2790 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2791 return NoAuth(parameters=model.parameters or {}) 2792 2793 @staticmethod 2794 def create_no_pagination( 2795 model: NoPaginationModel, config: Config, **kwargs: Any 2796 ) -> NoPagination: 2797 return NoPagination(parameters={}) 2798 2799 def create_oauth_authenticator( 2800 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2801 ) -> DeclarativeOauth2Authenticator: 2802 profile_assertion = ( 2803 self._create_component_from_model(model.profile_assertion, config=config) 2804 if model.profile_assertion 2805 else None 2806 ) 2807 2808 if model.refresh_token_updater: 2809 # ignore type error because fixing it would have a lot of dependencies, revisit later 2810 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2811 config, 2812 InterpolatedString.create( 2813 model.token_refresh_endpoint, # type: ignore 2814 parameters=model.parameters or {}, 2815 ).eval(config), 2816 access_token_name=InterpolatedString.create( 2817 model.access_token_name or "access_token", parameters=model.parameters or {} 2818 ).eval(config), 2819 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2820 expires_in_name=InterpolatedString.create( 2821 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2822 ).eval(config), 2823 client_id_name=InterpolatedString.create( 2824 model.client_id_name or "client_id", parameters=model.parameters or {} 2825 ).eval(config), 2826 client_id=InterpolatedString.create( 2827 model.client_id, parameters=model.parameters or {} 2828 ).eval(config) 2829 if model.client_id 2830 else model.client_id, 2831 client_secret_name=InterpolatedString.create( 2832 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2833 ).eval(config), 2834 client_secret=InterpolatedString.create( 2835 model.client_secret, parameters=model.parameters or {} 2836 ).eval(config) 2837 if model.client_secret 2838 else model.client_secret, 2839 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2840 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2841 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2842 grant_type_name=InterpolatedString.create( 2843 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2844 ).eval(config), 2845 grant_type=InterpolatedString.create( 2846 model.grant_type or "refresh_token", parameters=model.parameters or {} 2847 ).eval(config), 2848 refresh_request_body=InterpolatedMapping( 2849 model.refresh_request_body or {}, parameters=model.parameters or {} 2850 ).eval(config), 2851 refresh_request_headers=InterpolatedMapping( 2852 model.refresh_request_headers or {}, parameters=model.parameters or {} 2853 ).eval(config), 2854 scopes=model.scopes, 2855 token_expiry_date_format=model.token_expiry_date_format, 2856 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2857 message_repository=self._message_repository, 2858 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2859 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2860 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2861 ) 2862 # ignore type error because fixing it would have a lot of dependencies, revisit later 2863 return DeclarativeOauth2Authenticator( # type: ignore 2864 access_token_name=model.access_token_name or "access_token", 2865 access_token_value=model.access_token_value, 2866 client_id_name=model.client_id_name or "client_id", 2867 client_id=model.client_id, 2868 client_secret_name=model.client_secret_name or "client_secret", 2869 client_secret=model.client_secret, 2870 expires_in_name=model.expires_in_name or "expires_in", 2871 grant_type_name=model.grant_type_name or "grant_type", 2872 grant_type=model.grant_type or "refresh_token", 2873 refresh_request_body=model.refresh_request_body, 2874 refresh_request_headers=model.refresh_request_headers, 2875 refresh_token_name=model.refresh_token_name or "refresh_token", 2876 refresh_token=model.refresh_token, 2877 scopes=model.scopes, 2878 token_expiry_date=model.token_expiry_date, 2879 token_expiry_date_format=model.token_expiry_date_format, 2880 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2881 token_refresh_endpoint=model.token_refresh_endpoint, 2882 config=config, 2883 parameters=model.parameters or {}, 2884 message_repository=self._message_repository, 2885 profile_assertion=profile_assertion, 2886 use_profile_assertion=model.use_profile_assertion, 2887 ) 2888 2889 def create_offset_increment( 2890 self, 2891 model: OffsetIncrementModel, 2892 config: Config, 2893 decoder: Decoder, 2894 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2895 **kwargs: Any, 2896 ) -> OffsetIncrement: 2897 if isinstance(decoder, PaginationDecoderDecorator): 2898 inner_decoder = decoder.decoder 2899 else: 2900 inner_decoder = decoder 2901 decoder = PaginationDecoderDecorator(decoder=decoder) 2902 2903 if self._is_supported_decoder_for_pagination(inner_decoder): 2904 decoder_to_use = decoder 2905 else: 2906 raise ValueError( 2907 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2908 ) 2909 2910 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2911 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2912 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2913 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2914 # When we have more time to investigate we can look into reusing the same component. 2915 extractor = ( 2916 self._create_component_from_model( 2917 model=extractor_model, config=config, decoder=decoder_to_use 2918 ) 2919 if extractor_model 2920 else None 2921 ) 2922 2923 return OffsetIncrement( 2924 page_size=model.page_size, 2925 config=config, 2926 decoder=decoder_to_use, 2927 extractor=extractor, 2928 inject_on_first_request=model.inject_on_first_request or False, 2929 parameters=model.parameters or {}, 2930 ) 2931 2932 @staticmethod 2933 def create_page_increment( 2934 model: PageIncrementModel, config: Config, **kwargs: Any 2935 ) -> PageIncrement: 2936 return PageIncrement( 2937 page_size=model.page_size, 2938 config=config, 2939 start_from_page=model.start_from_page or 0, 2940 inject_on_first_request=model.inject_on_first_request or False, 2941 parameters=model.parameters or {}, 2942 ) 2943 2944 def create_parent_stream_config( 2945 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2946 ) -> ParentStreamConfig: 2947 declarative_stream = self._create_component_from_model( 2948 model.stream, config=config, **kwargs 2949 ) 2950 request_option = ( 2951 self._create_component_from_model(model.request_option, config=config) 2952 if model.request_option 2953 else None 2954 ) 2955 2956 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2957 raise ValueError( 2958 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2959 ) 2960 2961 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2962 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2963 ) 2964 2965 return ParentStreamConfig( 2966 parent_key=model.parent_key, 2967 request_option=request_option, 2968 stream=declarative_stream, 2969 partition_field=model.partition_field, 2970 config=config, 2971 incremental_dependency=model.incremental_dependency or False, 2972 parameters=model.parameters or {}, 2973 extra_fields=model.extra_fields, 2974 lazy_read_pointer=model_lazy_read_pointer, 2975 ) 2976 2977 def create_properties_from_endpoint( 2978 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2979 ) -> PropertiesFromEndpoint: 2980 retriever = self._create_component_from_model( 2981 model=model.retriever, 2982 config=config, 2983 name="dynamic_properties", 2984 primary_key=None, 2985 stream_slicer=None, 2986 transformations=[], 2987 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2988 ) 2989 return PropertiesFromEndpoint( 2990 property_field_path=model.property_field_path, 2991 retriever=retriever, 2992 config=config, 2993 parameters=model.parameters or {}, 2994 ) 2995 2996 def create_property_chunking( 2997 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2998 ) -> PropertyChunking: 2999 record_merge_strategy = ( 3000 self._create_component_from_model( 3001 model=model.record_merge_strategy, config=config, **kwargs 3002 ) 3003 if model.record_merge_strategy 3004 else None 3005 ) 3006 3007 property_limit_type: PropertyLimitType 3008 match model.property_limit_type: 3009 case PropertyLimitTypeModel.property_count: 3010 property_limit_type = PropertyLimitType.property_count 3011 case PropertyLimitTypeModel.characters: 3012 property_limit_type = PropertyLimitType.characters 3013 case _: 3014 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3015 3016 return PropertyChunking( 3017 property_limit_type=property_limit_type, 3018 property_limit=model.property_limit, 3019 record_merge_strategy=record_merge_strategy, 3020 config=config, 3021 parameters=model.parameters or {}, 3022 ) 3023 3024 def create_query_properties( 3025 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 3026 ) -> QueryProperties: 3027 if isinstance(model.property_list, list): 3028 property_list = model.property_list 3029 else: 3030 property_list = self._create_component_from_model( 3031 model=model.property_list, config=config, **kwargs 3032 ) 3033 3034 property_chunking = ( 3035 self._create_component_from_model( 3036 model=model.property_chunking, config=config, **kwargs 3037 ) 3038 if model.property_chunking 3039 else None 3040 ) 3041 3042 return QueryProperties( 3043 property_list=property_list, 3044 always_include_properties=model.always_include_properties, 3045 property_chunking=property_chunking, 3046 config=config, 3047 parameters=model.parameters or {}, 3048 ) 3049 3050 @staticmethod 3051 def create_record_filter( 3052 model: RecordFilterModel, config: Config, **kwargs: Any 3053 ) -> RecordFilter: 3054 return RecordFilter( 3055 condition=model.condition or "", config=config, parameters=model.parameters or {} 3056 ) 3057 3058 @staticmethod 3059 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3060 return RequestPath(parameters={}) 3061 3062 @staticmethod 3063 def create_request_option( 3064 model: RequestOptionModel, config: Config, **kwargs: Any 3065 ) -> RequestOption: 3066 inject_into = RequestOptionType(model.inject_into.value) 3067 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3068 [ 3069 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3070 for segment in model.field_path 3071 ] 3072 if model.field_path 3073 else None 3074 ) 3075 field_name = ( 3076 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3077 if model.field_name 3078 else None 3079 ) 3080 return RequestOption( 3081 field_name=field_name, 3082 field_path=field_path, 3083 inject_into=inject_into, 3084 parameters=kwargs.get("parameters", {}), 3085 ) 3086 3087 def create_record_selector( 3088 self, 3089 model: RecordSelectorModel, 3090 config: Config, 3091 *, 3092 name: str, 3093 transformations: List[RecordTransformation] | None = None, 3094 decoder: Decoder | None = None, 3095 client_side_incremental_sync: Dict[str, Any] | None = None, 3096 file_uploader: Optional[DefaultFileUploader] = None, 3097 **kwargs: Any, 3098 ) -> RecordSelector: 3099 extractor = self._create_component_from_model( 3100 model=model.extractor, decoder=decoder, config=config 3101 ) 3102 record_filter = ( 3103 self._create_component_from_model(model.record_filter, config=config) 3104 if model.record_filter 3105 else None 3106 ) 3107 3108 transform_before_filtering = ( 3109 False if model.transform_before_filtering is None else model.transform_before_filtering 3110 ) 3111 if client_side_incremental_sync: 3112 record_filter = ClientSideIncrementalRecordFilterDecorator( 3113 config=config, 3114 parameters=model.parameters, 3115 condition=model.record_filter.condition 3116 if (model.record_filter and hasattr(model.record_filter, "condition")) 3117 else None, 3118 **client_side_incremental_sync, 3119 ) 3120 transform_before_filtering = ( 3121 True 3122 if model.transform_before_filtering is None 3123 else model.transform_before_filtering 3124 ) 3125 3126 if model.schema_normalization is None: 3127 # default to no schema normalization if not set 3128 model.schema_normalization = SchemaNormalizationModel.None_ 3129 3130 schema_normalization = ( 3131 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3132 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3133 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3134 ) 3135 3136 return RecordSelector( 3137 extractor=extractor, 3138 name=name, 3139 config=config, 3140 record_filter=record_filter, 3141 transformations=transformations or [], 3142 file_uploader=file_uploader, 3143 schema_normalization=schema_normalization, 3144 parameters=model.parameters or {}, 3145 transform_before_filtering=transform_before_filtering, 3146 ) 3147 3148 @staticmethod 3149 def create_remove_fields( 3150 model: RemoveFieldsModel, config: Config, **kwargs: Any 3151 ) -> RemoveFields: 3152 return RemoveFields( 3153 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3154 ) 3155 3156 def create_selective_authenticator( 3157 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3158 ) -> DeclarativeAuthenticator: 3159 authenticators = { 3160 name: self._create_component_from_model(model=auth, config=config) 3161 for name, auth in model.authenticators.items() 3162 } 3163 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3164 return SelectiveAuthenticator( # type: ignore[abstract] 3165 config=config, 3166 authenticators=authenticators, 3167 authenticator_selection_path=model.authenticator_selection_path, 3168 **kwargs, 3169 ) 3170 3171 @staticmethod 3172 def create_legacy_session_token_authenticator( 3173 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3174 ) -> LegacySessionTokenAuthenticator: 3175 return LegacySessionTokenAuthenticator( 3176 api_url=url_base, 3177 header=model.header, 3178 login_url=model.login_url, 3179 password=model.password or "", 3180 session_token=model.session_token or "", 3181 session_token_response_key=model.session_token_response_key or "", 3182 username=model.username or "", 3183 validate_session_url=model.validate_session_url, 3184 config=config, 3185 parameters=model.parameters or {}, 3186 ) 3187 3188 def create_simple_retriever( 3189 self, 3190 model: SimpleRetrieverModel, 3191 config: Config, 3192 *, 3193 name: str, 3194 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3195 stream_slicer: Optional[StreamSlicer], 3196 request_options_provider: Optional[RequestOptionsProvider] = None, 3197 stop_condition_cursor: Optional[Cursor] = None, 3198 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3199 transformations: List[RecordTransformation], 3200 file_uploader: Optional[DefaultFileUploader] = None, 3201 incremental_sync: Optional[ 3202 Union[ 3203 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3204 ] 3205 ] = None, 3206 use_cache: Optional[bool] = None, 3207 log_formatter: Optional[Callable[[Response], Any]] = None, 3208 **kwargs: Any, 3209 ) -> SimpleRetriever: 3210 def _get_url() -> str: 3211 """ 3212 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3213 This is needed because the URL is not set until the requester is created. 3214 """ 3215 3216 _url: str = ( 3217 model.requester.url 3218 if hasattr(model.requester, "url") and model.requester.url is not None 3219 else requester.get_url() 3220 ) 3221 _url_base: str = ( 3222 model.requester.url_base 3223 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3224 else requester.get_url_base() 3225 ) 3226 3227 return _url or _url_base 3228 3229 decoder = ( 3230 self._create_component_from_model(model=model.decoder, config=config) 3231 if model.decoder 3232 else JsonDecoder(parameters={}) 3233 ) 3234 record_selector = self._create_component_from_model( 3235 model=model.record_selector, 3236 name=name, 3237 config=config, 3238 decoder=decoder, 3239 transformations=transformations, 3240 client_side_incremental_sync=client_side_incremental_sync, 3241 file_uploader=file_uploader, 3242 ) 3243 3244 query_properties: Optional[QueryProperties] = None 3245 query_properties_key: Optional[str] = None 3246 if self._query_properties_in_request_parameters(model.requester): 3247 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3248 # places instead of default to request_parameters which isn't clearly documented 3249 if ( 3250 hasattr(model.requester, "fetch_properties_from_endpoint") 3251 and model.requester.fetch_properties_from_endpoint 3252 ): 3253 raise ValueError( 3254 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3255 ) 3256 3257 query_properties_definitions = [] 3258 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3259 if isinstance(request_parameter, QueryPropertiesModel): 3260 query_properties_key = key 3261 query_properties_definitions.append(request_parameter) 3262 3263 if len(query_properties_definitions) > 1: 3264 raise ValueError( 3265 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3266 ) 3267 3268 if len(query_properties_definitions) == 1: 3269 query_properties = self._create_component_from_model( 3270 model=query_properties_definitions[0], config=config 3271 ) 3272 elif ( 3273 hasattr(model.requester, "fetch_properties_from_endpoint") 3274 and model.requester.fetch_properties_from_endpoint 3275 ): 3276 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3277 query_properties_definition = QueryPropertiesModel( 3278 type="QueryProperties", 3279 property_list=model.requester.fetch_properties_from_endpoint, 3280 always_include_properties=None, 3281 property_chunking=None, 3282 ) # type: ignore # $parameters has a default value 3283 3284 query_properties = self.create_query_properties( 3285 model=query_properties_definition, 3286 config=config, 3287 ) 3288 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3289 query_properties = self.create_query_properties( 3290 model=model.requester.query_properties, 3291 config=config, 3292 ) 3293 3294 requester = self._create_component_from_model( 3295 model=model.requester, 3296 decoder=decoder, 3297 name=name, 3298 query_properties_key=query_properties_key, 3299 use_cache=use_cache, 3300 config=config, 3301 ) 3302 3303 # Define cursor only if per partition or common incremental support is needed 3304 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3305 3306 if ( 3307 not isinstance(stream_slicer, DatetimeBasedCursor) 3308 or type(stream_slicer) is not DatetimeBasedCursor 3309 ): 3310 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3311 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3312 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3313 # request_options_provider 3314 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3315 elif not request_options_provider: 3316 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3317 3318 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3319 if self._should_limit_slices_fetched(): 3320 stream_slicer = cast( 3321 StreamSlicer, 3322 StreamSlicerTestReadDecorator( 3323 wrapped_slicer=stream_slicer, 3324 maximum_number_of_slices=self._limit_slices_fetched or 5, 3325 ), 3326 ) 3327 3328 paginator = ( 3329 self._create_component_from_model( 3330 model=model.paginator, 3331 config=config, 3332 url_base=_get_url(), 3333 extractor_model=model.record_selector.extractor, 3334 decoder=decoder, 3335 cursor_used_for_stop_condition=stop_condition_cursor or None, 3336 ) 3337 if model.paginator 3338 else NoPagination(parameters={}) 3339 ) 3340 3341 ignore_stream_slicer_parameters_on_paginated_requests = ( 3342 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3343 ) 3344 3345 if ( 3346 model.partition_router 3347 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3348 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3349 and any( 3350 parent_stream_config.lazy_read_pointer 3351 for parent_stream_config in model.partition_router.parent_stream_configs 3352 ) 3353 ): 3354 if incremental_sync: 3355 if incremental_sync.type != "DatetimeBasedCursor": 3356 raise ValueError( 3357 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3358 ) 3359 3360 elif incremental_sync.step or incremental_sync.cursor_granularity: 3361 raise ValueError( 3362 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3363 ) 3364 3365 if model.decoder and model.decoder.type != "JsonDecoder": 3366 raise ValueError( 3367 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3368 ) 3369 3370 return LazySimpleRetriever( 3371 name=name, 3372 paginator=paginator, 3373 primary_key=primary_key, 3374 requester=requester, 3375 record_selector=record_selector, 3376 stream_slicer=stream_slicer, 3377 request_option_provider=request_options_provider, 3378 cursor=cursor, 3379 config=config, 3380 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3381 parameters=model.parameters or {}, 3382 ) 3383 3384 return SimpleRetriever( 3385 name=name, 3386 paginator=paginator, 3387 primary_key=primary_key, 3388 requester=requester, 3389 record_selector=record_selector, 3390 stream_slicer=stream_slicer, 3391 request_option_provider=request_options_provider, 3392 cursor=cursor, 3393 config=config, 3394 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3395 additional_query_properties=query_properties, 3396 log_formatter=self._get_log_formatter(log_formatter, name), 3397 parameters=model.parameters or {}, 3398 ) 3399 3400 def _get_log_formatter( 3401 self, log_formatter: Callable[[Response], Any] | None, name: str 3402 ) -> Callable[[Response], Any] | None: 3403 if self._should_limit_slices_fetched(): 3404 return ( 3405 ( 3406 lambda response: format_http_message( 3407 response, 3408 f"Stream '{name}' request", 3409 f"Request performed in order to extract records for stream '{name}'", 3410 name, 3411 ) 3412 ) 3413 if not log_formatter 3414 else log_formatter 3415 ) 3416 return None 3417 3418 def _should_limit_slices_fetched(self) -> bool: 3419 """ 3420 Returns True if the number of slices fetched should be limited, False otherwise. 3421 This is used to limit the number of slices fetched during tests. 3422 """ 3423 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3424 3425 @staticmethod 3426 def _query_properties_in_request_parameters( 3427 requester: Union[HttpRequesterModel, CustomRequesterModel], 3428 ) -> bool: 3429 if not hasattr(requester, "request_parameters"): 3430 return False 3431 request_parameters = requester.request_parameters 3432 if request_parameters and isinstance(request_parameters, Mapping): 3433 for request_parameter in request_parameters.values(): 3434 if isinstance(request_parameter, QueryPropertiesModel): 3435 return True 3436 return False 3437 3438 @staticmethod 3439 def _remove_query_properties( 3440 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3441 ) -> Mapping[str, str]: 3442 return { 3443 parameter_field: request_parameter 3444 for parameter_field, request_parameter in request_parameters.items() 3445 if not isinstance(request_parameter, QueryPropertiesModel) 3446 } 3447 3448 def create_state_delegating_stream( 3449 self, 3450 model: StateDelegatingStreamModel, 3451 config: Config, 3452 has_parent_state: Optional[bool] = None, 3453 **kwargs: Any, 3454 ) -> DeclarativeStream: 3455 if ( 3456 model.full_refresh_stream.name != model.name 3457 or model.name != model.incremental_stream.name 3458 ): 3459 raise ValueError( 3460 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3461 ) 3462 3463 stream_model = ( 3464 model.incremental_stream 3465 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3466 else model.full_refresh_stream 3467 ) 3468 3469 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3470 3471 def _create_async_job_status_mapping( 3472 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3473 ) -> Mapping[str, AsyncJobStatus]: 3474 api_status_to_cdk_status = {} 3475 for cdk_status, api_statuses in model.dict().items(): 3476 if cdk_status == "type": 3477 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3478 continue 3479 3480 for status in api_statuses: 3481 if status in api_status_to_cdk_status: 3482 raise ValueError( 3483 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3484 ) 3485 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3486 return api_status_to_cdk_status 3487 3488 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3489 match status: 3490 case "running": 3491 return AsyncJobStatus.RUNNING 3492 case "completed": 3493 return AsyncJobStatus.COMPLETED 3494 case "failed": 3495 return AsyncJobStatus.FAILED 3496 case "timeout": 3497 return AsyncJobStatus.TIMED_OUT 3498 case _: 3499 raise ValueError(f"Unsupported CDK status {status}") 3500 3501 def create_async_retriever( 3502 self, 3503 model: AsyncRetrieverModel, 3504 config: Config, 3505 *, 3506 name: str, 3507 primary_key: Optional[ 3508 Union[str, List[str], List[List[str]]] 3509 ], # this seems to be needed to match create_simple_retriever 3510 stream_slicer: Optional[StreamSlicer], 3511 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3512 transformations: List[RecordTransformation], 3513 **kwargs: Any, 3514 ) -> AsyncRetriever: 3515 def _get_download_retriever() -> SimpleRetriever: 3516 # We create a record selector for the download retriever 3517 # with no schema normalization and no transformations, neither record filter 3518 # as all this occurs in the record_selector of the AsyncRetriever 3519 record_selector = RecordSelector( 3520 extractor=download_extractor, 3521 name=name, 3522 record_filter=None, 3523 transformations=[], 3524 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3525 config=config, 3526 parameters={}, 3527 ) 3528 paginator = ( 3529 self._create_component_from_model( 3530 model=model.download_paginator, 3531 decoder=decoder, 3532 config=config, 3533 url_base="", 3534 ) 3535 if model.download_paginator 3536 else NoPagination(parameters={}) 3537 ) 3538 3539 return SimpleRetriever( 3540 requester=download_requester, 3541 record_selector=record_selector, 3542 primary_key=None, 3543 name=name, 3544 paginator=paginator, 3545 config=config, 3546 parameters={}, 3547 log_formatter=self._get_log_formatter(None, name), 3548 ) 3549 3550 def _get_job_timeout() -> datetime.timedelta: 3551 user_defined_timeout: Optional[int] = ( 3552 int( 3553 InterpolatedString.create( 3554 str(model.polling_job_timeout), 3555 parameters={}, 3556 ).eval(config) 3557 ) 3558 if model.polling_job_timeout 3559 else None 3560 ) 3561 3562 # check for user defined timeout during the test read or 15 minutes 3563 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3564 # default value for non-connector builder is 60 minutes. 3565 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3566 3567 return ( 3568 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3569 ) 3570 3571 decoder = ( 3572 self._create_component_from_model(model=model.decoder, config=config) 3573 if model.decoder 3574 else JsonDecoder(parameters={}) 3575 ) 3576 record_selector = self._create_component_from_model( 3577 model=model.record_selector, 3578 config=config, 3579 decoder=decoder, 3580 name=name, 3581 transformations=transformations, 3582 client_side_incremental_sync=client_side_incremental_sync, 3583 ) 3584 3585 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3586 if self._should_limit_slices_fetched(): 3587 stream_slicer = cast( 3588 StreamSlicer, 3589 StreamSlicerTestReadDecorator( 3590 wrapped_slicer=stream_slicer, 3591 maximum_number_of_slices=self._limit_slices_fetched or 5, 3592 ), 3593 ) 3594 3595 creation_requester = self._create_component_from_model( 3596 model=model.creation_requester, 3597 decoder=decoder, 3598 config=config, 3599 name=f"job creation - {name}", 3600 ) 3601 polling_requester = self._create_component_from_model( 3602 model=model.polling_requester, 3603 decoder=decoder, 3604 config=config, 3605 name=f"job polling - {name}", 3606 ) 3607 job_download_components_name = f"job download - {name}" 3608 download_decoder = ( 3609 self._create_component_from_model(model=model.download_decoder, config=config) 3610 if model.download_decoder 3611 else JsonDecoder(parameters={}) 3612 ) 3613 download_extractor = ( 3614 self._create_component_from_model( 3615 model=model.download_extractor, 3616 config=config, 3617 decoder=download_decoder, 3618 parameters=model.parameters, 3619 ) 3620 if model.download_extractor 3621 else DpathExtractor( 3622 [], 3623 config=config, 3624 decoder=download_decoder, 3625 parameters=model.parameters or {}, 3626 ) 3627 ) 3628 download_requester = self._create_component_from_model( 3629 model=model.download_requester, 3630 decoder=download_decoder, 3631 config=config, 3632 name=job_download_components_name, 3633 ) 3634 download_retriever = _get_download_retriever() 3635 abort_requester = ( 3636 self._create_component_from_model( 3637 model=model.abort_requester, 3638 decoder=decoder, 3639 config=config, 3640 name=f"job abort - {name}", 3641 ) 3642 if model.abort_requester 3643 else None 3644 ) 3645 delete_requester = ( 3646 self._create_component_from_model( 3647 model=model.delete_requester, 3648 decoder=decoder, 3649 config=config, 3650 name=f"job delete - {name}", 3651 ) 3652 if model.delete_requester 3653 else None 3654 ) 3655 download_target_requester = ( 3656 self._create_component_from_model( 3657 model=model.download_target_requester, 3658 decoder=decoder, 3659 config=config, 3660 name=f"job extract_url - {name}", 3661 ) 3662 if model.download_target_requester 3663 else None 3664 ) 3665 status_extractor = self._create_component_from_model( 3666 model=model.status_extractor, decoder=decoder, config=config, name=name 3667 ) 3668 download_target_extractor = self._create_component_from_model( 3669 model=model.download_target_extractor, 3670 decoder=decoder, 3671 config=config, 3672 name=name, 3673 ) 3674 3675 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3676 creation_requester=creation_requester, 3677 polling_requester=polling_requester, 3678 download_retriever=download_retriever, 3679 download_target_requester=download_target_requester, 3680 abort_requester=abort_requester, 3681 delete_requester=delete_requester, 3682 status_extractor=status_extractor, 3683 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3684 download_target_extractor=download_target_extractor, 3685 job_timeout=_get_job_timeout(), 3686 ) 3687 3688 async_job_partition_router = AsyncJobPartitionRouter( 3689 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3690 job_repository, 3691 stream_slices, 3692 self._job_tracker, 3693 self._message_repository, 3694 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3695 has_bulk_parent=False, 3696 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3697 # `None` == default retry is set to 3 attempts, under the hood. 3698 job_max_retry=1 if self._emit_connector_builder_messages else None, 3699 ), 3700 stream_slicer=stream_slicer, 3701 config=config, 3702 parameters=model.parameters or {}, 3703 ) 3704 3705 return AsyncRetriever( 3706 record_selector=record_selector, 3707 stream_slicer=async_job_partition_router, 3708 config=config, 3709 parameters=model.parameters or {}, 3710 ) 3711 3712 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3713 config_migrations = [ 3714 self._create_component_from_model(migration, config) 3715 for migration in ( 3716 model.config_normalization_rules.config_migrations 3717 if ( 3718 model.config_normalization_rules 3719 and model.config_normalization_rules.config_migrations 3720 ) 3721 else [] 3722 ) 3723 ] 3724 config_transformations = [ 3725 self._create_component_from_model(transformation, config) 3726 for transformation in ( 3727 model.config_normalization_rules.transformations 3728 if ( 3729 model.config_normalization_rules 3730 and model.config_normalization_rules.transformations 3731 ) 3732 else [] 3733 ) 3734 ] 3735 config_validations = [ 3736 self._create_component_from_model(validation, config) 3737 for validation in ( 3738 model.config_normalization_rules.validations 3739 if ( 3740 model.config_normalization_rules 3741 and model.config_normalization_rules.validations 3742 ) 3743 else [] 3744 ) 3745 ] 3746 3747 return Spec( 3748 connection_specification=model.connection_specification, 3749 documentation_url=model.documentation_url, 3750 advanced_auth=model.advanced_auth, 3751 parameters={}, 3752 config_migrations=config_migrations, 3753 config_transformations=config_transformations, 3754 config_validations=config_validations, 3755 ) 3756 3757 def create_substream_partition_router( 3758 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3759 ) -> SubstreamPartitionRouter: 3760 parent_stream_configs = [] 3761 if model.parent_stream_configs: 3762 parent_stream_configs.extend( 3763 [ 3764 self._create_message_repository_substream_wrapper( 3765 model=parent_stream_config, config=config, **kwargs 3766 ) 3767 for parent_stream_config in model.parent_stream_configs 3768 ] 3769 ) 3770 3771 return SubstreamPartitionRouter( 3772 parent_stream_configs=parent_stream_configs, 3773 parameters=model.parameters or {}, 3774 config=config, 3775 ) 3776 3777 def _create_message_repository_substream_wrapper( 3778 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3779 ) -> Any: 3780 substream_factory = ModelToComponentFactory( 3781 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3782 limit_slices_fetched=self._limit_slices_fetched, 3783 emit_connector_builder_messages=self._emit_connector_builder_messages, 3784 disable_retries=self._disable_retries, 3785 disable_cache=self._disable_cache, 3786 message_repository=LogAppenderMessageRepositoryDecorator( 3787 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3788 self._message_repository, 3789 self._evaluate_log_level(self._emit_connector_builder_messages), 3790 ), 3791 ) 3792 3793 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3794 has_parent_state = bool( 3795 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3796 if model.incremental_dependency 3797 else False 3798 ) 3799 return substream_factory._create_component_from_model( 3800 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3801 ) 3802 3803 @staticmethod 3804 def create_wait_time_from_header( 3805 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3806 ) -> WaitTimeFromHeaderBackoffStrategy: 3807 return WaitTimeFromHeaderBackoffStrategy( 3808 header=model.header, 3809 parameters=model.parameters or {}, 3810 config=config, 3811 regex=model.regex, 3812 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3813 if model.max_waiting_time_in_seconds is not None 3814 else None, 3815 ) 3816 3817 @staticmethod 3818 def create_wait_until_time_from_header( 3819 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3820 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3821 return WaitUntilTimeFromHeaderBackoffStrategy( 3822 header=model.header, 3823 parameters=model.parameters or {}, 3824 config=config, 3825 min_wait=model.min_wait, 3826 regex=model.regex, 3827 ) 3828 3829 def get_message_repository(self) -> MessageRepository: 3830 return self._message_repository 3831 3832 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3833 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3834 3835 @staticmethod 3836 def create_components_mapping_definition( 3837 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3838 ) -> ComponentMappingDefinition: 3839 interpolated_value = InterpolatedString.create( 3840 model.value, parameters=model.parameters or {} 3841 ) 3842 field_path = [ 3843 InterpolatedString.create(path, parameters=model.parameters or {}) 3844 for path in model.field_path 3845 ] 3846 return ComponentMappingDefinition( 3847 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3848 value=interpolated_value, 3849 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3850 create_or_update=model.create_or_update, 3851 condition=model.condition, 3852 parameters=model.parameters or {}, 3853 ) 3854 3855 def create_http_components_resolver( 3856 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3857 ) -> Any: 3858 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3859 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3860 3861 retriever = self._create_component_from_model( 3862 model=model.retriever, 3863 config=config, 3864 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3865 primary_key=None, 3866 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3867 transformations=[], 3868 ) 3869 3870 components_mapping = [] 3871 for component_mapping_definition_model in model.components_mapping: 3872 if component_mapping_definition_model.condition: 3873 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3874 components_mapping.append( 3875 self._create_component_from_model( 3876 model=component_mapping_definition_model, 3877 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3878 component_mapping_definition_model.value_type 3879 ), 3880 config=config, 3881 ) 3882 ) 3883 3884 return HttpComponentsResolver( 3885 retriever=retriever, 3886 config=config, 3887 components_mapping=components_mapping, 3888 parameters=model.parameters or {}, 3889 ) 3890 3891 @staticmethod 3892 def create_stream_config( 3893 model: StreamConfigModel, config: Config, **kwargs: Any 3894 ) -> StreamConfig: 3895 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3896 [x for x in model.configs_pointer] if model.configs_pointer else [] 3897 ) 3898 3899 return StreamConfig( 3900 configs_pointer=model_configs_pointer, 3901 default_values=model.default_values, 3902 parameters=model.parameters or {}, 3903 ) 3904 3905 def create_config_components_resolver( 3906 self, 3907 model: ConfigComponentsResolverModel, 3908 config: Config, 3909 ) -> Any: 3910 model_stream_configs = ( 3911 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3912 ) 3913 3914 stream_configs = [ 3915 self._create_component_from_model( 3916 stream_config, config=config, parameters=model.parameters or {} 3917 ) 3918 for stream_config in model_stream_configs 3919 ] 3920 3921 components_mapping = [ 3922 self._create_component_from_model( 3923 model=components_mapping_definition_model, 3924 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3925 components_mapping_definition_model.value_type 3926 ), 3927 config=config, 3928 parameters=model.parameters, 3929 ) 3930 for components_mapping_definition_model in model.components_mapping 3931 ] 3932 3933 return ConfigComponentsResolver( 3934 stream_configs=stream_configs, 3935 config=config, 3936 components_mapping=components_mapping, 3937 parameters=model.parameters or {}, 3938 ) 3939 3940 def create_parametrized_components_resolver( 3941 self, 3942 model: ParametrizedComponentsResolverModel, 3943 config: Config, 3944 ) -> ParametrizedComponentsResolver: 3945 stream_parameters = StreamParametersDefinition( 3946 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3947 ) 3948 3949 components_mapping = [] 3950 for components_mapping_definition_model in model.components_mapping: 3951 if components_mapping_definition_model.condition: 3952 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3953 components_mapping.append( 3954 self._create_component_from_model( 3955 model=components_mapping_definition_model, 3956 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3957 components_mapping_definition_model.value_type 3958 ), 3959 config=config, 3960 ) 3961 ) 3962 return ParametrizedComponentsResolver( 3963 stream_parameters=stream_parameters, 3964 config=config, 3965 components_mapping=components_mapping, 3966 parameters=model.parameters or {}, 3967 ) 3968 3969 _UNSUPPORTED_DECODER_ERROR = ( 3970 "Specified decoder of {decoder_type} is not supported for pagination." 3971 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3972 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3973 ) 3974 3975 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3976 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3977 return True 3978 elif isinstance(decoder, CompositeRawDecoder): 3979 return self._is_supported_parser_for_pagination(decoder.parser) 3980 else: 3981 return False 3982 3983 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3984 if isinstance(parser, JsonParser): 3985 return True 3986 elif isinstance(parser, GzipParser): 3987 return isinstance(parser.inner_parser, JsonParser) 3988 else: 3989 return False 3990 3991 def create_http_api_budget( 3992 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3993 ) -> HttpAPIBudget: 3994 policies = [ 3995 self._create_component_from_model(model=policy, config=config) 3996 for policy in model.policies 3997 ] 3998 3999 return HttpAPIBudget( 4000 policies=policies, 4001 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4002 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4003 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4004 ) 4005 4006 def create_fixed_window_call_rate_policy( 4007 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4008 ) -> FixedWindowCallRatePolicy: 4009 matchers = [ 4010 self._create_component_from_model(model=matcher, config=config) 4011 for matcher in model.matchers 4012 ] 4013 4014 # Set the initial reset timestamp to 10 days from now. 4015 # This value will be updated by the first request. 4016 return FixedWindowCallRatePolicy( 4017 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4018 period=parse_duration(model.period), 4019 call_limit=model.call_limit, 4020 matchers=matchers, 4021 ) 4022 4023 def create_file_uploader( 4024 self, model: FileUploaderModel, config: Config, **kwargs: Any 4025 ) -> FileUploader: 4026 name = "File Uploader" 4027 requester = self._create_component_from_model( 4028 model=model.requester, 4029 config=config, 4030 name=name, 4031 **kwargs, 4032 ) 4033 download_target_extractor = self._create_component_from_model( 4034 model=model.download_target_extractor, 4035 config=config, 4036 name=name, 4037 **kwargs, 4038 ) 4039 emit_connector_builder_messages = self._emit_connector_builder_messages 4040 file_uploader = DefaultFileUploader( 4041 requester=requester, 4042 download_target_extractor=download_target_extractor, 4043 config=config, 4044 file_writer=NoopFileWriter() 4045 if emit_connector_builder_messages 4046 else LocalFileSystemFileWriter(), 4047 parameters=model.parameters or {}, 4048 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4049 ) 4050 4051 return ( 4052 ConnectorBuilderFileUploader(file_uploader) 4053 if emit_connector_builder_messages 4054 else file_uploader 4055 ) 4056 4057 def create_moving_window_call_rate_policy( 4058 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4059 ) -> MovingWindowCallRatePolicy: 4060 rates = [ 4061 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4062 ] 4063 matchers = [ 4064 self._create_component_from_model(model=matcher, config=config) 4065 for matcher in model.matchers 4066 ] 4067 return MovingWindowCallRatePolicy( 4068 rates=rates, 4069 matchers=matchers, 4070 ) 4071 4072 def create_unlimited_call_rate_policy( 4073 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4074 ) -> UnlimitedCallRatePolicy: 4075 matchers = [ 4076 self._create_component_from_model(model=matcher, config=config) 4077 for matcher in model.matchers 4078 ] 4079 4080 return UnlimitedCallRatePolicy( 4081 matchers=matchers, 4082 ) 4083 4084 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4085 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4086 return Rate( 4087 limit=int(interpolated_limit.eval(config=config)), 4088 interval=parse_duration(model.interval), 4089 ) 4090 4091 def create_http_request_matcher( 4092 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4093 ) -> HttpRequestRegexMatcher: 4094 return HttpRequestRegexMatcher( 4095 method=model.method, 4096 url_base=model.url_base, 4097 url_path_pattern=model.url_path_pattern, 4098 params=model.params, 4099 headers=model.headers, 4100 ) 4101 4102 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4103 self._api_budget = self.create_component( 4104 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4105 ) 4106 4107 def create_grouping_partition_router( 4108 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4109 ) -> GroupingPartitionRouter: 4110 underlying_router = self._create_component_from_model( 4111 model=model.underlying_partition_router, config=config 4112 ) 4113 if model.group_size < 1: 4114 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4115 4116 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4117 # because they are specific to individual partitions and cannot be aggregated or handled 4118 # when grouping, potentially leading to incorrect API calls. Any request customization 4119 # should be managed at the stream level through the requester's configuration. 4120 if isinstance(underlying_router, SubstreamPartitionRouter): 4121 if any( 4122 parent_config.request_option 4123 for parent_config in underlying_router.parent_stream_configs 4124 ): 4125 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4126 4127 if isinstance(underlying_router, ListPartitionRouter): 4128 if underlying_router.request_option: 4129 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4130 4131 return GroupingPartitionRouter( 4132 group_size=model.group_size, 4133 underlying_partition_router=underlying_router, 4134 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4135 config=config, 4136 )
627class ModelToComponentFactory: 628 EPOCH_DATETIME_FORMAT = "%s" 629 630 def __init__( 631 self, 632 limit_pages_fetched_per_slice: Optional[int] = None, 633 limit_slices_fetched: Optional[int] = None, 634 emit_connector_builder_messages: bool = False, 635 disable_retries: bool = False, 636 disable_cache: bool = False, 637 disable_resumable_full_refresh: bool = False, 638 message_repository: Optional[MessageRepository] = None, 639 connector_state_manager: Optional[ConnectorStateManager] = None, 640 max_concurrent_async_job_count: Optional[int] = None, 641 ): 642 self._init_mappings() 643 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 644 self._limit_slices_fetched = limit_slices_fetched 645 self._emit_connector_builder_messages = emit_connector_builder_messages 646 self._disable_retries = disable_retries 647 self._disable_cache = disable_cache 648 self._disable_resumable_full_refresh = disable_resumable_full_refresh 649 self._message_repository = message_repository or InMemoryMessageRepository( 650 self._evaluate_log_level(emit_connector_builder_messages) 651 ) 652 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 653 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 654 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 655 # placeholder for deprecation warnings 656 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 657 658 def _init_mappings(self) -> None: 659 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 660 AddedFieldDefinitionModel: self.create_added_field_definition, 661 AddFieldsModel: self.create_add_fields, 662 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 663 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 664 BearerAuthenticatorModel: self.create_bearer_authenticator, 665 CheckStreamModel: self.create_check_stream, 666 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 667 CheckDynamicStreamModel: self.create_check_dynamic_stream, 668 CompositeErrorHandlerModel: self.create_composite_error_handler, 669 ConcurrencyLevelModel: self.create_concurrency_level, 670 ConfigMigrationModel: self.create_config_migration, 671 ConfigAddFieldsModel: self.create_config_add_fields, 672 ConfigRemapFieldModel: self.create_config_remap_field, 673 ConfigRemoveFieldsModel: self.create_config_remove_fields, 674 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 675 CsvDecoderModel: self.create_csv_decoder, 676 CursorPaginationModel: self.create_cursor_pagination, 677 CustomAuthenticatorModel: self.create_custom_component, 678 CustomBackoffStrategyModel: self.create_custom_component, 679 CustomDecoderModel: self.create_custom_component, 680 CustomErrorHandlerModel: self.create_custom_component, 681 CustomIncrementalSyncModel: self.create_custom_component, 682 CustomRecordExtractorModel: self.create_custom_component, 683 CustomRecordFilterModel: self.create_custom_component, 684 CustomRequesterModel: self.create_custom_component, 685 CustomRetrieverModel: self.create_custom_component, 686 CustomSchemaLoader: self.create_custom_component, 687 CustomSchemaNormalizationModel: self.create_custom_component, 688 CustomStateMigration: self.create_custom_component, 689 CustomPaginationStrategyModel: self.create_custom_component, 690 CustomPartitionRouterModel: self.create_custom_component, 691 CustomTransformationModel: self.create_custom_component, 692 CustomValidationStrategyModel: self.create_custom_component, 693 CustomConfigTransformationModel: self.create_custom_component, 694 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 695 DeclarativeStreamModel: self.create_declarative_stream, 696 DefaultErrorHandlerModel: self.create_default_error_handler, 697 DefaultPaginatorModel: self.create_default_paginator, 698 DpathExtractorModel: self.create_dpath_extractor, 699 DpathValidatorModel: self.create_dpath_validator, 700 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 701 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 702 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 703 GroupByKeyMergeStrategyModel: self.create_group_by_key, 704 HttpRequesterModel: self.create_http_requester, 705 HttpResponseFilterModel: self.create_http_response_filter, 706 InlineSchemaLoaderModel: self.create_inline_schema_loader, 707 JsonDecoderModel: self.create_json_decoder, 708 JsonlDecoderModel: self.create_jsonl_decoder, 709 GzipDecoderModel: self.create_gzip_decoder, 710 KeysToLowerModel: self.create_keys_to_lower_transformation, 711 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 712 KeysReplaceModel: self.create_keys_replace_transformation, 713 FlattenFieldsModel: self.create_flatten_fields, 714 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 715 IterableDecoderModel: self.create_iterable_decoder, 716 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 717 XmlDecoderModel: self.create_xml_decoder, 718 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 719 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 720 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 721 TypesMapModel: self.create_types_map, 722 ComplexFieldTypeModel: self.create_complex_field_type, 723 JwtAuthenticatorModel: self.create_jwt_authenticator, 724 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 725 ListPartitionRouterModel: self.create_list_partition_router, 726 MinMaxDatetimeModel: self.create_min_max_datetime, 727 NoAuthModel: self.create_no_auth, 728 NoPaginationModel: self.create_no_pagination, 729 OAuthAuthenticatorModel: self.create_oauth_authenticator, 730 OffsetIncrementModel: self.create_offset_increment, 731 PageIncrementModel: self.create_page_increment, 732 ParentStreamConfigModel: self.create_parent_stream_config, 733 PredicateValidatorModel: self.create_predicate_validator, 734 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 735 PropertyChunkingModel: self.create_property_chunking, 736 QueryPropertiesModel: self.create_query_properties, 737 RecordFilterModel: self.create_record_filter, 738 RecordSelectorModel: self.create_record_selector, 739 RemoveFieldsModel: self.create_remove_fields, 740 RequestPathModel: self.create_request_path, 741 RequestOptionModel: self.create_request_option, 742 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 743 SelectiveAuthenticatorModel: self.create_selective_authenticator, 744 SimpleRetrieverModel: self.create_simple_retriever, 745 StateDelegatingStreamModel: self.create_state_delegating_stream, 746 SpecModel: self.create_spec, 747 SubstreamPartitionRouterModel: self.create_substream_partition_router, 748 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 749 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 750 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 751 AsyncRetrieverModel: self.create_async_retriever, 752 HttpComponentsResolverModel: self.create_http_components_resolver, 753 ConfigComponentsResolverModel: self.create_config_components_resolver, 754 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 755 StreamConfigModel: self.create_stream_config, 756 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 757 ZipfileDecoderModel: self.create_zipfile_decoder, 758 HTTPAPIBudgetModel: self.create_http_api_budget, 759 FileUploaderModel: self.create_file_uploader, 760 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 761 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 762 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 763 RateModel: self.create_rate, 764 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 765 GroupingPartitionRouterModel: self.create_grouping_partition_router, 766 } 767 768 # Needed for the case where we need to perform a second parse on the fields of a custom component 769 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 770 771 def create_component( 772 self, 773 model_type: Type[BaseModel], 774 component_definition: ComponentDefinition, 775 config: Config, 776 **kwargs: Any, 777 ) -> Any: 778 """ 779 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 780 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 781 creating declarative components from that model. 782 783 :param model_type: The type of declarative component that is being initialized 784 :param component_definition: The mapping that represents a declarative component 785 :param config: The connector config that is provided by the customer 786 :return: The declarative component to be used at runtime 787 """ 788 789 component_type = component_definition.get("type") 790 if component_definition.get("type") != model_type.__name__: 791 raise ValueError( 792 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 793 ) 794 795 declarative_component_model = model_type.parse_obj(component_definition) 796 797 if not isinstance(declarative_component_model, model_type): 798 raise ValueError( 799 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 800 ) 801 802 return self._create_component_from_model( 803 model=declarative_component_model, config=config, **kwargs 804 ) 805 806 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 807 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 808 raise ValueError( 809 f"{model.__class__} with attributes {model} is not a valid component type" 810 ) 811 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 812 if not component_constructor: 813 raise ValueError(f"Could not find constructor for {model.__class__}") 814 815 # collect deprecation warnings for supported models. 816 if isinstance(model, BaseModelWithDeprecations): 817 self._collect_model_deprecations(model) 818 819 return component_constructor(model=model, config=config, **kwargs) 820 821 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 822 """ 823 Returns the deprecation warnings that were collected during the creation of components. 824 """ 825 return self._collected_deprecation_logs 826 827 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 828 """ 829 Collects deprecation logs from the given model and appends any new logs to the internal collection. 830 831 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 832 833 Args: 834 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 835 """ 836 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 837 for log in model._deprecation_logs: 838 # avoid duplicates for deprecation logs observed. 839 if log not in self._collected_deprecation_logs: 840 self._collected_deprecation_logs.append(log) 841 842 def create_config_migration( 843 self, model: ConfigMigrationModel, config: Config 844 ) -> ConfigMigration: 845 transformations: List[ConfigTransformation] = [ 846 self._create_component_from_model(transformation, config) 847 for transformation in model.transformations 848 ] 849 850 return ConfigMigration( 851 description=model.description, 852 transformations=transformations, 853 ) 854 855 def create_config_add_fields( 856 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 857 ) -> ConfigAddFields: 858 fields = [self._create_component_from_model(field, config) for field in model.fields] 859 return ConfigAddFields( 860 fields=fields, 861 condition=model.condition or "", 862 ) 863 864 @staticmethod 865 def create_config_remove_fields( 866 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 867 ) -> ConfigRemoveFields: 868 return ConfigRemoveFields( 869 field_pointers=model.field_pointers, 870 condition=model.condition or "", 871 ) 872 873 @staticmethod 874 def create_config_remap_field( 875 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 876 ) -> ConfigRemapField: 877 mapping = cast(Mapping[str, Any], model.map) 878 return ConfigRemapField( 879 map=mapping, 880 field_path=model.field_path, 881 config=config, 882 ) 883 884 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 885 strategy = self._create_component_from_model(model.validation_strategy, config) 886 887 return DpathValidator( 888 field_path=model.field_path, 889 strategy=strategy, 890 ) 891 892 def create_predicate_validator( 893 self, model: PredicateValidatorModel, config: Config 894 ) -> PredicateValidator: 895 strategy = self._create_component_from_model(model.validation_strategy, config) 896 897 return PredicateValidator( 898 value=model.value, 899 strategy=strategy, 900 ) 901 902 @staticmethod 903 def create_validate_adheres_to_schema( 904 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 905 ) -> ValidateAdheresToSchema: 906 base_schema = cast(Mapping[str, Any], model.base_schema) 907 return ValidateAdheresToSchema( 908 schema=base_schema, 909 ) 910 911 @staticmethod 912 def create_added_field_definition( 913 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 914 ) -> AddedFieldDefinition: 915 interpolated_value = InterpolatedString.create( 916 model.value, parameters=model.parameters or {} 917 ) 918 return AddedFieldDefinition( 919 path=model.path, 920 value=interpolated_value, 921 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 922 parameters=model.parameters or {}, 923 ) 924 925 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 926 added_field_definitions = [ 927 self._create_component_from_model( 928 model=added_field_definition_model, 929 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 930 added_field_definition_model.value_type 931 ), 932 config=config, 933 ) 934 for added_field_definition_model in model.fields 935 ] 936 return AddFields( 937 fields=added_field_definitions, 938 condition=model.condition or "", 939 parameters=model.parameters or {}, 940 ) 941 942 def create_keys_to_lower_transformation( 943 self, model: KeysToLowerModel, config: Config, **kwargs: Any 944 ) -> KeysToLowerTransformation: 945 return KeysToLowerTransformation() 946 947 def create_keys_to_snake_transformation( 948 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 949 ) -> KeysToSnakeCaseTransformation: 950 return KeysToSnakeCaseTransformation() 951 952 def create_keys_replace_transformation( 953 self, model: KeysReplaceModel, config: Config, **kwargs: Any 954 ) -> KeysReplaceTransformation: 955 return KeysReplaceTransformation( 956 old=model.old, new=model.new, parameters=model.parameters or {} 957 ) 958 959 def create_flatten_fields( 960 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 961 ) -> FlattenFields: 962 return FlattenFields( 963 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 964 ) 965 966 def create_dpath_flatten_fields( 967 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 968 ) -> DpathFlattenFields: 969 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 970 key_transformation = ( 971 KeyTransformation( 972 config=config, 973 prefix=model.key_transformation.prefix, 974 suffix=model.key_transformation.suffix, 975 parameters=model.parameters or {}, 976 ) 977 if model.key_transformation is not None 978 else None 979 ) 980 return DpathFlattenFields( 981 config=config, 982 field_path=model_field_path, 983 delete_origin_value=model.delete_origin_value 984 if model.delete_origin_value is not None 985 else False, 986 replace_record=model.replace_record if model.replace_record is not None else False, 987 key_transformation=key_transformation, 988 parameters=model.parameters or {}, 989 ) 990 991 @staticmethod 992 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 993 if not value_type: 994 return None 995 names_to_types = { 996 ValueType.string: str, 997 ValueType.number: float, 998 ValueType.integer: int, 999 ValueType.boolean: bool, 1000 } 1001 return names_to_types[value_type] 1002 1003 def create_api_key_authenticator( 1004 self, 1005 model: ApiKeyAuthenticatorModel, 1006 config: Config, 1007 token_provider: Optional[TokenProvider] = None, 1008 **kwargs: Any, 1009 ) -> ApiKeyAuthenticator: 1010 if model.inject_into is None and model.header is None: 1011 raise ValueError( 1012 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1013 ) 1014 1015 if model.inject_into is not None and model.header is not None: 1016 raise ValueError( 1017 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1018 ) 1019 1020 if token_provider is not None and model.api_token != "": 1021 raise ValueError( 1022 "If token_provider is set, api_token is ignored and has to be set to empty string." 1023 ) 1024 1025 request_option = ( 1026 self._create_component_from_model( 1027 model.inject_into, config, parameters=model.parameters or {} 1028 ) 1029 if model.inject_into 1030 else RequestOption( 1031 inject_into=RequestOptionType.header, 1032 field_name=model.header or "", 1033 parameters=model.parameters or {}, 1034 ) 1035 ) 1036 1037 return ApiKeyAuthenticator( 1038 token_provider=( 1039 token_provider 1040 if token_provider is not None 1041 else InterpolatedStringTokenProvider( 1042 api_token=model.api_token or "", 1043 config=config, 1044 parameters=model.parameters or {}, 1045 ) 1046 ), 1047 request_option=request_option, 1048 config=config, 1049 parameters=model.parameters or {}, 1050 ) 1051 1052 def create_legacy_to_per_partition_state_migration( 1053 self, 1054 model: LegacyToPerPartitionStateMigrationModel, 1055 config: Mapping[str, Any], 1056 declarative_stream: DeclarativeStreamModel, 1057 ) -> LegacyToPerPartitionStateMigration: 1058 retriever = declarative_stream.retriever 1059 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1060 raise ValueError( 1061 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1062 ) 1063 partition_router = retriever.partition_router 1064 if not isinstance( 1065 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1066 ): 1067 raise ValueError( 1068 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1069 ) 1070 if not hasattr(partition_router, "parent_stream_configs"): 1071 raise ValueError( 1072 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1073 ) 1074 1075 if not hasattr(declarative_stream, "incremental_sync"): 1076 raise ValueError( 1077 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1078 ) 1079 1080 return LegacyToPerPartitionStateMigration( 1081 partition_router, # type: ignore # was already checked above 1082 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1083 config, 1084 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1085 ) 1086 1087 def create_session_token_authenticator( 1088 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1089 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1090 decoder = ( 1091 self._create_component_from_model(model=model.decoder, config=config) 1092 if model.decoder 1093 else JsonDecoder(parameters={}) 1094 ) 1095 login_requester = self._create_component_from_model( 1096 model=model.login_requester, 1097 config=config, 1098 name=f"{name}_login_requester", 1099 decoder=decoder, 1100 ) 1101 token_provider = SessionTokenProvider( 1102 login_requester=login_requester, 1103 session_token_path=model.session_token_path, 1104 expiration_duration=parse_duration(model.expiration_duration) 1105 if model.expiration_duration 1106 else None, 1107 parameters=model.parameters or {}, 1108 message_repository=self._message_repository, 1109 decoder=decoder, 1110 ) 1111 if model.request_authentication.type == "Bearer": 1112 return ModelToComponentFactory.create_bearer_authenticator( 1113 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1114 config, 1115 token_provider=token_provider, 1116 ) 1117 else: 1118 return self.create_api_key_authenticator( 1119 ApiKeyAuthenticatorModel( 1120 type="ApiKeyAuthenticator", 1121 api_token="", 1122 inject_into=model.request_authentication.inject_into, 1123 ), # type: ignore # $parameters and headers default to None 1124 config=config, 1125 token_provider=token_provider, 1126 ) 1127 1128 @staticmethod 1129 def create_basic_http_authenticator( 1130 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1131 ) -> BasicHttpAuthenticator: 1132 return BasicHttpAuthenticator( 1133 password=model.password or "", 1134 username=model.username, 1135 config=config, 1136 parameters=model.parameters or {}, 1137 ) 1138 1139 @staticmethod 1140 def create_bearer_authenticator( 1141 model: BearerAuthenticatorModel, 1142 config: Config, 1143 token_provider: Optional[TokenProvider] = None, 1144 **kwargs: Any, 1145 ) -> BearerAuthenticator: 1146 if token_provider is not None and model.api_token != "": 1147 raise ValueError( 1148 "If token_provider is set, api_token is ignored and has to be set to empty string." 1149 ) 1150 return BearerAuthenticator( 1151 token_provider=( 1152 token_provider 1153 if token_provider is not None 1154 else InterpolatedStringTokenProvider( 1155 api_token=model.api_token or "", 1156 config=config, 1157 parameters=model.parameters or {}, 1158 ) 1159 ), 1160 config=config, 1161 parameters=model.parameters or {}, 1162 ) 1163 1164 @staticmethod 1165 def create_dynamic_stream_check_config( 1166 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1167 ) -> DynamicStreamCheckConfig: 1168 return DynamicStreamCheckConfig( 1169 dynamic_stream_name=model.dynamic_stream_name, 1170 stream_count=model.stream_count or 0, 1171 ) 1172 1173 def create_check_stream( 1174 self, model: CheckStreamModel, config: Config, **kwargs: Any 1175 ) -> CheckStream: 1176 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1177 raise ValueError( 1178 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1179 ) 1180 1181 dynamic_streams_check_configs = ( 1182 [ 1183 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1184 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1185 ] 1186 if model.dynamic_streams_check_configs 1187 else [] 1188 ) 1189 1190 return CheckStream( 1191 stream_names=model.stream_names or [], 1192 dynamic_streams_check_configs=dynamic_streams_check_configs, 1193 parameters={}, 1194 ) 1195 1196 @staticmethod 1197 def create_check_dynamic_stream( 1198 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1199 ) -> CheckDynamicStream: 1200 assert model.use_check_availability is not None # for mypy 1201 1202 use_check_availability = model.use_check_availability 1203 1204 return CheckDynamicStream( 1205 stream_count=model.stream_count, 1206 use_check_availability=use_check_availability, 1207 parameters={}, 1208 ) 1209 1210 def create_composite_error_handler( 1211 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1212 ) -> CompositeErrorHandler: 1213 error_handlers = [ 1214 self._create_component_from_model(model=error_handler_model, config=config) 1215 for error_handler_model in model.error_handlers 1216 ] 1217 return CompositeErrorHandler( 1218 error_handlers=error_handlers, parameters=model.parameters or {} 1219 ) 1220 1221 @staticmethod 1222 def create_concurrency_level( 1223 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1224 ) -> ConcurrencyLevel: 1225 return ConcurrencyLevel( 1226 default_concurrency=model.default_concurrency, 1227 max_concurrency=model.max_concurrency, 1228 config=config, 1229 parameters={}, 1230 ) 1231 1232 @staticmethod 1233 def apply_stream_state_migrations( 1234 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1235 ) -> MutableMapping[str, Any]: 1236 if stream_state_migrations: 1237 for state_migration in stream_state_migrations: 1238 if state_migration.should_migrate(stream_state): 1239 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1240 stream_state = dict(state_migration.migrate(stream_state)) 1241 return stream_state 1242 1243 def create_concurrent_cursor_from_datetime_based_cursor( 1244 self, 1245 model_type: Type[BaseModel], 1246 component_definition: ComponentDefinition, 1247 stream_name: str, 1248 stream_namespace: Optional[str], 1249 config: Config, 1250 message_repository: Optional[MessageRepository] = None, 1251 runtime_lookback_window: Optional[datetime.timedelta] = None, 1252 stream_state_migrations: Optional[List[Any]] = None, 1253 **kwargs: Any, 1254 ) -> ConcurrentCursor: 1255 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1256 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1257 # incoming state and connector_state_manager that is initialized when the component factory is created 1258 stream_state = ( 1259 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1260 if "stream_state" not in kwargs 1261 else kwargs["stream_state"] 1262 ) 1263 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1264 1265 component_type = component_definition.get("type") 1266 if component_definition.get("type") != model_type.__name__: 1267 raise ValueError( 1268 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1269 ) 1270 1271 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1272 1273 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1274 raise ValueError( 1275 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1276 ) 1277 1278 interpolated_cursor_field = InterpolatedString.create( 1279 datetime_based_cursor_model.cursor_field, 1280 parameters=datetime_based_cursor_model.parameters or {}, 1281 ) 1282 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1283 1284 interpolated_partition_field_start = InterpolatedString.create( 1285 datetime_based_cursor_model.partition_field_start or "start_time", 1286 parameters=datetime_based_cursor_model.parameters or {}, 1287 ) 1288 interpolated_partition_field_end = InterpolatedString.create( 1289 datetime_based_cursor_model.partition_field_end or "end_time", 1290 parameters=datetime_based_cursor_model.parameters or {}, 1291 ) 1292 1293 slice_boundary_fields = ( 1294 interpolated_partition_field_start.eval(config=config), 1295 interpolated_partition_field_end.eval(config=config), 1296 ) 1297 1298 datetime_format = datetime_based_cursor_model.datetime_format 1299 1300 cursor_granularity = ( 1301 parse_duration(datetime_based_cursor_model.cursor_granularity) 1302 if datetime_based_cursor_model.cursor_granularity 1303 else None 1304 ) 1305 1306 lookback_window = None 1307 interpolated_lookback_window = ( 1308 InterpolatedString.create( 1309 datetime_based_cursor_model.lookback_window, 1310 parameters=datetime_based_cursor_model.parameters or {}, 1311 ) 1312 if datetime_based_cursor_model.lookback_window 1313 else None 1314 ) 1315 if interpolated_lookback_window: 1316 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1317 if evaluated_lookback_window: 1318 lookback_window = parse_duration(evaluated_lookback_window) 1319 1320 connector_state_converter: DateTimeStreamStateConverter 1321 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1322 datetime_format=datetime_format, 1323 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1324 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1325 cursor_granularity=cursor_granularity, 1326 ) 1327 1328 # Adjusts the stream state by applying the runtime lookback window. 1329 # This is used to ensure correct state handling in case of failed partitions. 1330 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1331 if runtime_lookback_window and stream_state_value: 1332 new_stream_state = ( 1333 connector_state_converter.parse_timestamp(stream_state_value) 1334 - runtime_lookback_window 1335 ) 1336 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1337 new_stream_state 1338 ) 1339 1340 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1341 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1342 start_date_runtime_value = self.create_min_max_datetime( 1343 model=datetime_based_cursor_model.start_datetime, config=config 1344 ) 1345 else: 1346 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1347 1348 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1349 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1350 end_date_runtime_value = self.create_min_max_datetime( 1351 model=datetime_based_cursor_model.end_datetime, config=config 1352 ) 1353 else: 1354 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1355 1356 interpolated_start_date = MinMaxDatetime.create( 1357 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1358 parameters=datetime_based_cursor_model.parameters, 1359 ) 1360 interpolated_end_date = ( 1361 None 1362 if not end_date_runtime_value 1363 else MinMaxDatetime.create( 1364 end_date_runtime_value, datetime_based_cursor_model.parameters 1365 ) 1366 ) 1367 1368 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1369 if not interpolated_start_date.datetime_format: 1370 interpolated_start_date.datetime_format = datetime_format 1371 if interpolated_end_date and not interpolated_end_date.datetime_format: 1372 interpolated_end_date.datetime_format = datetime_format 1373 1374 start_date = interpolated_start_date.get_datetime(config=config) 1375 end_date_provider = ( 1376 partial(interpolated_end_date.get_datetime, config) 1377 if interpolated_end_date 1378 else connector_state_converter.get_end_provider() 1379 ) 1380 1381 if ( 1382 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1383 ) or ( 1384 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1385 ): 1386 raise ValueError( 1387 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1388 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1389 ) 1390 1391 # When step is not defined, default to a step size from the starting date to the present moment 1392 step_length = datetime.timedelta.max 1393 interpolated_step = ( 1394 InterpolatedString.create( 1395 datetime_based_cursor_model.step, 1396 parameters=datetime_based_cursor_model.parameters or {}, 1397 ) 1398 if datetime_based_cursor_model.step 1399 else None 1400 ) 1401 if interpolated_step: 1402 evaluated_step = interpolated_step.eval(config) 1403 if evaluated_step: 1404 step_length = parse_duration(evaluated_step) 1405 1406 clamping_strategy: ClampingStrategy = NoClamping() 1407 if datetime_based_cursor_model.clamping: 1408 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1409 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1410 # object which we want to keep agnostic of being low-code 1411 target = InterpolatedString( 1412 string=datetime_based_cursor_model.clamping.target, 1413 parameters=datetime_based_cursor_model.parameters or {}, 1414 ) 1415 evaluated_target = target.eval(config=config) 1416 match evaluated_target: 1417 case "DAY": 1418 clamping_strategy = DayClampingStrategy() 1419 end_date_provider = ClampingEndProvider( 1420 DayClampingStrategy(is_ceiling=False), 1421 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1422 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1423 ) 1424 case "WEEK": 1425 if ( 1426 not datetime_based_cursor_model.clamping.target_details 1427 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1428 ): 1429 raise ValueError( 1430 "Given WEEK clamping, weekday needs to be provided as target_details" 1431 ) 1432 weekday = self._assemble_weekday( 1433 datetime_based_cursor_model.clamping.target_details["weekday"] 1434 ) 1435 clamping_strategy = WeekClampingStrategy(weekday) 1436 end_date_provider = ClampingEndProvider( 1437 WeekClampingStrategy(weekday, is_ceiling=False), 1438 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1439 granularity=cursor_granularity or datetime.timedelta(days=1), 1440 ) 1441 case "MONTH": 1442 clamping_strategy = MonthClampingStrategy() 1443 end_date_provider = ClampingEndProvider( 1444 MonthClampingStrategy(is_ceiling=False), 1445 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1446 granularity=cursor_granularity or datetime.timedelta(days=1), 1447 ) 1448 case _: 1449 raise ValueError( 1450 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1451 ) 1452 1453 return ConcurrentCursor( 1454 stream_name=stream_name, 1455 stream_namespace=stream_namespace, 1456 stream_state=stream_state, 1457 message_repository=message_repository or self._message_repository, 1458 connector_state_manager=self._connector_state_manager, 1459 connector_state_converter=connector_state_converter, 1460 cursor_field=cursor_field, 1461 slice_boundary_fields=slice_boundary_fields, 1462 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1463 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1464 lookback_window=lookback_window, 1465 slice_range=step_length, 1466 cursor_granularity=cursor_granularity, 1467 clamping_strategy=clamping_strategy, 1468 ) 1469 1470 def create_concurrent_cursor_from_incrementing_count_cursor( 1471 self, 1472 model_type: Type[BaseModel], 1473 component_definition: ComponentDefinition, 1474 stream_name: str, 1475 stream_namespace: Optional[str], 1476 config: Config, 1477 message_repository: Optional[MessageRepository] = None, 1478 stream_state_migrations: Optional[List[Any]] = None, 1479 **kwargs: Any, 1480 ) -> ConcurrentCursor: 1481 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1482 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1483 # incoming state and connector_state_manager that is initialized when the component factory is created 1484 stream_state = ( 1485 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1486 if "stream_state" not in kwargs 1487 else kwargs["stream_state"] 1488 ) 1489 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1490 1491 component_type = component_definition.get("type") 1492 if component_definition.get("type") != model_type.__name__: 1493 raise ValueError( 1494 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1495 ) 1496 1497 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1498 1499 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1500 raise ValueError( 1501 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1502 ) 1503 1504 interpolated_start_value = ( 1505 InterpolatedString.create( 1506 incrementing_count_cursor_model.start_value, # type: ignore 1507 parameters=incrementing_count_cursor_model.parameters or {}, 1508 ) 1509 if incrementing_count_cursor_model.start_value 1510 else 0 1511 ) 1512 1513 interpolated_cursor_field = InterpolatedString.create( 1514 incrementing_count_cursor_model.cursor_field, 1515 parameters=incrementing_count_cursor_model.parameters or {}, 1516 ) 1517 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1518 1519 connector_state_converter = IncrementingCountStreamStateConverter( 1520 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1521 ) 1522 1523 return ConcurrentCursor( 1524 stream_name=stream_name, 1525 stream_namespace=stream_namespace, 1526 stream_state=stream_state, 1527 message_repository=message_repository or self._message_repository, 1528 connector_state_manager=self._connector_state_manager, 1529 connector_state_converter=connector_state_converter, 1530 cursor_field=cursor_field, 1531 slice_boundary_fields=None, 1532 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1534 ) 1535 1536 def _assemble_weekday(self, weekday: str) -> Weekday: 1537 match weekday: 1538 case "MONDAY": 1539 return Weekday.MONDAY 1540 case "TUESDAY": 1541 return Weekday.TUESDAY 1542 case "WEDNESDAY": 1543 return Weekday.WEDNESDAY 1544 case "THURSDAY": 1545 return Weekday.THURSDAY 1546 case "FRIDAY": 1547 return Weekday.FRIDAY 1548 case "SATURDAY": 1549 return Weekday.SATURDAY 1550 case "SUNDAY": 1551 return Weekday.SUNDAY 1552 case _: 1553 raise ValueError(f"Unknown weekday {weekday}") 1554 1555 def create_concurrent_cursor_from_perpartition_cursor( 1556 self, 1557 state_manager: ConnectorStateManager, 1558 model_type: Type[BaseModel], 1559 component_definition: ComponentDefinition, 1560 stream_name: str, 1561 stream_namespace: Optional[str], 1562 config: Config, 1563 stream_state: MutableMapping[str, Any], 1564 partition_router: PartitionRouter, 1565 stream_state_migrations: Optional[List[Any]] = None, 1566 attempt_to_create_cursor_if_not_provided: bool = False, 1567 **kwargs: Any, 1568 ) -> ConcurrentPerPartitionCursor: 1569 component_type = component_definition.get("type") 1570 if component_definition.get("type") != model_type.__name__: 1571 raise ValueError( 1572 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1573 ) 1574 1575 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1576 1577 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1578 raise ValueError( 1579 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1580 ) 1581 1582 interpolated_cursor_field = InterpolatedString.create( 1583 datetime_based_cursor_model.cursor_field, 1584 parameters=datetime_based_cursor_model.parameters or {}, 1585 ) 1586 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1587 1588 datetime_format = datetime_based_cursor_model.datetime_format 1589 1590 cursor_granularity = ( 1591 parse_duration(datetime_based_cursor_model.cursor_granularity) 1592 if datetime_based_cursor_model.cursor_granularity 1593 else None 1594 ) 1595 1596 connector_state_converter: DateTimeStreamStateConverter 1597 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1598 datetime_format=datetime_format, 1599 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1600 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1601 cursor_granularity=cursor_granularity, 1602 ) 1603 1604 # Create the cursor factory 1605 cursor_factory = ConcurrentCursorFactory( 1606 partial( 1607 self.create_concurrent_cursor_from_datetime_based_cursor, 1608 state_manager=state_manager, 1609 model_type=model_type, 1610 component_definition=component_definition, 1611 stream_name=stream_name, 1612 stream_namespace=stream_namespace, 1613 config=config, 1614 message_repository=NoopMessageRepository(), 1615 stream_state_migrations=stream_state_migrations, 1616 ) 1617 ) 1618 1619 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1620 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1621 use_global_cursor = isinstance( 1622 partition_router, GroupingPartitionRouter 1623 ) or component_definition.get("global_substream_cursor", False) 1624 1625 # Return the concurrent cursor and state converter 1626 return ConcurrentPerPartitionCursor( 1627 cursor_factory=cursor_factory, 1628 partition_router=partition_router, 1629 stream_name=stream_name, 1630 stream_namespace=stream_namespace, 1631 stream_state=stream_state, 1632 message_repository=self._message_repository, # type: ignore 1633 connector_state_manager=state_manager, 1634 connector_state_converter=connector_state_converter, 1635 cursor_field=cursor_field, 1636 use_global_cursor=use_global_cursor, 1637 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1638 ) 1639 1640 @staticmethod 1641 def create_constant_backoff_strategy( 1642 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1643 ) -> ConstantBackoffStrategy: 1644 return ConstantBackoffStrategy( 1645 backoff_time_in_seconds=model.backoff_time_in_seconds, 1646 config=config, 1647 parameters=model.parameters or {}, 1648 ) 1649 1650 def create_cursor_pagination( 1651 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1652 ) -> CursorPaginationStrategy: 1653 if isinstance(decoder, PaginationDecoderDecorator): 1654 inner_decoder = decoder.decoder 1655 else: 1656 inner_decoder = decoder 1657 decoder = PaginationDecoderDecorator(decoder=decoder) 1658 1659 if self._is_supported_decoder_for_pagination(inner_decoder): 1660 decoder_to_use = decoder 1661 else: 1662 raise ValueError( 1663 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1664 ) 1665 1666 return CursorPaginationStrategy( 1667 cursor_value=model.cursor_value, 1668 decoder=decoder_to_use, 1669 page_size=model.page_size, 1670 stop_condition=model.stop_condition, 1671 config=config, 1672 parameters=model.parameters or {}, 1673 ) 1674 1675 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1676 """ 1677 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1678 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1679 :param model: The Pydantic model of the custom component being created 1680 :param config: The custom defined connector config 1681 :return: The declarative component built from the Pydantic model to be used at runtime 1682 """ 1683 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1684 component_fields = get_type_hints(custom_component_class) 1685 model_args = model.dict() 1686 model_args["config"] = config 1687 1688 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1689 # we defer to these arguments over the component's definition 1690 for key, arg in kwargs.items(): 1691 model_args[key] = arg 1692 1693 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1694 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1695 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1696 for model_field, model_value in model_args.items(): 1697 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1698 if ( 1699 isinstance(model_value, dict) 1700 and "type" not in model_value 1701 and model_field in component_fields 1702 ): 1703 derived_type = self._derive_component_type_from_type_hints( 1704 component_fields.get(model_field) 1705 ) 1706 if derived_type: 1707 model_value["type"] = derived_type 1708 1709 if self._is_component(model_value): 1710 model_args[model_field] = self._create_nested_component( 1711 model, model_field, model_value, config 1712 ) 1713 elif isinstance(model_value, list): 1714 vals = [] 1715 for v in model_value: 1716 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1717 derived_type = self._derive_component_type_from_type_hints( 1718 component_fields.get(model_field) 1719 ) 1720 if derived_type: 1721 v["type"] = derived_type 1722 if self._is_component(v): 1723 vals.append(self._create_nested_component(model, model_field, v, config)) 1724 else: 1725 vals.append(v) 1726 model_args[model_field] = vals 1727 1728 kwargs = { 1729 class_field: model_args[class_field] 1730 for class_field in component_fields.keys() 1731 if class_field in model_args 1732 } 1733 return custom_component_class(**kwargs) 1734 1735 @staticmethod 1736 def _get_class_from_fully_qualified_class_name( 1737 full_qualified_class_name: str, 1738 ) -> Any: 1739 """Get a class from its fully qualified name. 1740 1741 If a custom components module is needed, we assume it is already registered - probably 1742 as `source_declarative_manifest.components` or `components`. 1743 1744 Args: 1745 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1746 1747 Returns: 1748 Any: The class object. 1749 1750 Raises: 1751 ValueError: If the class cannot be loaded. 1752 """ 1753 split = full_qualified_class_name.split(".") 1754 module_name_full = ".".join(split[:-1]) 1755 class_name = split[-1] 1756 1757 try: 1758 module_ref = importlib.import_module(module_name_full) 1759 except ModuleNotFoundError as e: 1760 if split[0] == "source_declarative_manifest": 1761 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1762 try: 1763 import os 1764 1765 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1766 module_ref = importlib.import_module( 1767 module_name_with_source_declarative_manifest 1768 ) 1769 except ModuleNotFoundError: 1770 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1771 else: 1772 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1773 1774 try: 1775 return getattr(module_ref, class_name) 1776 except AttributeError as e: 1777 raise ValueError( 1778 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1779 ) from e 1780 1781 @staticmethod 1782 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1783 interface = field_type 1784 while True: 1785 origin = get_origin(interface) 1786 if origin: 1787 # Unnest types until we reach the raw type 1788 # List[T] -> T 1789 # Optional[List[T]] -> T 1790 args = get_args(interface) 1791 interface = args[0] 1792 else: 1793 break 1794 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1795 return interface.__name__ 1796 return None 1797 1798 @staticmethod 1799 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1800 if not cls: 1801 return False 1802 return cls.__module__ == "builtins" 1803 1804 @staticmethod 1805 def _extract_missing_parameters(error: TypeError) -> List[str]: 1806 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1807 if parameter_search: 1808 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1809 else: 1810 return [] 1811 1812 def _create_nested_component( 1813 self, model: Any, model_field: str, model_value: Any, config: Config 1814 ) -> Any: 1815 type_name = model_value.get("type", None) 1816 if not type_name: 1817 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1818 return model_value 1819 1820 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1821 if model_type: 1822 parsed_model = model_type.parse_obj(model_value) 1823 try: 1824 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1825 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1826 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1827 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1828 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1829 # are needed by a component and could not be shared. 1830 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1831 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1832 model_parameters = model_value.get("$parameters", {}) 1833 matching_parameters = { 1834 kwarg: model_parameters[kwarg] 1835 for kwarg in constructor_kwargs 1836 if kwarg in model_parameters 1837 } 1838 return self._create_component_from_model( 1839 model=parsed_model, config=config, **matching_parameters 1840 ) 1841 except TypeError as error: 1842 missing_parameters = self._extract_missing_parameters(error) 1843 if missing_parameters: 1844 raise ValueError( 1845 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1846 + ", ".join( 1847 ( 1848 f"{type_name}.$parameters.{parameter}" 1849 for parameter in missing_parameters 1850 ) 1851 ) 1852 ) 1853 raise TypeError( 1854 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1855 ) 1856 else: 1857 raise ValueError( 1858 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1859 ) 1860 1861 @staticmethod 1862 def _is_component(model_value: Any) -> bool: 1863 return isinstance(model_value, dict) and model_value.get("type") is not None 1864 1865 def create_datetime_based_cursor( 1866 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1867 ) -> DatetimeBasedCursor: 1868 start_datetime: Union[str, MinMaxDatetime] = ( 1869 model.start_datetime 1870 if isinstance(model.start_datetime, str) 1871 else self.create_min_max_datetime(model.start_datetime, config) 1872 ) 1873 end_datetime: Union[str, MinMaxDatetime, None] = None 1874 if model.is_data_feed and model.end_datetime: 1875 raise ValueError("Data feed does not support end_datetime") 1876 if model.is_data_feed and model.is_client_side_incremental: 1877 raise ValueError( 1878 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1879 ) 1880 if model.end_datetime: 1881 end_datetime = ( 1882 model.end_datetime 1883 if isinstance(model.end_datetime, str) 1884 else self.create_min_max_datetime(model.end_datetime, config) 1885 ) 1886 1887 end_time_option = ( 1888 self._create_component_from_model( 1889 model.end_time_option, config, parameters=model.parameters or {} 1890 ) 1891 if model.end_time_option 1892 else None 1893 ) 1894 start_time_option = ( 1895 self._create_component_from_model( 1896 model.start_time_option, config, parameters=model.parameters or {} 1897 ) 1898 if model.start_time_option 1899 else None 1900 ) 1901 1902 return DatetimeBasedCursor( 1903 cursor_field=model.cursor_field, 1904 cursor_datetime_formats=model.cursor_datetime_formats 1905 if model.cursor_datetime_formats 1906 else [], 1907 cursor_granularity=model.cursor_granularity, 1908 datetime_format=model.datetime_format, 1909 end_datetime=end_datetime, 1910 start_datetime=start_datetime, 1911 step=model.step, 1912 end_time_option=end_time_option, 1913 lookback_window=model.lookback_window, 1914 start_time_option=start_time_option, 1915 partition_field_end=model.partition_field_end, 1916 partition_field_start=model.partition_field_start, 1917 message_repository=self._message_repository, 1918 is_compare_strictly=model.is_compare_strictly, 1919 config=config, 1920 parameters=model.parameters or {}, 1921 ) 1922 1923 def create_declarative_stream( 1924 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1925 ) -> DeclarativeStream: 1926 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1927 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1928 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1929 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1930 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1931 1932 primary_key = model.primary_key.__root__ if model.primary_key else None 1933 stop_condition_on_cursor = ( 1934 model.incremental_sync 1935 and hasattr(model.incremental_sync, "is_data_feed") 1936 and model.incremental_sync.is_data_feed 1937 ) 1938 client_side_filtering_enabled = ( 1939 model.incremental_sync 1940 and hasattr(model.incremental_sync, "is_client_side_incremental") 1941 and model.incremental_sync.is_client_side_incremental 1942 ) 1943 concurrent_cursor = None 1944 if stop_condition_on_cursor or client_side_filtering_enabled: 1945 stream_slicer = self._build_stream_slicer_from_partition_router( 1946 model.retriever, config, stream_name=model.name 1947 ) 1948 concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) 1949 1950 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1951 cursor_model = model.incremental_sync 1952 1953 end_time_option = ( 1954 self._create_component_from_model( 1955 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1956 ) 1957 if cursor_model.end_time_option 1958 else None 1959 ) 1960 start_time_option = ( 1961 self._create_component_from_model( 1962 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1963 ) 1964 if cursor_model.start_time_option 1965 else None 1966 ) 1967 1968 request_options_provider = DatetimeBasedRequestOptionsProvider( 1969 start_time_option=start_time_option, 1970 end_time_option=end_time_option, 1971 partition_field_start=cursor_model.partition_field_end, 1972 partition_field_end=cursor_model.partition_field_end, 1973 config=config, 1974 parameters=model.parameters or {}, 1975 ) 1976 elif model.incremental_sync and isinstance( 1977 model.incremental_sync, IncrementingCountCursorModel 1978 ): 1979 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1980 1981 start_time_option = ( 1982 self._create_component_from_model( 1983 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1984 config, 1985 parameters=cursor_model.parameters or {}, 1986 ) 1987 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1988 else None 1989 ) 1990 1991 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1992 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1993 partition_field_start = "start" 1994 1995 request_options_provider = DatetimeBasedRequestOptionsProvider( 1996 start_time_option=start_time_option, 1997 partition_field_start=partition_field_start, 1998 config=config, 1999 parameters=model.parameters or {}, 2000 ) 2001 else: 2002 request_options_provider = None 2003 2004 transformations = [] 2005 if model.transformations: 2006 for transformation_model in model.transformations: 2007 transformations.append( 2008 self._create_component_from_model(model=transformation_model, config=config) 2009 ) 2010 file_uploader = None 2011 if model.file_uploader: 2012 file_uploader = self._create_component_from_model( 2013 model=model.file_uploader, config=config 2014 ) 2015 2016 retriever = self._create_component_from_model( 2017 model=model.retriever, 2018 config=config, 2019 name=model.name, 2020 primary_key=primary_key, 2021 stream_slicer=combined_slicers, 2022 request_options_provider=request_options_provider, 2023 stop_condition_cursor=concurrent_cursor, 2024 client_side_incremental_sync={"cursor": concurrent_cursor} 2025 if client_side_filtering_enabled 2026 else None, 2027 transformations=transformations, 2028 file_uploader=file_uploader, 2029 incremental_sync=model.incremental_sync, 2030 ) 2031 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2032 2033 if model.state_migrations: 2034 state_transformations = [ 2035 self._create_component_from_model(state_migration, config, declarative_stream=model) 2036 for state_migration in model.state_migrations 2037 ] 2038 else: 2039 state_transformations = [] 2040 2041 schema_loader: Union[ 2042 CompositeSchemaLoader, 2043 DefaultSchemaLoader, 2044 DynamicSchemaLoader, 2045 InlineSchemaLoader, 2046 JsonFileSchemaLoader, 2047 ] 2048 if model.schema_loader and isinstance(model.schema_loader, list): 2049 nested_schema_loaders = [ 2050 self._create_component_from_model(model=nested_schema_loader, config=config) 2051 for nested_schema_loader in model.schema_loader 2052 ] 2053 schema_loader = CompositeSchemaLoader( 2054 schema_loaders=nested_schema_loaders, parameters={} 2055 ) 2056 elif model.schema_loader: 2057 schema_loader = self._create_component_from_model( 2058 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2059 config=config, 2060 ) 2061 else: 2062 options = model.parameters or {} 2063 if "name" not in options: 2064 options["name"] = model.name 2065 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2066 2067 return DeclarativeStream( 2068 name=model.name or "", 2069 primary_key=primary_key, 2070 retriever=retriever, 2071 schema_loader=schema_loader, 2072 stream_cursor_field=cursor_field or "", 2073 state_migrations=state_transformations, 2074 config=config, 2075 parameters=model.parameters or {}, 2076 ) 2077 2078 def _build_stream_slicer_from_partition_router( 2079 self, 2080 model: Union[ 2081 AsyncRetrieverModel, 2082 CustomRetrieverModel, 2083 SimpleRetrieverModel, 2084 ], 2085 config: Config, 2086 stream_name: Optional[str] = None, 2087 ) -> Optional[PartitionRouter]: 2088 if ( 2089 hasattr(model, "partition_router") 2090 and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel) 2091 and model.partition_router 2092 ): 2093 stream_slicer_model = model.partition_router 2094 if isinstance(stream_slicer_model, list): 2095 return CartesianProductStreamSlicer( 2096 [ 2097 self._create_component_from_model( 2098 model=slicer, config=config, stream_name=stream_name or "" 2099 ) 2100 for slicer in stream_slicer_model 2101 ], 2102 parameters={}, 2103 ) 2104 else: 2105 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2106 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2107 ) 2108 return None 2109 2110 def _build_incremental_cursor( 2111 self, 2112 model: DeclarativeStreamModel, 2113 stream_slicer: Optional[PartitionRouter], 2114 config: Config, 2115 ) -> Optional[StreamSlicer]: 2116 state_transformations = ( 2117 [ 2118 self._create_component_from_model(state_migration, config, declarative_stream=model) 2119 for state_migration in model.state_migrations 2120 ] 2121 if model.state_migrations 2122 else [] 2123 ) 2124 2125 if model.incremental_sync and stream_slicer: 2126 if model.retriever.type == "AsyncRetriever": 2127 stream_name = model.name or "" 2128 stream_namespace = None 2129 stream_state = self._connector_state_manager.get_stream_state( 2130 stream_name, stream_namespace 2131 ) 2132 2133 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2134 state_manager=self._connector_state_manager, 2135 model_type=DatetimeBasedCursorModel, 2136 component_definition=model.incremental_sync.__dict__, 2137 stream_name=stream_name, 2138 stream_namespace=stream_namespace, 2139 config=config or {}, 2140 stream_state=stream_state, 2141 stream_state_migrations=state_transformations, 2142 partition_router=stream_slicer, 2143 ) 2144 2145 incremental_sync_model = model.incremental_sync 2146 cursor_component = self._create_component_from_model( 2147 model=incremental_sync_model, config=config 2148 ) 2149 is_global_cursor = ( 2150 hasattr(incremental_sync_model, "global_substream_cursor") 2151 and incremental_sync_model.global_substream_cursor 2152 ) 2153 2154 if is_global_cursor: 2155 return GlobalSubstreamCursor( 2156 stream_cursor=cursor_component, partition_router=stream_slicer 2157 ) 2158 return PerPartitionWithGlobalCursor( 2159 cursor_factory=CursorFactory( 2160 lambda: self._create_component_from_model( 2161 model=incremental_sync_model, config=config 2162 ), 2163 ), 2164 partition_router=stream_slicer, 2165 stream_cursor=cursor_component, 2166 ) 2167 elif model.incremental_sync: 2168 if model.retriever.type == "AsyncRetriever": 2169 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2170 model_type=DatetimeBasedCursorModel, 2171 component_definition=model.incremental_sync.__dict__, 2172 stream_name=model.name or "", 2173 stream_namespace=None, 2174 config=config or {}, 2175 stream_state_migrations=state_transformations, 2176 ) 2177 return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync 2178 return None 2179 2180 def _build_concurrent_cursor( 2181 self, 2182 model: DeclarativeStreamModel, 2183 stream_slicer: Optional[PartitionRouter], 2184 config: Config, 2185 ) -> Optional[StreamSlicer]: 2186 stream_state = self._connector_state_manager.get_stream_state( 2187 stream_name=model.name or "", namespace=None 2188 ) 2189 2190 if model.state_migrations: 2191 state_transformations = [ 2192 self._create_component_from_model(state_migration, config, declarative_stream=model) 2193 for state_migration in model.state_migrations 2194 ] 2195 else: 2196 state_transformations = [] 2197 2198 if model.incremental_sync and stream_slicer: 2199 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2200 state_manager=self._connector_state_manager, 2201 model_type=DatetimeBasedCursorModel, 2202 component_definition=model.incremental_sync.__dict__, 2203 stream_name=model.name or "", 2204 stream_namespace=None, 2205 config=config or {}, 2206 stream_state=stream_state, 2207 stream_state_migrations=state_transformations, 2208 partition_router=stream_slicer, 2209 attempt_to_create_cursor_if_not_provided=True, 2210 ) 2211 elif model.incremental_sync: 2212 if type(model.incremental_sync) == IncrementingCountCursorModel: 2213 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2214 model_type=IncrementingCountCursorModel, 2215 component_definition=model.incremental_sync.__dict__, 2216 stream_name=model.name or "", 2217 stream_namespace=None, 2218 config=config or {}, 2219 stream_state_migrations=state_transformations, 2220 ) 2221 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2222 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2223 model_type=type(model.incremental_sync), 2224 component_definition=model.incremental_sync.__dict__, 2225 stream_name=model.name or "", 2226 stream_namespace=None, 2227 config=config or {}, 2228 stream_state_migrations=state_transformations, 2229 attempt_to_create_cursor_if_not_provided=True, 2230 ) 2231 else: 2232 raise ValueError( 2233 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2234 ) 2235 return None 2236 2237 def _build_resumable_cursor( 2238 self, 2239 model: Union[ 2240 AsyncRetrieverModel, 2241 CustomRetrieverModel, 2242 SimpleRetrieverModel, 2243 ], 2244 stream_slicer: Optional[PartitionRouter], 2245 ) -> Optional[StreamSlicer]: 2246 if hasattr(model, "paginator") and model.paginator and not stream_slicer: 2247 # For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor` 2248 return ResumableFullRefreshCursor(parameters={}) 2249 elif stream_slicer: 2250 # For the Full-Refresh sub-streams, we use the nested `ChildPartitionResumableFullRefreshCursor` 2251 return PerPartitionCursor( 2252 cursor_factory=CursorFactory( 2253 create_function=partial(ChildPartitionResumableFullRefreshCursor, {}) 2254 ), 2255 partition_router=stream_slicer, 2256 ) 2257 return None 2258 2259 def _merge_stream_slicers( 2260 self, model: DeclarativeStreamModel, config: Config 2261 ) -> Optional[StreamSlicer]: 2262 retriever_model = model.retriever 2263 2264 stream_slicer = self._build_stream_slicer_from_partition_router( 2265 retriever_model, config, stream_name=model.name 2266 ) 2267 2268 if retriever_model.type == "AsyncRetriever": 2269 is_not_datetime_cursor = ( 2270 model.incremental_sync.type != "DatetimeBasedCursor" 2271 if model.incremental_sync 2272 else None 2273 ) 2274 is_partition_router = ( 2275 bool(retriever_model.partition_router) if model.incremental_sync else None 2276 ) 2277 2278 if is_not_datetime_cursor: 2279 # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the 2280 # support or unordered slices (for example, when we trigger reports for January and February, the report 2281 # in February can be completed first). Once we have support for custom concurrent cursor or have a new 2282 # implementation available in the CDK, we can enable more cursors here. 2283 raise ValueError( 2284 "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet." 2285 ) 2286 2287 if is_partition_router and not stream_slicer: 2288 # Note that this development is also done in parallel to the per partition development which once merged 2289 # we could support here by calling create_concurrent_cursor_from_perpartition_cursor 2290 raise ValueError("Per partition state is not supported yet for AsyncRetriever.") 2291 2292 if model.incremental_sync: 2293 return self._build_incremental_cursor(model, stream_slicer, config) 2294 2295 return ( 2296 stream_slicer 2297 if self._disable_resumable_full_refresh 2298 else self._build_resumable_cursor(retriever_model, stream_slicer) 2299 ) 2300 2301 def create_default_error_handler( 2302 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2303 ) -> DefaultErrorHandler: 2304 backoff_strategies = [] 2305 if model.backoff_strategies: 2306 for backoff_strategy_model in model.backoff_strategies: 2307 backoff_strategies.append( 2308 self._create_component_from_model(model=backoff_strategy_model, config=config) 2309 ) 2310 2311 response_filters = [] 2312 if model.response_filters: 2313 for response_filter_model in model.response_filters: 2314 response_filters.append( 2315 self._create_component_from_model(model=response_filter_model, config=config) 2316 ) 2317 response_filters.append( 2318 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2319 ) 2320 2321 return DefaultErrorHandler( 2322 backoff_strategies=backoff_strategies, 2323 max_retries=model.max_retries, 2324 response_filters=response_filters, 2325 config=config, 2326 parameters=model.parameters or {}, 2327 ) 2328 2329 def create_default_paginator( 2330 self, 2331 model: DefaultPaginatorModel, 2332 config: Config, 2333 *, 2334 url_base: str, 2335 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2336 decoder: Optional[Decoder] = None, 2337 cursor_used_for_stop_condition: Optional[Cursor] = None, 2338 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2339 if decoder: 2340 if self._is_supported_decoder_for_pagination(decoder): 2341 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2342 else: 2343 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2344 else: 2345 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2346 page_size_option = ( 2347 self._create_component_from_model(model=model.page_size_option, config=config) 2348 if model.page_size_option 2349 else None 2350 ) 2351 page_token_option = ( 2352 self._create_component_from_model(model=model.page_token_option, config=config) 2353 if model.page_token_option 2354 else None 2355 ) 2356 pagination_strategy = self._create_component_from_model( 2357 model=model.pagination_strategy, 2358 config=config, 2359 decoder=decoder_to_use, 2360 extractor_model=extractor_model, 2361 ) 2362 if cursor_used_for_stop_condition: 2363 pagination_strategy = StopConditionPaginationStrategyDecorator( 2364 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2365 ) 2366 paginator = DefaultPaginator( 2367 decoder=decoder_to_use, 2368 page_size_option=page_size_option, 2369 page_token_option=page_token_option, 2370 pagination_strategy=pagination_strategy, 2371 url_base=url_base, 2372 config=config, 2373 parameters=model.parameters or {}, 2374 ) 2375 if self._limit_pages_fetched_per_slice: 2376 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2377 return paginator 2378 2379 def create_dpath_extractor( 2380 self, 2381 model: DpathExtractorModel, 2382 config: Config, 2383 decoder: Optional[Decoder] = None, 2384 **kwargs: Any, 2385 ) -> DpathExtractor: 2386 if decoder: 2387 decoder_to_use = decoder 2388 else: 2389 decoder_to_use = JsonDecoder(parameters={}) 2390 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2391 return DpathExtractor( 2392 decoder=decoder_to_use, 2393 field_path=model_field_path, 2394 config=config, 2395 parameters=model.parameters or {}, 2396 ) 2397 2398 @staticmethod 2399 def create_response_to_file_extractor( 2400 model: ResponseToFileExtractorModel, 2401 **kwargs: Any, 2402 ) -> ResponseToFileExtractor: 2403 return ResponseToFileExtractor(parameters=model.parameters or {}) 2404 2405 @staticmethod 2406 def create_exponential_backoff_strategy( 2407 model: ExponentialBackoffStrategyModel, config: Config 2408 ) -> ExponentialBackoffStrategy: 2409 return ExponentialBackoffStrategy( 2410 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2411 ) 2412 2413 @staticmethod 2414 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2415 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2416 2417 def create_http_requester( 2418 self, 2419 model: HttpRequesterModel, 2420 config: Config, 2421 decoder: Decoder = JsonDecoder(parameters={}), 2422 query_properties_key: Optional[str] = None, 2423 use_cache: Optional[bool] = None, 2424 *, 2425 name: str, 2426 ) -> HttpRequester: 2427 authenticator = ( 2428 self._create_component_from_model( 2429 model=model.authenticator, 2430 config=config, 2431 url_base=model.url or model.url_base, 2432 name=name, 2433 decoder=decoder, 2434 ) 2435 if model.authenticator 2436 else None 2437 ) 2438 error_handler = ( 2439 self._create_component_from_model(model=model.error_handler, config=config) 2440 if model.error_handler 2441 else DefaultErrorHandler( 2442 backoff_strategies=[], 2443 response_filters=[], 2444 config=config, 2445 parameters=model.parameters or {}, 2446 ) 2447 ) 2448 2449 api_budget = self._api_budget 2450 2451 # Removes QueryProperties components from the interpolated mappings because it has been designed 2452 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2453 # instead of through jinja interpolation 2454 request_parameters: Optional[Union[str, Mapping[str, str]]] 2455 if isinstance(model.request_parameters, Mapping): 2456 request_parameters = self._remove_query_properties(model.request_parameters) 2457 else: 2458 request_parameters = model.request_parameters 2459 2460 request_options_provider = InterpolatedRequestOptionsProvider( 2461 request_body=model.request_body, 2462 request_body_data=model.request_body_data, 2463 request_body_json=model.request_body_json, 2464 request_headers=model.request_headers, 2465 request_parameters=request_parameters, 2466 query_properties_key=query_properties_key, 2467 config=config, 2468 parameters=model.parameters or {}, 2469 ) 2470 2471 assert model.use_cache is not None # for mypy 2472 assert model.http_method is not None # for mypy 2473 2474 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2475 2476 return HttpRequester( 2477 name=name, 2478 url=model.url, 2479 url_base=model.url_base, 2480 path=model.path, 2481 authenticator=authenticator, 2482 error_handler=error_handler, 2483 api_budget=api_budget, 2484 http_method=HttpMethod[model.http_method.value], 2485 request_options_provider=request_options_provider, 2486 config=config, 2487 disable_retries=self._disable_retries, 2488 parameters=model.parameters or {}, 2489 message_repository=self._message_repository, 2490 use_cache=should_use_cache, 2491 decoder=decoder, 2492 stream_response=decoder.is_stream_response() if decoder else False, 2493 ) 2494 2495 @staticmethod 2496 def create_http_response_filter( 2497 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2498 ) -> HttpResponseFilter: 2499 if model.action: 2500 action = ResponseAction(model.action.value) 2501 else: 2502 action = None 2503 2504 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2505 2506 http_codes = ( 2507 set(model.http_codes) if model.http_codes else set() 2508 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2509 2510 return HttpResponseFilter( 2511 action=action, 2512 failure_type=failure_type, 2513 error_message=model.error_message or "", 2514 error_message_contains=model.error_message_contains or "", 2515 http_codes=http_codes, 2516 predicate=model.predicate or "", 2517 config=config, 2518 parameters=model.parameters or {}, 2519 ) 2520 2521 @staticmethod 2522 def create_inline_schema_loader( 2523 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2524 ) -> InlineSchemaLoader: 2525 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2526 2527 def create_complex_field_type( 2528 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2529 ) -> ComplexFieldType: 2530 items = ( 2531 self._create_component_from_model(model=model.items, config=config) 2532 if isinstance(model.items, ComplexFieldTypeModel) 2533 else model.items 2534 ) 2535 2536 return ComplexFieldType(field_type=model.field_type, items=items) 2537 2538 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2539 target_type = ( 2540 self._create_component_from_model(model=model.target_type, config=config) 2541 if isinstance(model.target_type, ComplexFieldTypeModel) 2542 else model.target_type 2543 ) 2544 2545 return TypesMap( 2546 target_type=target_type, 2547 current_type=model.current_type, 2548 condition=model.condition if model.condition is not None else "True", 2549 ) 2550 2551 def create_schema_type_identifier( 2552 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2553 ) -> SchemaTypeIdentifier: 2554 types_mapping = [] 2555 if model.types_mapping: 2556 types_mapping.extend( 2557 [ 2558 self._create_component_from_model(types_map, config=config) 2559 for types_map in model.types_mapping 2560 ] 2561 ) 2562 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2563 [x for x in model.schema_pointer] if model.schema_pointer else [] 2564 ) 2565 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2566 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2567 [x for x in model.type_pointer] if model.type_pointer else None 2568 ) 2569 2570 return SchemaTypeIdentifier( 2571 schema_pointer=model_schema_pointer, 2572 key_pointer=model_key_pointer, 2573 type_pointer=model_type_pointer, 2574 types_mapping=types_mapping, 2575 parameters=model.parameters or {}, 2576 ) 2577 2578 def create_dynamic_schema_loader( 2579 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2580 ) -> DynamicSchemaLoader: 2581 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2582 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2583 2584 schema_transformations = [] 2585 if model.schema_transformations: 2586 for transformation_model in model.schema_transformations: 2587 schema_transformations.append( 2588 self._create_component_from_model(model=transformation_model, config=config) 2589 ) 2590 name = "dynamic_properties" 2591 retriever = self._create_component_from_model( 2592 model=model.retriever, 2593 config=config, 2594 name=name, 2595 primary_key=None, 2596 stream_slicer=combined_slicers, 2597 transformations=[], 2598 use_cache=True, 2599 log_formatter=( 2600 lambda response: format_http_message( 2601 response, 2602 f"Schema loader '{name}' request", 2603 f"Request performed in order to extract schema.", 2604 name, 2605 is_auxiliary=True, 2606 ) 2607 ), 2608 ) 2609 schema_type_identifier = self._create_component_from_model( 2610 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2611 ) 2612 schema_filter = ( 2613 self._create_component_from_model( 2614 model.schema_filter, config=config, parameters=model.parameters or {} 2615 ) 2616 if model.schema_filter is not None 2617 else None 2618 ) 2619 2620 return DynamicSchemaLoader( 2621 retriever=retriever, 2622 config=config, 2623 schema_transformations=schema_transformations, 2624 schema_filter=schema_filter, 2625 schema_type_identifier=schema_type_identifier, 2626 parameters=model.parameters or {}, 2627 ) 2628 2629 @staticmethod 2630 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2631 return JsonDecoder(parameters={}) 2632 2633 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2634 return CompositeRawDecoder( 2635 parser=ModelToComponentFactory._get_parser(model, config), 2636 stream_response=False if self._emit_connector_builder_messages else True, 2637 ) 2638 2639 def create_jsonl_decoder( 2640 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2641 ) -> Decoder: 2642 return CompositeRawDecoder( 2643 parser=ModelToComponentFactory._get_parser(model, config), 2644 stream_response=False if self._emit_connector_builder_messages else True, 2645 ) 2646 2647 def create_gzip_decoder( 2648 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2649 ) -> Decoder: 2650 _compressed_response_types = { 2651 "gzip", 2652 "x-gzip", 2653 "gzip, deflate", 2654 "x-gzip, deflate", 2655 "application/zip", 2656 "application/gzip", 2657 "application/x-gzip", 2658 "application/x-zip-compressed", 2659 } 2660 2661 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2662 2663 if self._emit_connector_builder_messages: 2664 # This is very surprising but if the response is not streamed, 2665 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2666 # which uses urllib3 directly and does not uncompress the data. 2667 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2668 2669 return CompositeRawDecoder.by_headers( 2670 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2671 stream_response=True, 2672 fallback_parser=gzip_parser.inner_parser, 2673 ) 2674 2675 @staticmethod 2676 def create_incrementing_count_cursor( 2677 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2678 ) -> DatetimeBasedCursor: 2679 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2680 # we still parse models into components. The issue is that there's no runtime implementation of a 2681 # IncrementingCountCursor. 2682 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2683 return DatetimeBasedCursor( 2684 cursor_field=model.cursor_field, 2685 datetime_format="%Y-%m-%d", 2686 start_datetime="2024-12-12", 2687 config=config, 2688 parameters={}, 2689 ) 2690 2691 @staticmethod 2692 def create_iterable_decoder( 2693 model: IterableDecoderModel, config: Config, **kwargs: Any 2694 ) -> IterableDecoder: 2695 return IterableDecoder(parameters={}) 2696 2697 @staticmethod 2698 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2699 return XmlDecoder(parameters={}) 2700 2701 def create_zipfile_decoder( 2702 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2703 ) -> ZipfileDecoder: 2704 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2705 2706 @staticmethod 2707 def _get_parser(model: BaseModel, config: Config) -> Parser: 2708 if isinstance(model, JsonDecoderModel): 2709 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2710 return JsonParser() 2711 elif isinstance(model, JsonlDecoderModel): 2712 return JsonLineParser() 2713 elif isinstance(model, CsvDecoderModel): 2714 return CsvParser( 2715 encoding=model.encoding, 2716 delimiter=model.delimiter, 2717 set_values_to_none=model.set_values_to_none, 2718 ) 2719 elif isinstance(model, GzipDecoderModel): 2720 return GzipParser( 2721 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2722 ) 2723 elif isinstance( 2724 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2725 ): 2726 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2727 2728 raise ValueError(f"Unknown decoder type {model}") 2729 2730 @staticmethod 2731 def create_json_file_schema_loader( 2732 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2733 ) -> JsonFileSchemaLoader: 2734 return JsonFileSchemaLoader( 2735 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2736 ) 2737 2738 @staticmethod 2739 def create_jwt_authenticator( 2740 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2741 ) -> JwtAuthenticator: 2742 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2743 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2744 return JwtAuthenticator( 2745 config=config, 2746 parameters=model.parameters or {}, 2747 algorithm=JwtAlgorithm(model.algorithm.value), 2748 secret_key=model.secret_key, 2749 base64_encode_secret_key=model.base64_encode_secret_key, 2750 token_duration=model.token_duration, 2751 header_prefix=model.header_prefix, 2752 kid=jwt_headers.kid, 2753 typ=jwt_headers.typ, 2754 cty=jwt_headers.cty, 2755 iss=jwt_payload.iss, 2756 sub=jwt_payload.sub, 2757 aud=jwt_payload.aud, 2758 additional_jwt_headers=model.additional_jwt_headers, 2759 additional_jwt_payload=model.additional_jwt_payload, 2760 ) 2761 2762 def create_list_partition_router( 2763 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2764 ) -> ListPartitionRouter: 2765 request_option = ( 2766 self._create_component_from_model(model.request_option, config) 2767 if model.request_option 2768 else None 2769 ) 2770 return ListPartitionRouter( 2771 cursor_field=model.cursor_field, 2772 request_option=request_option, 2773 values=model.values, 2774 config=config, 2775 parameters=model.parameters or {}, 2776 ) 2777 2778 @staticmethod 2779 def create_min_max_datetime( 2780 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2781 ) -> MinMaxDatetime: 2782 return MinMaxDatetime( 2783 datetime=model.datetime, 2784 datetime_format=model.datetime_format or "", 2785 max_datetime=model.max_datetime or "", 2786 min_datetime=model.min_datetime or "", 2787 parameters=model.parameters or {}, 2788 ) 2789 2790 @staticmethod 2791 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2792 return NoAuth(parameters=model.parameters or {}) 2793 2794 @staticmethod 2795 def create_no_pagination( 2796 model: NoPaginationModel, config: Config, **kwargs: Any 2797 ) -> NoPagination: 2798 return NoPagination(parameters={}) 2799 2800 def create_oauth_authenticator( 2801 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2802 ) -> DeclarativeOauth2Authenticator: 2803 profile_assertion = ( 2804 self._create_component_from_model(model.profile_assertion, config=config) 2805 if model.profile_assertion 2806 else None 2807 ) 2808 2809 if model.refresh_token_updater: 2810 # ignore type error because fixing it would have a lot of dependencies, revisit later 2811 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2812 config, 2813 InterpolatedString.create( 2814 model.token_refresh_endpoint, # type: ignore 2815 parameters=model.parameters or {}, 2816 ).eval(config), 2817 access_token_name=InterpolatedString.create( 2818 model.access_token_name or "access_token", parameters=model.parameters or {} 2819 ).eval(config), 2820 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2821 expires_in_name=InterpolatedString.create( 2822 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2823 ).eval(config), 2824 client_id_name=InterpolatedString.create( 2825 model.client_id_name or "client_id", parameters=model.parameters or {} 2826 ).eval(config), 2827 client_id=InterpolatedString.create( 2828 model.client_id, parameters=model.parameters or {} 2829 ).eval(config) 2830 if model.client_id 2831 else model.client_id, 2832 client_secret_name=InterpolatedString.create( 2833 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2834 ).eval(config), 2835 client_secret=InterpolatedString.create( 2836 model.client_secret, parameters=model.parameters or {} 2837 ).eval(config) 2838 if model.client_secret 2839 else model.client_secret, 2840 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2841 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2842 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2843 grant_type_name=InterpolatedString.create( 2844 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2845 ).eval(config), 2846 grant_type=InterpolatedString.create( 2847 model.grant_type or "refresh_token", parameters=model.parameters or {} 2848 ).eval(config), 2849 refresh_request_body=InterpolatedMapping( 2850 model.refresh_request_body or {}, parameters=model.parameters or {} 2851 ).eval(config), 2852 refresh_request_headers=InterpolatedMapping( 2853 model.refresh_request_headers or {}, parameters=model.parameters or {} 2854 ).eval(config), 2855 scopes=model.scopes, 2856 token_expiry_date_format=model.token_expiry_date_format, 2857 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2858 message_repository=self._message_repository, 2859 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2860 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2861 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2862 ) 2863 # ignore type error because fixing it would have a lot of dependencies, revisit later 2864 return DeclarativeOauth2Authenticator( # type: ignore 2865 access_token_name=model.access_token_name or "access_token", 2866 access_token_value=model.access_token_value, 2867 client_id_name=model.client_id_name or "client_id", 2868 client_id=model.client_id, 2869 client_secret_name=model.client_secret_name or "client_secret", 2870 client_secret=model.client_secret, 2871 expires_in_name=model.expires_in_name or "expires_in", 2872 grant_type_name=model.grant_type_name or "grant_type", 2873 grant_type=model.grant_type or "refresh_token", 2874 refresh_request_body=model.refresh_request_body, 2875 refresh_request_headers=model.refresh_request_headers, 2876 refresh_token_name=model.refresh_token_name or "refresh_token", 2877 refresh_token=model.refresh_token, 2878 scopes=model.scopes, 2879 token_expiry_date=model.token_expiry_date, 2880 token_expiry_date_format=model.token_expiry_date_format, 2881 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2882 token_refresh_endpoint=model.token_refresh_endpoint, 2883 config=config, 2884 parameters=model.parameters or {}, 2885 message_repository=self._message_repository, 2886 profile_assertion=profile_assertion, 2887 use_profile_assertion=model.use_profile_assertion, 2888 ) 2889 2890 def create_offset_increment( 2891 self, 2892 model: OffsetIncrementModel, 2893 config: Config, 2894 decoder: Decoder, 2895 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2896 **kwargs: Any, 2897 ) -> OffsetIncrement: 2898 if isinstance(decoder, PaginationDecoderDecorator): 2899 inner_decoder = decoder.decoder 2900 else: 2901 inner_decoder = decoder 2902 decoder = PaginationDecoderDecorator(decoder=decoder) 2903 2904 if self._is_supported_decoder_for_pagination(inner_decoder): 2905 decoder_to_use = decoder 2906 else: 2907 raise ValueError( 2908 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2909 ) 2910 2911 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2912 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2913 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2914 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2915 # When we have more time to investigate we can look into reusing the same component. 2916 extractor = ( 2917 self._create_component_from_model( 2918 model=extractor_model, config=config, decoder=decoder_to_use 2919 ) 2920 if extractor_model 2921 else None 2922 ) 2923 2924 return OffsetIncrement( 2925 page_size=model.page_size, 2926 config=config, 2927 decoder=decoder_to_use, 2928 extractor=extractor, 2929 inject_on_first_request=model.inject_on_first_request or False, 2930 parameters=model.parameters or {}, 2931 ) 2932 2933 @staticmethod 2934 def create_page_increment( 2935 model: PageIncrementModel, config: Config, **kwargs: Any 2936 ) -> PageIncrement: 2937 return PageIncrement( 2938 page_size=model.page_size, 2939 config=config, 2940 start_from_page=model.start_from_page or 0, 2941 inject_on_first_request=model.inject_on_first_request or False, 2942 parameters=model.parameters or {}, 2943 ) 2944 2945 def create_parent_stream_config( 2946 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2947 ) -> ParentStreamConfig: 2948 declarative_stream = self._create_component_from_model( 2949 model.stream, config=config, **kwargs 2950 ) 2951 request_option = ( 2952 self._create_component_from_model(model.request_option, config=config) 2953 if model.request_option 2954 else None 2955 ) 2956 2957 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2958 raise ValueError( 2959 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2960 ) 2961 2962 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2963 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2964 ) 2965 2966 return ParentStreamConfig( 2967 parent_key=model.parent_key, 2968 request_option=request_option, 2969 stream=declarative_stream, 2970 partition_field=model.partition_field, 2971 config=config, 2972 incremental_dependency=model.incremental_dependency or False, 2973 parameters=model.parameters or {}, 2974 extra_fields=model.extra_fields, 2975 lazy_read_pointer=model_lazy_read_pointer, 2976 ) 2977 2978 def create_properties_from_endpoint( 2979 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2980 ) -> PropertiesFromEndpoint: 2981 retriever = self._create_component_from_model( 2982 model=model.retriever, 2983 config=config, 2984 name="dynamic_properties", 2985 primary_key=None, 2986 stream_slicer=None, 2987 transformations=[], 2988 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2989 ) 2990 return PropertiesFromEndpoint( 2991 property_field_path=model.property_field_path, 2992 retriever=retriever, 2993 config=config, 2994 parameters=model.parameters or {}, 2995 ) 2996 2997 def create_property_chunking( 2998 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2999 ) -> PropertyChunking: 3000 record_merge_strategy = ( 3001 self._create_component_from_model( 3002 model=model.record_merge_strategy, config=config, **kwargs 3003 ) 3004 if model.record_merge_strategy 3005 else None 3006 ) 3007 3008 property_limit_type: PropertyLimitType 3009 match model.property_limit_type: 3010 case PropertyLimitTypeModel.property_count: 3011 property_limit_type = PropertyLimitType.property_count 3012 case PropertyLimitTypeModel.characters: 3013 property_limit_type = PropertyLimitType.characters 3014 case _: 3015 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3016 3017 return PropertyChunking( 3018 property_limit_type=property_limit_type, 3019 property_limit=model.property_limit, 3020 record_merge_strategy=record_merge_strategy, 3021 config=config, 3022 parameters=model.parameters or {}, 3023 ) 3024 3025 def create_query_properties( 3026 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 3027 ) -> QueryProperties: 3028 if isinstance(model.property_list, list): 3029 property_list = model.property_list 3030 else: 3031 property_list = self._create_component_from_model( 3032 model=model.property_list, config=config, **kwargs 3033 ) 3034 3035 property_chunking = ( 3036 self._create_component_from_model( 3037 model=model.property_chunking, config=config, **kwargs 3038 ) 3039 if model.property_chunking 3040 else None 3041 ) 3042 3043 return QueryProperties( 3044 property_list=property_list, 3045 always_include_properties=model.always_include_properties, 3046 property_chunking=property_chunking, 3047 config=config, 3048 parameters=model.parameters or {}, 3049 ) 3050 3051 @staticmethod 3052 def create_record_filter( 3053 model: RecordFilterModel, config: Config, **kwargs: Any 3054 ) -> RecordFilter: 3055 return RecordFilter( 3056 condition=model.condition or "", config=config, parameters=model.parameters or {} 3057 ) 3058 3059 @staticmethod 3060 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3061 return RequestPath(parameters={}) 3062 3063 @staticmethod 3064 def create_request_option( 3065 model: RequestOptionModel, config: Config, **kwargs: Any 3066 ) -> RequestOption: 3067 inject_into = RequestOptionType(model.inject_into.value) 3068 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3069 [ 3070 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3071 for segment in model.field_path 3072 ] 3073 if model.field_path 3074 else None 3075 ) 3076 field_name = ( 3077 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3078 if model.field_name 3079 else None 3080 ) 3081 return RequestOption( 3082 field_name=field_name, 3083 field_path=field_path, 3084 inject_into=inject_into, 3085 parameters=kwargs.get("parameters", {}), 3086 ) 3087 3088 def create_record_selector( 3089 self, 3090 model: RecordSelectorModel, 3091 config: Config, 3092 *, 3093 name: str, 3094 transformations: List[RecordTransformation] | None = None, 3095 decoder: Decoder | None = None, 3096 client_side_incremental_sync: Dict[str, Any] | None = None, 3097 file_uploader: Optional[DefaultFileUploader] = None, 3098 **kwargs: Any, 3099 ) -> RecordSelector: 3100 extractor = self._create_component_from_model( 3101 model=model.extractor, decoder=decoder, config=config 3102 ) 3103 record_filter = ( 3104 self._create_component_from_model(model.record_filter, config=config) 3105 if model.record_filter 3106 else None 3107 ) 3108 3109 transform_before_filtering = ( 3110 False if model.transform_before_filtering is None else model.transform_before_filtering 3111 ) 3112 if client_side_incremental_sync: 3113 record_filter = ClientSideIncrementalRecordFilterDecorator( 3114 config=config, 3115 parameters=model.parameters, 3116 condition=model.record_filter.condition 3117 if (model.record_filter and hasattr(model.record_filter, "condition")) 3118 else None, 3119 **client_side_incremental_sync, 3120 ) 3121 transform_before_filtering = ( 3122 True 3123 if model.transform_before_filtering is None 3124 else model.transform_before_filtering 3125 ) 3126 3127 if model.schema_normalization is None: 3128 # default to no schema normalization if not set 3129 model.schema_normalization = SchemaNormalizationModel.None_ 3130 3131 schema_normalization = ( 3132 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3133 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3134 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3135 ) 3136 3137 return RecordSelector( 3138 extractor=extractor, 3139 name=name, 3140 config=config, 3141 record_filter=record_filter, 3142 transformations=transformations or [], 3143 file_uploader=file_uploader, 3144 schema_normalization=schema_normalization, 3145 parameters=model.parameters or {}, 3146 transform_before_filtering=transform_before_filtering, 3147 ) 3148 3149 @staticmethod 3150 def create_remove_fields( 3151 model: RemoveFieldsModel, config: Config, **kwargs: Any 3152 ) -> RemoveFields: 3153 return RemoveFields( 3154 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3155 ) 3156 3157 def create_selective_authenticator( 3158 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3159 ) -> DeclarativeAuthenticator: 3160 authenticators = { 3161 name: self._create_component_from_model(model=auth, config=config) 3162 for name, auth in model.authenticators.items() 3163 } 3164 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3165 return SelectiveAuthenticator( # type: ignore[abstract] 3166 config=config, 3167 authenticators=authenticators, 3168 authenticator_selection_path=model.authenticator_selection_path, 3169 **kwargs, 3170 ) 3171 3172 @staticmethod 3173 def create_legacy_session_token_authenticator( 3174 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3175 ) -> LegacySessionTokenAuthenticator: 3176 return LegacySessionTokenAuthenticator( 3177 api_url=url_base, 3178 header=model.header, 3179 login_url=model.login_url, 3180 password=model.password or "", 3181 session_token=model.session_token or "", 3182 session_token_response_key=model.session_token_response_key or "", 3183 username=model.username or "", 3184 validate_session_url=model.validate_session_url, 3185 config=config, 3186 parameters=model.parameters or {}, 3187 ) 3188 3189 def create_simple_retriever( 3190 self, 3191 model: SimpleRetrieverModel, 3192 config: Config, 3193 *, 3194 name: str, 3195 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3196 stream_slicer: Optional[StreamSlicer], 3197 request_options_provider: Optional[RequestOptionsProvider] = None, 3198 stop_condition_cursor: Optional[Cursor] = None, 3199 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3200 transformations: List[RecordTransformation], 3201 file_uploader: Optional[DefaultFileUploader] = None, 3202 incremental_sync: Optional[ 3203 Union[ 3204 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3205 ] 3206 ] = None, 3207 use_cache: Optional[bool] = None, 3208 log_formatter: Optional[Callable[[Response], Any]] = None, 3209 **kwargs: Any, 3210 ) -> SimpleRetriever: 3211 def _get_url() -> str: 3212 """ 3213 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3214 This is needed because the URL is not set until the requester is created. 3215 """ 3216 3217 _url: str = ( 3218 model.requester.url 3219 if hasattr(model.requester, "url") and model.requester.url is not None 3220 else requester.get_url() 3221 ) 3222 _url_base: str = ( 3223 model.requester.url_base 3224 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3225 else requester.get_url_base() 3226 ) 3227 3228 return _url or _url_base 3229 3230 decoder = ( 3231 self._create_component_from_model(model=model.decoder, config=config) 3232 if model.decoder 3233 else JsonDecoder(parameters={}) 3234 ) 3235 record_selector = self._create_component_from_model( 3236 model=model.record_selector, 3237 name=name, 3238 config=config, 3239 decoder=decoder, 3240 transformations=transformations, 3241 client_side_incremental_sync=client_side_incremental_sync, 3242 file_uploader=file_uploader, 3243 ) 3244 3245 query_properties: Optional[QueryProperties] = None 3246 query_properties_key: Optional[str] = None 3247 if self._query_properties_in_request_parameters(model.requester): 3248 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3249 # places instead of default to request_parameters which isn't clearly documented 3250 if ( 3251 hasattr(model.requester, "fetch_properties_from_endpoint") 3252 and model.requester.fetch_properties_from_endpoint 3253 ): 3254 raise ValueError( 3255 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3256 ) 3257 3258 query_properties_definitions = [] 3259 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3260 if isinstance(request_parameter, QueryPropertiesModel): 3261 query_properties_key = key 3262 query_properties_definitions.append(request_parameter) 3263 3264 if len(query_properties_definitions) > 1: 3265 raise ValueError( 3266 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3267 ) 3268 3269 if len(query_properties_definitions) == 1: 3270 query_properties = self._create_component_from_model( 3271 model=query_properties_definitions[0], config=config 3272 ) 3273 elif ( 3274 hasattr(model.requester, "fetch_properties_from_endpoint") 3275 and model.requester.fetch_properties_from_endpoint 3276 ): 3277 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3278 query_properties_definition = QueryPropertiesModel( 3279 type="QueryProperties", 3280 property_list=model.requester.fetch_properties_from_endpoint, 3281 always_include_properties=None, 3282 property_chunking=None, 3283 ) # type: ignore # $parameters has a default value 3284 3285 query_properties = self.create_query_properties( 3286 model=query_properties_definition, 3287 config=config, 3288 ) 3289 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3290 query_properties = self.create_query_properties( 3291 model=model.requester.query_properties, 3292 config=config, 3293 ) 3294 3295 requester = self._create_component_from_model( 3296 model=model.requester, 3297 decoder=decoder, 3298 name=name, 3299 query_properties_key=query_properties_key, 3300 use_cache=use_cache, 3301 config=config, 3302 ) 3303 3304 # Define cursor only if per partition or common incremental support is needed 3305 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3306 3307 if ( 3308 not isinstance(stream_slicer, DatetimeBasedCursor) 3309 or type(stream_slicer) is not DatetimeBasedCursor 3310 ): 3311 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3312 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3313 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3314 # request_options_provider 3315 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3316 elif not request_options_provider: 3317 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3318 3319 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3320 if self._should_limit_slices_fetched(): 3321 stream_slicer = cast( 3322 StreamSlicer, 3323 StreamSlicerTestReadDecorator( 3324 wrapped_slicer=stream_slicer, 3325 maximum_number_of_slices=self._limit_slices_fetched or 5, 3326 ), 3327 ) 3328 3329 paginator = ( 3330 self._create_component_from_model( 3331 model=model.paginator, 3332 config=config, 3333 url_base=_get_url(), 3334 extractor_model=model.record_selector.extractor, 3335 decoder=decoder, 3336 cursor_used_for_stop_condition=stop_condition_cursor or None, 3337 ) 3338 if model.paginator 3339 else NoPagination(parameters={}) 3340 ) 3341 3342 ignore_stream_slicer_parameters_on_paginated_requests = ( 3343 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3344 ) 3345 3346 if ( 3347 model.partition_router 3348 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3349 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3350 and any( 3351 parent_stream_config.lazy_read_pointer 3352 for parent_stream_config in model.partition_router.parent_stream_configs 3353 ) 3354 ): 3355 if incremental_sync: 3356 if incremental_sync.type != "DatetimeBasedCursor": 3357 raise ValueError( 3358 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3359 ) 3360 3361 elif incremental_sync.step or incremental_sync.cursor_granularity: 3362 raise ValueError( 3363 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3364 ) 3365 3366 if model.decoder and model.decoder.type != "JsonDecoder": 3367 raise ValueError( 3368 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3369 ) 3370 3371 return LazySimpleRetriever( 3372 name=name, 3373 paginator=paginator, 3374 primary_key=primary_key, 3375 requester=requester, 3376 record_selector=record_selector, 3377 stream_slicer=stream_slicer, 3378 request_option_provider=request_options_provider, 3379 cursor=cursor, 3380 config=config, 3381 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3382 parameters=model.parameters or {}, 3383 ) 3384 3385 return SimpleRetriever( 3386 name=name, 3387 paginator=paginator, 3388 primary_key=primary_key, 3389 requester=requester, 3390 record_selector=record_selector, 3391 stream_slicer=stream_slicer, 3392 request_option_provider=request_options_provider, 3393 cursor=cursor, 3394 config=config, 3395 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3396 additional_query_properties=query_properties, 3397 log_formatter=self._get_log_formatter(log_formatter, name), 3398 parameters=model.parameters or {}, 3399 ) 3400 3401 def _get_log_formatter( 3402 self, log_formatter: Callable[[Response], Any] | None, name: str 3403 ) -> Callable[[Response], Any] | None: 3404 if self._should_limit_slices_fetched(): 3405 return ( 3406 ( 3407 lambda response: format_http_message( 3408 response, 3409 f"Stream '{name}' request", 3410 f"Request performed in order to extract records for stream '{name}'", 3411 name, 3412 ) 3413 ) 3414 if not log_formatter 3415 else log_formatter 3416 ) 3417 return None 3418 3419 def _should_limit_slices_fetched(self) -> bool: 3420 """ 3421 Returns True if the number of slices fetched should be limited, False otherwise. 3422 This is used to limit the number of slices fetched during tests. 3423 """ 3424 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3425 3426 @staticmethod 3427 def _query_properties_in_request_parameters( 3428 requester: Union[HttpRequesterModel, CustomRequesterModel], 3429 ) -> bool: 3430 if not hasattr(requester, "request_parameters"): 3431 return False 3432 request_parameters = requester.request_parameters 3433 if request_parameters and isinstance(request_parameters, Mapping): 3434 for request_parameter in request_parameters.values(): 3435 if isinstance(request_parameter, QueryPropertiesModel): 3436 return True 3437 return False 3438 3439 @staticmethod 3440 def _remove_query_properties( 3441 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3442 ) -> Mapping[str, str]: 3443 return { 3444 parameter_field: request_parameter 3445 for parameter_field, request_parameter in request_parameters.items() 3446 if not isinstance(request_parameter, QueryPropertiesModel) 3447 } 3448 3449 def create_state_delegating_stream( 3450 self, 3451 model: StateDelegatingStreamModel, 3452 config: Config, 3453 has_parent_state: Optional[bool] = None, 3454 **kwargs: Any, 3455 ) -> DeclarativeStream: 3456 if ( 3457 model.full_refresh_stream.name != model.name 3458 or model.name != model.incremental_stream.name 3459 ): 3460 raise ValueError( 3461 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3462 ) 3463 3464 stream_model = ( 3465 model.incremental_stream 3466 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3467 else model.full_refresh_stream 3468 ) 3469 3470 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description 3471 3472 def _create_async_job_status_mapping( 3473 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3474 ) -> Mapping[str, AsyncJobStatus]: 3475 api_status_to_cdk_status = {} 3476 for cdk_status, api_statuses in model.dict().items(): 3477 if cdk_status == "type": 3478 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3479 continue 3480 3481 for status in api_statuses: 3482 if status in api_status_to_cdk_status: 3483 raise ValueError( 3484 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3485 ) 3486 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3487 return api_status_to_cdk_status 3488 3489 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3490 match status: 3491 case "running": 3492 return AsyncJobStatus.RUNNING 3493 case "completed": 3494 return AsyncJobStatus.COMPLETED 3495 case "failed": 3496 return AsyncJobStatus.FAILED 3497 case "timeout": 3498 return AsyncJobStatus.TIMED_OUT 3499 case _: 3500 raise ValueError(f"Unsupported CDK status {status}") 3501 3502 def create_async_retriever( 3503 self, 3504 model: AsyncRetrieverModel, 3505 config: Config, 3506 *, 3507 name: str, 3508 primary_key: Optional[ 3509 Union[str, List[str], List[List[str]]] 3510 ], # this seems to be needed to match create_simple_retriever 3511 stream_slicer: Optional[StreamSlicer], 3512 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3513 transformations: List[RecordTransformation], 3514 **kwargs: Any, 3515 ) -> AsyncRetriever: 3516 def _get_download_retriever() -> SimpleRetriever: 3517 # We create a record selector for the download retriever 3518 # with no schema normalization and no transformations, neither record filter 3519 # as all this occurs in the record_selector of the AsyncRetriever 3520 record_selector = RecordSelector( 3521 extractor=download_extractor, 3522 name=name, 3523 record_filter=None, 3524 transformations=[], 3525 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3526 config=config, 3527 parameters={}, 3528 ) 3529 paginator = ( 3530 self._create_component_from_model( 3531 model=model.download_paginator, 3532 decoder=decoder, 3533 config=config, 3534 url_base="", 3535 ) 3536 if model.download_paginator 3537 else NoPagination(parameters={}) 3538 ) 3539 3540 return SimpleRetriever( 3541 requester=download_requester, 3542 record_selector=record_selector, 3543 primary_key=None, 3544 name=name, 3545 paginator=paginator, 3546 config=config, 3547 parameters={}, 3548 log_formatter=self._get_log_formatter(None, name), 3549 ) 3550 3551 def _get_job_timeout() -> datetime.timedelta: 3552 user_defined_timeout: Optional[int] = ( 3553 int( 3554 InterpolatedString.create( 3555 str(model.polling_job_timeout), 3556 parameters={}, 3557 ).eval(config) 3558 ) 3559 if model.polling_job_timeout 3560 else None 3561 ) 3562 3563 # check for user defined timeout during the test read or 15 minutes 3564 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3565 # default value for non-connector builder is 60 minutes. 3566 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3567 3568 return ( 3569 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3570 ) 3571 3572 decoder = ( 3573 self._create_component_from_model(model=model.decoder, config=config) 3574 if model.decoder 3575 else JsonDecoder(parameters={}) 3576 ) 3577 record_selector = self._create_component_from_model( 3578 model=model.record_selector, 3579 config=config, 3580 decoder=decoder, 3581 name=name, 3582 transformations=transformations, 3583 client_side_incremental_sync=client_side_incremental_sync, 3584 ) 3585 3586 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3587 if self._should_limit_slices_fetched(): 3588 stream_slicer = cast( 3589 StreamSlicer, 3590 StreamSlicerTestReadDecorator( 3591 wrapped_slicer=stream_slicer, 3592 maximum_number_of_slices=self._limit_slices_fetched or 5, 3593 ), 3594 ) 3595 3596 creation_requester = self._create_component_from_model( 3597 model=model.creation_requester, 3598 decoder=decoder, 3599 config=config, 3600 name=f"job creation - {name}", 3601 ) 3602 polling_requester = self._create_component_from_model( 3603 model=model.polling_requester, 3604 decoder=decoder, 3605 config=config, 3606 name=f"job polling - {name}", 3607 ) 3608 job_download_components_name = f"job download - {name}" 3609 download_decoder = ( 3610 self._create_component_from_model(model=model.download_decoder, config=config) 3611 if model.download_decoder 3612 else JsonDecoder(parameters={}) 3613 ) 3614 download_extractor = ( 3615 self._create_component_from_model( 3616 model=model.download_extractor, 3617 config=config, 3618 decoder=download_decoder, 3619 parameters=model.parameters, 3620 ) 3621 if model.download_extractor 3622 else DpathExtractor( 3623 [], 3624 config=config, 3625 decoder=download_decoder, 3626 parameters=model.parameters or {}, 3627 ) 3628 ) 3629 download_requester = self._create_component_from_model( 3630 model=model.download_requester, 3631 decoder=download_decoder, 3632 config=config, 3633 name=job_download_components_name, 3634 ) 3635 download_retriever = _get_download_retriever() 3636 abort_requester = ( 3637 self._create_component_from_model( 3638 model=model.abort_requester, 3639 decoder=decoder, 3640 config=config, 3641 name=f"job abort - {name}", 3642 ) 3643 if model.abort_requester 3644 else None 3645 ) 3646 delete_requester = ( 3647 self._create_component_from_model( 3648 model=model.delete_requester, 3649 decoder=decoder, 3650 config=config, 3651 name=f"job delete - {name}", 3652 ) 3653 if model.delete_requester 3654 else None 3655 ) 3656 download_target_requester = ( 3657 self._create_component_from_model( 3658 model=model.download_target_requester, 3659 decoder=decoder, 3660 config=config, 3661 name=f"job extract_url - {name}", 3662 ) 3663 if model.download_target_requester 3664 else None 3665 ) 3666 status_extractor = self._create_component_from_model( 3667 model=model.status_extractor, decoder=decoder, config=config, name=name 3668 ) 3669 download_target_extractor = self._create_component_from_model( 3670 model=model.download_target_extractor, 3671 decoder=decoder, 3672 config=config, 3673 name=name, 3674 ) 3675 3676 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3677 creation_requester=creation_requester, 3678 polling_requester=polling_requester, 3679 download_retriever=download_retriever, 3680 download_target_requester=download_target_requester, 3681 abort_requester=abort_requester, 3682 delete_requester=delete_requester, 3683 status_extractor=status_extractor, 3684 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3685 download_target_extractor=download_target_extractor, 3686 job_timeout=_get_job_timeout(), 3687 ) 3688 3689 async_job_partition_router = AsyncJobPartitionRouter( 3690 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3691 job_repository, 3692 stream_slices, 3693 self._job_tracker, 3694 self._message_repository, 3695 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3696 has_bulk_parent=False, 3697 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3698 # `None` == default retry is set to 3 attempts, under the hood. 3699 job_max_retry=1 if self._emit_connector_builder_messages else None, 3700 ), 3701 stream_slicer=stream_slicer, 3702 config=config, 3703 parameters=model.parameters or {}, 3704 ) 3705 3706 return AsyncRetriever( 3707 record_selector=record_selector, 3708 stream_slicer=async_job_partition_router, 3709 config=config, 3710 parameters=model.parameters or {}, 3711 ) 3712 3713 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3714 config_migrations = [ 3715 self._create_component_from_model(migration, config) 3716 for migration in ( 3717 model.config_normalization_rules.config_migrations 3718 if ( 3719 model.config_normalization_rules 3720 and model.config_normalization_rules.config_migrations 3721 ) 3722 else [] 3723 ) 3724 ] 3725 config_transformations = [ 3726 self._create_component_from_model(transformation, config) 3727 for transformation in ( 3728 model.config_normalization_rules.transformations 3729 if ( 3730 model.config_normalization_rules 3731 and model.config_normalization_rules.transformations 3732 ) 3733 else [] 3734 ) 3735 ] 3736 config_validations = [ 3737 self._create_component_from_model(validation, config) 3738 for validation in ( 3739 model.config_normalization_rules.validations 3740 if ( 3741 model.config_normalization_rules 3742 and model.config_normalization_rules.validations 3743 ) 3744 else [] 3745 ) 3746 ] 3747 3748 return Spec( 3749 connection_specification=model.connection_specification, 3750 documentation_url=model.documentation_url, 3751 advanced_auth=model.advanced_auth, 3752 parameters={}, 3753 config_migrations=config_migrations, 3754 config_transformations=config_transformations, 3755 config_validations=config_validations, 3756 ) 3757 3758 def create_substream_partition_router( 3759 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3760 ) -> SubstreamPartitionRouter: 3761 parent_stream_configs = [] 3762 if model.parent_stream_configs: 3763 parent_stream_configs.extend( 3764 [ 3765 self._create_message_repository_substream_wrapper( 3766 model=parent_stream_config, config=config, **kwargs 3767 ) 3768 for parent_stream_config in model.parent_stream_configs 3769 ] 3770 ) 3771 3772 return SubstreamPartitionRouter( 3773 parent_stream_configs=parent_stream_configs, 3774 parameters=model.parameters or {}, 3775 config=config, 3776 ) 3777 3778 def _create_message_repository_substream_wrapper( 3779 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 3780 ) -> Any: 3781 substream_factory = ModelToComponentFactory( 3782 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3783 limit_slices_fetched=self._limit_slices_fetched, 3784 emit_connector_builder_messages=self._emit_connector_builder_messages, 3785 disable_retries=self._disable_retries, 3786 disable_cache=self._disable_cache, 3787 message_repository=LogAppenderMessageRepositoryDecorator( 3788 {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}}, 3789 self._message_repository, 3790 self._evaluate_log_level(self._emit_connector_builder_messages), 3791 ), 3792 ) 3793 3794 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3795 has_parent_state = bool( 3796 self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None) 3797 if model.incremental_dependency 3798 else False 3799 ) 3800 return substream_factory._create_component_from_model( 3801 model=model, config=config, has_parent_state=has_parent_state, **kwargs 3802 ) 3803 3804 @staticmethod 3805 def create_wait_time_from_header( 3806 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3807 ) -> WaitTimeFromHeaderBackoffStrategy: 3808 return WaitTimeFromHeaderBackoffStrategy( 3809 header=model.header, 3810 parameters=model.parameters or {}, 3811 config=config, 3812 regex=model.regex, 3813 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3814 if model.max_waiting_time_in_seconds is not None 3815 else None, 3816 ) 3817 3818 @staticmethod 3819 def create_wait_until_time_from_header( 3820 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3821 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3822 return WaitUntilTimeFromHeaderBackoffStrategy( 3823 header=model.header, 3824 parameters=model.parameters or {}, 3825 config=config, 3826 min_wait=model.min_wait, 3827 regex=model.regex, 3828 ) 3829 3830 def get_message_repository(self) -> MessageRepository: 3831 return self._message_repository 3832 3833 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3834 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3835 3836 @staticmethod 3837 def create_components_mapping_definition( 3838 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3839 ) -> ComponentMappingDefinition: 3840 interpolated_value = InterpolatedString.create( 3841 model.value, parameters=model.parameters or {} 3842 ) 3843 field_path = [ 3844 InterpolatedString.create(path, parameters=model.parameters or {}) 3845 for path in model.field_path 3846 ] 3847 return ComponentMappingDefinition( 3848 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3849 value=interpolated_value, 3850 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3851 create_or_update=model.create_or_update, 3852 condition=model.condition, 3853 parameters=model.parameters or {}, 3854 ) 3855 3856 def create_http_components_resolver( 3857 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3858 ) -> Any: 3859 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3860 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3861 3862 retriever = self._create_component_from_model( 3863 model=model.retriever, 3864 config=config, 3865 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3866 primary_key=None, 3867 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3868 transformations=[], 3869 ) 3870 3871 components_mapping = [] 3872 for component_mapping_definition_model in model.components_mapping: 3873 if component_mapping_definition_model.condition: 3874 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3875 components_mapping.append( 3876 self._create_component_from_model( 3877 model=component_mapping_definition_model, 3878 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3879 component_mapping_definition_model.value_type 3880 ), 3881 config=config, 3882 ) 3883 ) 3884 3885 return HttpComponentsResolver( 3886 retriever=retriever, 3887 config=config, 3888 components_mapping=components_mapping, 3889 parameters=model.parameters or {}, 3890 ) 3891 3892 @staticmethod 3893 def create_stream_config( 3894 model: StreamConfigModel, config: Config, **kwargs: Any 3895 ) -> StreamConfig: 3896 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3897 [x for x in model.configs_pointer] if model.configs_pointer else [] 3898 ) 3899 3900 return StreamConfig( 3901 configs_pointer=model_configs_pointer, 3902 default_values=model.default_values, 3903 parameters=model.parameters or {}, 3904 ) 3905 3906 def create_config_components_resolver( 3907 self, 3908 model: ConfigComponentsResolverModel, 3909 config: Config, 3910 ) -> Any: 3911 model_stream_configs = ( 3912 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3913 ) 3914 3915 stream_configs = [ 3916 self._create_component_from_model( 3917 stream_config, config=config, parameters=model.parameters or {} 3918 ) 3919 for stream_config in model_stream_configs 3920 ] 3921 3922 components_mapping = [ 3923 self._create_component_from_model( 3924 model=components_mapping_definition_model, 3925 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3926 components_mapping_definition_model.value_type 3927 ), 3928 config=config, 3929 parameters=model.parameters, 3930 ) 3931 for components_mapping_definition_model in model.components_mapping 3932 ] 3933 3934 return ConfigComponentsResolver( 3935 stream_configs=stream_configs, 3936 config=config, 3937 components_mapping=components_mapping, 3938 parameters=model.parameters or {}, 3939 ) 3940 3941 def create_parametrized_components_resolver( 3942 self, 3943 model: ParametrizedComponentsResolverModel, 3944 config: Config, 3945 ) -> ParametrizedComponentsResolver: 3946 stream_parameters = StreamParametersDefinition( 3947 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3948 ) 3949 3950 components_mapping = [] 3951 for components_mapping_definition_model in model.components_mapping: 3952 if components_mapping_definition_model.condition: 3953 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3954 components_mapping.append( 3955 self._create_component_from_model( 3956 model=components_mapping_definition_model, 3957 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3958 components_mapping_definition_model.value_type 3959 ), 3960 config=config, 3961 ) 3962 ) 3963 return ParametrizedComponentsResolver( 3964 stream_parameters=stream_parameters, 3965 config=config, 3966 components_mapping=components_mapping, 3967 parameters=model.parameters or {}, 3968 ) 3969 3970 _UNSUPPORTED_DECODER_ERROR = ( 3971 "Specified decoder of {decoder_type} is not supported for pagination." 3972 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 3973 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 3974 ) 3975 3976 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 3977 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 3978 return True 3979 elif isinstance(decoder, CompositeRawDecoder): 3980 return self._is_supported_parser_for_pagination(decoder.parser) 3981 else: 3982 return False 3983 3984 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 3985 if isinstance(parser, JsonParser): 3986 return True 3987 elif isinstance(parser, GzipParser): 3988 return isinstance(parser.inner_parser, JsonParser) 3989 else: 3990 return False 3991 3992 def create_http_api_budget( 3993 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3994 ) -> HttpAPIBudget: 3995 policies = [ 3996 self._create_component_from_model(model=policy, config=config) 3997 for policy in model.policies 3998 ] 3999 4000 return HttpAPIBudget( 4001 policies=policies, 4002 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4003 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4004 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4005 ) 4006 4007 def create_fixed_window_call_rate_policy( 4008 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4009 ) -> FixedWindowCallRatePolicy: 4010 matchers = [ 4011 self._create_component_from_model(model=matcher, config=config) 4012 for matcher in model.matchers 4013 ] 4014 4015 # Set the initial reset timestamp to 10 days from now. 4016 # This value will be updated by the first request. 4017 return FixedWindowCallRatePolicy( 4018 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4019 period=parse_duration(model.period), 4020 call_limit=model.call_limit, 4021 matchers=matchers, 4022 ) 4023 4024 def create_file_uploader( 4025 self, model: FileUploaderModel, config: Config, **kwargs: Any 4026 ) -> FileUploader: 4027 name = "File Uploader" 4028 requester = self._create_component_from_model( 4029 model=model.requester, 4030 config=config, 4031 name=name, 4032 **kwargs, 4033 ) 4034 download_target_extractor = self._create_component_from_model( 4035 model=model.download_target_extractor, 4036 config=config, 4037 name=name, 4038 **kwargs, 4039 ) 4040 emit_connector_builder_messages = self._emit_connector_builder_messages 4041 file_uploader = DefaultFileUploader( 4042 requester=requester, 4043 download_target_extractor=download_target_extractor, 4044 config=config, 4045 file_writer=NoopFileWriter() 4046 if emit_connector_builder_messages 4047 else LocalFileSystemFileWriter(), 4048 parameters=model.parameters or {}, 4049 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4050 ) 4051 4052 return ( 4053 ConnectorBuilderFileUploader(file_uploader) 4054 if emit_connector_builder_messages 4055 else file_uploader 4056 ) 4057 4058 def create_moving_window_call_rate_policy( 4059 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4060 ) -> MovingWindowCallRatePolicy: 4061 rates = [ 4062 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4063 ] 4064 matchers = [ 4065 self._create_component_from_model(model=matcher, config=config) 4066 for matcher in model.matchers 4067 ] 4068 return MovingWindowCallRatePolicy( 4069 rates=rates, 4070 matchers=matchers, 4071 ) 4072 4073 def create_unlimited_call_rate_policy( 4074 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4075 ) -> UnlimitedCallRatePolicy: 4076 matchers = [ 4077 self._create_component_from_model(model=matcher, config=config) 4078 for matcher in model.matchers 4079 ] 4080 4081 return UnlimitedCallRatePolicy( 4082 matchers=matchers, 4083 ) 4084 4085 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4086 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4087 return Rate( 4088 limit=int(interpolated_limit.eval(config=config)), 4089 interval=parse_duration(model.interval), 4090 ) 4091 4092 def create_http_request_matcher( 4093 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4094 ) -> HttpRequestRegexMatcher: 4095 return HttpRequestRegexMatcher( 4096 method=model.method, 4097 url_base=model.url_base, 4098 url_path_pattern=model.url_path_pattern, 4099 params=model.params, 4100 headers=model.headers, 4101 ) 4102 4103 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4104 self._api_budget = self.create_component( 4105 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4106 ) 4107 4108 def create_grouping_partition_router( 4109 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4110 ) -> GroupingPartitionRouter: 4111 underlying_router = self._create_component_from_model( 4112 model=model.underlying_partition_router, config=config 4113 ) 4114 if model.group_size < 1: 4115 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4116 4117 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4118 # because they are specific to individual partitions and cannot be aggregated or handled 4119 # when grouping, potentially leading to incorrect API calls. Any request customization 4120 # should be managed at the stream level through the requester's configuration. 4121 if isinstance(underlying_router, SubstreamPartitionRouter): 4122 if any( 4123 parent_config.request_option 4124 for parent_config in underlying_router.parent_stream_configs 4125 ): 4126 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4127 4128 if isinstance(underlying_router, ListPartitionRouter): 4129 if underlying_router.request_option: 4130 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4131 4132 return GroupingPartitionRouter( 4133 group_size=model.group_size, 4134 underlying_partition_router=underlying_router, 4135 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4136 config=config, 4137 )
630 def __init__( 631 self, 632 limit_pages_fetched_per_slice: Optional[int] = None, 633 limit_slices_fetched: Optional[int] = None, 634 emit_connector_builder_messages: bool = False, 635 disable_retries: bool = False, 636 disable_cache: bool = False, 637 disable_resumable_full_refresh: bool = False, 638 message_repository: Optional[MessageRepository] = None, 639 connector_state_manager: Optional[ConnectorStateManager] = None, 640 max_concurrent_async_job_count: Optional[int] = None, 641 ): 642 self._init_mappings() 643 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 644 self._limit_slices_fetched = limit_slices_fetched 645 self._emit_connector_builder_messages = emit_connector_builder_messages 646 self._disable_retries = disable_retries 647 self._disable_cache = disable_cache 648 self._disable_resumable_full_refresh = disable_resumable_full_refresh 649 self._message_repository = message_repository or InMemoryMessageRepository( 650 self._evaluate_log_level(emit_connector_builder_messages) 651 ) 652 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 653 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 654 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 655 # placeholder for deprecation warnings 656 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
771 def create_component( 772 self, 773 model_type: Type[BaseModel], 774 component_definition: ComponentDefinition, 775 config: Config, 776 **kwargs: Any, 777 ) -> Any: 778 """ 779 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 780 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 781 creating declarative components from that model. 782 783 :param model_type: The type of declarative component that is being initialized 784 :param component_definition: The mapping that represents a declarative component 785 :param config: The connector config that is provided by the customer 786 :return: The declarative component to be used at runtime 787 """ 788 789 component_type = component_definition.get("type") 790 if component_definition.get("type") != model_type.__name__: 791 raise ValueError( 792 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 793 ) 794 795 declarative_component_model = model_type.parse_obj(component_definition) 796 797 if not isinstance(declarative_component_model, model_type): 798 raise ValueError( 799 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 800 ) 801 802 return self._create_component_from_model( 803 model=declarative_component_model, config=config, **kwargs 804 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
821 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 822 """ 823 Returns the deprecation warnings that were collected during the creation of components. 824 """ 825 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
842 def create_config_migration( 843 self, model: ConfigMigrationModel, config: Config 844 ) -> ConfigMigration: 845 transformations: List[ConfigTransformation] = [ 846 self._create_component_from_model(transformation, config) 847 for transformation in model.transformations 848 ] 849 850 return ConfigMigration( 851 description=model.description, 852 transformations=transformations, 853 )
855 def create_config_add_fields( 856 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 857 ) -> ConfigAddFields: 858 fields = [self._create_component_from_model(field, config) for field in model.fields] 859 return ConfigAddFields( 860 fields=fields, 861 condition=model.condition or "", 862 )
911 @staticmethod 912 def create_added_field_definition( 913 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 914 ) -> AddedFieldDefinition: 915 interpolated_value = InterpolatedString.create( 916 model.value, parameters=model.parameters or {} 917 ) 918 return AddedFieldDefinition( 919 path=model.path, 920 value=interpolated_value, 921 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 922 parameters=model.parameters or {}, 923 )
925 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 926 added_field_definitions = [ 927 self._create_component_from_model( 928 model=added_field_definition_model, 929 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 930 added_field_definition_model.value_type 931 ), 932 config=config, 933 ) 934 for added_field_definition_model in model.fields 935 ] 936 return AddFields( 937 fields=added_field_definitions, 938 condition=model.condition or "", 939 parameters=model.parameters or {}, 940 )
966 def create_dpath_flatten_fields( 967 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 968 ) -> DpathFlattenFields: 969 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 970 key_transformation = ( 971 KeyTransformation( 972 config=config, 973 prefix=model.key_transformation.prefix, 974 suffix=model.key_transformation.suffix, 975 parameters=model.parameters or {}, 976 ) 977 if model.key_transformation is not None 978 else None 979 ) 980 return DpathFlattenFields( 981 config=config, 982 field_path=model_field_path, 983 delete_origin_value=model.delete_origin_value 984 if model.delete_origin_value is not None 985 else False, 986 replace_record=model.replace_record if model.replace_record is not None else False, 987 key_transformation=key_transformation, 988 parameters=model.parameters or {}, 989 )
1003 def create_api_key_authenticator( 1004 self, 1005 model: ApiKeyAuthenticatorModel, 1006 config: Config, 1007 token_provider: Optional[TokenProvider] = None, 1008 **kwargs: Any, 1009 ) -> ApiKeyAuthenticator: 1010 if model.inject_into is None and model.header is None: 1011 raise ValueError( 1012 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1013 ) 1014 1015 if model.inject_into is not None and model.header is not None: 1016 raise ValueError( 1017 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1018 ) 1019 1020 if token_provider is not None and model.api_token != "": 1021 raise ValueError( 1022 "If token_provider is set, api_token is ignored and has to be set to empty string." 1023 ) 1024 1025 request_option = ( 1026 self._create_component_from_model( 1027 model.inject_into, config, parameters=model.parameters or {} 1028 ) 1029 if model.inject_into 1030 else RequestOption( 1031 inject_into=RequestOptionType.header, 1032 field_name=model.header or "", 1033 parameters=model.parameters or {}, 1034 ) 1035 ) 1036 1037 return ApiKeyAuthenticator( 1038 token_provider=( 1039 token_provider 1040 if token_provider is not None 1041 else InterpolatedStringTokenProvider( 1042 api_token=model.api_token or "", 1043 config=config, 1044 parameters=model.parameters or {}, 1045 ) 1046 ), 1047 request_option=request_option, 1048 config=config, 1049 parameters=model.parameters or {}, 1050 )
1052 def create_legacy_to_per_partition_state_migration( 1053 self, 1054 model: LegacyToPerPartitionStateMigrationModel, 1055 config: Mapping[str, Any], 1056 declarative_stream: DeclarativeStreamModel, 1057 ) -> LegacyToPerPartitionStateMigration: 1058 retriever = declarative_stream.retriever 1059 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1060 raise ValueError( 1061 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1062 ) 1063 partition_router = retriever.partition_router 1064 if not isinstance( 1065 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1066 ): 1067 raise ValueError( 1068 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1069 ) 1070 if not hasattr(partition_router, "parent_stream_configs"): 1071 raise ValueError( 1072 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1073 ) 1074 1075 if not hasattr(declarative_stream, "incremental_sync"): 1076 raise ValueError( 1077 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1078 ) 1079 1080 return LegacyToPerPartitionStateMigration( 1081 partition_router, # type: ignore # was already checked above 1082 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1083 config, 1084 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1085 )
1087 def create_session_token_authenticator( 1088 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1089 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1090 decoder = ( 1091 self._create_component_from_model(model=model.decoder, config=config) 1092 if model.decoder 1093 else JsonDecoder(parameters={}) 1094 ) 1095 login_requester = self._create_component_from_model( 1096 model=model.login_requester, 1097 config=config, 1098 name=f"{name}_login_requester", 1099 decoder=decoder, 1100 ) 1101 token_provider = SessionTokenProvider( 1102 login_requester=login_requester, 1103 session_token_path=model.session_token_path, 1104 expiration_duration=parse_duration(model.expiration_duration) 1105 if model.expiration_duration 1106 else None, 1107 parameters=model.parameters or {}, 1108 message_repository=self._message_repository, 1109 decoder=decoder, 1110 ) 1111 if model.request_authentication.type == "Bearer": 1112 return ModelToComponentFactory.create_bearer_authenticator( 1113 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1114 config, 1115 token_provider=token_provider, 1116 ) 1117 else: 1118 return self.create_api_key_authenticator( 1119 ApiKeyAuthenticatorModel( 1120 type="ApiKeyAuthenticator", 1121 api_token="", 1122 inject_into=model.request_authentication.inject_into, 1123 ), # type: ignore # $parameters and headers default to None 1124 config=config, 1125 token_provider=token_provider, 1126 )
1128 @staticmethod 1129 def create_basic_http_authenticator( 1130 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1131 ) -> BasicHttpAuthenticator: 1132 return BasicHttpAuthenticator( 1133 password=model.password or "", 1134 username=model.username, 1135 config=config, 1136 parameters=model.parameters or {}, 1137 )
1139 @staticmethod 1140 def create_bearer_authenticator( 1141 model: BearerAuthenticatorModel, 1142 config: Config, 1143 token_provider: Optional[TokenProvider] = None, 1144 **kwargs: Any, 1145 ) -> BearerAuthenticator: 1146 if token_provider is not None and model.api_token != "": 1147 raise ValueError( 1148 "If token_provider is set, api_token is ignored and has to be set to empty string." 1149 ) 1150 return BearerAuthenticator( 1151 token_provider=( 1152 token_provider 1153 if token_provider is not None 1154 else InterpolatedStringTokenProvider( 1155 api_token=model.api_token or "", 1156 config=config, 1157 parameters=model.parameters or {}, 1158 ) 1159 ), 1160 config=config, 1161 parameters=model.parameters or {}, 1162 )
1164 @staticmethod 1165 def create_dynamic_stream_check_config( 1166 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1167 ) -> DynamicStreamCheckConfig: 1168 return DynamicStreamCheckConfig( 1169 dynamic_stream_name=model.dynamic_stream_name, 1170 stream_count=model.stream_count or 0, 1171 )
1173 def create_check_stream( 1174 self, model: CheckStreamModel, config: Config, **kwargs: Any 1175 ) -> CheckStream: 1176 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1177 raise ValueError( 1178 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1179 ) 1180 1181 dynamic_streams_check_configs = ( 1182 [ 1183 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1184 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1185 ] 1186 if model.dynamic_streams_check_configs 1187 else [] 1188 ) 1189 1190 return CheckStream( 1191 stream_names=model.stream_names or [], 1192 dynamic_streams_check_configs=dynamic_streams_check_configs, 1193 parameters={}, 1194 )
1196 @staticmethod 1197 def create_check_dynamic_stream( 1198 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1199 ) -> CheckDynamicStream: 1200 assert model.use_check_availability is not None # for mypy 1201 1202 use_check_availability = model.use_check_availability 1203 1204 return CheckDynamicStream( 1205 stream_count=model.stream_count, 1206 use_check_availability=use_check_availability, 1207 parameters={}, 1208 )
1210 def create_composite_error_handler( 1211 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1212 ) -> CompositeErrorHandler: 1213 error_handlers = [ 1214 self._create_component_from_model(model=error_handler_model, config=config) 1215 for error_handler_model in model.error_handlers 1216 ] 1217 return CompositeErrorHandler( 1218 error_handlers=error_handlers, parameters=model.parameters or {} 1219 )
1221 @staticmethod 1222 def create_concurrency_level( 1223 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1224 ) -> ConcurrencyLevel: 1225 return ConcurrencyLevel( 1226 default_concurrency=model.default_concurrency, 1227 max_concurrency=model.max_concurrency, 1228 config=config, 1229 parameters={}, 1230 )
1232 @staticmethod 1233 def apply_stream_state_migrations( 1234 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1235 ) -> MutableMapping[str, Any]: 1236 if stream_state_migrations: 1237 for state_migration in stream_state_migrations: 1238 if state_migration.should_migrate(stream_state): 1239 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1240 stream_state = dict(state_migration.migrate(stream_state)) 1241 return stream_state
1243 def create_concurrent_cursor_from_datetime_based_cursor( 1244 self, 1245 model_type: Type[BaseModel], 1246 component_definition: ComponentDefinition, 1247 stream_name: str, 1248 stream_namespace: Optional[str], 1249 config: Config, 1250 message_repository: Optional[MessageRepository] = None, 1251 runtime_lookback_window: Optional[datetime.timedelta] = None, 1252 stream_state_migrations: Optional[List[Any]] = None, 1253 **kwargs: Any, 1254 ) -> ConcurrentCursor: 1255 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1256 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1257 # incoming state and connector_state_manager that is initialized when the component factory is created 1258 stream_state = ( 1259 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1260 if "stream_state" not in kwargs 1261 else kwargs["stream_state"] 1262 ) 1263 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1264 1265 component_type = component_definition.get("type") 1266 if component_definition.get("type") != model_type.__name__: 1267 raise ValueError( 1268 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1269 ) 1270 1271 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1272 1273 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1274 raise ValueError( 1275 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1276 ) 1277 1278 interpolated_cursor_field = InterpolatedString.create( 1279 datetime_based_cursor_model.cursor_field, 1280 parameters=datetime_based_cursor_model.parameters or {}, 1281 ) 1282 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1283 1284 interpolated_partition_field_start = InterpolatedString.create( 1285 datetime_based_cursor_model.partition_field_start or "start_time", 1286 parameters=datetime_based_cursor_model.parameters or {}, 1287 ) 1288 interpolated_partition_field_end = InterpolatedString.create( 1289 datetime_based_cursor_model.partition_field_end or "end_time", 1290 parameters=datetime_based_cursor_model.parameters or {}, 1291 ) 1292 1293 slice_boundary_fields = ( 1294 interpolated_partition_field_start.eval(config=config), 1295 interpolated_partition_field_end.eval(config=config), 1296 ) 1297 1298 datetime_format = datetime_based_cursor_model.datetime_format 1299 1300 cursor_granularity = ( 1301 parse_duration(datetime_based_cursor_model.cursor_granularity) 1302 if datetime_based_cursor_model.cursor_granularity 1303 else None 1304 ) 1305 1306 lookback_window = None 1307 interpolated_lookback_window = ( 1308 InterpolatedString.create( 1309 datetime_based_cursor_model.lookback_window, 1310 parameters=datetime_based_cursor_model.parameters or {}, 1311 ) 1312 if datetime_based_cursor_model.lookback_window 1313 else None 1314 ) 1315 if interpolated_lookback_window: 1316 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1317 if evaluated_lookback_window: 1318 lookback_window = parse_duration(evaluated_lookback_window) 1319 1320 connector_state_converter: DateTimeStreamStateConverter 1321 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1322 datetime_format=datetime_format, 1323 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1324 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1325 cursor_granularity=cursor_granularity, 1326 ) 1327 1328 # Adjusts the stream state by applying the runtime lookback window. 1329 # This is used to ensure correct state handling in case of failed partitions. 1330 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1331 if runtime_lookback_window and stream_state_value: 1332 new_stream_state = ( 1333 connector_state_converter.parse_timestamp(stream_state_value) 1334 - runtime_lookback_window 1335 ) 1336 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1337 new_stream_state 1338 ) 1339 1340 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1341 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1342 start_date_runtime_value = self.create_min_max_datetime( 1343 model=datetime_based_cursor_model.start_datetime, config=config 1344 ) 1345 else: 1346 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1347 1348 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1349 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1350 end_date_runtime_value = self.create_min_max_datetime( 1351 model=datetime_based_cursor_model.end_datetime, config=config 1352 ) 1353 else: 1354 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1355 1356 interpolated_start_date = MinMaxDatetime.create( 1357 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1358 parameters=datetime_based_cursor_model.parameters, 1359 ) 1360 interpolated_end_date = ( 1361 None 1362 if not end_date_runtime_value 1363 else MinMaxDatetime.create( 1364 end_date_runtime_value, datetime_based_cursor_model.parameters 1365 ) 1366 ) 1367 1368 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1369 if not interpolated_start_date.datetime_format: 1370 interpolated_start_date.datetime_format = datetime_format 1371 if interpolated_end_date and not interpolated_end_date.datetime_format: 1372 interpolated_end_date.datetime_format = datetime_format 1373 1374 start_date = interpolated_start_date.get_datetime(config=config) 1375 end_date_provider = ( 1376 partial(interpolated_end_date.get_datetime, config) 1377 if interpolated_end_date 1378 else connector_state_converter.get_end_provider() 1379 ) 1380 1381 if ( 1382 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1383 ) or ( 1384 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1385 ): 1386 raise ValueError( 1387 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1388 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1389 ) 1390 1391 # When step is not defined, default to a step size from the starting date to the present moment 1392 step_length = datetime.timedelta.max 1393 interpolated_step = ( 1394 InterpolatedString.create( 1395 datetime_based_cursor_model.step, 1396 parameters=datetime_based_cursor_model.parameters or {}, 1397 ) 1398 if datetime_based_cursor_model.step 1399 else None 1400 ) 1401 if interpolated_step: 1402 evaluated_step = interpolated_step.eval(config) 1403 if evaluated_step: 1404 step_length = parse_duration(evaluated_step) 1405 1406 clamping_strategy: ClampingStrategy = NoClamping() 1407 if datetime_based_cursor_model.clamping: 1408 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1409 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1410 # object which we want to keep agnostic of being low-code 1411 target = InterpolatedString( 1412 string=datetime_based_cursor_model.clamping.target, 1413 parameters=datetime_based_cursor_model.parameters or {}, 1414 ) 1415 evaluated_target = target.eval(config=config) 1416 match evaluated_target: 1417 case "DAY": 1418 clamping_strategy = DayClampingStrategy() 1419 end_date_provider = ClampingEndProvider( 1420 DayClampingStrategy(is_ceiling=False), 1421 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1422 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1423 ) 1424 case "WEEK": 1425 if ( 1426 not datetime_based_cursor_model.clamping.target_details 1427 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1428 ): 1429 raise ValueError( 1430 "Given WEEK clamping, weekday needs to be provided as target_details" 1431 ) 1432 weekday = self._assemble_weekday( 1433 datetime_based_cursor_model.clamping.target_details["weekday"] 1434 ) 1435 clamping_strategy = WeekClampingStrategy(weekday) 1436 end_date_provider = ClampingEndProvider( 1437 WeekClampingStrategy(weekday, is_ceiling=False), 1438 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1439 granularity=cursor_granularity or datetime.timedelta(days=1), 1440 ) 1441 case "MONTH": 1442 clamping_strategy = MonthClampingStrategy() 1443 end_date_provider = ClampingEndProvider( 1444 MonthClampingStrategy(is_ceiling=False), 1445 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1446 granularity=cursor_granularity or datetime.timedelta(days=1), 1447 ) 1448 case _: 1449 raise ValueError( 1450 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1451 ) 1452 1453 return ConcurrentCursor( 1454 stream_name=stream_name, 1455 stream_namespace=stream_namespace, 1456 stream_state=stream_state, 1457 message_repository=message_repository or self._message_repository, 1458 connector_state_manager=self._connector_state_manager, 1459 connector_state_converter=connector_state_converter, 1460 cursor_field=cursor_field, 1461 slice_boundary_fields=slice_boundary_fields, 1462 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1463 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1464 lookback_window=lookback_window, 1465 slice_range=step_length, 1466 cursor_granularity=cursor_granularity, 1467 clamping_strategy=clamping_strategy, 1468 )
1470 def create_concurrent_cursor_from_incrementing_count_cursor( 1471 self, 1472 model_type: Type[BaseModel], 1473 component_definition: ComponentDefinition, 1474 stream_name: str, 1475 stream_namespace: Optional[str], 1476 config: Config, 1477 message_repository: Optional[MessageRepository] = None, 1478 stream_state_migrations: Optional[List[Any]] = None, 1479 **kwargs: Any, 1480 ) -> ConcurrentCursor: 1481 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1482 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1483 # incoming state and connector_state_manager that is initialized when the component factory is created 1484 stream_state = ( 1485 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1486 if "stream_state" not in kwargs 1487 else kwargs["stream_state"] 1488 ) 1489 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1490 1491 component_type = component_definition.get("type") 1492 if component_definition.get("type") != model_type.__name__: 1493 raise ValueError( 1494 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1495 ) 1496 1497 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1498 1499 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1500 raise ValueError( 1501 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1502 ) 1503 1504 interpolated_start_value = ( 1505 InterpolatedString.create( 1506 incrementing_count_cursor_model.start_value, # type: ignore 1507 parameters=incrementing_count_cursor_model.parameters or {}, 1508 ) 1509 if incrementing_count_cursor_model.start_value 1510 else 0 1511 ) 1512 1513 interpolated_cursor_field = InterpolatedString.create( 1514 incrementing_count_cursor_model.cursor_field, 1515 parameters=incrementing_count_cursor_model.parameters or {}, 1516 ) 1517 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1518 1519 connector_state_converter = IncrementingCountStreamStateConverter( 1520 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1521 ) 1522 1523 return ConcurrentCursor( 1524 stream_name=stream_name, 1525 stream_namespace=stream_namespace, 1526 stream_state=stream_state, 1527 message_repository=message_repository or self._message_repository, 1528 connector_state_manager=self._connector_state_manager, 1529 connector_state_converter=connector_state_converter, 1530 cursor_field=cursor_field, 1531 slice_boundary_fields=None, 1532 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1534 )
1555 def create_concurrent_cursor_from_perpartition_cursor( 1556 self, 1557 state_manager: ConnectorStateManager, 1558 model_type: Type[BaseModel], 1559 component_definition: ComponentDefinition, 1560 stream_name: str, 1561 stream_namespace: Optional[str], 1562 config: Config, 1563 stream_state: MutableMapping[str, Any], 1564 partition_router: PartitionRouter, 1565 stream_state_migrations: Optional[List[Any]] = None, 1566 attempt_to_create_cursor_if_not_provided: bool = False, 1567 **kwargs: Any, 1568 ) -> ConcurrentPerPartitionCursor: 1569 component_type = component_definition.get("type") 1570 if component_definition.get("type") != model_type.__name__: 1571 raise ValueError( 1572 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1573 ) 1574 1575 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1576 1577 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1578 raise ValueError( 1579 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1580 ) 1581 1582 interpolated_cursor_field = InterpolatedString.create( 1583 datetime_based_cursor_model.cursor_field, 1584 parameters=datetime_based_cursor_model.parameters or {}, 1585 ) 1586 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1587 1588 datetime_format = datetime_based_cursor_model.datetime_format 1589 1590 cursor_granularity = ( 1591 parse_duration(datetime_based_cursor_model.cursor_granularity) 1592 if datetime_based_cursor_model.cursor_granularity 1593 else None 1594 ) 1595 1596 connector_state_converter: DateTimeStreamStateConverter 1597 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1598 datetime_format=datetime_format, 1599 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1600 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1601 cursor_granularity=cursor_granularity, 1602 ) 1603 1604 # Create the cursor factory 1605 cursor_factory = ConcurrentCursorFactory( 1606 partial( 1607 self.create_concurrent_cursor_from_datetime_based_cursor, 1608 state_manager=state_manager, 1609 model_type=model_type, 1610 component_definition=component_definition, 1611 stream_name=stream_name, 1612 stream_namespace=stream_namespace, 1613 config=config, 1614 message_repository=NoopMessageRepository(), 1615 stream_state_migrations=stream_state_migrations, 1616 ) 1617 ) 1618 1619 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1620 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1621 use_global_cursor = isinstance( 1622 partition_router, GroupingPartitionRouter 1623 ) or component_definition.get("global_substream_cursor", False) 1624 1625 # Return the concurrent cursor and state converter 1626 return ConcurrentPerPartitionCursor( 1627 cursor_factory=cursor_factory, 1628 partition_router=partition_router, 1629 stream_name=stream_name, 1630 stream_namespace=stream_namespace, 1631 stream_state=stream_state, 1632 message_repository=self._message_repository, # type: ignore 1633 connector_state_manager=state_manager, 1634 connector_state_converter=connector_state_converter, 1635 cursor_field=cursor_field, 1636 use_global_cursor=use_global_cursor, 1637 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1638 )
1640 @staticmethod 1641 def create_constant_backoff_strategy( 1642 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1643 ) -> ConstantBackoffStrategy: 1644 return ConstantBackoffStrategy( 1645 backoff_time_in_seconds=model.backoff_time_in_seconds, 1646 config=config, 1647 parameters=model.parameters or {}, 1648 )
1650 def create_cursor_pagination( 1651 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1652 ) -> CursorPaginationStrategy: 1653 if isinstance(decoder, PaginationDecoderDecorator): 1654 inner_decoder = decoder.decoder 1655 else: 1656 inner_decoder = decoder 1657 decoder = PaginationDecoderDecorator(decoder=decoder) 1658 1659 if self._is_supported_decoder_for_pagination(inner_decoder): 1660 decoder_to_use = decoder 1661 else: 1662 raise ValueError( 1663 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1664 ) 1665 1666 return CursorPaginationStrategy( 1667 cursor_value=model.cursor_value, 1668 decoder=decoder_to_use, 1669 page_size=model.page_size, 1670 stop_condition=model.stop_condition, 1671 config=config, 1672 parameters=model.parameters or {}, 1673 )
1675 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1676 """ 1677 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1678 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1679 :param model: The Pydantic model of the custom component being created 1680 :param config: The custom defined connector config 1681 :return: The declarative component built from the Pydantic model to be used at runtime 1682 """ 1683 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1684 component_fields = get_type_hints(custom_component_class) 1685 model_args = model.dict() 1686 model_args["config"] = config 1687 1688 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1689 # we defer to these arguments over the component's definition 1690 for key, arg in kwargs.items(): 1691 model_args[key] = arg 1692 1693 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1694 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1695 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1696 for model_field, model_value in model_args.items(): 1697 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1698 if ( 1699 isinstance(model_value, dict) 1700 and "type" not in model_value 1701 and model_field in component_fields 1702 ): 1703 derived_type = self._derive_component_type_from_type_hints( 1704 component_fields.get(model_field) 1705 ) 1706 if derived_type: 1707 model_value["type"] = derived_type 1708 1709 if self._is_component(model_value): 1710 model_args[model_field] = self._create_nested_component( 1711 model, model_field, model_value, config 1712 ) 1713 elif isinstance(model_value, list): 1714 vals = [] 1715 for v in model_value: 1716 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1717 derived_type = self._derive_component_type_from_type_hints( 1718 component_fields.get(model_field) 1719 ) 1720 if derived_type: 1721 v["type"] = derived_type 1722 if self._is_component(v): 1723 vals.append(self._create_nested_component(model, model_field, v, config)) 1724 else: 1725 vals.append(v) 1726 model_args[model_field] = vals 1727 1728 kwargs = { 1729 class_field: model_args[class_field] 1730 for class_field in component_fields.keys() 1731 if class_field in model_args 1732 } 1733 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1865 def create_datetime_based_cursor( 1866 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1867 ) -> DatetimeBasedCursor: 1868 start_datetime: Union[str, MinMaxDatetime] = ( 1869 model.start_datetime 1870 if isinstance(model.start_datetime, str) 1871 else self.create_min_max_datetime(model.start_datetime, config) 1872 ) 1873 end_datetime: Union[str, MinMaxDatetime, None] = None 1874 if model.is_data_feed and model.end_datetime: 1875 raise ValueError("Data feed does not support end_datetime") 1876 if model.is_data_feed and model.is_client_side_incremental: 1877 raise ValueError( 1878 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1879 ) 1880 if model.end_datetime: 1881 end_datetime = ( 1882 model.end_datetime 1883 if isinstance(model.end_datetime, str) 1884 else self.create_min_max_datetime(model.end_datetime, config) 1885 ) 1886 1887 end_time_option = ( 1888 self._create_component_from_model( 1889 model.end_time_option, config, parameters=model.parameters or {} 1890 ) 1891 if model.end_time_option 1892 else None 1893 ) 1894 start_time_option = ( 1895 self._create_component_from_model( 1896 model.start_time_option, config, parameters=model.parameters or {} 1897 ) 1898 if model.start_time_option 1899 else None 1900 ) 1901 1902 return DatetimeBasedCursor( 1903 cursor_field=model.cursor_field, 1904 cursor_datetime_formats=model.cursor_datetime_formats 1905 if model.cursor_datetime_formats 1906 else [], 1907 cursor_granularity=model.cursor_granularity, 1908 datetime_format=model.datetime_format, 1909 end_datetime=end_datetime, 1910 start_datetime=start_datetime, 1911 step=model.step, 1912 end_time_option=end_time_option, 1913 lookback_window=model.lookback_window, 1914 start_time_option=start_time_option, 1915 partition_field_end=model.partition_field_end, 1916 partition_field_start=model.partition_field_start, 1917 message_repository=self._message_repository, 1918 is_compare_strictly=model.is_compare_strictly, 1919 config=config, 1920 parameters=model.parameters or {}, 1921 )
1923 def create_declarative_stream( 1924 self, model: DeclarativeStreamModel, config: Config, **kwargs: Any 1925 ) -> DeclarativeStream: 1926 # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field 1927 # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the 1928 # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in 1929 # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one. 1930 combined_slicers = self._merge_stream_slicers(model=model, config=config) 1931 1932 primary_key = model.primary_key.__root__ if model.primary_key else None 1933 stop_condition_on_cursor = ( 1934 model.incremental_sync 1935 and hasattr(model.incremental_sync, "is_data_feed") 1936 and model.incremental_sync.is_data_feed 1937 ) 1938 client_side_filtering_enabled = ( 1939 model.incremental_sync 1940 and hasattr(model.incremental_sync, "is_client_side_incremental") 1941 and model.incremental_sync.is_client_side_incremental 1942 ) 1943 concurrent_cursor = None 1944 if stop_condition_on_cursor or client_side_filtering_enabled: 1945 stream_slicer = self._build_stream_slicer_from_partition_router( 1946 model.retriever, config, stream_name=model.name 1947 ) 1948 concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config) 1949 1950 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1951 cursor_model = model.incremental_sync 1952 1953 end_time_option = ( 1954 self._create_component_from_model( 1955 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1956 ) 1957 if cursor_model.end_time_option 1958 else None 1959 ) 1960 start_time_option = ( 1961 self._create_component_from_model( 1962 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1963 ) 1964 if cursor_model.start_time_option 1965 else None 1966 ) 1967 1968 request_options_provider = DatetimeBasedRequestOptionsProvider( 1969 start_time_option=start_time_option, 1970 end_time_option=end_time_option, 1971 partition_field_start=cursor_model.partition_field_end, 1972 partition_field_end=cursor_model.partition_field_end, 1973 config=config, 1974 parameters=model.parameters or {}, 1975 ) 1976 elif model.incremental_sync and isinstance( 1977 model.incremental_sync, IncrementingCountCursorModel 1978 ): 1979 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 1980 1981 start_time_option = ( 1982 self._create_component_from_model( 1983 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1984 config, 1985 parameters=cursor_model.parameters or {}, 1986 ) 1987 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 1988 else None 1989 ) 1990 1991 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 1992 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 1993 partition_field_start = "start" 1994 1995 request_options_provider = DatetimeBasedRequestOptionsProvider( 1996 start_time_option=start_time_option, 1997 partition_field_start=partition_field_start, 1998 config=config, 1999 parameters=model.parameters or {}, 2000 ) 2001 else: 2002 request_options_provider = None 2003 2004 transformations = [] 2005 if model.transformations: 2006 for transformation_model in model.transformations: 2007 transformations.append( 2008 self._create_component_from_model(model=transformation_model, config=config) 2009 ) 2010 file_uploader = None 2011 if model.file_uploader: 2012 file_uploader = self._create_component_from_model( 2013 model=model.file_uploader, config=config 2014 ) 2015 2016 retriever = self._create_component_from_model( 2017 model=model.retriever, 2018 config=config, 2019 name=model.name, 2020 primary_key=primary_key, 2021 stream_slicer=combined_slicers, 2022 request_options_provider=request_options_provider, 2023 stop_condition_cursor=concurrent_cursor, 2024 client_side_incremental_sync={"cursor": concurrent_cursor} 2025 if client_side_filtering_enabled 2026 else None, 2027 transformations=transformations, 2028 file_uploader=file_uploader, 2029 incremental_sync=model.incremental_sync, 2030 ) 2031 cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None 2032 2033 if model.state_migrations: 2034 state_transformations = [ 2035 self._create_component_from_model(state_migration, config, declarative_stream=model) 2036 for state_migration in model.state_migrations 2037 ] 2038 else: 2039 state_transformations = [] 2040 2041 schema_loader: Union[ 2042 CompositeSchemaLoader, 2043 DefaultSchemaLoader, 2044 DynamicSchemaLoader, 2045 InlineSchemaLoader, 2046 JsonFileSchemaLoader, 2047 ] 2048 if model.schema_loader and isinstance(model.schema_loader, list): 2049 nested_schema_loaders = [ 2050 self._create_component_from_model(model=nested_schema_loader, config=config) 2051 for nested_schema_loader in model.schema_loader 2052 ] 2053 schema_loader = CompositeSchemaLoader( 2054 schema_loaders=nested_schema_loaders, parameters={} 2055 ) 2056 elif model.schema_loader: 2057 schema_loader = self._create_component_from_model( 2058 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2059 config=config, 2060 ) 2061 else: 2062 options = model.parameters or {} 2063 if "name" not in options: 2064 options["name"] = model.name 2065 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2066 2067 return DeclarativeStream( 2068 name=model.name or "", 2069 primary_key=primary_key, 2070 retriever=retriever, 2071 schema_loader=schema_loader, 2072 stream_cursor_field=cursor_field or "", 2073 state_migrations=state_transformations, 2074 config=config, 2075 parameters=model.parameters or {}, 2076 )
2301 def create_default_error_handler( 2302 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2303 ) -> DefaultErrorHandler: 2304 backoff_strategies = [] 2305 if model.backoff_strategies: 2306 for backoff_strategy_model in model.backoff_strategies: 2307 backoff_strategies.append( 2308 self._create_component_from_model(model=backoff_strategy_model, config=config) 2309 ) 2310 2311 response_filters = [] 2312 if model.response_filters: 2313 for response_filter_model in model.response_filters: 2314 response_filters.append( 2315 self._create_component_from_model(model=response_filter_model, config=config) 2316 ) 2317 response_filters.append( 2318 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2319 ) 2320 2321 return DefaultErrorHandler( 2322 backoff_strategies=backoff_strategies, 2323 max_retries=model.max_retries, 2324 response_filters=response_filters, 2325 config=config, 2326 parameters=model.parameters or {}, 2327 )
2329 def create_default_paginator( 2330 self, 2331 model: DefaultPaginatorModel, 2332 config: Config, 2333 *, 2334 url_base: str, 2335 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2336 decoder: Optional[Decoder] = None, 2337 cursor_used_for_stop_condition: Optional[Cursor] = None, 2338 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2339 if decoder: 2340 if self._is_supported_decoder_for_pagination(decoder): 2341 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2342 else: 2343 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2344 else: 2345 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2346 page_size_option = ( 2347 self._create_component_from_model(model=model.page_size_option, config=config) 2348 if model.page_size_option 2349 else None 2350 ) 2351 page_token_option = ( 2352 self._create_component_from_model(model=model.page_token_option, config=config) 2353 if model.page_token_option 2354 else None 2355 ) 2356 pagination_strategy = self._create_component_from_model( 2357 model=model.pagination_strategy, 2358 config=config, 2359 decoder=decoder_to_use, 2360 extractor_model=extractor_model, 2361 ) 2362 if cursor_used_for_stop_condition: 2363 pagination_strategy = StopConditionPaginationStrategyDecorator( 2364 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2365 ) 2366 paginator = DefaultPaginator( 2367 decoder=decoder_to_use, 2368 page_size_option=page_size_option, 2369 page_token_option=page_token_option, 2370 pagination_strategy=pagination_strategy, 2371 url_base=url_base, 2372 config=config, 2373 parameters=model.parameters or {}, 2374 ) 2375 if self._limit_pages_fetched_per_slice: 2376 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2377 return paginator
2379 def create_dpath_extractor( 2380 self, 2381 model: DpathExtractorModel, 2382 config: Config, 2383 decoder: Optional[Decoder] = None, 2384 **kwargs: Any, 2385 ) -> DpathExtractor: 2386 if decoder: 2387 decoder_to_use = decoder 2388 else: 2389 decoder_to_use = JsonDecoder(parameters={}) 2390 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2391 return DpathExtractor( 2392 decoder=decoder_to_use, 2393 field_path=model_field_path, 2394 config=config, 2395 parameters=model.parameters or {}, 2396 )
2417 def create_http_requester( 2418 self, 2419 model: HttpRequesterModel, 2420 config: Config, 2421 decoder: Decoder = JsonDecoder(parameters={}), 2422 query_properties_key: Optional[str] = None, 2423 use_cache: Optional[bool] = None, 2424 *, 2425 name: str, 2426 ) -> HttpRequester: 2427 authenticator = ( 2428 self._create_component_from_model( 2429 model=model.authenticator, 2430 config=config, 2431 url_base=model.url or model.url_base, 2432 name=name, 2433 decoder=decoder, 2434 ) 2435 if model.authenticator 2436 else None 2437 ) 2438 error_handler = ( 2439 self._create_component_from_model(model=model.error_handler, config=config) 2440 if model.error_handler 2441 else DefaultErrorHandler( 2442 backoff_strategies=[], 2443 response_filters=[], 2444 config=config, 2445 parameters=model.parameters or {}, 2446 ) 2447 ) 2448 2449 api_budget = self._api_budget 2450 2451 # Removes QueryProperties components from the interpolated mappings because it has been designed 2452 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2453 # instead of through jinja interpolation 2454 request_parameters: Optional[Union[str, Mapping[str, str]]] 2455 if isinstance(model.request_parameters, Mapping): 2456 request_parameters = self._remove_query_properties(model.request_parameters) 2457 else: 2458 request_parameters = model.request_parameters 2459 2460 request_options_provider = InterpolatedRequestOptionsProvider( 2461 request_body=model.request_body, 2462 request_body_data=model.request_body_data, 2463 request_body_json=model.request_body_json, 2464 request_headers=model.request_headers, 2465 request_parameters=request_parameters, 2466 query_properties_key=query_properties_key, 2467 config=config, 2468 parameters=model.parameters or {}, 2469 ) 2470 2471 assert model.use_cache is not None # for mypy 2472 assert model.http_method is not None # for mypy 2473 2474 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2475 2476 return HttpRequester( 2477 name=name, 2478 url=model.url, 2479 url_base=model.url_base, 2480 path=model.path, 2481 authenticator=authenticator, 2482 error_handler=error_handler, 2483 api_budget=api_budget, 2484 http_method=HttpMethod[model.http_method.value], 2485 request_options_provider=request_options_provider, 2486 config=config, 2487 disable_retries=self._disable_retries, 2488 parameters=model.parameters or {}, 2489 message_repository=self._message_repository, 2490 use_cache=should_use_cache, 2491 decoder=decoder, 2492 stream_response=decoder.is_stream_response() if decoder else False, 2493 )
2495 @staticmethod 2496 def create_http_response_filter( 2497 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2498 ) -> HttpResponseFilter: 2499 if model.action: 2500 action = ResponseAction(model.action.value) 2501 else: 2502 action = None 2503 2504 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2505 2506 http_codes = ( 2507 set(model.http_codes) if model.http_codes else set() 2508 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2509 2510 return HttpResponseFilter( 2511 action=action, 2512 failure_type=failure_type, 2513 error_message=model.error_message or "", 2514 error_message_contains=model.error_message_contains or "", 2515 http_codes=http_codes, 2516 predicate=model.predicate or "", 2517 config=config, 2518 parameters=model.parameters or {}, 2519 )
2527 def create_complex_field_type( 2528 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2529 ) -> ComplexFieldType: 2530 items = ( 2531 self._create_component_from_model(model=model.items, config=config) 2532 if isinstance(model.items, ComplexFieldTypeModel) 2533 else model.items 2534 ) 2535 2536 return ComplexFieldType(field_type=model.field_type, items=items)
2538 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2539 target_type = ( 2540 self._create_component_from_model(model=model.target_type, config=config) 2541 if isinstance(model.target_type, ComplexFieldTypeModel) 2542 else model.target_type 2543 ) 2544 2545 return TypesMap( 2546 target_type=target_type, 2547 current_type=model.current_type, 2548 condition=model.condition if model.condition is not None else "True", 2549 )
2551 def create_schema_type_identifier( 2552 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2553 ) -> SchemaTypeIdentifier: 2554 types_mapping = [] 2555 if model.types_mapping: 2556 types_mapping.extend( 2557 [ 2558 self._create_component_from_model(types_map, config=config) 2559 for types_map in model.types_mapping 2560 ] 2561 ) 2562 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2563 [x for x in model.schema_pointer] if model.schema_pointer else [] 2564 ) 2565 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2566 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2567 [x for x in model.type_pointer] if model.type_pointer else None 2568 ) 2569 2570 return SchemaTypeIdentifier( 2571 schema_pointer=model_schema_pointer, 2572 key_pointer=model_key_pointer, 2573 type_pointer=model_type_pointer, 2574 types_mapping=types_mapping, 2575 parameters=model.parameters or {}, 2576 )
2578 def create_dynamic_schema_loader( 2579 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2580 ) -> DynamicSchemaLoader: 2581 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 2582 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 2583 2584 schema_transformations = [] 2585 if model.schema_transformations: 2586 for transformation_model in model.schema_transformations: 2587 schema_transformations.append( 2588 self._create_component_from_model(model=transformation_model, config=config) 2589 ) 2590 name = "dynamic_properties" 2591 retriever = self._create_component_from_model( 2592 model=model.retriever, 2593 config=config, 2594 name=name, 2595 primary_key=None, 2596 stream_slicer=combined_slicers, 2597 transformations=[], 2598 use_cache=True, 2599 log_formatter=( 2600 lambda response: format_http_message( 2601 response, 2602 f"Schema loader '{name}' request", 2603 f"Request performed in order to extract schema.", 2604 name, 2605 is_auxiliary=True, 2606 ) 2607 ), 2608 ) 2609 schema_type_identifier = self._create_component_from_model( 2610 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2611 ) 2612 schema_filter = ( 2613 self._create_component_from_model( 2614 model.schema_filter, config=config, parameters=model.parameters or {} 2615 ) 2616 if model.schema_filter is not None 2617 else None 2618 ) 2619 2620 return DynamicSchemaLoader( 2621 retriever=retriever, 2622 config=config, 2623 schema_transformations=schema_transformations, 2624 schema_filter=schema_filter, 2625 schema_type_identifier=schema_type_identifier, 2626 parameters=model.parameters or {}, 2627 )
2647 def create_gzip_decoder( 2648 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2649 ) -> Decoder: 2650 _compressed_response_types = { 2651 "gzip", 2652 "x-gzip", 2653 "gzip, deflate", 2654 "x-gzip, deflate", 2655 "application/zip", 2656 "application/gzip", 2657 "application/x-gzip", 2658 "application/x-zip-compressed", 2659 } 2660 2661 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2662 2663 if self._emit_connector_builder_messages: 2664 # This is very surprising but if the response is not streamed, 2665 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2666 # which uses urllib3 directly and does not uncompress the data. 2667 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2668 2669 return CompositeRawDecoder.by_headers( 2670 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2671 stream_response=True, 2672 fallback_parser=gzip_parser.inner_parser, 2673 )
2675 @staticmethod 2676 def create_incrementing_count_cursor( 2677 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2678 ) -> DatetimeBasedCursor: 2679 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2680 # we still parse models into components. The issue is that there's no runtime implementation of a 2681 # IncrementingCountCursor. 2682 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2683 return DatetimeBasedCursor( 2684 cursor_field=model.cursor_field, 2685 datetime_format="%Y-%m-%d", 2686 start_datetime="2024-12-12", 2687 config=config, 2688 parameters={}, 2689 )
2738 @staticmethod 2739 def create_jwt_authenticator( 2740 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2741 ) -> JwtAuthenticator: 2742 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2743 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2744 return JwtAuthenticator( 2745 config=config, 2746 parameters=model.parameters or {}, 2747 algorithm=JwtAlgorithm(model.algorithm.value), 2748 secret_key=model.secret_key, 2749 base64_encode_secret_key=model.base64_encode_secret_key, 2750 token_duration=model.token_duration, 2751 header_prefix=model.header_prefix, 2752 kid=jwt_headers.kid, 2753 typ=jwt_headers.typ, 2754 cty=jwt_headers.cty, 2755 iss=jwt_payload.iss, 2756 sub=jwt_payload.sub, 2757 aud=jwt_payload.aud, 2758 additional_jwt_headers=model.additional_jwt_headers, 2759 additional_jwt_payload=model.additional_jwt_payload, 2760 )
2762 def create_list_partition_router( 2763 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2764 ) -> ListPartitionRouter: 2765 request_option = ( 2766 self._create_component_from_model(model.request_option, config) 2767 if model.request_option 2768 else None 2769 ) 2770 return ListPartitionRouter( 2771 cursor_field=model.cursor_field, 2772 request_option=request_option, 2773 values=model.values, 2774 config=config, 2775 parameters=model.parameters or {}, 2776 )
2778 @staticmethod 2779 def create_min_max_datetime( 2780 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2781 ) -> MinMaxDatetime: 2782 return MinMaxDatetime( 2783 datetime=model.datetime, 2784 datetime_format=model.datetime_format or "", 2785 max_datetime=model.max_datetime or "", 2786 min_datetime=model.min_datetime or "", 2787 parameters=model.parameters or {}, 2788 )
2800 def create_oauth_authenticator( 2801 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2802 ) -> DeclarativeOauth2Authenticator: 2803 profile_assertion = ( 2804 self._create_component_from_model(model.profile_assertion, config=config) 2805 if model.profile_assertion 2806 else None 2807 ) 2808 2809 if model.refresh_token_updater: 2810 # ignore type error because fixing it would have a lot of dependencies, revisit later 2811 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2812 config, 2813 InterpolatedString.create( 2814 model.token_refresh_endpoint, # type: ignore 2815 parameters=model.parameters or {}, 2816 ).eval(config), 2817 access_token_name=InterpolatedString.create( 2818 model.access_token_name or "access_token", parameters=model.parameters or {} 2819 ).eval(config), 2820 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2821 expires_in_name=InterpolatedString.create( 2822 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2823 ).eval(config), 2824 client_id_name=InterpolatedString.create( 2825 model.client_id_name or "client_id", parameters=model.parameters or {} 2826 ).eval(config), 2827 client_id=InterpolatedString.create( 2828 model.client_id, parameters=model.parameters or {} 2829 ).eval(config) 2830 if model.client_id 2831 else model.client_id, 2832 client_secret_name=InterpolatedString.create( 2833 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2834 ).eval(config), 2835 client_secret=InterpolatedString.create( 2836 model.client_secret, parameters=model.parameters or {} 2837 ).eval(config) 2838 if model.client_secret 2839 else model.client_secret, 2840 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2841 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2842 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2843 grant_type_name=InterpolatedString.create( 2844 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2845 ).eval(config), 2846 grant_type=InterpolatedString.create( 2847 model.grant_type or "refresh_token", parameters=model.parameters or {} 2848 ).eval(config), 2849 refresh_request_body=InterpolatedMapping( 2850 model.refresh_request_body or {}, parameters=model.parameters or {} 2851 ).eval(config), 2852 refresh_request_headers=InterpolatedMapping( 2853 model.refresh_request_headers or {}, parameters=model.parameters or {} 2854 ).eval(config), 2855 scopes=model.scopes, 2856 token_expiry_date_format=model.token_expiry_date_format, 2857 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2858 message_repository=self._message_repository, 2859 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2860 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2861 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2862 ) 2863 # ignore type error because fixing it would have a lot of dependencies, revisit later 2864 return DeclarativeOauth2Authenticator( # type: ignore 2865 access_token_name=model.access_token_name or "access_token", 2866 access_token_value=model.access_token_value, 2867 client_id_name=model.client_id_name or "client_id", 2868 client_id=model.client_id, 2869 client_secret_name=model.client_secret_name or "client_secret", 2870 client_secret=model.client_secret, 2871 expires_in_name=model.expires_in_name or "expires_in", 2872 grant_type_name=model.grant_type_name or "grant_type", 2873 grant_type=model.grant_type or "refresh_token", 2874 refresh_request_body=model.refresh_request_body, 2875 refresh_request_headers=model.refresh_request_headers, 2876 refresh_token_name=model.refresh_token_name or "refresh_token", 2877 refresh_token=model.refresh_token, 2878 scopes=model.scopes, 2879 token_expiry_date=model.token_expiry_date, 2880 token_expiry_date_format=model.token_expiry_date_format, 2881 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2882 token_refresh_endpoint=model.token_refresh_endpoint, 2883 config=config, 2884 parameters=model.parameters or {}, 2885 message_repository=self._message_repository, 2886 profile_assertion=profile_assertion, 2887 use_profile_assertion=model.use_profile_assertion, 2888 )
2890 def create_offset_increment( 2891 self, 2892 model: OffsetIncrementModel, 2893 config: Config, 2894 decoder: Decoder, 2895 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2896 **kwargs: Any, 2897 ) -> OffsetIncrement: 2898 if isinstance(decoder, PaginationDecoderDecorator): 2899 inner_decoder = decoder.decoder 2900 else: 2901 inner_decoder = decoder 2902 decoder = PaginationDecoderDecorator(decoder=decoder) 2903 2904 if self._is_supported_decoder_for_pagination(inner_decoder): 2905 decoder_to_use = decoder 2906 else: 2907 raise ValueError( 2908 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2909 ) 2910 2911 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2912 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2913 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2914 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2915 # When we have more time to investigate we can look into reusing the same component. 2916 extractor = ( 2917 self._create_component_from_model( 2918 model=extractor_model, config=config, decoder=decoder_to_use 2919 ) 2920 if extractor_model 2921 else None 2922 ) 2923 2924 return OffsetIncrement( 2925 page_size=model.page_size, 2926 config=config, 2927 decoder=decoder_to_use, 2928 extractor=extractor, 2929 inject_on_first_request=model.inject_on_first_request or False, 2930 parameters=model.parameters or {}, 2931 )
2933 @staticmethod 2934 def create_page_increment( 2935 model: PageIncrementModel, config: Config, **kwargs: Any 2936 ) -> PageIncrement: 2937 return PageIncrement( 2938 page_size=model.page_size, 2939 config=config, 2940 start_from_page=model.start_from_page or 0, 2941 inject_on_first_request=model.inject_on_first_request or False, 2942 parameters=model.parameters or {}, 2943 )
2945 def create_parent_stream_config( 2946 self, model: ParentStreamConfigModel, config: Config, **kwargs: Any 2947 ) -> ParentStreamConfig: 2948 declarative_stream = self._create_component_from_model( 2949 model.stream, config=config, **kwargs 2950 ) 2951 request_option = ( 2952 self._create_component_from_model(model.request_option, config=config) 2953 if model.request_option 2954 else None 2955 ) 2956 2957 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2958 raise ValueError( 2959 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2960 ) 2961 2962 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2963 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2964 ) 2965 2966 return ParentStreamConfig( 2967 parent_key=model.parent_key, 2968 request_option=request_option, 2969 stream=declarative_stream, 2970 partition_field=model.partition_field, 2971 config=config, 2972 incremental_dependency=model.incremental_dependency or False, 2973 parameters=model.parameters or {}, 2974 extra_fields=model.extra_fields, 2975 lazy_read_pointer=model_lazy_read_pointer, 2976 )
2978 def create_properties_from_endpoint( 2979 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2980 ) -> PropertiesFromEndpoint: 2981 retriever = self._create_component_from_model( 2982 model=model.retriever, 2983 config=config, 2984 name="dynamic_properties", 2985 primary_key=None, 2986 stream_slicer=None, 2987 transformations=[], 2988 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2989 ) 2990 return PropertiesFromEndpoint( 2991 property_field_path=model.property_field_path, 2992 retriever=retriever, 2993 config=config, 2994 parameters=model.parameters or {}, 2995 )
2997 def create_property_chunking( 2998 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2999 ) -> PropertyChunking: 3000 record_merge_strategy = ( 3001 self._create_component_from_model( 3002 model=model.record_merge_strategy, config=config, **kwargs 3003 ) 3004 if model.record_merge_strategy 3005 else None 3006 ) 3007 3008 property_limit_type: PropertyLimitType 3009 match model.property_limit_type: 3010 case PropertyLimitTypeModel.property_count: 3011 property_limit_type = PropertyLimitType.property_count 3012 case PropertyLimitTypeModel.characters: 3013 property_limit_type = PropertyLimitType.characters 3014 case _: 3015 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3016 3017 return PropertyChunking( 3018 property_limit_type=property_limit_type, 3019 property_limit=model.property_limit, 3020 record_merge_strategy=record_merge_strategy, 3021 config=config, 3022 parameters=model.parameters or {}, 3023 )
3025 def create_query_properties( 3026 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 3027 ) -> QueryProperties: 3028 if isinstance(model.property_list, list): 3029 property_list = model.property_list 3030 else: 3031 property_list = self._create_component_from_model( 3032 model=model.property_list, config=config, **kwargs 3033 ) 3034 3035 property_chunking = ( 3036 self._create_component_from_model( 3037 model=model.property_chunking, config=config, **kwargs 3038 ) 3039 if model.property_chunking 3040 else None 3041 ) 3042 3043 return QueryProperties( 3044 property_list=property_list, 3045 always_include_properties=model.always_include_properties, 3046 property_chunking=property_chunking, 3047 config=config, 3048 parameters=model.parameters or {}, 3049 )
3063 @staticmethod 3064 def create_request_option( 3065 model: RequestOptionModel, config: Config, **kwargs: Any 3066 ) -> RequestOption: 3067 inject_into = RequestOptionType(model.inject_into.value) 3068 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3069 [ 3070 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3071 for segment in model.field_path 3072 ] 3073 if model.field_path 3074 else None 3075 ) 3076 field_name = ( 3077 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3078 if model.field_name 3079 else None 3080 ) 3081 return RequestOption( 3082 field_name=field_name, 3083 field_path=field_path, 3084 inject_into=inject_into, 3085 parameters=kwargs.get("parameters", {}), 3086 )
3088 def create_record_selector( 3089 self, 3090 model: RecordSelectorModel, 3091 config: Config, 3092 *, 3093 name: str, 3094 transformations: List[RecordTransformation] | None = None, 3095 decoder: Decoder | None = None, 3096 client_side_incremental_sync: Dict[str, Any] | None = None, 3097 file_uploader: Optional[DefaultFileUploader] = None, 3098 **kwargs: Any, 3099 ) -> RecordSelector: 3100 extractor = self._create_component_from_model( 3101 model=model.extractor, decoder=decoder, config=config 3102 ) 3103 record_filter = ( 3104 self._create_component_from_model(model.record_filter, config=config) 3105 if model.record_filter 3106 else None 3107 ) 3108 3109 transform_before_filtering = ( 3110 False if model.transform_before_filtering is None else model.transform_before_filtering 3111 ) 3112 if client_side_incremental_sync: 3113 record_filter = ClientSideIncrementalRecordFilterDecorator( 3114 config=config, 3115 parameters=model.parameters, 3116 condition=model.record_filter.condition 3117 if (model.record_filter and hasattr(model.record_filter, "condition")) 3118 else None, 3119 **client_side_incremental_sync, 3120 ) 3121 transform_before_filtering = ( 3122 True 3123 if model.transform_before_filtering is None 3124 else model.transform_before_filtering 3125 ) 3126 3127 if model.schema_normalization is None: 3128 # default to no schema normalization if not set 3129 model.schema_normalization = SchemaNormalizationModel.None_ 3130 3131 schema_normalization = ( 3132 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3133 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3134 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3135 ) 3136 3137 return RecordSelector( 3138 extractor=extractor, 3139 name=name, 3140 config=config, 3141 record_filter=record_filter, 3142 transformations=transformations or [], 3143 file_uploader=file_uploader, 3144 schema_normalization=schema_normalization, 3145 parameters=model.parameters or {}, 3146 transform_before_filtering=transform_before_filtering, 3147 )
3157 def create_selective_authenticator( 3158 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3159 ) -> DeclarativeAuthenticator: 3160 authenticators = { 3161 name: self._create_component_from_model(model=auth, config=config) 3162 for name, auth in model.authenticators.items() 3163 } 3164 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3165 return SelectiveAuthenticator( # type: ignore[abstract] 3166 config=config, 3167 authenticators=authenticators, 3168 authenticator_selection_path=model.authenticator_selection_path, 3169 **kwargs, 3170 )
3172 @staticmethod 3173 def create_legacy_session_token_authenticator( 3174 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3175 ) -> LegacySessionTokenAuthenticator: 3176 return LegacySessionTokenAuthenticator( 3177 api_url=url_base, 3178 header=model.header, 3179 login_url=model.login_url, 3180 password=model.password or "", 3181 session_token=model.session_token or "", 3182 session_token_response_key=model.session_token_response_key or "", 3183 username=model.username or "", 3184 validate_session_url=model.validate_session_url, 3185 config=config, 3186 parameters=model.parameters or {}, 3187 )
3189 def create_simple_retriever( 3190 self, 3191 model: SimpleRetrieverModel, 3192 config: Config, 3193 *, 3194 name: str, 3195 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3196 stream_slicer: Optional[StreamSlicer], 3197 request_options_provider: Optional[RequestOptionsProvider] = None, 3198 stop_condition_cursor: Optional[Cursor] = None, 3199 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3200 transformations: List[RecordTransformation], 3201 file_uploader: Optional[DefaultFileUploader] = None, 3202 incremental_sync: Optional[ 3203 Union[ 3204 IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel 3205 ] 3206 ] = None, 3207 use_cache: Optional[bool] = None, 3208 log_formatter: Optional[Callable[[Response], Any]] = None, 3209 **kwargs: Any, 3210 ) -> SimpleRetriever: 3211 def _get_url() -> str: 3212 """ 3213 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3214 This is needed because the URL is not set until the requester is created. 3215 """ 3216 3217 _url: str = ( 3218 model.requester.url 3219 if hasattr(model.requester, "url") and model.requester.url is not None 3220 else requester.get_url() 3221 ) 3222 _url_base: str = ( 3223 model.requester.url_base 3224 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3225 else requester.get_url_base() 3226 ) 3227 3228 return _url or _url_base 3229 3230 decoder = ( 3231 self._create_component_from_model(model=model.decoder, config=config) 3232 if model.decoder 3233 else JsonDecoder(parameters={}) 3234 ) 3235 record_selector = self._create_component_from_model( 3236 model=model.record_selector, 3237 name=name, 3238 config=config, 3239 decoder=decoder, 3240 transformations=transformations, 3241 client_side_incremental_sync=client_side_incremental_sync, 3242 file_uploader=file_uploader, 3243 ) 3244 3245 query_properties: Optional[QueryProperties] = None 3246 query_properties_key: Optional[str] = None 3247 if self._query_properties_in_request_parameters(model.requester): 3248 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3249 # places instead of default to request_parameters which isn't clearly documented 3250 if ( 3251 hasattr(model.requester, "fetch_properties_from_endpoint") 3252 and model.requester.fetch_properties_from_endpoint 3253 ): 3254 raise ValueError( 3255 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3256 ) 3257 3258 query_properties_definitions = [] 3259 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3260 if isinstance(request_parameter, QueryPropertiesModel): 3261 query_properties_key = key 3262 query_properties_definitions.append(request_parameter) 3263 3264 if len(query_properties_definitions) > 1: 3265 raise ValueError( 3266 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3267 ) 3268 3269 if len(query_properties_definitions) == 1: 3270 query_properties = self._create_component_from_model( 3271 model=query_properties_definitions[0], config=config 3272 ) 3273 elif ( 3274 hasattr(model.requester, "fetch_properties_from_endpoint") 3275 and model.requester.fetch_properties_from_endpoint 3276 ): 3277 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3278 query_properties_definition = QueryPropertiesModel( 3279 type="QueryProperties", 3280 property_list=model.requester.fetch_properties_from_endpoint, 3281 always_include_properties=None, 3282 property_chunking=None, 3283 ) # type: ignore # $parameters has a default value 3284 3285 query_properties = self.create_query_properties( 3286 model=query_properties_definition, 3287 config=config, 3288 ) 3289 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3290 query_properties = self.create_query_properties( 3291 model=model.requester.query_properties, 3292 config=config, 3293 ) 3294 3295 requester = self._create_component_from_model( 3296 model=model.requester, 3297 decoder=decoder, 3298 name=name, 3299 query_properties_key=query_properties_key, 3300 use_cache=use_cache, 3301 config=config, 3302 ) 3303 3304 # Define cursor only if per partition or common incremental support is needed 3305 cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None 3306 3307 if ( 3308 not isinstance(stream_slicer, DatetimeBasedCursor) 3309 or type(stream_slicer) is not DatetimeBasedCursor 3310 ): 3311 # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods). 3312 # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement 3313 # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's 3314 # request_options_provider 3315 request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={}) 3316 elif not request_options_provider: 3317 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3318 3319 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3320 if self._should_limit_slices_fetched(): 3321 stream_slicer = cast( 3322 StreamSlicer, 3323 StreamSlicerTestReadDecorator( 3324 wrapped_slicer=stream_slicer, 3325 maximum_number_of_slices=self._limit_slices_fetched or 5, 3326 ), 3327 ) 3328 3329 paginator = ( 3330 self._create_component_from_model( 3331 model=model.paginator, 3332 config=config, 3333 url_base=_get_url(), 3334 extractor_model=model.record_selector.extractor, 3335 decoder=decoder, 3336 cursor_used_for_stop_condition=stop_condition_cursor or None, 3337 ) 3338 if model.paginator 3339 else NoPagination(parameters={}) 3340 ) 3341 3342 ignore_stream_slicer_parameters_on_paginated_requests = ( 3343 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3344 ) 3345 3346 if ( 3347 model.partition_router 3348 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3349 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3350 and any( 3351 parent_stream_config.lazy_read_pointer 3352 for parent_stream_config in model.partition_router.parent_stream_configs 3353 ) 3354 ): 3355 if incremental_sync: 3356 if incremental_sync.type != "DatetimeBasedCursor": 3357 raise ValueError( 3358 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3359 ) 3360 3361 elif incremental_sync.step or incremental_sync.cursor_granularity: 3362 raise ValueError( 3363 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3364 ) 3365 3366 if model.decoder and model.decoder.type != "JsonDecoder": 3367 raise ValueError( 3368 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3369 ) 3370 3371 return LazySimpleRetriever( 3372 name=name, 3373 paginator=paginator, 3374 primary_key=primary_key, 3375 requester=requester, 3376 record_selector=record_selector, 3377 stream_slicer=stream_slicer, 3378 request_option_provider=request_options_provider, 3379 cursor=cursor, 3380 config=config, 3381 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3382 parameters=model.parameters or {}, 3383 ) 3384 3385 return SimpleRetriever( 3386 name=name, 3387 paginator=paginator, 3388 primary_key=primary_key, 3389 requester=requester, 3390 record_selector=record_selector, 3391 stream_slicer=stream_slicer, 3392 request_option_provider=request_options_provider, 3393 cursor=cursor, 3394 config=config, 3395 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3396 additional_query_properties=query_properties, 3397 log_formatter=self._get_log_formatter(log_formatter, name), 3398 parameters=model.parameters or {}, 3399 )
3449 def create_state_delegating_stream( 3450 self, 3451 model: StateDelegatingStreamModel, 3452 config: Config, 3453 has_parent_state: Optional[bool] = None, 3454 **kwargs: Any, 3455 ) -> DeclarativeStream: 3456 if ( 3457 model.full_refresh_stream.name != model.name 3458 or model.name != model.incremental_stream.name 3459 ): 3460 raise ValueError( 3461 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3462 ) 3463 3464 stream_model = ( 3465 model.incremental_stream 3466 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3467 else model.full_refresh_stream 3468 ) 3469 3470 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3502 def create_async_retriever( 3503 self, 3504 model: AsyncRetrieverModel, 3505 config: Config, 3506 *, 3507 name: str, 3508 primary_key: Optional[ 3509 Union[str, List[str], List[List[str]]] 3510 ], # this seems to be needed to match create_simple_retriever 3511 stream_slicer: Optional[StreamSlicer], 3512 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3513 transformations: List[RecordTransformation], 3514 **kwargs: Any, 3515 ) -> AsyncRetriever: 3516 def _get_download_retriever() -> SimpleRetriever: 3517 # We create a record selector for the download retriever 3518 # with no schema normalization and no transformations, neither record filter 3519 # as all this occurs in the record_selector of the AsyncRetriever 3520 record_selector = RecordSelector( 3521 extractor=download_extractor, 3522 name=name, 3523 record_filter=None, 3524 transformations=[], 3525 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3526 config=config, 3527 parameters={}, 3528 ) 3529 paginator = ( 3530 self._create_component_from_model( 3531 model=model.download_paginator, 3532 decoder=decoder, 3533 config=config, 3534 url_base="", 3535 ) 3536 if model.download_paginator 3537 else NoPagination(parameters={}) 3538 ) 3539 3540 return SimpleRetriever( 3541 requester=download_requester, 3542 record_selector=record_selector, 3543 primary_key=None, 3544 name=name, 3545 paginator=paginator, 3546 config=config, 3547 parameters={}, 3548 log_formatter=self._get_log_formatter(None, name), 3549 ) 3550 3551 def _get_job_timeout() -> datetime.timedelta: 3552 user_defined_timeout: Optional[int] = ( 3553 int( 3554 InterpolatedString.create( 3555 str(model.polling_job_timeout), 3556 parameters={}, 3557 ).eval(config) 3558 ) 3559 if model.polling_job_timeout 3560 else None 3561 ) 3562 3563 # check for user defined timeout during the test read or 15 minutes 3564 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3565 # default value for non-connector builder is 60 minutes. 3566 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3567 3568 return ( 3569 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3570 ) 3571 3572 decoder = ( 3573 self._create_component_from_model(model=model.decoder, config=config) 3574 if model.decoder 3575 else JsonDecoder(parameters={}) 3576 ) 3577 record_selector = self._create_component_from_model( 3578 model=model.record_selector, 3579 config=config, 3580 decoder=decoder, 3581 name=name, 3582 transformations=transformations, 3583 client_side_incremental_sync=client_side_incremental_sync, 3584 ) 3585 3586 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3587 if self._should_limit_slices_fetched(): 3588 stream_slicer = cast( 3589 StreamSlicer, 3590 StreamSlicerTestReadDecorator( 3591 wrapped_slicer=stream_slicer, 3592 maximum_number_of_slices=self._limit_slices_fetched or 5, 3593 ), 3594 ) 3595 3596 creation_requester = self._create_component_from_model( 3597 model=model.creation_requester, 3598 decoder=decoder, 3599 config=config, 3600 name=f"job creation - {name}", 3601 ) 3602 polling_requester = self._create_component_from_model( 3603 model=model.polling_requester, 3604 decoder=decoder, 3605 config=config, 3606 name=f"job polling - {name}", 3607 ) 3608 job_download_components_name = f"job download - {name}" 3609 download_decoder = ( 3610 self._create_component_from_model(model=model.download_decoder, config=config) 3611 if model.download_decoder 3612 else JsonDecoder(parameters={}) 3613 ) 3614 download_extractor = ( 3615 self._create_component_from_model( 3616 model=model.download_extractor, 3617 config=config, 3618 decoder=download_decoder, 3619 parameters=model.parameters, 3620 ) 3621 if model.download_extractor 3622 else DpathExtractor( 3623 [], 3624 config=config, 3625 decoder=download_decoder, 3626 parameters=model.parameters or {}, 3627 ) 3628 ) 3629 download_requester = self._create_component_from_model( 3630 model=model.download_requester, 3631 decoder=download_decoder, 3632 config=config, 3633 name=job_download_components_name, 3634 ) 3635 download_retriever = _get_download_retriever() 3636 abort_requester = ( 3637 self._create_component_from_model( 3638 model=model.abort_requester, 3639 decoder=decoder, 3640 config=config, 3641 name=f"job abort - {name}", 3642 ) 3643 if model.abort_requester 3644 else None 3645 ) 3646 delete_requester = ( 3647 self._create_component_from_model( 3648 model=model.delete_requester, 3649 decoder=decoder, 3650 config=config, 3651 name=f"job delete - {name}", 3652 ) 3653 if model.delete_requester 3654 else None 3655 ) 3656 download_target_requester = ( 3657 self._create_component_from_model( 3658 model=model.download_target_requester, 3659 decoder=decoder, 3660 config=config, 3661 name=f"job extract_url - {name}", 3662 ) 3663 if model.download_target_requester 3664 else None 3665 ) 3666 status_extractor = self._create_component_from_model( 3667 model=model.status_extractor, decoder=decoder, config=config, name=name 3668 ) 3669 download_target_extractor = self._create_component_from_model( 3670 model=model.download_target_extractor, 3671 decoder=decoder, 3672 config=config, 3673 name=name, 3674 ) 3675 3676 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3677 creation_requester=creation_requester, 3678 polling_requester=polling_requester, 3679 download_retriever=download_retriever, 3680 download_target_requester=download_target_requester, 3681 abort_requester=abort_requester, 3682 delete_requester=delete_requester, 3683 status_extractor=status_extractor, 3684 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3685 download_target_extractor=download_target_extractor, 3686 job_timeout=_get_job_timeout(), 3687 ) 3688 3689 async_job_partition_router = AsyncJobPartitionRouter( 3690 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3691 job_repository, 3692 stream_slices, 3693 self._job_tracker, 3694 self._message_repository, 3695 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3696 has_bulk_parent=False, 3697 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3698 # `None` == default retry is set to 3 attempts, under the hood. 3699 job_max_retry=1 if self._emit_connector_builder_messages else None, 3700 ), 3701 stream_slicer=stream_slicer, 3702 config=config, 3703 parameters=model.parameters or {}, 3704 ) 3705 3706 return AsyncRetriever( 3707 record_selector=record_selector, 3708 stream_slicer=async_job_partition_router, 3709 config=config, 3710 parameters=model.parameters or {}, 3711 )
3713 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3714 config_migrations = [ 3715 self._create_component_from_model(migration, config) 3716 for migration in ( 3717 model.config_normalization_rules.config_migrations 3718 if ( 3719 model.config_normalization_rules 3720 and model.config_normalization_rules.config_migrations 3721 ) 3722 else [] 3723 ) 3724 ] 3725 config_transformations = [ 3726 self._create_component_from_model(transformation, config) 3727 for transformation in ( 3728 model.config_normalization_rules.transformations 3729 if ( 3730 model.config_normalization_rules 3731 and model.config_normalization_rules.transformations 3732 ) 3733 else [] 3734 ) 3735 ] 3736 config_validations = [ 3737 self._create_component_from_model(validation, config) 3738 for validation in ( 3739 model.config_normalization_rules.validations 3740 if ( 3741 model.config_normalization_rules 3742 and model.config_normalization_rules.validations 3743 ) 3744 else [] 3745 ) 3746 ] 3747 3748 return Spec( 3749 connection_specification=model.connection_specification, 3750 documentation_url=model.documentation_url, 3751 advanced_auth=model.advanced_auth, 3752 parameters={}, 3753 config_migrations=config_migrations, 3754 config_transformations=config_transformations, 3755 config_validations=config_validations, 3756 )
3758 def create_substream_partition_router( 3759 self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any 3760 ) -> SubstreamPartitionRouter: 3761 parent_stream_configs = [] 3762 if model.parent_stream_configs: 3763 parent_stream_configs.extend( 3764 [ 3765 self._create_message_repository_substream_wrapper( 3766 model=parent_stream_config, config=config, **kwargs 3767 ) 3768 for parent_stream_config in model.parent_stream_configs 3769 ] 3770 ) 3771 3772 return SubstreamPartitionRouter( 3773 parent_stream_configs=parent_stream_configs, 3774 parameters=model.parameters or {}, 3775 config=config, 3776 )
3804 @staticmethod 3805 def create_wait_time_from_header( 3806 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3807 ) -> WaitTimeFromHeaderBackoffStrategy: 3808 return WaitTimeFromHeaderBackoffStrategy( 3809 header=model.header, 3810 parameters=model.parameters or {}, 3811 config=config, 3812 regex=model.regex, 3813 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3814 if model.max_waiting_time_in_seconds is not None 3815 else None, 3816 )
3818 @staticmethod 3819 def create_wait_until_time_from_header( 3820 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3821 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3822 return WaitUntilTimeFromHeaderBackoffStrategy( 3823 header=model.header, 3824 parameters=model.parameters or {}, 3825 config=config, 3826 min_wait=model.min_wait, 3827 regex=model.regex, 3828 )
3836 @staticmethod 3837 def create_components_mapping_definition( 3838 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3839 ) -> ComponentMappingDefinition: 3840 interpolated_value = InterpolatedString.create( 3841 model.value, parameters=model.parameters or {} 3842 ) 3843 field_path = [ 3844 InterpolatedString.create(path, parameters=model.parameters or {}) 3845 for path in model.field_path 3846 ] 3847 return ComponentMappingDefinition( 3848 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3849 value=interpolated_value, 3850 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3851 create_or_update=model.create_or_update, 3852 condition=model.condition, 3853 parameters=model.parameters or {}, 3854 )
3856 def create_http_components_resolver( 3857 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3858 ) -> Any: 3859 stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config) 3860 combined_slicers = self._build_resumable_cursor(model.retriever, stream_slicer) 3861 3862 retriever = self._create_component_from_model( 3863 model=model.retriever, 3864 config=config, 3865 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3866 primary_key=None, 3867 stream_slicer=stream_slicer if stream_slicer else combined_slicers, 3868 transformations=[], 3869 ) 3870 3871 components_mapping = [] 3872 for component_mapping_definition_model in model.components_mapping: 3873 if component_mapping_definition_model.condition: 3874 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3875 components_mapping.append( 3876 self._create_component_from_model( 3877 model=component_mapping_definition_model, 3878 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3879 component_mapping_definition_model.value_type 3880 ), 3881 config=config, 3882 ) 3883 ) 3884 3885 return HttpComponentsResolver( 3886 retriever=retriever, 3887 config=config, 3888 components_mapping=components_mapping, 3889 parameters=model.parameters or {}, 3890 )
3892 @staticmethod 3893 def create_stream_config( 3894 model: StreamConfigModel, config: Config, **kwargs: Any 3895 ) -> StreamConfig: 3896 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3897 [x for x in model.configs_pointer] if model.configs_pointer else [] 3898 ) 3899 3900 return StreamConfig( 3901 configs_pointer=model_configs_pointer, 3902 default_values=model.default_values, 3903 parameters=model.parameters or {}, 3904 )
3906 def create_config_components_resolver( 3907 self, 3908 model: ConfigComponentsResolverModel, 3909 config: Config, 3910 ) -> Any: 3911 model_stream_configs = ( 3912 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3913 ) 3914 3915 stream_configs = [ 3916 self._create_component_from_model( 3917 stream_config, config=config, parameters=model.parameters or {} 3918 ) 3919 for stream_config in model_stream_configs 3920 ] 3921 3922 components_mapping = [ 3923 self._create_component_from_model( 3924 model=components_mapping_definition_model, 3925 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3926 components_mapping_definition_model.value_type 3927 ), 3928 config=config, 3929 parameters=model.parameters, 3930 ) 3931 for components_mapping_definition_model in model.components_mapping 3932 ] 3933 3934 return ConfigComponentsResolver( 3935 stream_configs=stream_configs, 3936 config=config, 3937 components_mapping=components_mapping, 3938 parameters=model.parameters or {}, 3939 )
3941 def create_parametrized_components_resolver( 3942 self, 3943 model: ParametrizedComponentsResolverModel, 3944 config: Config, 3945 ) -> ParametrizedComponentsResolver: 3946 stream_parameters = StreamParametersDefinition( 3947 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3948 ) 3949 3950 components_mapping = [] 3951 for components_mapping_definition_model in model.components_mapping: 3952 if components_mapping_definition_model.condition: 3953 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3954 components_mapping.append( 3955 self._create_component_from_model( 3956 model=components_mapping_definition_model, 3957 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3958 components_mapping_definition_model.value_type 3959 ), 3960 config=config, 3961 ) 3962 ) 3963 return ParametrizedComponentsResolver( 3964 stream_parameters=stream_parameters, 3965 config=config, 3966 components_mapping=components_mapping, 3967 parameters=model.parameters or {}, 3968 )
3992 def create_http_api_budget( 3993 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 3994 ) -> HttpAPIBudget: 3995 policies = [ 3996 self._create_component_from_model(model=policy, config=config) 3997 for policy in model.policies 3998 ] 3999 4000 return HttpAPIBudget( 4001 policies=policies, 4002 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4003 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4004 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4005 )
4007 def create_fixed_window_call_rate_policy( 4008 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4009 ) -> FixedWindowCallRatePolicy: 4010 matchers = [ 4011 self._create_component_from_model(model=matcher, config=config) 4012 for matcher in model.matchers 4013 ] 4014 4015 # Set the initial reset timestamp to 10 days from now. 4016 # This value will be updated by the first request. 4017 return FixedWindowCallRatePolicy( 4018 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4019 period=parse_duration(model.period), 4020 call_limit=model.call_limit, 4021 matchers=matchers, 4022 )
4024 def create_file_uploader( 4025 self, model: FileUploaderModel, config: Config, **kwargs: Any 4026 ) -> FileUploader: 4027 name = "File Uploader" 4028 requester = self._create_component_from_model( 4029 model=model.requester, 4030 config=config, 4031 name=name, 4032 **kwargs, 4033 ) 4034 download_target_extractor = self._create_component_from_model( 4035 model=model.download_target_extractor, 4036 config=config, 4037 name=name, 4038 **kwargs, 4039 ) 4040 emit_connector_builder_messages = self._emit_connector_builder_messages 4041 file_uploader = DefaultFileUploader( 4042 requester=requester, 4043 download_target_extractor=download_target_extractor, 4044 config=config, 4045 file_writer=NoopFileWriter() 4046 if emit_connector_builder_messages 4047 else LocalFileSystemFileWriter(), 4048 parameters=model.parameters or {}, 4049 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4050 ) 4051 4052 return ( 4053 ConnectorBuilderFileUploader(file_uploader) 4054 if emit_connector_builder_messages 4055 else file_uploader 4056 )
4058 def create_moving_window_call_rate_policy( 4059 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4060 ) -> MovingWindowCallRatePolicy: 4061 rates = [ 4062 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4063 ] 4064 matchers = [ 4065 self._create_component_from_model(model=matcher, config=config) 4066 for matcher in model.matchers 4067 ] 4068 return MovingWindowCallRatePolicy( 4069 rates=rates, 4070 matchers=matchers, 4071 )
4073 def create_unlimited_call_rate_policy( 4074 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4075 ) -> UnlimitedCallRatePolicy: 4076 matchers = [ 4077 self._create_component_from_model(model=matcher, config=config) 4078 for matcher in model.matchers 4079 ] 4080 4081 return UnlimitedCallRatePolicy( 4082 matchers=matchers, 4083 )
4092 def create_http_request_matcher( 4093 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4094 ) -> HttpRequestRegexMatcher: 4095 return HttpRequestRegexMatcher( 4096 method=model.method, 4097 url_base=model.url_base, 4098 url_path_pattern=model.url_path_pattern, 4099 params=model.params, 4100 headers=model.headers, 4101 )
4108 def create_grouping_partition_router( 4109 self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any 4110 ) -> GroupingPartitionRouter: 4111 underlying_router = self._create_component_from_model( 4112 model=model.underlying_partition_router, config=config 4113 ) 4114 if model.group_size < 1: 4115 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4116 4117 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4118 # because they are specific to individual partitions and cannot be aggregated or handled 4119 # when grouping, potentially leading to incorrect API calls. Any request customization 4120 # should be managed at the stream level through the requester's configuration. 4121 if isinstance(underlying_router, SubstreamPartitionRouter): 4122 if any( 4123 parent_config.request_option 4124 for parent_config in underlying_router.parent_stream_configs 4125 ): 4126 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4127 4128 if isinstance(underlying_router, ListPartitionRouter): 4129 if underlying_router.request_option: 4130 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4131 4132 return GroupingPartitionRouter( 4133 group_size=model.group_size, 4134 underlying_partition_router=underlying_router, 4135 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4136 config=config, 4137 )