airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 TYPE_CHECKING, 15 Any, 16 Callable, 17 Dict, 18 List, 19 Mapping, 20 MutableMapping, 21 Optional, 22 Tuple, 23 Type, 24 Union, 25 cast, 26 get_args, 27 get_origin, 28 get_type_hints, 29) 30 31if TYPE_CHECKING: 32 from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( 33 DatetimeBasedCursor, 34 ) 35 36from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream 37from isodate import parse_duration 38from pydantic.v1 import BaseModel 39from requests import Response 40 41from airbyte_cdk.connector_builder.models import ( 42 LogMessage as ConnectorBuilderLogMessage, 43) 44from airbyte_cdk.models import ( 45 AirbyteStateBlob, 46 AirbyteStateMessage, 47 AirbyteStateType, 48 AirbyteStreamState, 49 ConfiguredAirbyteCatalog, 50 FailureType, 51 Level, 52 StreamDescriptor, 53) 54from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 55from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 56from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 57from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 58from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 59from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 60from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 61 DeclarativeAuthenticator, 62 NoAuth, 63) 64from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 65from airbyte_cdk.sources.declarative.auth.oauth import ( 66 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 67) 68from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 69from airbyte_cdk.sources.declarative.auth.token import ( 70 ApiKeyAuthenticator, 71 BasicHttpAuthenticator, 72 BearerAuthenticator, 73 LegacySessionTokenAuthenticator, 74) 75from airbyte_cdk.sources.declarative.auth.token_provider import ( 76 InterpolatedSessionTokenProvider, 77 InterpolatedStringTokenProvider, 78 SessionTokenProvider, 79 TokenProvider, 80) 81from airbyte_cdk.sources.declarative.checks import ( 82 CheckDynamicStream, 83 CheckStream, 84 DynamicStreamCheckConfig, 85) 86from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 87from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 88from airbyte_cdk.sources.declarative.decoders import ( 89 Decoder, 90 IterableDecoder, 91 JsonDecoder, 92 PaginationDecoderDecorator, 93 XmlDecoder, 94 ZipfileDecoder, 95) 96from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 97 CompositeRawDecoder, 98 CsvParser, 99 GzipParser, 100 JsonLineParser, 101 JsonParser, 102 Parser, 103) 104from airbyte_cdk.sources.declarative.expanders.record_expander import ( 105 OnNoRecords, 106 RecordExpander, 107) 108from airbyte_cdk.sources.declarative.extractors import ( 109 DpathExtractor, 110 RecordFilter, 111 RecordSelector, 112 ResponseToFileExtractor, 113) 114from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 115from airbyte_cdk.sources.declarative.extractors.record_filter import ( 116 ClientSideIncrementalRecordFilterDecorator, 117) 118from airbyte_cdk.sources.declarative.incremental import ( 119 ConcurrentCursorFactory, 120 ConcurrentPerPartitionCursor, 121) 122from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 123from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 124from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 125 LegacyToPerPartitionStateMigration, 126) 127from airbyte_cdk.sources.declarative.models import ( 128 CustomStateMigration, 129 PaginationResetLimits, 130) 131from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 132 DEPRECATION_LOGS_TAG, 133 BaseModelWithDeprecations, 134) 135from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 136 Action1 as PaginationResetActionModel, 137) 138from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 139 AddedFieldDefinition as AddedFieldDefinitionModel, 140) 141from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 142 AddFields as AddFieldsModel, 143) 144from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 145 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 146) 147from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 148 AsyncJobStatusMap as AsyncJobStatusMapModel, 149) 150from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 151 AsyncRetriever as AsyncRetrieverModel, 152) 153from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 154 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 155) 156from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 157 BearerAuthenticator as BearerAuthenticatorModel, 158) 159from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 160 CheckDynamicStream as CheckDynamicStreamModel, 161) 162from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 163 CheckStream as CheckStreamModel, 164) 165from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 166 ComplexFieldType as ComplexFieldTypeModel, 167) 168from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 169 ComponentMappingDefinition as ComponentMappingDefinitionModel, 170) 171from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 172 CompositeErrorHandler as CompositeErrorHandlerModel, 173) 174from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 175 ConcurrencyLevel as ConcurrencyLevelModel, 176) 177from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 178 ConfigAddFields as ConfigAddFieldsModel, 179) 180from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 181 ConfigComponentsResolver as ConfigComponentsResolverModel, 182) 183from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 184 ConfigMigration as ConfigMigrationModel, 185) 186from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 187 ConfigRemapField as ConfigRemapFieldModel, 188) 189from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 190 ConfigRemoveFields as ConfigRemoveFieldsModel, 191) 192from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 193 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 194) 195from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 196 CsvDecoder as CsvDecoderModel, 197) 198from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 199 CursorPagination as CursorPaginationModel, 200) 201from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 202 CustomAuthenticator as CustomAuthenticatorModel, 203) 204from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 205 CustomBackoffStrategy as CustomBackoffStrategyModel, 206) 207from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 208 CustomConfigTransformation as CustomConfigTransformationModel, 209) 210from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 211 CustomDecoder as CustomDecoderModel, 212) 213from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 214 CustomErrorHandler as CustomErrorHandlerModel, 215) 216from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 217 CustomPaginationStrategy as CustomPaginationStrategyModel, 218) 219from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 220 CustomPartitionRouter as CustomPartitionRouterModel, 221) 222from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 223 CustomRecordExtractor as CustomRecordExtractorModel, 224) 225from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 226 CustomRecordFilter as CustomRecordFilterModel, 227) 228from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 229 CustomRequester as CustomRequesterModel, 230) 231from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 232 CustomRetriever as CustomRetrieverModel, 233) 234from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 235 CustomSchemaLoader as CustomSchemaLoader, 236) 237from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 238 CustomSchemaNormalization as CustomSchemaNormalizationModel, 239) 240from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 241 CustomTransformation as CustomTransformationModel, 242) 243from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 244 CustomValidationStrategy as CustomValidationStrategyModel, 245) 246from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 247 DatetimeBasedCursor as DatetimeBasedCursorModel, 248) 249from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 250 DeclarativeStream as DeclarativeStreamModel, 251) 252from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 253 DefaultErrorHandler as DefaultErrorHandlerModel, 254) 255from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 256 DefaultPaginator as DefaultPaginatorModel, 257) 258from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 259 DpathExtractor as DpathExtractorModel, 260) 261from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 262 DpathFlattenFields as DpathFlattenFieldsModel, 263) 264from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 265 DpathValidator as DpathValidatorModel, 266) 267from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 268 DynamicSchemaLoader as DynamicSchemaLoaderModel, 269) 270from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 271 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 272) 273from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 274 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 275) 276from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 277 FileUploader as FileUploaderModel, 278) 279from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 280 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 281) 282from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 283 FlattenFields as FlattenFieldsModel, 284) 285from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 286 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 287) 288from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 289 GroupingPartitionRouter as GroupingPartitionRouterModel, 290) 291from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 292 GzipDecoder as GzipDecoderModel, 293) 294from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 295 HTTPAPIBudget as HTTPAPIBudgetModel, 296) 297from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 298 HttpComponentsResolver as HttpComponentsResolverModel, 299) 300from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 301 HttpRequester as HttpRequesterModel, 302) 303from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 304 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 305) 306from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 307 HttpResponseFilter as HttpResponseFilterModel, 308) 309from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 310 IncrementingCountCursor as IncrementingCountCursorModel, 311) 312from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 313 InlineSchemaLoader as InlineSchemaLoaderModel, 314) 315from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 316 IterableDecoder as IterableDecoderModel, 317) 318from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 319 JsonDecoder as JsonDecoderModel, 320) 321from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 322 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 323) 324from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 325 JsonlDecoder as JsonlDecoderModel, 326) 327from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 328 JsonSchemaPropertySelector as JsonSchemaPropertySelectorModel, 329) 330from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 331 JwtAuthenticator as JwtAuthenticatorModel, 332) 333from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 334 JwtHeaders as JwtHeadersModel, 335) 336from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 337 JwtPayload as JwtPayloadModel, 338) 339from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 340 KeysReplace as KeysReplaceModel, 341) 342from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 343 KeysToLower as KeysToLowerModel, 344) 345from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 346 KeysToSnakeCase as KeysToSnakeCaseModel, 347) 348from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 349 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 350) 351from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 352 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 353) 354from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 355 ListPartitionRouter as ListPartitionRouterModel, 356) 357from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 358 MinMaxDatetime as MinMaxDatetimeModel, 359) 360from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 361 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 362) 363from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 364 NoAuth as NoAuthModel, 365) 366from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 367 NoPagination as NoPaginationModel, 368) 369from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 370 OAuthAuthenticator as OAuthAuthenticatorModel, 371) 372from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 373 OffsetIncrement as OffsetIncrementModel, 374) 375from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 376 PageIncrement as PageIncrementModel, 377) 378from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 379 PaginationReset as PaginationResetModel, 380) 381from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 382 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 383) 384from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 385 ParentStreamConfig as ParentStreamConfigModel, 386) 387from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 388 PredicateValidator as PredicateValidatorModel, 389) 390from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 391 PropertiesFromEndpoint as PropertiesFromEndpointModel, 392) 393from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 394 PropertyChunking as PropertyChunkingModel, 395) 396from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 397 PropertyLimitType as PropertyLimitTypeModel, 398) 399from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 400 QueryProperties as QueryPropertiesModel, 401) 402from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 403 Rate as RateModel, 404) 405from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 406 RecordExpander as RecordExpanderModel, 407) 408from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 409 RecordFilter as RecordFilterModel, 410) 411from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 412 RecordSelector as RecordSelectorModel, 413) 414from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 415 RefreshTokenUpdater as RefreshTokenUpdaterModel, 416) 417from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 418 RemoveFields as RemoveFieldsModel, 419) 420from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 421 RequestOption as RequestOptionModel, 422) 423from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 424 RequestPath as RequestPathModel, 425) 426from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 427 ResponseToFileExtractor as ResponseToFileExtractorModel, 428) 429from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 430 SchemaNormalization as SchemaNormalizationModel, 431) 432from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 433 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 434) 435from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 436 SelectiveAuthenticator as SelectiveAuthenticatorModel, 437) 438from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 439 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 440) 441from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 442 SimpleRetriever as SimpleRetrieverModel, 443) 444from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 445from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 446 StateDelegatingStream as StateDelegatingStreamModel, 447) 448from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 449 StreamConfig as StreamConfigModel, 450) 451from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 452 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 453) 454from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 455 TypesMap as TypesMapModel, 456) 457from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 458 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 459) 460from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 461 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 462) 463from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 464from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 465 WaitTimeFromHeader as WaitTimeFromHeaderModel, 466) 467from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 468 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 469) 470from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 471 XmlDecoder as XmlDecoderModel, 472) 473from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 474 ZipfileDecoder as ZipfileDecoderModel, 475) 476from airbyte_cdk.sources.declarative.partition_routers import ( 477 CartesianProductStreamSlicer, 478 GroupingPartitionRouter, 479 ListPartitionRouter, 480 PartitionRouter, 481 SinglePartitionRouter, 482 SubstreamPartitionRouter, 483) 484from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 485 AsyncJobPartitionRouter, 486) 487from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 488 ParentStreamConfig, 489) 490from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 491from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 492 CompositeErrorHandler, 493 DefaultErrorHandler, 494 HttpResponseFilter, 495) 496from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 497 ConstantBackoffStrategy, 498 ExponentialBackoffStrategy, 499 WaitTimeFromHeaderBackoffStrategy, 500 WaitUntilTimeFromHeaderBackoffStrategy, 501) 502from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 503from airbyte_cdk.sources.declarative.requesters.paginators import ( 504 DefaultPaginator, 505 NoPagination, 506 PaginatorTestReadDecorator, 507) 508from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 509 CursorPaginationStrategy, 510 CursorStopCondition, 511 OffsetIncrement, 512 PageIncrement, 513 StopConditionPaginationStrategyDecorator, 514) 515from airbyte_cdk.sources.declarative.requesters.query_properties import ( 516 PropertiesFromEndpoint, 517 PropertyChunking, 518 QueryProperties, 519) 520from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 521 PropertyLimitType, 522) 523from airbyte_cdk.sources.declarative.requesters.query_properties.property_selector import ( 524 JsonSchemaPropertySelector, 525) 526from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 527 GroupByKey, 528) 529from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 530from airbyte_cdk.sources.declarative.requesters.request_options import ( 531 DatetimeBasedRequestOptionsProvider, 532 DefaultRequestOptionsProvider, 533 InterpolatedRequestOptionsProvider, 534 RequestOptionsProvider, 535) 536from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 537 PerPartitionRequestOptionsProvider, 538) 539from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 540from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 541from airbyte_cdk.sources.declarative.resolvers import ( 542 ComponentMappingDefinition, 543 ConfigComponentsResolver, 544 HttpComponentsResolver, 545 ParametrizedComponentsResolver, 546 StreamConfig, 547 StreamParametersDefinition, 548) 549from airbyte_cdk.sources.declarative.retrievers import ( 550 AsyncRetriever, 551 LazySimpleRetriever, 552 SimpleRetriever, 553) 554from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 555 ConnectorBuilderFileUploader, 556 DefaultFileUploader, 557 FileUploader, 558 LocalFileSystemFileWriter, 559 NoopFileWriter, 560) 561from airbyte_cdk.sources.declarative.retrievers.pagination_tracker import PaginationTracker 562from airbyte_cdk.sources.declarative.schema import ( 563 ComplexFieldType, 564 DefaultSchemaLoader, 565 DynamicSchemaLoader, 566 InlineSchemaLoader, 567 JsonFileSchemaLoader, 568 SchemaLoader, 569 SchemaTypeIdentifier, 570 TypesMap, 571) 572from airbyte_cdk.sources.declarative.schema.caching_schema_loader_decorator import ( 573 CachingSchemaLoaderDecorator, 574) 575from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 576from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 577from airbyte_cdk.sources.declarative.stream_slicers import ( 578 StreamSlicer, 579 StreamSlicerTestReadDecorator, 580) 581from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 582 DeclarativePartitionFactory, 583 StreamSlicerPartitionGenerator, 584) 585from airbyte_cdk.sources.declarative.transformations import ( 586 AddFields, 587 RecordTransformation, 588 RemoveFields, 589) 590from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 591from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 592 ConfigAddFields, 593 ConfigRemapField, 594 ConfigRemoveFields, 595) 596from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 597 ConfigTransformation, 598) 599from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 600 DpathFlattenFields, 601 KeyTransformation, 602) 603from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 604 FlattenFields, 605) 606from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 607 KeysReplaceTransformation, 608) 609from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 610 KeysToLowerTransformation, 611) 612from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 613 KeysToSnakeCaseTransformation, 614) 615from airbyte_cdk.sources.declarative.validators import ( 616 DpathValidator, 617 PredicateValidator, 618 ValidateAdheresToSchema, 619) 620from airbyte_cdk.sources.http_logger import format_http_message 621from airbyte_cdk.sources.message import ( 622 InMemoryMessageRepository, 623 LogAppenderMessageRepositoryDecorator, 624 MessageRepository, 625 NoopMessageRepository, 626) 627from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 628from airbyte_cdk.sources.streams.call_rate import ( 629 APIBudget, 630 FixedWindowCallRatePolicy, 631 HttpAPIBudget, 632 HttpRequestRegexMatcher, 633 MovingWindowCallRatePolicy, 634 Rate, 635 UnlimitedCallRatePolicy, 636) 637from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 638from airbyte_cdk.sources.streams.concurrent.clamping import ( 639 ClampingEndProvider, 640 ClampingStrategy, 641 DayClampingStrategy, 642 MonthClampingStrategy, 643 NoClamping, 644 WeekClampingStrategy, 645 Weekday, 646) 647from airbyte_cdk.sources.streams.concurrent.cursor import ( 648 ConcurrentCursor, 649 Cursor, 650 CursorField, 651 FinalStateCursor, 652) 653from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 654from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 655from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 656 StreamSlicer as ConcurrentStreamSlicer, 657) 658from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 659 CustomFormatConcurrentStreamStateConverter, 660 DateTimeStreamStateConverter, 661) 662from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 663 IncrementingCountStreamStateConverter, 664) 665from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 666from airbyte_cdk.sources.types import Config 667from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 668 669ComponentDefinition = Mapping[str, Any] 670 671SCHEMA_TRANSFORMER_TYPE_MAPPING = { 672 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 673 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 674} 675_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 676 677# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 678# this would be a circular import 679MAX_SLICES = 5 680 681LOGGER = logging.getLogger(f"airbyte.model_to_component_factory") 682 683 684class ModelToComponentFactory: 685 EPOCH_DATETIME_FORMAT = "%s" 686 687 def __init__( 688 self, 689 limit_pages_fetched_per_slice: Optional[int] = None, 690 limit_slices_fetched: Optional[int] = None, 691 emit_connector_builder_messages: bool = False, 692 disable_retries: bool = False, 693 disable_cache: bool = False, 694 message_repository: Optional[MessageRepository] = None, 695 connector_state_manager: Optional[ConnectorStateManager] = None, 696 max_concurrent_async_job_count: Optional[int] = None, 697 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 698 api_budget: Optional[APIBudget] = None, 699 ): 700 self._init_mappings() 701 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 702 self._limit_slices_fetched = limit_slices_fetched 703 self._emit_connector_builder_messages = emit_connector_builder_messages 704 self._disable_retries = disable_retries 705 self._disable_cache = disable_cache 706 self._message_repository = message_repository or InMemoryMessageRepository( 707 self._evaluate_log_level(emit_connector_builder_messages) 708 ) 709 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 710 configured_catalog 711 ) 712 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 713 self._api_budget: Optional[Union[APIBudget]] = api_budget 714 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 715 # placeholder for deprecation warnings 716 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 717 718 def _init_mappings(self) -> None: 719 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 720 AddedFieldDefinitionModel: self.create_added_field_definition, 721 AddFieldsModel: self.create_add_fields, 722 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 723 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 724 BearerAuthenticatorModel: self.create_bearer_authenticator, 725 CheckStreamModel: self.create_check_stream, 726 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 727 CheckDynamicStreamModel: self.create_check_dynamic_stream, 728 CompositeErrorHandlerModel: self.create_composite_error_handler, 729 ConcurrencyLevelModel: self.create_concurrency_level, 730 ConfigMigrationModel: self.create_config_migration, 731 ConfigAddFieldsModel: self.create_config_add_fields, 732 ConfigRemapFieldModel: self.create_config_remap_field, 733 ConfigRemoveFieldsModel: self.create_config_remove_fields, 734 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 735 CsvDecoderModel: self.create_csv_decoder, 736 CursorPaginationModel: self.create_cursor_pagination, 737 CustomAuthenticatorModel: self.create_custom_component, 738 CustomBackoffStrategyModel: self.create_custom_component, 739 CustomDecoderModel: self.create_custom_component, 740 CustomErrorHandlerModel: self.create_custom_component, 741 CustomRecordExtractorModel: self.create_custom_component, 742 CustomRecordFilterModel: self.create_custom_component, 743 CustomRequesterModel: self.create_custom_component, 744 CustomRetrieverModel: self.create_custom_component, 745 CustomSchemaLoader: self.create_custom_component, 746 CustomSchemaNormalizationModel: self.create_custom_component, 747 CustomStateMigration: self.create_custom_component, 748 CustomPaginationStrategyModel: self.create_custom_component, 749 CustomPartitionRouterModel: self.create_custom_component, 750 CustomTransformationModel: self.create_custom_component, 751 CustomValidationStrategyModel: self.create_custom_component, 752 CustomConfigTransformationModel: self.create_custom_component, 753 DeclarativeStreamModel: self.create_default_stream, 754 DefaultErrorHandlerModel: self.create_default_error_handler, 755 DefaultPaginatorModel: self.create_default_paginator, 756 DpathExtractorModel: self.create_dpath_extractor, 757 DpathValidatorModel: self.create_dpath_validator, 758 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 759 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 760 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 761 GroupByKeyMergeStrategyModel: self.create_group_by_key, 762 HttpRequesterModel: self.create_http_requester, 763 HttpResponseFilterModel: self.create_http_response_filter, 764 InlineSchemaLoaderModel: self.create_inline_schema_loader, 765 JsonDecoderModel: self.create_json_decoder, 766 JsonlDecoderModel: self.create_jsonl_decoder, 767 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 768 GzipDecoderModel: self.create_gzip_decoder, 769 KeysToLowerModel: self.create_keys_to_lower_transformation, 770 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 771 KeysReplaceModel: self.create_keys_replace_transformation, 772 FlattenFieldsModel: self.create_flatten_fields, 773 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 774 IterableDecoderModel: self.create_iterable_decoder, 775 XmlDecoderModel: self.create_xml_decoder, 776 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 777 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 778 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 779 TypesMapModel: self.create_types_map, 780 ComplexFieldTypeModel: self.create_complex_field_type, 781 JwtAuthenticatorModel: self.create_jwt_authenticator, 782 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 783 ListPartitionRouterModel: self.create_list_partition_router, 784 MinMaxDatetimeModel: self.create_min_max_datetime, 785 NoAuthModel: self.create_no_auth, 786 NoPaginationModel: self.create_no_pagination, 787 OAuthAuthenticatorModel: self.create_oauth_authenticator, 788 OffsetIncrementModel: self.create_offset_increment, 789 PageIncrementModel: self.create_page_increment, 790 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 791 PredicateValidatorModel: self.create_predicate_validator, 792 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 793 PropertyChunkingModel: self.create_property_chunking, 794 QueryPropertiesModel: self.create_query_properties, 795 RecordExpanderModel: self.create_record_expander, 796 RecordFilterModel: self.create_record_filter, 797 RecordSelectorModel: self.create_record_selector, 798 RemoveFieldsModel: self.create_remove_fields, 799 RequestPathModel: self.create_request_path, 800 RequestOptionModel: self.create_request_option, 801 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 802 SelectiveAuthenticatorModel: self.create_selective_authenticator, 803 SimpleRetrieverModel: self.create_simple_retriever, 804 StateDelegatingStreamModel: self.create_state_delegating_stream, 805 SpecModel: self.create_spec, 806 SubstreamPartitionRouterModel: self.create_substream_partition_router, 807 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 808 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 809 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 810 AsyncRetrieverModel: self.create_async_retriever, 811 HttpComponentsResolverModel: self.create_http_components_resolver, 812 ConfigComponentsResolverModel: self.create_config_components_resolver, 813 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 814 StreamConfigModel: self.create_stream_config, 815 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 816 ZipfileDecoderModel: self.create_zipfile_decoder, 817 HTTPAPIBudgetModel: self.create_http_api_budget, 818 FileUploaderModel: self.create_file_uploader, 819 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 820 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 821 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 822 RateModel: self.create_rate, 823 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 824 GroupingPartitionRouterModel: self.create_grouping_partition_router, 825 } 826 827 # Needed for the case where we need to perform a second parse on the fields of a custom component 828 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 829 830 @staticmethod 831 def _create_stream_name_to_configured_stream( 832 configured_catalog: Optional[ConfiguredAirbyteCatalog], 833 ) -> Mapping[str, ConfiguredAirbyteStream]: 834 return ( 835 {stream.stream.name: stream for stream in configured_catalog.streams} 836 if configured_catalog 837 else {} 838 ) 839 840 def create_component( 841 self, 842 model_type: Type[BaseModel], 843 component_definition: ComponentDefinition, 844 config: Config, 845 **kwargs: Any, 846 ) -> Any: 847 """ 848 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 849 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 850 creating declarative components from that model. 851 852 :param model_type: The type of declarative component that is being initialized 853 :param component_definition: The mapping that represents a declarative component 854 :param config: The connector config that is provided by the customer 855 :return: The declarative component to be used at runtime 856 """ 857 858 component_type = component_definition.get("type") 859 if component_definition.get("type") != model_type.__name__: 860 raise ValueError( 861 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 862 ) 863 864 declarative_component_model = model_type.parse_obj(component_definition) 865 866 if not isinstance(declarative_component_model, model_type): 867 raise ValueError( 868 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 869 ) 870 871 return self._create_component_from_model( 872 model=declarative_component_model, config=config, **kwargs 873 ) 874 875 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 876 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 877 raise ValueError( 878 f"{model.__class__} with attributes {model} is not a valid component type" 879 ) 880 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 881 if not component_constructor: 882 raise ValueError(f"Could not find constructor for {model.__class__}") 883 884 # collect deprecation warnings for supported models. 885 if isinstance(model, BaseModelWithDeprecations): 886 self._collect_model_deprecations(model) 887 888 return component_constructor(model=model, config=config, **kwargs) 889 890 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 891 """ 892 Returns the deprecation warnings that were collected during the creation of components. 893 """ 894 return self._collected_deprecation_logs 895 896 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 897 """ 898 Collects deprecation logs from the given model and appends any new logs to the internal collection. 899 900 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 901 902 Args: 903 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 904 """ 905 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 906 for log in model._deprecation_logs: 907 # avoid duplicates for deprecation logs observed. 908 if log not in self._collected_deprecation_logs: 909 self._collected_deprecation_logs.append(log) 910 911 def create_config_migration( 912 self, model: ConfigMigrationModel, config: Config 913 ) -> ConfigMigration: 914 transformations: List[ConfigTransformation] = [ 915 self._create_component_from_model(transformation, config) 916 for transformation in model.transformations 917 ] 918 919 return ConfigMigration( 920 description=model.description, 921 transformations=transformations, 922 ) 923 924 def create_config_add_fields( 925 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 926 ) -> ConfigAddFields: 927 fields = [self._create_component_from_model(field, config) for field in model.fields] 928 return ConfigAddFields( 929 fields=fields, 930 condition=model.condition or "", 931 ) 932 933 @staticmethod 934 def create_config_remove_fields( 935 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 936 ) -> ConfigRemoveFields: 937 return ConfigRemoveFields( 938 field_pointers=model.field_pointers, 939 condition=model.condition or "", 940 ) 941 942 @staticmethod 943 def create_config_remap_field( 944 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 945 ) -> ConfigRemapField: 946 mapping = cast(Mapping[str, Any], model.map) 947 return ConfigRemapField( 948 map=mapping, 949 field_path=model.field_path, 950 config=config, 951 ) 952 953 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 954 strategy = self._create_component_from_model(model.validation_strategy, config) 955 956 return DpathValidator( 957 field_path=model.field_path, 958 strategy=strategy, 959 ) 960 961 def create_predicate_validator( 962 self, model: PredicateValidatorModel, config: Config 963 ) -> PredicateValidator: 964 strategy = self._create_component_from_model(model.validation_strategy, config) 965 966 return PredicateValidator( 967 value=model.value, 968 strategy=strategy, 969 ) 970 971 @staticmethod 972 def create_validate_adheres_to_schema( 973 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 974 ) -> ValidateAdheresToSchema: 975 base_schema = cast(Mapping[str, Any], model.base_schema) 976 return ValidateAdheresToSchema( 977 schema=base_schema, 978 ) 979 980 @staticmethod 981 def create_added_field_definition( 982 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 983 ) -> AddedFieldDefinition: 984 interpolated_value = InterpolatedString.create( 985 model.value, parameters=model.parameters or {} 986 ) 987 return AddedFieldDefinition( 988 path=model.path, 989 value=interpolated_value, 990 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 991 parameters=model.parameters or {}, 992 ) 993 994 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 995 added_field_definitions = [ 996 self._create_component_from_model( 997 model=added_field_definition_model, 998 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 999 added_field_definition_model.value_type 1000 ), 1001 config=config, 1002 ) 1003 for added_field_definition_model in model.fields 1004 ] 1005 return AddFields( 1006 fields=added_field_definitions, 1007 condition=model.condition or "", 1008 parameters=model.parameters or {}, 1009 ) 1010 1011 def create_keys_to_lower_transformation( 1012 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1013 ) -> KeysToLowerTransformation: 1014 return KeysToLowerTransformation() 1015 1016 def create_keys_to_snake_transformation( 1017 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1018 ) -> KeysToSnakeCaseTransformation: 1019 return KeysToSnakeCaseTransformation() 1020 1021 def create_keys_replace_transformation( 1022 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1023 ) -> KeysReplaceTransformation: 1024 return KeysReplaceTransformation( 1025 old=model.old, new=model.new, parameters=model.parameters or {} 1026 ) 1027 1028 def create_flatten_fields( 1029 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1030 ) -> FlattenFields: 1031 return FlattenFields( 1032 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1033 ) 1034 1035 def create_dpath_flatten_fields( 1036 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1037 ) -> DpathFlattenFields: 1038 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1039 key_transformation = ( 1040 KeyTransformation( 1041 config=config, 1042 prefix=model.key_transformation.prefix, 1043 suffix=model.key_transformation.suffix, 1044 parameters=model.parameters or {}, 1045 ) 1046 if model.key_transformation is not None 1047 else None 1048 ) 1049 return DpathFlattenFields( 1050 config=config, 1051 field_path=model_field_path, 1052 delete_origin_value=model.delete_origin_value 1053 if model.delete_origin_value is not None 1054 else False, 1055 replace_record=model.replace_record if model.replace_record is not None else False, 1056 key_transformation=key_transformation, 1057 parameters=model.parameters or {}, 1058 ) 1059 1060 @staticmethod 1061 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1062 if not value_type: 1063 return None 1064 names_to_types = { 1065 ValueType.string: str, 1066 ValueType.number: float, 1067 ValueType.integer: int, 1068 ValueType.boolean: bool, 1069 } 1070 return names_to_types[value_type] 1071 1072 def create_api_key_authenticator( 1073 self, 1074 model: ApiKeyAuthenticatorModel, 1075 config: Config, 1076 token_provider: Optional[TokenProvider] = None, 1077 **kwargs: Any, 1078 ) -> ApiKeyAuthenticator: 1079 if model.inject_into is None and model.header is None: 1080 raise ValueError( 1081 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1082 ) 1083 1084 if model.inject_into is not None and model.header is not None: 1085 raise ValueError( 1086 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1087 ) 1088 1089 if token_provider is not None and model.api_token != "": 1090 raise ValueError( 1091 "If token_provider is set, api_token is ignored and has to be set to empty string." 1092 ) 1093 1094 request_option = ( 1095 self._create_component_from_model( 1096 model.inject_into, config, parameters=model.parameters or {} 1097 ) 1098 if model.inject_into 1099 else RequestOption( 1100 inject_into=RequestOptionType.header, 1101 field_name=model.header or "", 1102 parameters=model.parameters or {}, 1103 ) 1104 ) 1105 1106 return ApiKeyAuthenticator( 1107 token_provider=( 1108 token_provider 1109 if token_provider is not None 1110 else InterpolatedStringTokenProvider( 1111 api_token=model.api_token or "", 1112 config=config, 1113 parameters=model.parameters or {}, 1114 ) 1115 ), 1116 request_option=request_option, 1117 config=config, 1118 parameters=model.parameters or {}, 1119 ) 1120 1121 def create_legacy_to_per_partition_state_migration( 1122 self, 1123 model: LegacyToPerPartitionStateMigrationModel, 1124 config: Mapping[str, Any], 1125 declarative_stream: DeclarativeStreamModel, 1126 ) -> LegacyToPerPartitionStateMigration: 1127 retriever = declarative_stream.retriever 1128 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1129 raise ValueError( 1130 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1131 ) 1132 partition_router = retriever.partition_router 1133 if not isinstance( 1134 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1135 ): 1136 raise ValueError( 1137 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1138 ) 1139 if not hasattr(partition_router, "parent_stream_configs"): 1140 raise ValueError( 1141 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1142 ) 1143 1144 if not hasattr(declarative_stream, "incremental_sync"): 1145 raise ValueError( 1146 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1147 ) 1148 1149 return LegacyToPerPartitionStateMigration( 1150 partition_router, # type: ignore # was already checked above 1151 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1152 config, 1153 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1154 ) 1155 1156 def create_session_token_authenticator( 1157 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1158 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1159 decoder = ( 1160 self._create_component_from_model(model=model.decoder, config=config) 1161 if model.decoder 1162 else JsonDecoder(parameters={}) 1163 ) 1164 login_requester = self._create_component_from_model( 1165 model=model.login_requester, 1166 config=config, 1167 name=f"{name}_login_requester", 1168 decoder=decoder, 1169 ) 1170 token_provider = SessionTokenProvider( 1171 login_requester=login_requester, 1172 session_token_path=model.session_token_path, 1173 expiration_duration=parse_duration(model.expiration_duration) 1174 if model.expiration_duration 1175 else None, 1176 parameters=model.parameters or {}, 1177 message_repository=self._message_repository, 1178 decoder=decoder, 1179 ) 1180 if model.request_authentication.type == "Bearer": 1181 return ModelToComponentFactory.create_bearer_authenticator( 1182 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1183 config, 1184 token_provider=token_provider, 1185 ) 1186 else: 1187 # Get the api_token template if specified, default to just the session token 1188 api_token_template = ( 1189 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1190 ) 1191 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1192 config=config, 1193 api_token=api_token_template, 1194 session_token_provider=token_provider, 1195 parameters=model.parameters or {}, 1196 ) 1197 return self.create_api_key_authenticator( 1198 ApiKeyAuthenticatorModel( 1199 type="ApiKeyAuthenticator", 1200 api_token="", 1201 inject_into=model.request_authentication.inject_into, 1202 ), # type: ignore # $parameters and headers default to None 1203 config=config, 1204 token_provider=final_token_provider, 1205 ) 1206 1207 @staticmethod 1208 def create_basic_http_authenticator( 1209 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1210 ) -> BasicHttpAuthenticator: 1211 return BasicHttpAuthenticator( 1212 password=model.password or "", 1213 username=model.username, 1214 config=config, 1215 parameters=model.parameters or {}, 1216 ) 1217 1218 @staticmethod 1219 def create_bearer_authenticator( 1220 model: BearerAuthenticatorModel, 1221 config: Config, 1222 token_provider: Optional[TokenProvider] = None, 1223 **kwargs: Any, 1224 ) -> BearerAuthenticator: 1225 if token_provider is not None and model.api_token != "": 1226 raise ValueError( 1227 "If token_provider is set, api_token is ignored and has to be set to empty string." 1228 ) 1229 return BearerAuthenticator( 1230 token_provider=( 1231 token_provider 1232 if token_provider is not None 1233 else InterpolatedStringTokenProvider( 1234 api_token=model.api_token or "", 1235 config=config, 1236 parameters=model.parameters or {}, 1237 ) 1238 ), 1239 config=config, 1240 parameters=model.parameters or {}, 1241 ) 1242 1243 @staticmethod 1244 def create_dynamic_stream_check_config( 1245 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1246 ) -> DynamicStreamCheckConfig: 1247 return DynamicStreamCheckConfig( 1248 dynamic_stream_name=model.dynamic_stream_name, 1249 stream_count=model.stream_count, 1250 ) 1251 1252 def create_check_stream( 1253 self, model: CheckStreamModel, config: Config, **kwargs: Any 1254 ) -> CheckStream: 1255 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1256 raise ValueError( 1257 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1258 ) 1259 1260 dynamic_streams_check_configs = ( 1261 [ 1262 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1263 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1264 ] 1265 if model.dynamic_streams_check_configs 1266 else [] 1267 ) 1268 1269 return CheckStream( 1270 stream_names=model.stream_names or [], 1271 dynamic_streams_check_configs=dynamic_streams_check_configs, 1272 parameters={}, 1273 ) 1274 1275 @staticmethod 1276 def create_check_dynamic_stream( 1277 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1278 ) -> CheckDynamicStream: 1279 assert model.use_check_availability is not None # for mypy 1280 1281 use_check_availability = model.use_check_availability 1282 1283 return CheckDynamicStream( 1284 stream_count=model.stream_count, 1285 use_check_availability=use_check_availability, 1286 parameters={}, 1287 ) 1288 1289 def create_composite_error_handler( 1290 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1291 ) -> CompositeErrorHandler: 1292 error_handlers = [ 1293 self._create_component_from_model(model=error_handler_model, config=config) 1294 for error_handler_model in model.error_handlers 1295 ] 1296 return CompositeErrorHandler( 1297 error_handlers=error_handlers, parameters=model.parameters or {} 1298 ) 1299 1300 @staticmethod 1301 def create_concurrency_level( 1302 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1303 ) -> ConcurrencyLevel: 1304 return ConcurrencyLevel( 1305 default_concurrency=model.default_concurrency, 1306 max_concurrency=model.max_concurrency, 1307 config=config, 1308 parameters={}, 1309 ) 1310 1311 @staticmethod 1312 def apply_stream_state_migrations( 1313 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1314 ) -> MutableMapping[str, Any]: 1315 if stream_state_migrations: 1316 for state_migration in stream_state_migrations: 1317 if state_migration.should_migrate(stream_state): 1318 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1319 stream_state = dict(state_migration.migrate(stream_state)) 1320 return stream_state 1321 1322 def create_concurrent_cursor_from_datetime_based_cursor( 1323 self, 1324 model_type: Type[BaseModel], 1325 component_definition: ComponentDefinition, 1326 stream_name: str, 1327 stream_namespace: Optional[str], 1328 stream_state: MutableMapping[str, Any], 1329 config: Config, 1330 message_repository: Optional[MessageRepository] = None, 1331 runtime_lookback_window: Optional[datetime.timedelta] = None, 1332 **kwargs: Any, 1333 ) -> ConcurrentCursor: 1334 component_type = component_definition.get("type") 1335 if component_definition.get("type") != model_type.__name__: 1336 raise ValueError( 1337 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1338 ) 1339 1340 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1341 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1342 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1343 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1344 if "$parameters" not in component_definition and "parameters" in component_definition: 1345 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1346 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1347 1348 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1349 raise ValueError( 1350 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1351 ) 1352 1353 model_parameters = datetime_based_cursor_model.parameters or {} 1354 1355 cursor_field = self._get_catalog_defined_cursor_field( 1356 stream_name=stream_name, 1357 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1358 or False, 1359 ) 1360 1361 if not cursor_field: 1362 interpolated_cursor_field = InterpolatedString.create( 1363 datetime_based_cursor_model.cursor_field, 1364 parameters=model_parameters, 1365 ) 1366 cursor_field = CursorField( 1367 cursor_field_key=interpolated_cursor_field.eval(config=config), 1368 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1369 or False, 1370 ) 1371 1372 interpolated_partition_field_start = InterpolatedString.create( 1373 datetime_based_cursor_model.partition_field_start or "start_time", 1374 parameters=model_parameters, 1375 ) 1376 interpolated_partition_field_end = InterpolatedString.create( 1377 datetime_based_cursor_model.partition_field_end or "end_time", 1378 parameters=model_parameters, 1379 ) 1380 1381 slice_boundary_fields = ( 1382 interpolated_partition_field_start.eval(config=config), 1383 interpolated_partition_field_end.eval(config=config), 1384 ) 1385 1386 datetime_format = datetime_based_cursor_model.datetime_format 1387 1388 cursor_granularity = ( 1389 parse_duration(datetime_based_cursor_model.cursor_granularity) 1390 if datetime_based_cursor_model.cursor_granularity 1391 else None 1392 ) 1393 1394 lookback_window = None 1395 interpolated_lookback_window = ( 1396 InterpolatedString.create( 1397 datetime_based_cursor_model.lookback_window, 1398 parameters=model_parameters, 1399 ) 1400 if datetime_based_cursor_model.lookback_window 1401 else None 1402 ) 1403 if interpolated_lookback_window: 1404 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1405 if evaluated_lookback_window: 1406 lookback_window = parse_duration(evaluated_lookback_window) 1407 1408 connector_state_converter: DateTimeStreamStateConverter 1409 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1410 datetime_format=datetime_format, 1411 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1412 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1413 cursor_granularity=cursor_granularity, 1414 ) 1415 1416 # Adjusts the stream state by applying the runtime lookback window. 1417 # This is used to ensure correct state handling in case of failed partitions. 1418 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1419 if runtime_lookback_window and stream_state_value: 1420 new_stream_state = ( 1421 connector_state_converter.parse_timestamp(stream_state_value) 1422 - runtime_lookback_window 1423 ) 1424 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1425 new_stream_state 1426 ) 1427 1428 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1429 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1430 start_date_runtime_value = self.create_min_max_datetime( 1431 model=datetime_based_cursor_model.start_datetime, config=config 1432 ) 1433 else: 1434 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1435 1436 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1437 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1438 end_date_runtime_value = self.create_min_max_datetime( 1439 model=datetime_based_cursor_model.end_datetime, config=config 1440 ) 1441 else: 1442 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1443 1444 interpolated_start_date = MinMaxDatetime.create( 1445 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1446 parameters=datetime_based_cursor_model.parameters, 1447 ) 1448 interpolated_end_date = ( 1449 None 1450 if not end_date_runtime_value 1451 else MinMaxDatetime.create( 1452 end_date_runtime_value, datetime_based_cursor_model.parameters 1453 ) 1454 ) 1455 1456 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1457 if not interpolated_start_date.datetime_format: 1458 interpolated_start_date.datetime_format = datetime_format 1459 if interpolated_end_date and not interpolated_end_date.datetime_format: 1460 interpolated_end_date.datetime_format = datetime_format 1461 1462 start_date = interpolated_start_date.get_datetime(config=config) 1463 end_date_provider = ( 1464 partial(interpolated_end_date.get_datetime, config) 1465 if interpolated_end_date 1466 else connector_state_converter.get_end_provider() 1467 ) 1468 1469 if ( 1470 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1471 ) or ( 1472 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1473 ): 1474 raise ValueError( 1475 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1476 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1477 ) 1478 1479 # When step is not defined, default to a step size from the starting date to the present moment 1480 step_length = datetime.timedelta.max 1481 interpolated_step = ( 1482 InterpolatedString.create( 1483 datetime_based_cursor_model.step, 1484 parameters=model_parameters, 1485 ) 1486 if datetime_based_cursor_model.step 1487 else None 1488 ) 1489 if interpolated_step: 1490 evaluated_step = interpolated_step.eval(config) 1491 if evaluated_step: 1492 step_length = parse_duration(evaluated_step) 1493 1494 clamping_strategy: ClampingStrategy = NoClamping() 1495 if datetime_based_cursor_model.clamping: 1496 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1497 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1498 # object which we want to keep agnostic of being low-code 1499 target = InterpolatedString( 1500 string=datetime_based_cursor_model.clamping.target, 1501 parameters=model_parameters, 1502 ) 1503 evaluated_target = target.eval(config=config) 1504 match evaluated_target: 1505 case "DAY": 1506 clamping_strategy = DayClampingStrategy() 1507 end_date_provider = ClampingEndProvider( 1508 DayClampingStrategy(is_ceiling=False), 1509 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1510 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1511 ) 1512 case "WEEK": 1513 if ( 1514 not datetime_based_cursor_model.clamping.target_details 1515 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1516 ): 1517 raise ValueError( 1518 "Given WEEK clamping, weekday needs to be provided as target_details" 1519 ) 1520 weekday = self._assemble_weekday( 1521 datetime_based_cursor_model.clamping.target_details["weekday"] 1522 ) 1523 clamping_strategy = WeekClampingStrategy(weekday) 1524 end_date_provider = ClampingEndProvider( 1525 WeekClampingStrategy(weekday, is_ceiling=False), 1526 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1527 granularity=cursor_granularity or datetime.timedelta(days=1), 1528 ) 1529 case "MONTH": 1530 clamping_strategy = MonthClampingStrategy() 1531 end_date_provider = ClampingEndProvider( 1532 MonthClampingStrategy(is_ceiling=False), 1533 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1534 granularity=cursor_granularity or datetime.timedelta(days=1), 1535 ) 1536 case _: 1537 raise ValueError( 1538 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1539 ) 1540 1541 return ConcurrentCursor( 1542 stream_name=stream_name, 1543 stream_namespace=stream_namespace, 1544 stream_state=stream_state, 1545 message_repository=message_repository or self._message_repository, 1546 connector_state_manager=self._connector_state_manager, 1547 connector_state_converter=connector_state_converter, 1548 cursor_field=cursor_field, 1549 slice_boundary_fields=slice_boundary_fields, 1550 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1551 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1552 lookback_window=lookback_window, 1553 slice_range=step_length, 1554 cursor_granularity=cursor_granularity, 1555 clamping_strategy=clamping_strategy, 1556 ) 1557 1558 def create_concurrent_cursor_from_incrementing_count_cursor( 1559 self, 1560 model_type: Type[BaseModel], 1561 component_definition: ComponentDefinition, 1562 stream_name: str, 1563 stream_namespace: Optional[str], 1564 stream_state: MutableMapping[str, Any], 1565 config: Config, 1566 message_repository: Optional[MessageRepository] = None, 1567 **kwargs: Any, 1568 ) -> ConcurrentCursor: 1569 component_type = component_definition.get("type") 1570 if component_definition.get("type") != model_type.__name__: 1571 raise ValueError( 1572 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1573 ) 1574 1575 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1576 1577 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1578 raise ValueError( 1579 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1580 ) 1581 1582 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1583 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1584 # We need to handle both int and str representations of numeric values. 1585 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1586 if start_value is not None: 1587 interpolated_start_value = InterpolatedString.create( 1588 str(start_value), # Ensure we pass a string to InterpolatedString.create 1589 parameters=incrementing_count_cursor_model.parameters or {}, 1590 ) 1591 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1592 else: 1593 evaluated_start_value = 0 1594 1595 cursor_field = self._get_catalog_defined_cursor_field( 1596 stream_name=stream_name, 1597 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1598 or False, 1599 ) 1600 1601 if not cursor_field: 1602 interpolated_cursor_field = InterpolatedString.create( 1603 incrementing_count_cursor_model.cursor_field, 1604 parameters=incrementing_count_cursor_model.parameters or {}, 1605 ) 1606 cursor_field = CursorField( 1607 cursor_field_key=interpolated_cursor_field.eval(config=config), 1608 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1609 or False, 1610 ) 1611 1612 connector_state_converter = IncrementingCountStreamStateConverter( 1613 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1614 ) 1615 1616 return ConcurrentCursor( 1617 stream_name=stream_name, 1618 stream_namespace=stream_namespace, 1619 stream_state=stream_state, 1620 message_repository=message_repository or self._message_repository, 1621 connector_state_manager=self._connector_state_manager, 1622 connector_state_converter=connector_state_converter, 1623 cursor_field=cursor_field, 1624 slice_boundary_fields=None, 1625 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1626 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1627 ) 1628 1629 def _assemble_weekday(self, weekday: str) -> Weekday: 1630 match weekday: 1631 case "MONDAY": 1632 return Weekday.MONDAY 1633 case "TUESDAY": 1634 return Weekday.TUESDAY 1635 case "WEDNESDAY": 1636 return Weekday.WEDNESDAY 1637 case "THURSDAY": 1638 return Weekday.THURSDAY 1639 case "FRIDAY": 1640 return Weekday.FRIDAY 1641 case "SATURDAY": 1642 return Weekday.SATURDAY 1643 case "SUNDAY": 1644 return Weekday.SUNDAY 1645 case _: 1646 raise ValueError(f"Unknown weekday {weekday}") 1647 1648 def create_concurrent_cursor_from_perpartition_cursor( 1649 self, 1650 state_manager: ConnectorStateManager, 1651 model_type: Type[BaseModel], 1652 component_definition: ComponentDefinition, 1653 stream_name: str, 1654 stream_namespace: Optional[str], 1655 config: Config, 1656 stream_state: MutableMapping[str, Any], 1657 partition_router: PartitionRouter, 1658 attempt_to_create_cursor_if_not_provided: bool = False, 1659 **kwargs: Any, 1660 ) -> ConcurrentPerPartitionCursor: 1661 component_type = component_definition.get("type") 1662 if component_definition.get("type") != model_type.__name__: 1663 raise ValueError( 1664 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1665 ) 1666 1667 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1668 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1669 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1670 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1671 if "$parameters" not in component_definition and "parameters" in component_definition: 1672 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1673 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1674 1675 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1676 raise ValueError( 1677 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1678 ) 1679 1680 cursor_field = self._get_catalog_defined_cursor_field( 1681 stream_name=stream_name, 1682 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1683 or False, 1684 ) 1685 1686 if not cursor_field: 1687 interpolated_cursor_field = InterpolatedString.create( 1688 datetime_based_cursor_model.cursor_field, 1689 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1690 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1691 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1692 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1693 parameters=datetime_based_cursor_model.parameters or {}, 1694 ) 1695 cursor_field = CursorField( 1696 cursor_field_key=interpolated_cursor_field.eval(config=config), 1697 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1698 or False, 1699 ) 1700 1701 datetime_format = datetime_based_cursor_model.datetime_format 1702 1703 cursor_granularity = ( 1704 parse_duration(datetime_based_cursor_model.cursor_granularity) 1705 if datetime_based_cursor_model.cursor_granularity 1706 else None 1707 ) 1708 1709 connector_state_converter: DateTimeStreamStateConverter 1710 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1711 datetime_format=datetime_format, 1712 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1713 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1714 cursor_granularity=cursor_granularity, 1715 ) 1716 1717 # Create the cursor factory 1718 cursor_factory = ConcurrentCursorFactory( 1719 partial( 1720 self.create_concurrent_cursor_from_datetime_based_cursor, 1721 state_manager=state_manager, 1722 model_type=model_type, 1723 component_definition=component_definition, 1724 stream_name=stream_name, 1725 stream_namespace=stream_namespace, 1726 config=config, 1727 message_repository=NoopMessageRepository(), 1728 ) 1729 ) 1730 1731 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1732 use_global_cursor = isinstance( 1733 partition_router, GroupingPartitionRouter 1734 ) or component_definition.get("global_substream_cursor", False) 1735 1736 # Return the concurrent cursor and state converter 1737 return ConcurrentPerPartitionCursor( 1738 cursor_factory=cursor_factory, 1739 partition_router=partition_router, 1740 stream_name=stream_name, 1741 stream_namespace=stream_namespace, 1742 stream_state=stream_state, 1743 message_repository=self._message_repository, # type: ignore 1744 connector_state_manager=state_manager, 1745 connector_state_converter=connector_state_converter, 1746 cursor_field=cursor_field, 1747 use_global_cursor=use_global_cursor, 1748 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1749 ) 1750 1751 @staticmethod 1752 def create_constant_backoff_strategy( 1753 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1754 ) -> ConstantBackoffStrategy: 1755 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 1756 return ConstantBackoffStrategy( 1757 backoff_time_in_seconds=model.backoff_time_in_seconds, 1758 jitter_range_in_seconds=model.jitter_range_in_seconds, 1759 config=config, 1760 parameters=model.parameters or {}, 1761 ) 1762 1763 @staticmethod 1764 def _validate_jitter_range(jitter_range_in_seconds: Optional[float]) -> None: 1765 if jitter_range_in_seconds is not None and jitter_range_in_seconds < 0: 1766 raise ValueError("jitter_range_in_seconds must be greater than or equal to 0") 1767 1768 def create_cursor_pagination( 1769 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1770 ) -> CursorPaginationStrategy: 1771 if isinstance(decoder, PaginationDecoderDecorator): 1772 inner_decoder = decoder.decoder 1773 else: 1774 inner_decoder = decoder 1775 decoder = PaginationDecoderDecorator(decoder=decoder) 1776 1777 if self._is_supported_decoder_for_pagination(inner_decoder): 1778 decoder_to_use = decoder 1779 else: 1780 raise ValueError( 1781 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1782 ) 1783 1784 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1785 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1786 page_size = model.page_size 1787 if isinstance(page_size, str) and page_size.isdigit(): 1788 page_size = int(page_size) 1789 1790 return CursorPaginationStrategy( 1791 cursor_value=model.cursor_value, 1792 decoder=decoder_to_use, 1793 page_size=page_size, 1794 stop_condition=model.stop_condition, 1795 config=config, 1796 parameters=model.parameters or {}, 1797 ) 1798 1799 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1800 """ 1801 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1802 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1803 :param model: The Pydantic model of the custom component being created 1804 :param config: The custom defined connector config 1805 :return: The declarative component built from the Pydantic model to be used at runtime 1806 """ 1807 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1808 component_fields = get_type_hints(custom_component_class) 1809 model_args = model.dict() 1810 model_args["config"] = config 1811 1812 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1813 # we defer to these arguments over the component's definition 1814 for key, arg in kwargs.items(): 1815 model_args[key] = arg 1816 1817 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1818 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1819 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1820 for model_field, model_value in model_args.items(): 1821 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1822 if ( 1823 isinstance(model_value, dict) 1824 and "type" not in model_value 1825 and model_field in component_fields 1826 ): 1827 derived_type = self._derive_component_type_from_type_hints( 1828 component_fields.get(model_field) 1829 ) 1830 if derived_type: 1831 model_value["type"] = derived_type 1832 1833 if self._is_component(model_value): 1834 model_args[model_field] = self._create_nested_component( 1835 model, 1836 model_field, 1837 model_value, 1838 config, 1839 **kwargs, 1840 ) 1841 elif isinstance(model_value, list): 1842 vals = [] 1843 for v in model_value: 1844 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1845 derived_type = self._derive_component_type_from_type_hints( 1846 component_fields.get(model_field) 1847 ) 1848 if derived_type: 1849 v["type"] = derived_type 1850 if self._is_component(v): 1851 vals.append( 1852 self._create_nested_component( 1853 model, 1854 model_field, 1855 v, 1856 config, 1857 **kwargs, 1858 ) 1859 ) 1860 else: 1861 vals.append(v) 1862 model_args[model_field] = vals 1863 1864 kwargs = { 1865 class_field: model_args[class_field] 1866 for class_field in component_fields.keys() 1867 if class_field in model_args 1868 } 1869 1870 if "api_budget" in component_fields and kwargs.get("api_budget") is None: 1871 kwargs["api_budget"] = self._api_budget 1872 1873 return custom_component_class(**kwargs) 1874 1875 @staticmethod 1876 def _get_class_from_fully_qualified_class_name( 1877 full_qualified_class_name: str, 1878 ) -> Any: 1879 """Get a class from its fully qualified name. 1880 1881 If a custom components module is needed, we assume it is already registered - probably 1882 as `source_declarative_manifest.components` or `components`. 1883 1884 Args: 1885 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1886 1887 Returns: 1888 Any: The class object. 1889 1890 Raises: 1891 ValueError: If the class cannot be loaded. 1892 """ 1893 split = full_qualified_class_name.split(".") 1894 module_name_full = ".".join(split[:-1]) 1895 class_name = split[-1] 1896 1897 try: 1898 module_ref = importlib.import_module(module_name_full) 1899 except ModuleNotFoundError as e: 1900 if split[0] == "source_declarative_manifest": 1901 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1902 try: 1903 import os 1904 1905 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1906 module_ref = importlib.import_module( 1907 module_name_with_source_declarative_manifest 1908 ) 1909 except ModuleNotFoundError: 1910 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1911 else: 1912 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1913 1914 try: 1915 return getattr(module_ref, class_name) 1916 except AttributeError as e: 1917 raise ValueError( 1918 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1919 ) from e 1920 1921 @staticmethod 1922 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1923 interface = field_type 1924 while True: 1925 origin = get_origin(interface) 1926 if origin: 1927 # Unnest types until we reach the raw type 1928 # List[T] -> T 1929 # Optional[List[T]] -> T 1930 args = get_args(interface) 1931 interface = args[0] 1932 else: 1933 break 1934 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1935 return interface.__name__ 1936 return None 1937 1938 @staticmethod 1939 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1940 if not cls: 1941 return False 1942 return cls.__module__ == "builtins" 1943 1944 @staticmethod 1945 def _extract_missing_parameters(error: TypeError) -> List[str]: 1946 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1947 if parameter_search: 1948 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1949 else: 1950 return [] 1951 1952 def _create_nested_component( 1953 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1954 ) -> Any: 1955 type_name = model_value.get("type", None) 1956 if not type_name: 1957 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1958 return model_value 1959 1960 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1961 if model_type: 1962 parsed_model = model_type.parse_obj(model_value) 1963 try: 1964 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1965 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1966 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1967 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1968 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1969 # are needed by a component and could not be shared. 1970 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1971 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1972 model_parameters = model_value.get("$parameters", {}) 1973 matching_parameters = { 1974 kwarg: model_parameters[kwarg] 1975 for kwarg in constructor_kwargs 1976 if kwarg in model_parameters 1977 } 1978 matching_kwargs = { 1979 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1980 } 1981 return self._create_component_from_model( 1982 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1983 ) 1984 except TypeError as error: 1985 missing_parameters = self._extract_missing_parameters(error) 1986 if missing_parameters: 1987 raise ValueError( 1988 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1989 + ", ".join( 1990 ( 1991 f"{type_name}.$parameters.{parameter}" 1992 for parameter in missing_parameters 1993 ) 1994 ) 1995 ) 1996 raise TypeError( 1997 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1998 ) 1999 else: 2000 raise ValueError( 2001 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 2002 ) 2003 2004 @staticmethod 2005 def _is_component(model_value: Any) -> bool: 2006 return isinstance(model_value, dict) and model_value.get("type") is not None 2007 2008 def create_default_stream( 2009 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2010 ) -> AbstractStream: 2011 primary_key = model.primary_key.__root__ if model.primary_key else None 2012 self._migrate_state(model, config) 2013 2014 partition_router = self._build_stream_slicer_from_partition_router( 2015 model.retriever, 2016 config, 2017 stream_name=model.name, 2018 **kwargs, 2019 ) 2020 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2021 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2022 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2023 2024 end_time_option = ( 2025 self._create_component_from_model( 2026 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2027 ) 2028 if cursor_model.end_time_option 2029 else None 2030 ) 2031 start_time_option = ( 2032 self._create_component_from_model( 2033 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2034 ) 2035 if cursor_model.start_time_option 2036 else None 2037 ) 2038 2039 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2040 start_time_option=start_time_option, 2041 end_time_option=end_time_option, 2042 partition_field_start=cursor_model.partition_field_start, 2043 partition_field_end=cursor_model.partition_field_end, 2044 config=config, 2045 parameters=model.parameters or {}, 2046 ) 2047 request_options_provider = ( 2048 datetime_request_options_provider 2049 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2050 else PerPartitionRequestOptionsProvider( 2051 partition_router, datetime_request_options_provider 2052 ) 2053 ) 2054 elif model.incremental_sync and isinstance( 2055 model.incremental_sync, IncrementingCountCursorModel 2056 ): 2057 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2058 raise ValueError( 2059 "PerPartition does not support per partition states because switching to global state is time based" 2060 ) 2061 2062 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2063 2064 start_time_option = ( 2065 self._create_component_from_model( 2066 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2067 config, 2068 parameters=cursor_model.parameters or {}, 2069 ) 2070 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2071 else None 2072 ) 2073 2074 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2075 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2076 partition_field_start = "start" 2077 2078 request_options_provider = DatetimeBasedRequestOptionsProvider( 2079 start_time_option=start_time_option, 2080 partition_field_start=partition_field_start, 2081 config=config, 2082 parameters=model.parameters or {}, 2083 ) 2084 else: 2085 request_options_provider = None 2086 2087 transformations = [] 2088 if model.transformations: 2089 for transformation_model in model.transformations: 2090 transformations.append( 2091 self._create_component_from_model(model=transformation_model, config=config) 2092 ) 2093 file_uploader = None 2094 if model.file_uploader: 2095 file_uploader = self._create_component_from_model( 2096 model=model.file_uploader, config=config 2097 ) 2098 2099 stream_slicer: ConcurrentStreamSlicer = ( 2100 partition_router 2101 if isinstance(concurrent_cursor, FinalStateCursor) 2102 else concurrent_cursor 2103 ) 2104 2105 retriever = self._create_component_from_model( 2106 model=model.retriever, 2107 config=config, 2108 name=model.name, 2109 primary_key=primary_key, 2110 request_options_provider=request_options_provider, 2111 stream_slicer=stream_slicer, 2112 partition_router=partition_router, 2113 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2114 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2115 cursor=concurrent_cursor, 2116 transformations=transformations, 2117 file_uploader=file_uploader, 2118 incremental_sync=model.incremental_sync, 2119 ) 2120 if isinstance(retriever, AsyncRetriever): 2121 stream_slicer = retriever.stream_slicer 2122 2123 schema_loader: SchemaLoader 2124 if model.schema_loader and isinstance(model.schema_loader, list): 2125 nested_schema_loaders = [ 2126 self._create_component_from_model(model=nested_schema_loader, config=config) 2127 for nested_schema_loader in model.schema_loader 2128 ] 2129 schema_loader = CompositeSchemaLoader( 2130 schema_loaders=nested_schema_loaders, parameters={} 2131 ) 2132 elif model.schema_loader: 2133 schema_loader = self._create_component_from_model( 2134 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2135 config=config, 2136 ) 2137 else: 2138 options = model.parameters or {} 2139 if "name" not in options: 2140 options["name"] = model.name 2141 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2142 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2143 2144 stream_name = model.name or "" 2145 return DefaultStream( 2146 partition_generator=StreamSlicerPartitionGenerator( 2147 DeclarativePartitionFactory( 2148 stream_name, 2149 schema_loader, 2150 retriever, 2151 self._message_repository, 2152 ), 2153 stream_slicer, 2154 slice_limit=self._limit_slices_fetched, 2155 ), 2156 name=stream_name, 2157 json_schema=schema_loader.get_json_schema, 2158 primary_key=get_primary_key_from_stream(primary_key), 2159 cursor_field=( 2160 concurrent_cursor.cursor_field 2161 if hasattr(concurrent_cursor, "cursor_field") 2162 else None 2163 ), 2164 logger=logging.getLogger(f"airbyte.{stream_name}"), 2165 cursor=concurrent_cursor, 2166 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2167 ) 2168 2169 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2170 stream_name = model.name or "" 2171 stream_state = self._connector_state_manager.get_stream_state( 2172 stream_name=stream_name, namespace=None 2173 ) 2174 if model.state_migrations: 2175 state_transformations = [ 2176 self._create_component_from_model(state_migration, config, declarative_stream=model) 2177 for state_migration in model.state_migrations 2178 ] 2179 else: 2180 state_transformations = [] 2181 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2182 self._connector_state_manager.update_state_for_stream( 2183 stream_name=stream_name, namespace=None, value=stream_state 2184 ) 2185 2186 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2187 return bool( 2188 model.incremental_sync 2189 and hasattr(model.incremental_sync, "is_data_feed") 2190 and model.incremental_sync.is_data_feed 2191 ) 2192 2193 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2194 return bool( 2195 model.incremental_sync 2196 and hasattr(model.incremental_sync, "is_client_side_incremental") 2197 and model.incremental_sync.is_client_side_incremental 2198 ) 2199 2200 def _build_stream_slicer_from_partition_router( 2201 self, 2202 model: Union[ 2203 AsyncRetrieverModel, 2204 CustomRetrieverModel, 2205 SimpleRetrieverModel, 2206 ], 2207 config: Config, 2208 stream_name: Optional[str] = None, 2209 **kwargs: Any, 2210 ) -> PartitionRouter: 2211 if ( 2212 hasattr(model, "partition_router") 2213 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2214 and model.partition_router 2215 ): 2216 stream_slicer_model = model.partition_router 2217 if isinstance(stream_slicer_model, list): 2218 return CartesianProductStreamSlicer( 2219 [ 2220 self._create_component_from_model( 2221 model=slicer, config=config, stream_name=stream_name or "" 2222 ) 2223 for slicer in stream_slicer_model 2224 ], 2225 parameters={}, 2226 ) 2227 elif isinstance(stream_slicer_model, dict): 2228 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2229 params = stream_slicer_model.get("$parameters") 2230 if not isinstance(params, dict): 2231 params = {} 2232 stream_slicer_model["$parameters"] = params 2233 2234 if stream_name is not None: 2235 params["stream_name"] = stream_name 2236 2237 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2238 model, 2239 "partition_router", 2240 stream_slicer_model, 2241 config, 2242 **kwargs, 2243 ) 2244 else: 2245 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2246 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2247 ) 2248 return SinglePartitionRouter(parameters={}) 2249 2250 def _build_concurrent_cursor( 2251 self, 2252 model: DeclarativeStreamModel, 2253 stream_slicer: Optional[PartitionRouter], 2254 config: Config, 2255 ) -> Cursor: 2256 stream_name = model.name or "" 2257 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2258 2259 if ( 2260 model.incremental_sync 2261 and stream_slicer 2262 and not isinstance(stream_slicer, SinglePartitionRouter) 2263 ): 2264 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2265 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2266 # same time because we didn't solve for design questions like what the lookback window would 2267 # be as well as global cursor fall backs. We have not seen customers that have needed both 2268 # at the same time yet and are currently punting on this until we need to solve it. 2269 raise ValueError( 2270 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2271 ) 2272 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2273 state_manager=self._connector_state_manager, 2274 model_type=DatetimeBasedCursorModel, 2275 component_definition=model.incremental_sync.__dict__, 2276 stream_name=stream_name, 2277 stream_state=stream_state, 2278 stream_namespace=None, 2279 config=config or {}, 2280 partition_router=stream_slicer, 2281 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2282 ) 2283 elif model.incremental_sync: 2284 if type(model.incremental_sync) == IncrementingCountCursorModel: 2285 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2286 model_type=IncrementingCountCursorModel, 2287 component_definition=model.incremental_sync.__dict__, 2288 stream_name=stream_name, 2289 stream_namespace=None, 2290 stream_state=stream_state, 2291 config=config or {}, 2292 ) 2293 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2294 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2295 model_type=type(model.incremental_sync), 2296 component_definition=model.incremental_sync.__dict__, 2297 stream_name=stream_name, 2298 stream_namespace=None, 2299 stream_state=stream_state, 2300 config=config or {}, 2301 attempt_to_create_cursor_if_not_provided=True, 2302 ) 2303 else: 2304 raise ValueError( 2305 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2306 ) 2307 return FinalStateCursor(stream_name, None, self._message_repository) 2308 2309 def create_default_error_handler( 2310 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2311 ) -> DefaultErrorHandler: 2312 backoff_strategies = [] 2313 if model.backoff_strategies: 2314 for backoff_strategy_model in model.backoff_strategies: 2315 backoff_strategies.append( 2316 self._create_component_from_model(model=backoff_strategy_model, config=config) 2317 ) 2318 2319 response_filters = [] 2320 if model.response_filters: 2321 for response_filter_model in model.response_filters: 2322 response_filters.append( 2323 self._create_component_from_model(model=response_filter_model, config=config) 2324 ) 2325 response_filters.append( 2326 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2327 ) 2328 2329 return DefaultErrorHandler( 2330 backoff_strategies=backoff_strategies, 2331 max_retries=model.max_retries, 2332 response_filters=response_filters, 2333 config=config, 2334 parameters=model.parameters or {}, 2335 ) 2336 2337 def create_default_paginator( 2338 self, 2339 model: DefaultPaginatorModel, 2340 config: Config, 2341 *, 2342 url_base: str, 2343 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2344 decoder: Optional[Decoder] = None, 2345 cursor_used_for_stop_condition: Optional[Cursor] = None, 2346 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2347 if decoder: 2348 if self._is_supported_decoder_for_pagination(decoder): 2349 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2350 else: 2351 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2352 else: 2353 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2354 page_size_option = ( 2355 self._create_component_from_model(model=model.page_size_option, config=config) 2356 if model.page_size_option 2357 else None 2358 ) 2359 page_token_option = ( 2360 self._create_component_from_model(model=model.page_token_option, config=config) 2361 if model.page_token_option 2362 else None 2363 ) 2364 pagination_strategy = self._create_component_from_model( 2365 model=model.pagination_strategy, 2366 config=config, 2367 decoder=decoder_to_use, 2368 extractor_model=extractor_model, 2369 ) 2370 if cursor_used_for_stop_condition: 2371 pagination_strategy = StopConditionPaginationStrategyDecorator( 2372 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2373 ) 2374 paginator = DefaultPaginator( 2375 decoder=decoder_to_use, 2376 page_size_option=page_size_option, 2377 page_token_option=page_token_option, 2378 pagination_strategy=pagination_strategy, 2379 url_base=url_base, 2380 config=config, 2381 parameters=model.parameters or {}, 2382 ) 2383 if self._limit_pages_fetched_per_slice: 2384 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2385 return paginator 2386 2387 def create_dpath_extractor( 2388 self, 2389 model: DpathExtractorModel, 2390 config: Config, 2391 decoder: Optional[Decoder] = None, 2392 **kwargs: Any, 2393 ) -> DpathExtractor: 2394 if decoder: 2395 decoder_to_use = decoder 2396 else: 2397 decoder_to_use = JsonDecoder(parameters={}) 2398 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2399 2400 record_expander = None 2401 if model.record_expander: 2402 record_expander = self._create_component_from_model( 2403 model=model.record_expander, 2404 config=config, 2405 ) 2406 2407 return DpathExtractor( 2408 decoder=decoder_to_use, 2409 field_path=model_field_path, 2410 config=config, 2411 parameters=model.parameters or {}, 2412 record_expander=record_expander, 2413 ) 2414 2415 def create_record_expander( 2416 self, 2417 model: RecordExpanderModel, 2418 config: Config, 2419 **kwargs: Any, 2420 ) -> RecordExpander: 2421 return RecordExpander( 2422 expand_records_from_field=model.expand_records_from_field, 2423 config=config, 2424 parameters=model.parameters or {}, 2425 remain_original_record=model.remain_original_record or False, 2426 on_no_records=OnNoRecords(model.on_no_records.value) 2427 if model.on_no_records 2428 else OnNoRecords.skip, 2429 ) 2430 2431 @staticmethod 2432 def create_response_to_file_extractor( 2433 model: ResponseToFileExtractorModel, 2434 **kwargs: Any, 2435 ) -> ResponseToFileExtractor: 2436 return ResponseToFileExtractor(parameters=model.parameters or {}) 2437 2438 @staticmethod 2439 def create_exponential_backoff_strategy( 2440 model: ExponentialBackoffStrategyModel, config: Config 2441 ) -> ExponentialBackoffStrategy: 2442 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 2443 return ExponentialBackoffStrategy( 2444 factor=model.factor or 5, 2445 jitter_range_in_seconds=model.jitter_range_in_seconds, 2446 parameters=model.parameters or {}, 2447 config=config, 2448 ) 2449 2450 @staticmethod 2451 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2452 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2453 2454 def create_http_requester( 2455 self, 2456 model: HttpRequesterModel, 2457 config: Config, 2458 decoder: Decoder = JsonDecoder(parameters={}), 2459 query_properties_key: Optional[str] = None, 2460 use_cache: Optional[bool] = None, 2461 *, 2462 name: str, 2463 ) -> HttpRequester: 2464 authenticator = ( 2465 self._create_component_from_model( 2466 model=model.authenticator, 2467 config=config, 2468 url_base=model.url or model.url_base, 2469 name=name, 2470 decoder=decoder, 2471 ) 2472 if model.authenticator 2473 else None 2474 ) 2475 error_handler = ( 2476 self._create_component_from_model(model=model.error_handler, config=config) 2477 if model.error_handler 2478 else DefaultErrorHandler( 2479 backoff_strategies=[], 2480 response_filters=[], 2481 config=config, 2482 parameters=model.parameters or {}, 2483 ) 2484 ) 2485 2486 api_budget = self._api_budget 2487 2488 request_options_provider = InterpolatedRequestOptionsProvider( 2489 request_body=model.request_body, 2490 request_body_data=model.request_body_data, 2491 request_body_json=model.request_body_json, 2492 request_headers=model.request_headers, 2493 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2494 query_properties_key=query_properties_key, 2495 config=config, 2496 parameters=model.parameters or {}, 2497 ) 2498 2499 assert model.use_cache is not None # for mypy 2500 assert model.http_method is not None # for mypy 2501 2502 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2503 2504 return HttpRequester( 2505 name=name, 2506 url=model.url, 2507 url_base=model.url_base, 2508 path=model.path, 2509 authenticator=authenticator, 2510 error_handler=error_handler, 2511 api_budget=api_budget, 2512 http_method=HttpMethod[model.http_method.value], 2513 request_options_provider=request_options_provider, 2514 config=config, 2515 disable_retries=self._disable_retries, 2516 parameters=model.parameters or {}, 2517 message_repository=self._message_repository, 2518 use_cache=should_use_cache, 2519 decoder=decoder, 2520 stream_response=decoder.is_stream_response() if decoder else False, 2521 ) 2522 2523 @staticmethod 2524 def create_http_response_filter( 2525 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2526 ) -> HttpResponseFilter: 2527 if model.action: 2528 action = ResponseAction(model.action.value) 2529 else: 2530 action = None 2531 2532 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2533 2534 http_codes = ( 2535 set(model.http_codes) if model.http_codes else set() 2536 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2537 2538 return HttpResponseFilter( 2539 action=action, 2540 failure_type=failure_type, 2541 error_message=model.error_message or "", 2542 error_message_contains=model.error_message_contains or "", 2543 http_codes=http_codes, 2544 predicate=model.predicate or "", 2545 config=config, 2546 parameters=model.parameters or {}, 2547 ) 2548 2549 @staticmethod 2550 def create_inline_schema_loader( 2551 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2552 ) -> InlineSchemaLoader: 2553 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2554 2555 def create_complex_field_type( 2556 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2557 ) -> ComplexFieldType: 2558 items = ( 2559 self._create_component_from_model(model=model.items, config=config) 2560 if isinstance(model.items, ComplexFieldTypeModel) 2561 else model.items 2562 ) 2563 2564 return ComplexFieldType(field_type=model.field_type, items=items) 2565 2566 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2567 target_type = ( 2568 self._create_component_from_model(model=model.target_type, config=config) 2569 if isinstance(model.target_type, ComplexFieldTypeModel) 2570 else model.target_type 2571 ) 2572 2573 return TypesMap( 2574 target_type=target_type, 2575 current_type=model.current_type, 2576 condition=model.condition if model.condition is not None else "True", 2577 ) 2578 2579 def create_schema_type_identifier( 2580 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2581 ) -> SchemaTypeIdentifier: 2582 types_mapping = [] 2583 if model.types_mapping: 2584 types_mapping.extend( 2585 [ 2586 self._create_component_from_model(types_map, config=config) 2587 for types_map in model.types_mapping 2588 ] 2589 ) 2590 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2591 [x for x in model.schema_pointer] if model.schema_pointer else [] 2592 ) 2593 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2594 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2595 [x for x in model.type_pointer] if model.type_pointer else None 2596 ) 2597 2598 return SchemaTypeIdentifier( 2599 schema_pointer=model_schema_pointer, 2600 key_pointer=model_key_pointer, 2601 type_pointer=model_type_pointer, 2602 types_mapping=types_mapping, 2603 parameters=model.parameters or {}, 2604 ) 2605 2606 def create_dynamic_schema_loader( 2607 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2608 ) -> DynamicSchemaLoader: 2609 schema_transformations = [] 2610 if model.schema_transformations: 2611 for transformation_model in model.schema_transformations: 2612 schema_transformations.append( 2613 self._create_component_from_model(model=transformation_model, config=config) 2614 ) 2615 name = "dynamic_properties" 2616 retriever = self._create_component_from_model( 2617 model=model.retriever, 2618 config=config, 2619 name=name, 2620 primary_key=None, 2621 partition_router=self._build_stream_slicer_from_partition_router( 2622 model.retriever, config 2623 ), 2624 transformations=[], 2625 use_cache=True, 2626 log_formatter=( 2627 lambda response: format_http_message( 2628 response, 2629 f"Schema loader '{name}' request", 2630 f"Request performed in order to extract schema.", 2631 name, 2632 is_auxiliary=True, 2633 ) 2634 ), 2635 ) 2636 schema_type_identifier = self._create_component_from_model( 2637 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2638 ) 2639 schema_filter = ( 2640 self._create_component_from_model( 2641 model.schema_filter, config=config, parameters=model.parameters or {} 2642 ) 2643 if model.schema_filter is not None 2644 else None 2645 ) 2646 2647 return DynamicSchemaLoader( 2648 retriever=retriever, 2649 config=config, 2650 schema_transformations=schema_transformations, 2651 schema_filter=schema_filter, 2652 schema_type_identifier=schema_type_identifier, 2653 parameters=model.parameters or {}, 2654 ) 2655 2656 @staticmethod 2657 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2658 return JsonDecoder(parameters={}) 2659 2660 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2661 return CompositeRawDecoder( 2662 parser=ModelToComponentFactory._get_parser(model, config), 2663 stream_response=False if self._emit_connector_builder_messages else True, 2664 ) 2665 2666 def create_jsonl_decoder( 2667 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2668 ) -> Decoder: 2669 return CompositeRawDecoder( 2670 parser=ModelToComponentFactory._get_parser(model, config), 2671 stream_response=False if self._emit_connector_builder_messages else True, 2672 ) 2673 2674 def create_gzip_decoder( 2675 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2676 ) -> Decoder: 2677 _compressed_response_types = { 2678 "gzip", 2679 "x-gzip", 2680 "gzip, deflate", 2681 "x-gzip, deflate", 2682 "application/zip", 2683 "application/gzip", 2684 "application/x-gzip", 2685 "application/x-zip-compressed", 2686 } 2687 2688 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2689 2690 if self._emit_connector_builder_messages: 2691 # This is very surprising but if the response is not streamed, 2692 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2693 # which uses urllib3 directly and does not uncompress the data. 2694 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2695 2696 return CompositeRawDecoder.by_headers( 2697 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2698 stream_response=True, 2699 fallback_parser=gzip_parser.inner_parser, 2700 ) 2701 2702 @staticmethod 2703 def create_iterable_decoder( 2704 model: IterableDecoderModel, config: Config, **kwargs: Any 2705 ) -> IterableDecoder: 2706 return IterableDecoder(parameters={}) 2707 2708 @staticmethod 2709 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2710 return XmlDecoder(parameters={}) 2711 2712 def create_zipfile_decoder( 2713 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2714 ) -> ZipfileDecoder: 2715 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2716 2717 @staticmethod 2718 def _get_parser(model: BaseModel, config: Config) -> Parser: 2719 if isinstance(model, JsonDecoderModel): 2720 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2721 return JsonParser() 2722 elif isinstance(model, JsonlDecoderModel): 2723 return JsonLineParser() 2724 elif isinstance(model, CsvDecoderModel): 2725 return CsvParser( 2726 encoding=model.encoding, 2727 delimiter=model.delimiter, 2728 set_values_to_none=model.set_values_to_none, 2729 ) 2730 elif isinstance(model, GzipDecoderModel): 2731 return GzipParser( 2732 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2733 ) 2734 elif isinstance( 2735 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2736 ): 2737 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2738 2739 raise ValueError(f"Unknown decoder type {model}") 2740 2741 @staticmethod 2742 def create_json_file_schema_loader( 2743 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2744 ) -> JsonFileSchemaLoader: 2745 return JsonFileSchemaLoader( 2746 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2747 ) 2748 2749 def create_jwt_authenticator( 2750 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2751 ) -> JwtAuthenticator: 2752 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2753 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2754 request_option = ( 2755 self._create_component_from_model(model.request_option, config) 2756 if model.request_option 2757 else None 2758 ) 2759 return JwtAuthenticator( 2760 config=config, 2761 parameters=model.parameters or {}, 2762 algorithm=JwtAlgorithm(model.algorithm.value), 2763 secret_key=model.secret_key, 2764 base64_encode_secret_key=model.base64_encode_secret_key, 2765 token_duration=model.token_duration, 2766 header_prefix=model.header_prefix, 2767 kid=jwt_headers.kid, 2768 typ=jwt_headers.typ, 2769 cty=jwt_headers.cty, 2770 iss=jwt_payload.iss, 2771 sub=jwt_payload.sub, 2772 aud=jwt_payload.aud, 2773 additional_jwt_headers=model.additional_jwt_headers, 2774 additional_jwt_payload=model.additional_jwt_payload, 2775 passphrase=model.passphrase, 2776 request_option=request_option, 2777 ) 2778 2779 def create_list_partition_router( 2780 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2781 ) -> ListPartitionRouter: 2782 request_option = ( 2783 self._create_component_from_model(model.request_option, config) 2784 if model.request_option 2785 else None 2786 ) 2787 return ListPartitionRouter( 2788 cursor_field=model.cursor_field, 2789 request_option=request_option, 2790 values=model.values, 2791 config=config, 2792 parameters=model.parameters or {}, 2793 ) 2794 2795 @staticmethod 2796 def create_min_max_datetime( 2797 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2798 ) -> MinMaxDatetime: 2799 return MinMaxDatetime( 2800 datetime=model.datetime, 2801 datetime_format=model.datetime_format or "", 2802 max_datetime=model.max_datetime or "", 2803 min_datetime=model.min_datetime or "", 2804 parameters=model.parameters or {}, 2805 ) 2806 2807 @staticmethod 2808 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2809 return NoAuth(parameters=model.parameters or {}) 2810 2811 @staticmethod 2812 def create_no_pagination( 2813 model: NoPaginationModel, config: Config, **kwargs: Any 2814 ) -> NoPagination: 2815 return NoPagination(parameters={}) 2816 2817 def create_oauth_authenticator( 2818 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2819 ) -> DeclarativeOauth2Authenticator: 2820 profile_assertion = ( 2821 self._create_component_from_model(model.profile_assertion, config=config) 2822 if model.profile_assertion 2823 else None 2824 ) 2825 2826 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2827 self._get_refresh_token_error_information(model) 2828 ) 2829 if model.refresh_token_updater: 2830 # ignore type error because fixing it would have a lot of dependencies, revisit later 2831 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2832 config, 2833 InterpolatedString.create( 2834 model.token_refresh_endpoint, # type: ignore 2835 parameters=model.parameters or {}, 2836 ).eval(config), 2837 access_token_name=InterpolatedString.create( 2838 model.access_token_name or "access_token", parameters=model.parameters or {} 2839 ).eval(config), 2840 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2841 expires_in_name=InterpolatedString.create( 2842 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2843 ).eval(config), 2844 client_id_name=InterpolatedString.create( 2845 model.client_id_name or "client_id", parameters=model.parameters or {} 2846 ).eval(config), 2847 client_id=InterpolatedString.create( 2848 model.client_id, parameters=model.parameters or {} 2849 ).eval(config) 2850 if model.client_id 2851 else model.client_id, 2852 client_secret_name=InterpolatedString.create( 2853 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2854 ).eval(config), 2855 client_secret=InterpolatedString.create( 2856 model.client_secret, parameters=model.parameters or {} 2857 ).eval(config) 2858 if model.client_secret 2859 else model.client_secret, 2860 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2861 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2862 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2863 grant_type_name=InterpolatedString.create( 2864 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2865 ).eval(config), 2866 grant_type=InterpolatedString.create( 2867 model.grant_type or "refresh_token", parameters=model.parameters or {} 2868 ).eval(config), 2869 refresh_request_body=InterpolatedMapping( 2870 model.refresh_request_body or {}, parameters=model.parameters or {} 2871 ).eval(config), 2872 refresh_request_headers=InterpolatedMapping( 2873 model.refresh_request_headers or {}, parameters=model.parameters or {} 2874 ).eval(config), 2875 scopes=model.scopes, 2876 token_expiry_date_format=model.token_expiry_date_format, 2877 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2878 message_repository=self._message_repository, 2879 refresh_token_error_status_codes=refresh_token_error_status_codes, 2880 refresh_token_error_key=refresh_token_error_key, 2881 refresh_token_error_values=refresh_token_error_values, 2882 ) 2883 # ignore type error because fixing it would have a lot of dependencies, revisit later 2884 return DeclarativeOauth2Authenticator( # type: ignore 2885 access_token_name=model.access_token_name or "access_token", 2886 access_token_value=model.access_token_value, 2887 client_id_name=model.client_id_name or "client_id", 2888 client_id=model.client_id, 2889 client_secret_name=model.client_secret_name or "client_secret", 2890 client_secret=model.client_secret, 2891 expires_in_name=model.expires_in_name or "expires_in", 2892 grant_type_name=model.grant_type_name or "grant_type", 2893 grant_type=model.grant_type or "refresh_token", 2894 refresh_request_body=model.refresh_request_body, 2895 refresh_request_headers=model.refresh_request_headers, 2896 refresh_token_name=model.refresh_token_name or "refresh_token", 2897 refresh_token=model.refresh_token, 2898 scopes=model.scopes, 2899 token_expiry_date=model.token_expiry_date, 2900 token_expiry_date_format=model.token_expiry_date_format, 2901 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2902 token_refresh_endpoint=model.token_refresh_endpoint, 2903 config=config, 2904 parameters=model.parameters or {}, 2905 message_repository=self._message_repository, 2906 profile_assertion=profile_assertion, 2907 use_profile_assertion=model.use_profile_assertion, 2908 refresh_token_error_status_codes=refresh_token_error_status_codes, 2909 refresh_token_error_key=refresh_token_error_key, 2910 refresh_token_error_values=refresh_token_error_values, 2911 ) 2912 2913 @staticmethod 2914 def _get_refresh_token_error_information( 2915 model: OAuthAuthenticatorModel, 2916 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2917 """ 2918 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2919 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2920 information is defined only once and return the right fields. 2921 """ 2922 refresh_token_updater = model.refresh_token_updater 2923 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2924 refresh_token_updater.refresh_token_error_status_codes 2925 or refresh_token_updater.refresh_token_error_key 2926 or refresh_token_updater.refresh_token_error_values 2927 ) 2928 is_defined_on_oauth_authenticator = ( 2929 model.refresh_token_error_status_codes 2930 or model.refresh_token_error_key 2931 or model.refresh_token_error_values 2932 ) 2933 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2934 raise ValueError( 2935 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2936 ) 2937 2938 if is_defined_on_refresh_token_updated: 2939 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2940 return ( 2941 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2942 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2943 else (), 2944 not_optional_refresh_token_updater.refresh_token_error_key or "", 2945 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2946 if not_optional_refresh_token_updater.refresh_token_error_values 2947 else (), 2948 ) 2949 elif is_defined_on_oauth_authenticator: 2950 return ( 2951 tuple(model.refresh_token_error_status_codes) 2952 if model.refresh_token_error_status_codes 2953 else (), 2954 model.refresh_token_error_key or "", 2955 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2956 ) 2957 2958 # returning default values we think cover most cases 2959 return (400,), "error", ("invalid_grant", "invalid_permissions") 2960 2961 def create_offset_increment( 2962 self, 2963 model: OffsetIncrementModel, 2964 config: Config, 2965 decoder: Decoder, 2966 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2967 **kwargs: Any, 2968 ) -> OffsetIncrement: 2969 if isinstance(decoder, PaginationDecoderDecorator): 2970 inner_decoder = decoder.decoder 2971 else: 2972 inner_decoder = decoder 2973 decoder = PaginationDecoderDecorator(decoder=decoder) 2974 2975 if self._is_supported_decoder_for_pagination(inner_decoder): 2976 decoder_to_use = decoder 2977 else: 2978 raise ValueError( 2979 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2980 ) 2981 2982 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2983 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2984 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2985 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2986 # When we have more time to investigate we can look into reusing the same component. 2987 extractor = ( 2988 self._create_component_from_model( 2989 model=extractor_model, config=config, decoder=decoder_to_use 2990 ) 2991 if extractor_model 2992 else None 2993 ) 2994 2995 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2996 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2997 page_size = model.page_size 2998 if isinstance(page_size, str) and page_size.isdigit(): 2999 page_size = int(page_size) 3000 3001 return OffsetIncrement( 3002 page_size=page_size, 3003 config=config, 3004 decoder=decoder_to_use, 3005 extractor=extractor, 3006 inject_on_first_request=model.inject_on_first_request or False, 3007 parameters=model.parameters or {}, 3008 ) 3009 3010 @staticmethod 3011 def create_page_increment( 3012 model: PageIncrementModel, config: Config, **kwargs: Any 3013 ) -> PageIncrement: 3014 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3015 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3016 page_size = model.page_size 3017 if isinstance(page_size, str) and page_size.isdigit(): 3018 page_size = int(page_size) 3019 3020 return PageIncrement( 3021 page_size=page_size, 3022 config=config, 3023 start_from_page=model.start_from_page or 0, 3024 inject_on_first_request=model.inject_on_first_request or False, 3025 parameters=model.parameters or {}, 3026 ) 3027 3028 def create_parent_stream_config( 3029 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3030 ) -> ParentStreamConfig: 3031 declarative_stream = self._create_component_from_model( 3032 model.stream, 3033 config=config, 3034 is_parent=True, 3035 **kwargs, 3036 ) 3037 request_option = ( 3038 self._create_component_from_model(model.request_option, config=config) 3039 if model.request_option 3040 else None 3041 ) 3042 3043 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3044 raise ValueError( 3045 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3046 ) 3047 3048 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3049 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3050 ) 3051 3052 return ParentStreamConfig( 3053 parent_key=model.parent_key, 3054 request_option=request_option, 3055 stream=declarative_stream, 3056 partition_field=model.partition_field, 3057 config=config, 3058 incremental_dependency=model.incremental_dependency or False, 3059 parameters=model.parameters or {}, 3060 extra_fields=model.extra_fields, 3061 lazy_read_pointer=model_lazy_read_pointer, 3062 ) 3063 3064 def create_properties_from_endpoint( 3065 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3066 ) -> PropertiesFromEndpoint: 3067 retriever = self._create_component_from_model( 3068 model=model.retriever, 3069 config=config, 3070 name="dynamic_properties", 3071 primary_key=None, 3072 stream_slicer=None, 3073 transformations=[], 3074 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3075 ) 3076 return PropertiesFromEndpoint( 3077 property_field_path=model.property_field_path, 3078 retriever=retriever, 3079 config=config, 3080 parameters=model.parameters or {}, 3081 ) 3082 3083 def create_property_chunking( 3084 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3085 ) -> PropertyChunking: 3086 record_merge_strategy = ( 3087 self._create_component_from_model( 3088 model=model.record_merge_strategy, config=config, **kwargs 3089 ) 3090 if model.record_merge_strategy 3091 else None 3092 ) 3093 3094 property_limit_type: PropertyLimitType 3095 match model.property_limit_type: 3096 case PropertyLimitTypeModel.property_count: 3097 property_limit_type = PropertyLimitType.property_count 3098 case PropertyLimitTypeModel.characters: 3099 property_limit_type = PropertyLimitType.characters 3100 case _: 3101 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3102 3103 return PropertyChunking( 3104 property_limit_type=property_limit_type, 3105 property_limit=model.property_limit, 3106 record_merge_strategy=record_merge_strategy, 3107 config=config, 3108 parameters=model.parameters or {}, 3109 ) 3110 3111 def create_query_properties( 3112 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3113 ) -> QueryProperties: 3114 if isinstance(model.property_list, list): 3115 property_list = model.property_list 3116 else: 3117 property_list = self._create_component_from_model( 3118 model=model.property_list, config=config, **kwargs 3119 ) 3120 3121 property_chunking = ( 3122 self._create_component_from_model( 3123 model=model.property_chunking, config=config, **kwargs 3124 ) 3125 if model.property_chunking 3126 else None 3127 ) 3128 3129 property_selector = ( 3130 self._create_component_from_model( 3131 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3132 ) 3133 if model.property_selector 3134 else None 3135 ) 3136 3137 return QueryProperties( 3138 property_list=property_list, 3139 always_include_properties=model.always_include_properties, 3140 property_chunking=property_chunking, 3141 property_selector=property_selector, 3142 config=config, 3143 parameters=model.parameters or {}, 3144 ) 3145 3146 def create_json_schema_property_selector( 3147 self, 3148 model: JsonSchemaPropertySelectorModel, 3149 config: Config, 3150 *, 3151 stream_name: str, 3152 **kwargs: Any, 3153 ) -> JsonSchemaPropertySelector: 3154 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3155 3156 transformations = [] 3157 if model.transformations: 3158 for transformation_model in model.transformations: 3159 transformations.append( 3160 self._create_component_from_model(model=transformation_model, config=config) 3161 ) 3162 3163 return JsonSchemaPropertySelector( 3164 configured_stream=configured_stream, 3165 properties_transformations=transformations, 3166 config=config, 3167 parameters=model.parameters or {}, 3168 ) 3169 3170 @staticmethod 3171 def create_record_filter( 3172 model: RecordFilterModel, config: Config, **kwargs: Any 3173 ) -> RecordFilter: 3174 return RecordFilter( 3175 condition=model.condition or "", config=config, parameters=model.parameters or {} 3176 ) 3177 3178 @staticmethod 3179 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3180 return RequestPath(parameters={}) 3181 3182 @staticmethod 3183 def create_request_option( 3184 model: RequestOptionModel, config: Config, **kwargs: Any 3185 ) -> RequestOption: 3186 inject_into = RequestOptionType(model.inject_into.value) 3187 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3188 [ 3189 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3190 for segment in model.field_path 3191 ] 3192 if model.field_path 3193 else None 3194 ) 3195 field_name = ( 3196 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3197 if model.field_name 3198 else None 3199 ) 3200 return RequestOption( 3201 field_name=field_name, 3202 field_path=field_path, 3203 inject_into=inject_into, 3204 parameters=kwargs.get("parameters", {}), 3205 ) 3206 3207 def create_record_selector( 3208 self, 3209 model: RecordSelectorModel, 3210 config: Config, 3211 *, 3212 name: str, 3213 transformations: List[RecordTransformation] | None = None, 3214 decoder: Decoder | None = None, 3215 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3216 file_uploader: Optional[DefaultFileUploader] = None, 3217 **kwargs: Any, 3218 ) -> RecordSelector: 3219 extractor = self._create_component_from_model( 3220 model=model.extractor, decoder=decoder, config=config 3221 ) 3222 record_filter = ( 3223 self._create_component_from_model(model.record_filter, config=config) 3224 if model.record_filter 3225 else None 3226 ) 3227 3228 transform_before_filtering = ( 3229 False if model.transform_before_filtering is None else model.transform_before_filtering 3230 ) 3231 if client_side_incremental_sync_cursor: 3232 record_filter = ClientSideIncrementalRecordFilterDecorator( 3233 config=config, 3234 parameters=model.parameters, 3235 condition=model.record_filter.condition 3236 if (model.record_filter and hasattr(model.record_filter, "condition")) 3237 else None, 3238 cursor=client_side_incremental_sync_cursor, 3239 ) 3240 transform_before_filtering = ( 3241 True 3242 if model.transform_before_filtering is None 3243 else model.transform_before_filtering 3244 ) 3245 3246 if model.schema_normalization is None: 3247 # default to no schema normalization if not set 3248 model.schema_normalization = SchemaNormalizationModel.None_ 3249 3250 schema_normalization = ( 3251 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3252 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3253 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3254 ) 3255 3256 return RecordSelector( 3257 extractor=extractor, 3258 name=name, 3259 config=config, 3260 record_filter=record_filter, 3261 transformations=transformations or [], 3262 file_uploader=file_uploader, 3263 schema_normalization=schema_normalization, 3264 parameters=model.parameters or {}, 3265 transform_before_filtering=transform_before_filtering, 3266 ) 3267 3268 @staticmethod 3269 def create_remove_fields( 3270 model: RemoveFieldsModel, config: Config, **kwargs: Any 3271 ) -> RemoveFields: 3272 return RemoveFields( 3273 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3274 ) 3275 3276 def create_selective_authenticator( 3277 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3278 ) -> DeclarativeAuthenticator: 3279 authenticators = { 3280 name: self._create_component_from_model(model=auth, config=config) 3281 for name, auth in model.authenticators.items() 3282 } 3283 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3284 return SelectiveAuthenticator( # type: ignore[abstract] 3285 config=config, 3286 authenticators=authenticators, 3287 authenticator_selection_path=model.authenticator_selection_path, 3288 **kwargs, 3289 ) 3290 3291 @staticmethod 3292 def create_legacy_session_token_authenticator( 3293 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3294 ) -> LegacySessionTokenAuthenticator: 3295 return LegacySessionTokenAuthenticator( 3296 api_url=url_base, 3297 header=model.header, 3298 login_url=model.login_url, 3299 password=model.password or "", 3300 session_token=model.session_token or "", 3301 session_token_response_key=model.session_token_response_key or "", 3302 username=model.username or "", 3303 validate_session_url=model.validate_session_url, 3304 config=config, 3305 parameters=model.parameters or {}, 3306 ) 3307 3308 def create_simple_retriever( 3309 self, 3310 model: SimpleRetrieverModel, 3311 config: Config, 3312 *, 3313 name: str, 3314 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3315 request_options_provider: Optional[RequestOptionsProvider] = None, 3316 cursor: Optional[Cursor] = None, 3317 has_stop_condition_cursor: bool = False, 3318 is_client_side_incremental_sync: bool = False, 3319 transformations: List[RecordTransformation], 3320 file_uploader: Optional[DefaultFileUploader] = None, 3321 incremental_sync: Optional[ 3322 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3323 ] = None, 3324 use_cache: Optional[bool] = None, 3325 log_formatter: Optional[Callable[[Response], Any]] = None, 3326 partition_router: Optional[PartitionRouter] = None, 3327 **kwargs: Any, 3328 ) -> SimpleRetriever: 3329 def _get_url(req: Requester) -> str: 3330 """ 3331 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3332 This is needed because the URL is not set until the requester is created. 3333 """ 3334 3335 _url: str = ( 3336 model.requester.url 3337 if hasattr(model.requester, "url") and model.requester.url is not None 3338 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3339 ) 3340 _url_base: str = ( 3341 model.requester.url_base 3342 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3343 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3344 ) 3345 3346 return _url or _url_base 3347 3348 if cursor is None: 3349 cursor = FinalStateCursor(name, None, self._message_repository) 3350 3351 decoder = ( 3352 self._create_component_from_model(model=model.decoder, config=config) 3353 if model.decoder 3354 else JsonDecoder(parameters={}) 3355 ) 3356 record_selector = self._create_component_from_model( 3357 model=model.record_selector, 3358 name=name, 3359 config=config, 3360 decoder=decoder, 3361 transformations=transformations, 3362 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3363 file_uploader=file_uploader, 3364 ) 3365 3366 query_properties: Optional[QueryProperties] = None 3367 query_properties_key: Optional[str] = None 3368 self._ensure_query_properties_to_model(model.requester) 3369 if self._has_query_properties_in_request_parameters(model.requester): 3370 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3371 # places instead of default to request_parameters which isn't clearly documented 3372 if ( 3373 hasattr(model.requester, "fetch_properties_from_endpoint") 3374 and model.requester.fetch_properties_from_endpoint 3375 ): 3376 raise ValueError( 3377 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3378 ) 3379 3380 query_properties_definitions = [] 3381 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3382 if isinstance(request_parameter, QueryPropertiesModel): 3383 query_properties_key = key 3384 query_properties_definitions.append(request_parameter) 3385 3386 if len(query_properties_definitions) > 1: 3387 raise ValueError( 3388 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3389 ) 3390 3391 if len(query_properties_definitions) == 1: 3392 query_properties = self._create_component_from_model( 3393 model=query_properties_definitions[0], stream_name=name, config=config 3394 ) 3395 3396 # Removes QueryProperties components from the interpolated mappings because it has been designed 3397 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3398 # instead of through jinja interpolation 3399 if hasattr(model.requester, "request_parameters") and isinstance( 3400 model.requester.request_parameters, Mapping 3401 ): 3402 model.requester.request_parameters = self._remove_query_properties( 3403 model.requester.request_parameters 3404 ) 3405 elif ( 3406 hasattr(model.requester, "fetch_properties_from_endpoint") 3407 and model.requester.fetch_properties_from_endpoint 3408 ): 3409 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3410 query_properties_definition = QueryPropertiesModel( 3411 type="QueryProperties", 3412 property_list=model.requester.fetch_properties_from_endpoint, 3413 always_include_properties=None, 3414 property_chunking=None, 3415 ) # type: ignore # $parameters has a default value 3416 3417 query_properties = self.create_query_properties( 3418 model=query_properties_definition, 3419 stream_name=name, 3420 config=config, 3421 ) 3422 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3423 query_properties = self.create_query_properties( 3424 model=model.requester.query_properties, 3425 stream_name=name, 3426 config=config, 3427 ) 3428 3429 requester = self._create_component_from_model( 3430 model=model.requester, 3431 decoder=decoder, 3432 name=name, 3433 query_properties_key=query_properties_key, 3434 use_cache=use_cache, 3435 config=config, 3436 ) 3437 3438 if not request_options_provider: 3439 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3440 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3441 partition_router, PartitionRouter 3442 ): 3443 request_options_provider = partition_router 3444 3445 paginator = ( 3446 self._create_component_from_model( 3447 model=model.paginator, 3448 config=config, 3449 url_base=_get_url(requester), 3450 extractor_model=model.record_selector.extractor, 3451 decoder=decoder, 3452 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3453 ) 3454 if model.paginator 3455 else NoPagination(parameters={}) 3456 ) 3457 3458 ignore_stream_slicer_parameters_on_paginated_requests = ( 3459 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3460 ) 3461 3462 if ( 3463 model.partition_router 3464 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3465 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3466 and any( 3467 parent_stream_config.lazy_read_pointer 3468 for parent_stream_config in model.partition_router.parent_stream_configs 3469 ) 3470 ): 3471 if incremental_sync: 3472 if incremental_sync.type != "DatetimeBasedCursor": 3473 raise ValueError( 3474 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3475 ) 3476 3477 elif incremental_sync.step or incremental_sync.cursor_granularity: 3478 raise ValueError( 3479 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3480 ) 3481 3482 if model.decoder and model.decoder.type != "JsonDecoder": 3483 raise ValueError( 3484 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3485 ) 3486 3487 return LazySimpleRetriever( 3488 name=name, 3489 paginator=paginator, 3490 primary_key=primary_key, 3491 requester=requester, 3492 record_selector=record_selector, 3493 stream_slicer=_NO_STREAM_SLICING, 3494 request_option_provider=request_options_provider, 3495 config=config, 3496 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3497 parameters=model.parameters or {}, 3498 ) 3499 3500 if ( 3501 model.record_selector.record_filter 3502 and model.pagination_reset 3503 and model.pagination_reset.limits 3504 ): 3505 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3506 3507 return SimpleRetriever( 3508 name=name, 3509 paginator=paginator, 3510 primary_key=primary_key, 3511 requester=requester, 3512 record_selector=record_selector, 3513 stream_slicer=_NO_STREAM_SLICING, 3514 request_option_provider=request_options_provider, 3515 config=config, 3516 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3517 additional_query_properties=query_properties, 3518 log_formatter=self._get_log_formatter(log_formatter, name), 3519 pagination_tracker_factory=self._create_pagination_tracker_factory( 3520 model.pagination_reset, cursor 3521 ), 3522 parameters=model.parameters or {}, 3523 ) 3524 3525 def _create_pagination_tracker_factory( 3526 self, model: Optional[PaginationResetModel], cursor: Cursor 3527 ) -> Callable[[], PaginationTracker]: 3528 if model is None: 3529 return lambda: PaginationTracker() 3530 3531 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3532 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3533 if model.action == PaginationResetActionModel.RESET: 3534 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3535 pass 3536 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3537 if isinstance(cursor, ConcurrentCursor): 3538 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3539 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3540 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3541 {}, datetime.timedelta(0) 3542 ) 3543 elif not isinstance(cursor, FinalStateCursor): 3544 LOGGER.warning( 3545 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3546 ) 3547 else: 3548 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3549 3550 limit = model.limits.number_of_records if model and model.limits else None 3551 return lambda: PaginationTracker(cursor_factory(), limit) 3552 3553 def _get_log_formatter( 3554 self, log_formatter: Callable[[Response], Any] | None, name: str 3555 ) -> Callable[[Response], Any] | None: 3556 if self._should_limit_slices_fetched(): 3557 return ( 3558 ( 3559 lambda response: format_http_message( 3560 response, 3561 f"Stream '{name}' request", 3562 f"Request performed in order to extract records for stream '{name}'", 3563 name, 3564 ) 3565 ) 3566 if not log_formatter 3567 else log_formatter 3568 ) 3569 return None 3570 3571 def _should_limit_slices_fetched(self) -> bool: 3572 """ 3573 Returns True if the number of slices fetched should be limited, False otherwise. 3574 This is used to limit the number of slices fetched during tests. 3575 """ 3576 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3577 3578 @staticmethod 3579 def _has_query_properties_in_request_parameters( 3580 requester: Union[HttpRequesterModel, CustomRequesterModel], 3581 ) -> bool: 3582 if not hasattr(requester, "request_parameters"): 3583 return False 3584 request_parameters = requester.request_parameters 3585 if request_parameters and isinstance(request_parameters, Mapping): 3586 for request_parameter in request_parameters.values(): 3587 if isinstance(request_parameter, QueryPropertiesModel): 3588 return True 3589 return False 3590 3591 @staticmethod 3592 def _remove_query_properties( 3593 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3594 ) -> Mapping[str, str]: 3595 return { 3596 parameter_field: request_parameter 3597 for parameter_field, request_parameter in request_parameters.items() 3598 if not isinstance(request_parameter, QueryPropertiesModel) 3599 } 3600 3601 def create_state_delegating_stream( 3602 self, 3603 model: StateDelegatingStreamModel, 3604 config: Config, 3605 **kwargs: Any, 3606 ) -> DefaultStream: 3607 if ( 3608 model.full_refresh_stream.name != model.name 3609 or model.name != model.incremental_stream.name 3610 ): 3611 raise ValueError( 3612 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3613 ) 3614 3615 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3616 resolved_retention_period: Optional[str] = None 3617 if model.api_retention_period: 3618 interpolated_retention = InterpolatedString.create( 3619 model.api_retention_period, parameters=model.parameters or {} 3620 ) 3621 resolved_value = interpolated_retention.eval(config=config) 3622 if resolved_value: 3623 resolved_retention_period = str(resolved_value) 3624 3625 if resolved_retention_period: 3626 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3627 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3628 raise ValueError( 3629 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3630 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3631 f"cursors, so cursor age validation cannot be performed." 3632 ) 3633 3634 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3635 3636 if not stream_state: 3637 return self._create_component_from_model( # type: ignore[no-any-return] 3638 model.full_refresh_stream, config=config, **kwargs 3639 ) 3640 3641 incremental_stream: DefaultStream = self._create_component_from_model( 3642 model.incremental_stream, config=config, **kwargs 3643 ) # type: ignore[assignment] 3644 3645 # Only run cursor age validation for streams that are in the configured 3646 # catalog (or when no catalog was provided, e.g. during discover / connector 3647 # builder). Streams not selected by the user but instantiated as parent-stream 3648 # dependencies must not go through this path because it emits state messages 3649 # that the destination does not know about, causing "Stream not found" crashes. 3650 stream_is_in_catalog = ( 3651 not self._stream_name_to_configured_stream # no catalog → validate by default 3652 or model.name in self._stream_name_to_configured_stream 3653 ) 3654 if resolved_retention_period and stream_is_in_catalog: 3655 full_refresh_stream: DefaultStream = self._create_component_from_model( 3656 model.full_refresh_stream, config=config, **kwargs 3657 ) # type: ignore[assignment] 3658 if self._is_cursor_older_than_retention_period( 3659 stream_state, 3660 full_refresh_stream.cursor, 3661 incremental_stream.cursor, 3662 resolved_retention_period, 3663 model.name, 3664 ): 3665 # Clear state BEFORE constructing the full_refresh_stream so that 3666 # its cursor starts from start_date instead of the stale cursor. 3667 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3668 state_message = self._connector_state_manager.create_state_message(model.name, None) 3669 self._message_repository.emit_message(state_message) 3670 return self._create_component_from_model( # type: ignore[no-any-return] 3671 model.full_refresh_stream, config=config, **kwargs 3672 ) 3673 3674 return incremental_stream 3675 3676 @staticmethod 3677 def _is_cursor_older_than_retention_period( 3678 stream_state: Mapping[str, Any], 3679 full_refresh_cursor: Cursor, 3680 incremental_cursor: Cursor, 3681 api_retention_period: str, 3682 stream_name: str, 3683 ) -> bool: 3684 """Check if the cursor value in the state is older than the API's retention period. 3685 3686 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3687 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3688 which is always within retention, so we use incremental. For other states, it returns 3689 None and we fall back to checking the incremental cursor. 3690 3691 Returns True if the cursor is older than the retention period (should use full refresh). 3692 Returns False if the cursor is within the retention period (safe to use incremental). 3693 """ 3694 retention_duration = parse_duration(api_retention_period) 3695 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3696 3697 # Check full refresh cursor first 3698 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3699 3700 # If full refresh cursor returns None, check incremental cursor 3701 if cursor_datetime is None: 3702 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3703 3704 if cursor_datetime is None: 3705 # Neither cursor could parse the state - fall back to full refresh to be safe 3706 return True 3707 3708 if cursor_datetime < retention_cutoff: 3709 logging.warning( 3710 f"Stream '{stream_name}' has a cursor value older than " 3711 f"the API's retention period of {api_retention_period} " 3712 f"(cutoff: {retention_cutoff.isoformat()}). " 3713 f"Falling back to full refresh to avoid data loss." 3714 ) 3715 return True 3716 3717 return False 3718 3719 def _get_state_delegating_stream_model( 3720 self, 3721 model: StateDelegatingStreamModel, 3722 parent_state: Optional[Mapping[str, Any]] = None, 3723 ) -> DeclarativeStreamModel: 3724 """Return the appropriate underlying stream model based on state.""" 3725 return ( 3726 model.incremental_stream 3727 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3728 else model.full_refresh_stream 3729 ) 3730 3731 _OPTIONAL_ASYNC_STATUS_FIELDS = {"skipped"} 3732 3733 def _create_async_job_status_mapping( 3734 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3735 ) -> Mapping[str, AsyncJobStatus]: 3736 api_status_to_cdk_status = {} 3737 for cdk_status, api_statuses in model.dict().items(): 3738 if cdk_status == "type": 3739 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3740 continue 3741 3742 if api_statuses is None: 3743 if cdk_status in self._OPTIONAL_ASYNC_STATUS_FIELDS: 3744 continue 3745 raise ValueError( 3746 f"Required CDK status '{cdk_status}' has no API statuses mapped. " 3747 f"Please provide at least an empty list for required status fields." 3748 ) 3749 3750 for status in api_statuses: 3751 if status in api_status_to_cdk_status: 3752 raise ValueError( 3753 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3754 ) 3755 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3756 return api_status_to_cdk_status 3757 3758 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3759 match status: 3760 case "running": 3761 return AsyncJobStatus.RUNNING 3762 case "completed": 3763 return AsyncJobStatus.COMPLETED 3764 case "failed": 3765 return AsyncJobStatus.FAILED 3766 case "timeout": 3767 return AsyncJobStatus.TIMED_OUT 3768 case "skipped": 3769 return AsyncJobStatus.SKIPPED 3770 case _: 3771 raise ValueError(f"Unsupported CDK status {status}") 3772 3773 def create_async_retriever( 3774 self, 3775 model: AsyncRetrieverModel, 3776 config: Config, 3777 *, 3778 name: str, 3779 primary_key: Optional[ 3780 Union[str, List[str], List[List[str]]] 3781 ], # this seems to be needed to match create_simple_retriever 3782 stream_slicer: Optional[StreamSlicer], 3783 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3784 transformations: List[RecordTransformation], 3785 **kwargs: Any, 3786 ) -> AsyncRetriever: 3787 if model.download_target_requester and not model.download_target_extractor: 3788 raise ValueError( 3789 f"`download_target_extractor` required if using a `download_target_requester`" 3790 ) 3791 3792 def _get_download_retriever( 3793 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3794 ) -> SimpleRetriever: 3795 # We create a record selector for the download retriever 3796 # with no schema normalization and no transformations, neither record filter 3797 # as all this occurs in the record_selector of the AsyncRetriever 3798 record_selector = RecordSelector( 3799 extractor=extractor, 3800 name=name, 3801 record_filter=None, 3802 transformations=[], 3803 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3804 config=config, 3805 parameters={}, 3806 ) 3807 paginator = ( 3808 self._create_component_from_model( 3809 model=model.download_paginator, 3810 decoder=_decoder, 3811 config=config, 3812 url_base="", 3813 ) 3814 if model.download_paginator 3815 else NoPagination(parameters={}) 3816 ) 3817 3818 return SimpleRetriever( 3819 requester=requester, 3820 record_selector=record_selector, 3821 primary_key=None, 3822 name=name, 3823 paginator=paginator, 3824 config=config, 3825 parameters={}, 3826 log_formatter=self._get_log_formatter(None, name), 3827 ) 3828 3829 def _get_job_timeout() -> datetime.timedelta: 3830 user_defined_timeout: Optional[int] = ( 3831 int( 3832 InterpolatedString.create( 3833 str(model.polling_job_timeout), 3834 parameters={}, 3835 ).eval(config) 3836 ) 3837 if model.polling_job_timeout 3838 else None 3839 ) 3840 3841 # check for user defined timeout during the test read or 15 minutes 3842 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3843 # default value for non-connector builder is 60 minutes. 3844 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3845 3846 return ( 3847 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3848 ) 3849 3850 decoder = ( 3851 self._create_component_from_model(model=model.decoder, config=config) 3852 if model.decoder 3853 else JsonDecoder(parameters={}) 3854 ) 3855 record_selector = self._create_component_from_model( 3856 model=model.record_selector, 3857 config=config, 3858 decoder=decoder, 3859 name=name, 3860 transformations=transformations, 3861 client_side_incremental_sync=client_side_incremental_sync, 3862 ) 3863 3864 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3865 if self._should_limit_slices_fetched(): 3866 stream_slicer = cast( 3867 StreamSlicer, 3868 StreamSlicerTestReadDecorator( 3869 wrapped_slicer=stream_slicer, 3870 maximum_number_of_slices=self._limit_slices_fetched or 5, 3871 ), 3872 ) 3873 3874 creation_requester = self._create_component_from_model( 3875 model=model.creation_requester, 3876 decoder=decoder, 3877 config=config, 3878 name=f"job creation - {name}", 3879 ) 3880 polling_requester = self._create_component_from_model( 3881 model=model.polling_requester, 3882 decoder=decoder, 3883 config=config, 3884 name=f"job polling - {name}", 3885 ) 3886 job_download_components_name = f"job download - {name}" 3887 download_decoder = ( 3888 self._create_component_from_model(model=model.download_decoder, config=config) 3889 if model.download_decoder 3890 else JsonDecoder(parameters={}) 3891 ) 3892 download_extractor = ( 3893 self._create_component_from_model( 3894 model=model.download_extractor, 3895 config=config, 3896 decoder=download_decoder, 3897 parameters=model.parameters, 3898 ) 3899 if model.download_extractor 3900 else DpathExtractor( 3901 [], 3902 config=config, 3903 decoder=download_decoder, 3904 parameters=model.parameters or {}, 3905 ) 3906 ) 3907 download_requester = self._create_component_from_model( 3908 model=model.download_requester, 3909 decoder=download_decoder, 3910 config=config, 3911 name=job_download_components_name, 3912 ) 3913 download_retriever = _get_download_retriever( 3914 download_requester, download_extractor, download_decoder 3915 ) 3916 abort_requester = ( 3917 self._create_component_from_model( 3918 model=model.abort_requester, 3919 decoder=decoder, 3920 config=config, 3921 name=f"job abort - {name}", 3922 ) 3923 if model.abort_requester 3924 else None 3925 ) 3926 delete_requester = ( 3927 self._create_component_from_model( 3928 model=model.delete_requester, 3929 decoder=decoder, 3930 config=config, 3931 name=f"job delete - {name}", 3932 ) 3933 if model.delete_requester 3934 else None 3935 ) 3936 download_target_requester = ( 3937 self._create_component_from_model( 3938 model=model.download_target_requester, 3939 decoder=decoder, 3940 config=config, 3941 name=f"job extract_url - {name}", 3942 ) 3943 if model.download_target_requester 3944 else None 3945 ) 3946 status_extractor = self._create_component_from_model( 3947 model=model.status_extractor, decoder=decoder, config=config, name=name 3948 ) 3949 download_target_extractor = ( 3950 self._create_component_from_model( 3951 model=model.download_target_extractor, 3952 decoder=decoder, 3953 config=config, 3954 name=name, 3955 ) 3956 if model.download_target_extractor 3957 else None 3958 ) 3959 3960 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3961 creation_requester=creation_requester, 3962 polling_requester=polling_requester, 3963 download_retriever=download_retriever, 3964 download_target_requester=download_target_requester, 3965 abort_requester=abort_requester, 3966 delete_requester=delete_requester, 3967 status_extractor=status_extractor, 3968 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3969 download_target_extractor=download_target_extractor, 3970 job_timeout=_get_job_timeout(), 3971 ) 3972 3973 failed_retry_wait_time_in_seconds: Optional[int] = ( 3974 int( 3975 InterpolatedString.create( 3976 str(model.failed_retry_wait_time_in_seconds), 3977 parameters={}, 3978 ).eval(config) 3979 ) 3980 if model.failed_retry_wait_time_in_seconds 3981 else None 3982 ) 3983 3984 async_job_partition_router = AsyncJobPartitionRouter( 3985 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3986 job_repository, 3987 stream_slices, 3988 self._job_tracker, 3989 self._message_repository, 3990 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3991 has_bulk_parent=False, 3992 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3993 # `None` == default retry is set to 3 attempts, under the hood. 3994 job_max_retry=1 if self._emit_connector_builder_messages else None, 3995 failed_retry_wait_time_in_seconds=failed_retry_wait_time_in_seconds, 3996 ), 3997 stream_slicer=stream_slicer, 3998 config=config, 3999 parameters=model.parameters or {}, 4000 ) 4001 4002 return AsyncRetriever( 4003 record_selector=record_selector, 4004 stream_slicer=async_job_partition_router, 4005 config=config, 4006 parameters=model.parameters or {}, 4007 ) 4008 4009 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 4010 config_migrations = [ 4011 self._create_component_from_model(migration, config) 4012 for migration in ( 4013 model.config_normalization_rules.config_migrations 4014 if ( 4015 model.config_normalization_rules 4016 and model.config_normalization_rules.config_migrations 4017 ) 4018 else [] 4019 ) 4020 ] 4021 config_transformations = [ 4022 self._create_component_from_model(transformation, config) 4023 for transformation in ( 4024 model.config_normalization_rules.transformations 4025 if ( 4026 model.config_normalization_rules 4027 and model.config_normalization_rules.transformations 4028 ) 4029 else [] 4030 ) 4031 ] 4032 config_validations = [ 4033 self._create_component_from_model(validation, config) 4034 for validation in ( 4035 model.config_normalization_rules.validations 4036 if ( 4037 model.config_normalization_rules 4038 and model.config_normalization_rules.validations 4039 ) 4040 else [] 4041 ) 4042 ] 4043 4044 return Spec( 4045 connection_specification=model.connection_specification, 4046 documentation_url=model.documentation_url, 4047 advanced_auth=model.advanced_auth, 4048 parameters={}, 4049 config_migrations=config_migrations, 4050 config_transformations=config_transformations, 4051 config_validations=config_validations, 4052 ) 4053 4054 def create_substream_partition_router( 4055 self, 4056 model: SubstreamPartitionRouterModel, 4057 config: Config, 4058 *, 4059 stream_name: str, 4060 **kwargs: Any, 4061 ) -> SubstreamPartitionRouter: 4062 parent_stream_configs = [] 4063 if model.parent_stream_configs: 4064 parent_stream_configs.extend( 4065 [ 4066 self.create_parent_stream_config_with_substream_wrapper( 4067 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4068 ) 4069 for parent_stream_config in model.parent_stream_configs 4070 ] 4071 ) 4072 4073 return SubstreamPartitionRouter( 4074 parent_stream_configs=parent_stream_configs, 4075 parameters=model.parameters or {}, 4076 config=config, 4077 ) 4078 4079 def create_parent_stream_config_with_substream_wrapper( 4080 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4081 ) -> Any: 4082 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4083 4084 parent_state: Optional[Mapping[str, Any]] = ( 4085 child_state if model.incremental_dependency and child_state else None 4086 ) 4087 connector_state_manager = self._instantiate_parent_stream_state_manager( 4088 child_state, config, model, parent_state 4089 ) 4090 4091 substream_factory = ModelToComponentFactory( 4092 connector_state_manager=connector_state_manager, 4093 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4094 limit_slices_fetched=self._limit_slices_fetched, 4095 emit_connector_builder_messages=self._emit_connector_builder_messages, 4096 disable_retries=self._disable_retries, 4097 disable_cache=self._disable_cache, 4098 message_repository=StateFilteringMessageRepository( 4099 LogAppenderMessageRepositoryDecorator( 4100 { 4101 "airbyte_cdk": {"stream": {"is_substream": True}}, 4102 "http": {"is_auxiliary": True}, 4103 }, 4104 self._message_repository, 4105 self._evaluate_log_level(self._emit_connector_builder_messages), 4106 ), 4107 ), 4108 api_budget=self._api_budget, 4109 ) 4110 4111 return substream_factory.create_parent_stream_config( 4112 model=model, config=config, stream_name=stream_name, **kwargs 4113 ) 4114 4115 def _instantiate_parent_stream_state_manager( 4116 self, 4117 child_state: MutableMapping[str, Any], 4118 config: Config, 4119 model: ParentStreamConfigModel, 4120 parent_state: Optional[Mapping[str, Any]] = None, 4121 ) -> ConnectorStateManager: 4122 """ 4123 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4124 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4125 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4126 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4127 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4128 incremental_dependency is set. 4129 """ 4130 if model.incremental_dependency and child_state: 4131 parent_stream_name = model.stream.name or "" 4132 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4133 child_state, parent_stream_name 4134 ) 4135 4136 if not extracted_parent_state: 4137 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4138 child_state, parent_stream_name 4139 ) 4140 4141 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4142 cursor_values = child_state.values() 4143 if cursor_values and len(cursor_values) == 1: 4144 incremental_sync_model: Union[ 4145 DatetimeBasedCursorModel, 4146 IncrementingCountCursorModel, 4147 ] = ( 4148 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4149 if isinstance(model.stream, DeclarativeStreamModel) 4150 else self._get_state_delegating_stream_model( 4151 model.stream, parent_state=parent_state 4152 ).incremental_sync 4153 ) 4154 cursor_field = InterpolatedString.create( 4155 incremental_sync_model.cursor_field, 4156 parameters=incremental_sync_model.parameters or {}, 4157 ).eval(config) 4158 extracted_parent_state = AirbyteStateMessage( 4159 type=AirbyteStateType.STREAM, 4160 stream=AirbyteStreamState( 4161 stream_descriptor=StreamDescriptor( 4162 name=parent_stream_name, namespace=None 4163 ), 4164 stream_state=AirbyteStateBlob( 4165 {cursor_field: list(cursor_values)[0]} 4166 ), 4167 ), 4168 ) 4169 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4170 4171 return ConnectorStateManager([]) 4172 4173 @staticmethod 4174 def create_wait_time_from_header( 4175 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4176 ) -> WaitTimeFromHeaderBackoffStrategy: 4177 return WaitTimeFromHeaderBackoffStrategy( 4178 header=model.header, 4179 parameters=model.parameters or {}, 4180 config=config, 4181 regex=model.regex, 4182 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4183 if model.max_waiting_time_in_seconds is not None 4184 else None, 4185 ) 4186 4187 @staticmethod 4188 def create_wait_until_time_from_header( 4189 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4190 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4191 return WaitUntilTimeFromHeaderBackoffStrategy( 4192 header=model.header, 4193 parameters=model.parameters or {}, 4194 config=config, 4195 min_wait=model.min_wait, 4196 regex=model.regex, 4197 ) 4198 4199 def get_message_repository(self) -> MessageRepository: 4200 return self._message_repository 4201 4202 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4203 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4204 4205 @staticmethod 4206 def create_components_mapping_definition( 4207 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4208 ) -> ComponentMappingDefinition: 4209 interpolated_value = InterpolatedString.create( 4210 model.value, parameters=model.parameters or {} 4211 ) 4212 field_path = [ 4213 InterpolatedString.create(path, parameters=model.parameters or {}) 4214 for path in model.field_path 4215 ] 4216 return ComponentMappingDefinition( 4217 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4218 value=interpolated_value, 4219 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4220 create_or_update=model.create_or_update, 4221 condition=model.condition, 4222 parameters=model.parameters or {}, 4223 ) 4224 4225 def create_http_components_resolver( 4226 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4227 ) -> Any: 4228 retriever = self._create_component_from_model( 4229 model=model.retriever, 4230 config=config, 4231 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4232 primary_key=None, 4233 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4234 transformations=[], 4235 ) 4236 4237 components_mapping = [] 4238 for component_mapping_definition_model in model.components_mapping: 4239 if component_mapping_definition_model.condition: 4240 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4241 components_mapping.append( 4242 self._create_component_from_model( 4243 model=component_mapping_definition_model, 4244 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4245 component_mapping_definition_model.value_type 4246 ), 4247 config=config, 4248 ) 4249 ) 4250 4251 return HttpComponentsResolver( 4252 retriever=retriever, 4253 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4254 config=config, 4255 components_mapping=components_mapping, 4256 parameters=model.parameters or {}, 4257 ) 4258 4259 @staticmethod 4260 def create_stream_config( 4261 model: StreamConfigModel, config: Config, **kwargs: Any 4262 ) -> StreamConfig: 4263 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4264 [x for x in model.configs_pointer] if model.configs_pointer else [] 4265 ) 4266 4267 return StreamConfig( 4268 configs_pointer=model_configs_pointer, 4269 default_values=model.default_values, 4270 parameters=model.parameters or {}, 4271 ) 4272 4273 def create_config_components_resolver( 4274 self, 4275 model: ConfigComponentsResolverModel, 4276 config: Config, 4277 ) -> Any: 4278 model_stream_configs = ( 4279 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4280 ) 4281 4282 stream_configs = [ 4283 self._create_component_from_model( 4284 stream_config, config=config, parameters=model.parameters or {} 4285 ) 4286 for stream_config in model_stream_configs 4287 ] 4288 4289 components_mapping = [ 4290 self._create_component_from_model( 4291 model=components_mapping_definition_model, 4292 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4293 components_mapping_definition_model.value_type 4294 ), 4295 config=config, 4296 parameters=model.parameters, 4297 ) 4298 for components_mapping_definition_model in model.components_mapping 4299 ] 4300 4301 return ConfigComponentsResolver( 4302 stream_configs=stream_configs, 4303 config=config, 4304 components_mapping=components_mapping, 4305 parameters=model.parameters or {}, 4306 ) 4307 4308 def create_parametrized_components_resolver( 4309 self, 4310 model: ParametrizedComponentsResolverModel, 4311 config: Config, 4312 ) -> ParametrizedComponentsResolver: 4313 stream_parameters = StreamParametersDefinition( 4314 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4315 ) 4316 4317 components_mapping = [] 4318 for components_mapping_definition_model in model.components_mapping: 4319 if components_mapping_definition_model.condition: 4320 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4321 components_mapping.append( 4322 self._create_component_from_model( 4323 model=components_mapping_definition_model, 4324 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4325 components_mapping_definition_model.value_type 4326 ), 4327 config=config, 4328 ) 4329 ) 4330 return ParametrizedComponentsResolver( 4331 stream_parameters=stream_parameters, 4332 config=config, 4333 components_mapping=components_mapping, 4334 parameters=model.parameters or {}, 4335 ) 4336 4337 _UNSUPPORTED_DECODER_ERROR = ( 4338 "Specified decoder of {decoder_type} is not supported for pagination." 4339 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4340 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4341 ) 4342 4343 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4344 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4345 return True 4346 elif isinstance(decoder, CompositeRawDecoder): 4347 return self._is_supported_parser_for_pagination(decoder.parser) 4348 else: 4349 return False 4350 4351 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4352 if isinstance(parser, JsonParser): 4353 return True 4354 elif isinstance(parser, GzipParser): 4355 return isinstance(parser.inner_parser, JsonParser) 4356 else: 4357 return False 4358 4359 def create_http_api_budget( 4360 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4361 ) -> HttpAPIBudget: 4362 policies = [ 4363 self._create_component_from_model(model=policy, config=config) 4364 for policy in model.policies 4365 ] 4366 4367 return HttpAPIBudget( 4368 policies=policies, 4369 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4370 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4371 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4372 ) 4373 4374 def create_fixed_window_call_rate_policy( 4375 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4376 ) -> FixedWindowCallRatePolicy: 4377 matchers = [ 4378 self._create_component_from_model(model=matcher, config=config) 4379 for matcher in model.matchers 4380 ] 4381 4382 # Set the initial reset timestamp to 10 days from now. 4383 # This value will be updated by the first request. 4384 return FixedWindowCallRatePolicy( 4385 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4386 period=parse_duration(model.period), 4387 call_limit=model.call_limit, 4388 matchers=matchers, 4389 ) 4390 4391 def create_file_uploader( 4392 self, model: FileUploaderModel, config: Config, **kwargs: Any 4393 ) -> FileUploader: 4394 name = "File Uploader" 4395 requester = self._create_component_from_model( 4396 model=model.requester, 4397 config=config, 4398 name=name, 4399 **kwargs, 4400 ) 4401 download_target_extractor = self._create_component_from_model( 4402 model=model.download_target_extractor, 4403 config=config, 4404 name=name, 4405 **kwargs, 4406 ) 4407 emit_connector_builder_messages = self._emit_connector_builder_messages 4408 file_uploader = DefaultFileUploader( 4409 requester=requester, 4410 download_target_extractor=download_target_extractor, 4411 config=config, 4412 file_writer=NoopFileWriter() 4413 if emit_connector_builder_messages 4414 else LocalFileSystemFileWriter(), 4415 parameters=model.parameters or {}, 4416 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4417 ) 4418 4419 return ( 4420 ConnectorBuilderFileUploader(file_uploader) 4421 if emit_connector_builder_messages 4422 else file_uploader 4423 ) 4424 4425 def create_moving_window_call_rate_policy( 4426 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4427 ) -> MovingWindowCallRatePolicy: 4428 rates = [ 4429 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4430 ] 4431 matchers = [ 4432 self._create_component_from_model(model=matcher, config=config) 4433 for matcher in model.matchers 4434 ] 4435 return MovingWindowCallRatePolicy( 4436 rates=rates, 4437 matchers=matchers, 4438 ) 4439 4440 def create_unlimited_call_rate_policy( 4441 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4442 ) -> UnlimitedCallRatePolicy: 4443 matchers = [ 4444 self._create_component_from_model(model=matcher, config=config) 4445 for matcher in model.matchers 4446 ] 4447 4448 return UnlimitedCallRatePolicy( 4449 matchers=matchers, 4450 ) 4451 4452 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4453 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4454 return Rate( 4455 limit=int(interpolated_limit.eval(config=config)), 4456 interval=parse_duration(model.interval), 4457 ) 4458 4459 def create_http_request_matcher( 4460 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4461 ) -> HttpRequestRegexMatcher: 4462 weight = model.weight 4463 if weight is not None: 4464 if isinstance(weight, str): 4465 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4466 else: 4467 weight = int(weight) 4468 if weight < 1: 4469 raise ValueError(f"weight must be >= 1, got {weight}") 4470 return HttpRequestRegexMatcher( 4471 method=model.method, 4472 url_base=model.url_base, 4473 url_path_pattern=model.url_path_pattern, 4474 params=model.params, 4475 headers=model.headers, 4476 weight=weight, 4477 ) 4478 4479 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4480 self._api_budget = self.create_component( 4481 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4482 ) 4483 4484 def create_grouping_partition_router( 4485 self, 4486 model: GroupingPartitionRouterModel, 4487 config: Config, 4488 *, 4489 stream_name: str, 4490 **kwargs: Any, 4491 ) -> GroupingPartitionRouter: 4492 underlying_router = self._create_component_from_model( 4493 model=model.underlying_partition_router, 4494 config=config, 4495 stream_name=stream_name, 4496 **kwargs, 4497 ) 4498 if model.group_size < 1: 4499 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4500 4501 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4502 # because they are specific to individual partitions and cannot be aggregated or handled 4503 # when grouping, potentially leading to incorrect API calls. Any request customization 4504 # should be managed at the stream level through the requester's configuration. 4505 if isinstance(underlying_router, SubstreamPartitionRouter): 4506 if any( 4507 parent_config.request_option 4508 for parent_config in underlying_router.parent_stream_configs 4509 ): 4510 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4511 4512 if isinstance(underlying_router, ListPartitionRouter): 4513 if underlying_router.request_option: 4514 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4515 4516 return GroupingPartitionRouter( 4517 group_size=model.group_size, 4518 underlying_partition_router=underlying_router, 4519 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4520 config=config, 4521 ) 4522 4523 def _ensure_query_properties_to_model( 4524 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4525 ) -> None: 4526 """ 4527 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4528 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4529 proper model. 4530 """ 4531 if not hasattr(requester, "request_parameters"): 4532 return 4533 4534 request_parameters = requester.request_parameters 4535 if request_parameters and isinstance(request_parameters, Dict): 4536 for request_parameter_key in request_parameters.keys(): 4537 request_parameter = request_parameters[request_parameter_key] 4538 if ( 4539 isinstance(request_parameter, Dict) 4540 and request_parameter.get("type") == "QueryProperties" 4541 ): 4542 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4543 request_parameter 4544 ) 4545 4546 def _get_catalog_defined_cursor_field( 4547 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4548 ) -> Optional[CursorField]: 4549 if not allow_catalog_defined_cursor_field: 4550 return None 4551 4552 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4553 4554 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4555 # case we return None which will then use the default cursor field defined on the cursor model. 4556 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4557 # occur when the platform serializes "no cursor configured" streams incorrectly. 4558 if ( 4559 not configured_stream 4560 or not configured_stream.cursor_field 4561 or not configured_stream.cursor_field[0] 4562 ): 4563 return None 4564 elif len(configured_stream.cursor_field) > 1: 4565 raise ValueError( 4566 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4567 ) 4568 else: 4569 return CursorField( 4570 cursor_field_key=configured_stream.cursor_field[0], 4571 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4572 )
685class ModelToComponentFactory: 686 EPOCH_DATETIME_FORMAT = "%s" 687 688 def __init__( 689 self, 690 limit_pages_fetched_per_slice: Optional[int] = None, 691 limit_slices_fetched: Optional[int] = None, 692 emit_connector_builder_messages: bool = False, 693 disable_retries: bool = False, 694 disable_cache: bool = False, 695 message_repository: Optional[MessageRepository] = None, 696 connector_state_manager: Optional[ConnectorStateManager] = None, 697 max_concurrent_async_job_count: Optional[int] = None, 698 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 699 api_budget: Optional[APIBudget] = None, 700 ): 701 self._init_mappings() 702 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 703 self._limit_slices_fetched = limit_slices_fetched 704 self._emit_connector_builder_messages = emit_connector_builder_messages 705 self._disable_retries = disable_retries 706 self._disable_cache = disable_cache 707 self._message_repository = message_repository or InMemoryMessageRepository( 708 self._evaluate_log_level(emit_connector_builder_messages) 709 ) 710 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 711 configured_catalog 712 ) 713 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 714 self._api_budget: Optional[Union[APIBudget]] = api_budget 715 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 716 # placeholder for deprecation warnings 717 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 718 719 def _init_mappings(self) -> None: 720 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 721 AddedFieldDefinitionModel: self.create_added_field_definition, 722 AddFieldsModel: self.create_add_fields, 723 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 724 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 725 BearerAuthenticatorModel: self.create_bearer_authenticator, 726 CheckStreamModel: self.create_check_stream, 727 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 728 CheckDynamicStreamModel: self.create_check_dynamic_stream, 729 CompositeErrorHandlerModel: self.create_composite_error_handler, 730 ConcurrencyLevelModel: self.create_concurrency_level, 731 ConfigMigrationModel: self.create_config_migration, 732 ConfigAddFieldsModel: self.create_config_add_fields, 733 ConfigRemapFieldModel: self.create_config_remap_field, 734 ConfigRemoveFieldsModel: self.create_config_remove_fields, 735 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 736 CsvDecoderModel: self.create_csv_decoder, 737 CursorPaginationModel: self.create_cursor_pagination, 738 CustomAuthenticatorModel: self.create_custom_component, 739 CustomBackoffStrategyModel: self.create_custom_component, 740 CustomDecoderModel: self.create_custom_component, 741 CustomErrorHandlerModel: self.create_custom_component, 742 CustomRecordExtractorModel: self.create_custom_component, 743 CustomRecordFilterModel: self.create_custom_component, 744 CustomRequesterModel: self.create_custom_component, 745 CustomRetrieverModel: self.create_custom_component, 746 CustomSchemaLoader: self.create_custom_component, 747 CustomSchemaNormalizationModel: self.create_custom_component, 748 CustomStateMigration: self.create_custom_component, 749 CustomPaginationStrategyModel: self.create_custom_component, 750 CustomPartitionRouterModel: self.create_custom_component, 751 CustomTransformationModel: self.create_custom_component, 752 CustomValidationStrategyModel: self.create_custom_component, 753 CustomConfigTransformationModel: self.create_custom_component, 754 DeclarativeStreamModel: self.create_default_stream, 755 DefaultErrorHandlerModel: self.create_default_error_handler, 756 DefaultPaginatorModel: self.create_default_paginator, 757 DpathExtractorModel: self.create_dpath_extractor, 758 DpathValidatorModel: self.create_dpath_validator, 759 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 760 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 761 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 762 GroupByKeyMergeStrategyModel: self.create_group_by_key, 763 HttpRequesterModel: self.create_http_requester, 764 HttpResponseFilterModel: self.create_http_response_filter, 765 InlineSchemaLoaderModel: self.create_inline_schema_loader, 766 JsonDecoderModel: self.create_json_decoder, 767 JsonlDecoderModel: self.create_jsonl_decoder, 768 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 769 GzipDecoderModel: self.create_gzip_decoder, 770 KeysToLowerModel: self.create_keys_to_lower_transformation, 771 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 772 KeysReplaceModel: self.create_keys_replace_transformation, 773 FlattenFieldsModel: self.create_flatten_fields, 774 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 775 IterableDecoderModel: self.create_iterable_decoder, 776 XmlDecoderModel: self.create_xml_decoder, 777 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 778 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 779 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 780 TypesMapModel: self.create_types_map, 781 ComplexFieldTypeModel: self.create_complex_field_type, 782 JwtAuthenticatorModel: self.create_jwt_authenticator, 783 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 784 ListPartitionRouterModel: self.create_list_partition_router, 785 MinMaxDatetimeModel: self.create_min_max_datetime, 786 NoAuthModel: self.create_no_auth, 787 NoPaginationModel: self.create_no_pagination, 788 OAuthAuthenticatorModel: self.create_oauth_authenticator, 789 OffsetIncrementModel: self.create_offset_increment, 790 PageIncrementModel: self.create_page_increment, 791 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 792 PredicateValidatorModel: self.create_predicate_validator, 793 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 794 PropertyChunkingModel: self.create_property_chunking, 795 QueryPropertiesModel: self.create_query_properties, 796 RecordExpanderModel: self.create_record_expander, 797 RecordFilterModel: self.create_record_filter, 798 RecordSelectorModel: self.create_record_selector, 799 RemoveFieldsModel: self.create_remove_fields, 800 RequestPathModel: self.create_request_path, 801 RequestOptionModel: self.create_request_option, 802 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 803 SelectiveAuthenticatorModel: self.create_selective_authenticator, 804 SimpleRetrieverModel: self.create_simple_retriever, 805 StateDelegatingStreamModel: self.create_state_delegating_stream, 806 SpecModel: self.create_spec, 807 SubstreamPartitionRouterModel: self.create_substream_partition_router, 808 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 809 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 810 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 811 AsyncRetrieverModel: self.create_async_retriever, 812 HttpComponentsResolverModel: self.create_http_components_resolver, 813 ConfigComponentsResolverModel: self.create_config_components_resolver, 814 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 815 StreamConfigModel: self.create_stream_config, 816 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 817 ZipfileDecoderModel: self.create_zipfile_decoder, 818 HTTPAPIBudgetModel: self.create_http_api_budget, 819 FileUploaderModel: self.create_file_uploader, 820 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 821 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 822 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 823 RateModel: self.create_rate, 824 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 825 GroupingPartitionRouterModel: self.create_grouping_partition_router, 826 } 827 828 # Needed for the case where we need to perform a second parse on the fields of a custom component 829 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 830 831 @staticmethod 832 def _create_stream_name_to_configured_stream( 833 configured_catalog: Optional[ConfiguredAirbyteCatalog], 834 ) -> Mapping[str, ConfiguredAirbyteStream]: 835 return ( 836 {stream.stream.name: stream for stream in configured_catalog.streams} 837 if configured_catalog 838 else {} 839 ) 840 841 def create_component( 842 self, 843 model_type: Type[BaseModel], 844 component_definition: ComponentDefinition, 845 config: Config, 846 **kwargs: Any, 847 ) -> Any: 848 """ 849 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 850 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 851 creating declarative components from that model. 852 853 :param model_type: The type of declarative component that is being initialized 854 :param component_definition: The mapping that represents a declarative component 855 :param config: The connector config that is provided by the customer 856 :return: The declarative component to be used at runtime 857 """ 858 859 component_type = component_definition.get("type") 860 if component_definition.get("type") != model_type.__name__: 861 raise ValueError( 862 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 863 ) 864 865 declarative_component_model = model_type.parse_obj(component_definition) 866 867 if not isinstance(declarative_component_model, model_type): 868 raise ValueError( 869 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 870 ) 871 872 return self._create_component_from_model( 873 model=declarative_component_model, config=config, **kwargs 874 ) 875 876 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 877 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 878 raise ValueError( 879 f"{model.__class__} with attributes {model} is not a valid component type" 880 ) 881 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 882 if not component_constructor: 883 raise ValueError(f"Could not find constructor for {model.__class__}") 884 885 # collect deprecation warnings for supported models. 886 if isinstance(model, BaseModelWithDeprecations): 887 self._collect_model_deprecations(model) 888 889 return component_constructor(model=model, config=config, **kwargs) 890 891 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 892 """ 893 Returns the deprecation warnings that were collected during the creation of components. 894 """ 895 return self._collected_deprecation_logs 896 897 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 898 """ 899 Collects deprecation logs from the given model and appends any new logs to the internal collection. 900 901 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 902 903 Args: 904 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 905 """ 906 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 907 for log in model._deprecation_logs: 908 # avoid duplicates for deprecation logs observed. 909 if log not in self._collected_deprecation_logs: 910 self._collected_deprecation_logs.append(log) 911 912 def create_config_migration( 913 self, model: ConfigMigrationModel, config: Config 914 ) -> ConfigMigration: 915 transformations: List[ConfigTransformation] = [ 916 self._create_component_from_model(transformation, config) 917 for transformation in model.transformations 918 ] 919 920 return ConfigMigration( 921 description=model.description, 922 transformations=transformations, 923 ) 924 925 def create_config_add_fields( 926 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 927 ) -> ConfigAddFields: 928 fields = [self._create_component_from_model(field, config) for field in model.fields] 929 return ConfigAddFields( 930 fields=fields, 931 condition=model.condition or "", 932 ) 933 934 @staticmethod 935 def create_config_remove_fields( 936 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 937 ) -> ConfigRemoveFields: 938 return ConfigRemoveFields( 939 field_pointers=model.field_pointers, 940 condition=model.condition or "", 941 ) 942 943 @staticmethod 944 def create_config_remap_field( 945 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 946 ) -> ConfigRemapField: 947 mapping = cast(Mapping[str, Any], model.map) 948 return ConfigRemapField( 949 map=mapping, 950 field_path=model.field_path, 951 config=config, 952 ) 953 954 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 955 strategy = self._create_component_from_model(model.validation_strategy, config) 956 957 return DpathValidator( 958 field_path=model.field_path, 959 strategy=strategy, 960 ) 961 962 def create_predicate_validator( 963 self, model: PredicateValidatorModel, config: Config 964 ) -> PredicateValidator: 965 strategy = self._create_component_from_model(model.validation_strategy, config) 966 967 return PredicateValidator( 968 value=model.value, 969 strategy=strategy, 970 ) 971 972 @staticmethod 973 def create_validate_adheres_to_schema( 974 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 975 ) -> ValidateAdheresToSchema: 976 base_schema = cast(Mapping[str, Any], model.base_schema) 977 return ValidateAdheresToSchema( 978 schema=base_schema, 979 ) 980 981 @staticmethod 982 def create_added_field_definition( 983 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 984 ) -> AddedFieldDefinition: 985 interpolated_value = InterpolatedString.create( 986 model.value, parameters=model.parameters or {} 987 ) 988 return AddedFieldDefinition( 989 path=model.path, 990 value=interpolated_value, 991 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 992 parameters=model.parameters or {}, 993 ) 994 995 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 996 added_field_definitions = [ 997 self._create_component_from_model( 998 model=added_field_definition_model, 999 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1000 added_field_definition_model.value_type 1001 ), 1002 config=config, 1003 ) 1004 for added_field_definition_model in model.fields 1005 ] 1006 return AddFields( 1007 fields=added_field_definitions, 1008 condition=model.condition or "", 1009 parameters=model.parameters or {}, 1010 ) 1011 1012 def create_keys_to_lower_transformation( 1013 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1014 ) -> KeysToLowerTransformation: 1015 return KeysToLowerTransformation() 1016 1017 def create_keys_to_snake_transformation( 1018 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1019 ) -> KeysToSnakeCaseTransformation: 1020 return KeysToSnakeCaseTransformation() 1021 1022 def create_keys_replace_transformation( 1023 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1024 ) -> KeysReplaceTransformation: 1025 return KeysReplaceTransformation( 1026 old=model.old, new=model.new, parameters=model.parameters or {} 1027 ) 1028 1029 def create_flatten_fields( 1030 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1031 ) -> FlattenFields: 1032 return FlattenFields( 1033 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1034 ) 1035 1036 def create_dpath_flatten_fields( 1037 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1038 ) -> DpathFlattenFields: 1039 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1040 key_transformation = ( 1041 KeyTransformation( 1042 config=config, 1043 prefix=model.key_transformation.prefix, 1044 suffix=model.key_transformation.suffix, 1045 parameters=model.parameters or {}, 1046 ) 1047 if model.key_transformation is not None 1048 else None 1049 ) 1050 return DpathFlattenFields( 1051 config=config, 1052 field_path=model_field_path, 1053 delete_origin_value=model.delete_origin_value 1054 if model.delete_origin_value is not None 1055 else False, 1056 replace_record=model.replace_record if model.replace_record is not None else False, 1057 key_transformation=key_transformation, 1058 parameters=model.parameters or {}, 1059 ) 1060 1061 @staticmethod 1062 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1063 if not value_type: 1064 return None 1065 names_to_types = { 1066 ValueType.string: str, 1067 ValueType.number: float, 1068 ValueType.integer: int, 1069 ValueType.boolean: bool, 1070 } 1071 return names_to_types[value_type] 1072 1073 def create_api_key_authenticator( 1074 self, 1075 model: ApiKeyAuthenticatorModel, 1076 config: Config, 1077 token_provider: Optional[TokenProvider] = None, 1078 **kwargs: Any, 1079 ) -> ApiKeyAuthenticator: 1080 if model.inject_into is None and model.header is None: 1081 raise ValueError( 1082 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1083 ) 1084 1085 if model.inject_into is not None and model.header is not None: 1086 raise ValueError( 1087 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1088 ) 1089 1090 if token_provider is not None and model.api_token != "": 1091 raise ValueError( 1092 "If token_provider is set, api_token is ignored and has to be set to empty string." 1093 ) 1094 1095 request_option = ( 1096 self._create_component_from_model( 1097 model.inject_into, config, parameters=model.parameters or {} 1098 ) 1099 if model.inject_into 1100 else RequestOption( 1101 inject_into=RequestOptionType.header, 1102 field_name=model.header or "", 1103 parameters=model.parameters or {}, 1104 ) 1105 ) 1106 1107 return ApiKeyAuthenticator( 1108 token_provider=( 1109 token_provider 1110 if token_provider is not None 1111 else InterpolatedStringTokenProvider( 1112 api_token=model.api_token or "", 1113 config=config, 1114 parameters=model.parameters or {}, 1115 ) 1116 ), 1117 request_option=request_option, 1118 config=config, 1119 parameters=model.parameters or {}, 1120 ) 1121 1122 def create_legacy_to_per_partition_state_migration( 1123 self, 1124 model: LegacyToPerPartitionStateMigrationModel, 1125 config: Mapping[str, Any], 1126 declarative_stream: DeclarativeStreamModel, 1127 ) -> LegacyToPerPartitionStateMigration: 1128 retriever = declarative_stream.retriever 1129 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1130 raise ValueError( 1131 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1132 ) 1133 partition_router = retriever.partition_router 1134 if not isinstance( 1135 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1136 ): 1137 raise ValueError( 1138 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1139 ) 1140 if not hasattr(partition_router, "parent_stream_configs"): 1141 raise ValueError( 1142 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1143 ) 1144 1145 if not hasattr(declarative_stream, "incremental_sync"): 1146 raise ValueError( 1147 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1148 ) 1149 1150 return LegacyToPerPartitionStateMigration( 1151 partition_router, # type: ignore # was already checked above 1152 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1153 config, 1154 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1155 ) 1156 1157 def create_session_token_authenticator( 1158 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1159 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1160 decoder = ( 1161 self._create_component_from_model(model=model.decoder, config=config) 1162 if model.decoder 1163 else JsonDecoder(parameters={}) 1164 ) 1165 login_requester = self._create_component_from_model( 1166 model=model.login_requester, 1167 config=config, 1168 name=f"{name}_login_requester", 1169 decoder=decoder, 1170 ) 1171 token_provider = SessionTokenProvider( 1172 login_requester=login_requester, 1173 session_token_path=model.session_token_path, 1174 expiration_duration=parse_duration(model.expiration_duration) 1175 if model.expiration_duration 1176 else None, 1177 parameters=model.parameters or {}, 1178 message_repository=self._message_repository, 1179 decoder=decoder, 1180 ) 1181 if model.request_authentication.type == "Bearer": 1182 return ModelToComponentFactory.create_bearer_authenticator( 1183 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1184 config, 1185 token_provider=token_provider, 1186 ) 1187 else: 1188 # Get the api_token template if specified, default to just the session token 1189 api_token_template = ( 1190 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1191 ) 1192 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1193 config=config, 1194 api_token=api_token_template, 1195 session_token_provider=token_provider, 1196 parameters=model.parameters or {}, 1197 ) 1198 return self.create_api_key_authenticator( 1199 ApiKeyAuthenticatorModel( 1200 type="ApiKeyAuthenticator", 1201 api_token="", 1202 inject_into=model.request_authentication.inject_into, 1203 ), # type: ignore # $parameters and headers default to None 1204 config=config, 1205 token_provider=final_token_provider, 1206 ) 1207 1208 @staticmethod 1209 def create_basic_http_authenticator( 1210 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1211 ) -> BasicHttpAuthenticator: 1212 return BasicHttpAuthenticator( 1213 password=model.password or "", 1214 username=model.username, 1215 config=config, 1216 parameters=model.parameters or {}, 1217 ) 1218 1219 @staticmethod 1220 def create_bearer_authenticator( 1221 model: BearerAuthenticatorModel, 1222 config: Config, 1223 token_provider: Optional[TokenProvider] = None, 1224 **kwargs: Any, 1225 ) -> BearerAuthenticator: 1226 if token_provider is not None and model.api_token != "": 1227 raise ValueError( 1228 "If token_provider is set, api_token is ignored and has to be set to empty string." 1229 ) 1230 return BearerAuthenticator( 1231 token_provider=( 1232 token_provider 1233 if token_provider is not None 1234 else InterpolatedStringTokenProvider( 1235 api_token=model.api_token or "", 1236 config=config, 1237 parameters=model.parameters or {}, 1238 ) 1239 ), 1240 config=config, 1241 parameters=model.parameters or {}, 1242 ) 1243 1244 @staticmethod 1245 def create_dynamic_stream_check_config( 1246 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1247 ) -> DynamicStreamCheckConfig: 1248 return DynamicStreamCheckConfig( 1249 dynamic_stream_name=model.dynamic_stream_name, 1250 stream_count=model.stream_count, 1251 ) 1252 1253 def create_check_stream( 1254 self, model: CheckStreamModel, config: Config, **kwargs: Any 1255 ) -> CheckStream: 1256 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1257 raise ValueError( 1258 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1259 ) 1260 1261 dynamic_streams_check_configs = ( 1262 [ 1263 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1264 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1265 ] 1266 if model.dynamic_streams_check_configs 1267 else [] 1268 ) 1269 1270 return CheckStream( 1271 stream_names=model.stream_names or [], 1272 dynamic_streams_check_configs=dynamic_streams_check_configs, 1273 parameters={}, 1274 ) 1275 1276 @staticmethod 1277 def create_check_dynamic_stream( 1278 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1279 ) -> CheckDynamicStream: 1280 assert model.use_check_availability is not None # for mypy 1281 1282 use_check_availability = model.use_check_availability 1283 1284 return CheckDynamicStream( 1285 stream_count=model.stream_count, 1286 use_check_availability=use_check_availability, 1287 parameters={}, 1288 ) 1289 1290 def create_composite_error_handler( 1291 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1292 ) -> CompositeErrorHandler: 1293 error_handlers = [ 1294 self._create_component_from_model(model=error_handler_model, config=config) 1295 for error_handler_model in model.error_handlers 1296 ] 1297 return CompositeErrorHandler( 1298 error_handlers=error_handlers, parameters=model.parameters or {} 1299 ) 1300 1301 @staticmethod 1302 def create_concurrency_level( 1303 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1304 ) -> ConcurrencyLevel: 1305 return ConcurrencyLevel( 1306 default_concurrency=model.default_concurrency, 1307 max_concurrency=model.max_concurrency, 1308 config=config, 1309 parameters={}, 1310 ) 1311 1312 @staticmethod 1313 def apply_stream_state_migrations( 1314 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1315 ) -> MutableMapping[str, Any]: 1316 if stream_state_migrations: 1317 for state_migration in stream_state_migrations: 1318 if state_migration.should_migrate(stream_state): 1319 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1320 stream_state = dict(state_migration.migrate(stream_state)) 1321 return stream_state 1322 1323 def create_concurrent_cursor_from_datetime_based_cursor( 1324 self, 1325 model_type: Type[BaseModel], 1326 component_definition: ComponentDefinition, 1327 stream_name: str, 1328 stream_namespace: Optional[str], 1329 stream_state: MutableMapping[str, Any], 1330 config: Config, 1331 message_repository: Optional[MessageRepository] = None, 1332 runtime_lookback_window: Optional[datetime.timedelta] = None, 1333 **kwargs: Any, 1334 ) -> ConcurrentCursor: 1335 component_type = component_definition.get("type") 1336 if component_definition.get("type") != model_type.__name__: 1337 raise ValueError( 1338 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1339 ) 1340 1341 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1342 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1343 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1344 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1345 if "$parameters" not in component_definition and "parameters" in component_definition: 1346 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1347 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1348 1349 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1350 raise ValueError( 1351 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1352 ) 1353 1354 model_parameters = datetime_based_cursor_model.parameters or {} 1355 1356 cursor_field = self._get_catalog_defined_cursor_field( 1357 stream_name=stream_name, 1358 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1359 or False, 1360 ) 1361 1362 if not cursor_field: 1363 interpolated_cursor_field = InterpolatedString.create( 1364 datetime_based_cursor_model.cursor_field, 1365 parameters=model_parameters, 1366 ) 1367 cursor_field = CursorField( 1368 cursor_field_key=interpolated_cursor_field.eval(config=config), 1369 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1370 or False, 1371 ) 1372 1373 interpolated_partition_field_start = InterpolatedString.create( 1374 datetime_based_cursor_model.partition_field_start or "start_time", 1375 parameters=model_parameters, 1376 ) 1377 interpolated_partition_field_end = InterpolatedString.create( 1378 datetime_based_cursor_model.partition_field_end or "end_time", 1379 parameters=model_parameters, 1380 ) 1381 1382 slice_boundary_fields = ( 1383 interpolated_partition_field_start.eval(config=config), 1384 interpolated_partition_field_end.eval(config=config), 1385 ) 1386 1387 datetime_format = datetime_based_cursor_model.datetime_format 1388 1389 cursor_granularity = ( 1390 parse_duration(datetime_based_cursor_model.cursor_granularity) 1391 if datetime_based_cursor_model.cursor_granularity 1392 else None 1393 ) 1394 1395 lookback_window = None 1396 interpolated_lookback_window = ( 1397 InterpolatedString.create( 1398 datetime_based_cursor_model.lookback_window, 1399 parameters=model_parameters, 1400 ) 1401 if datetime_based_cursor_model.lookback_window 1402 else None 1403 ) 1404 if interpolated_lookback_window: 1405 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1406 if evaluated_lookback_window: 1407 lookback_window = parse_duration(evaluated_lookback_window) 1408 1409 connector_state_converter: DateTimeStreamStateConverter 1410 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1411 datetime_format=datetime_format, 1412 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1413 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1414 cursor_granularity=cursor_granularity, 1415 ) 1416 1417 # Adjusts the stream state by applying the runtime lookback window. 1418 # This is used to ensure correct state handling in case of failed partitions. 1419 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1420 if runtime_lookback_window and stream_state_value: 1421 new_stream_state = ( 1422 connector_state_converter.parse_timestamp(stream_state_value) 1423 - runtime_lookback_window 1424 ) 1425 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1426 new_stream_state 1427 ) 1428 1429 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1430 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1431 start_date_runtime_value = self.create_min_max_datetime( 1432 model=datetime_based_cursor_model.start_datetime, config=config 1433 ) 1434 else: 1435 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1436 1437 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1438 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1439 end_date_runtime_value = self.create_min_max_datetime( 1440 model=datetime_based_cursor_model.end_datetime, config=config 1441 ) 1442 else: 1443 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1444 1445 interpolated_start_date = MinMaxDatetime.create( 1446 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1447 parameters=datetime_based_cursor_model.parameters, 1448 ) 1449 interpolated_end_date = ( 1450 None 1451 if not end_date_runtime_value 1452 else MinMaxDatetime.create( 1453 end_date_runtime_value, datetime_based_cursor_model.parameters 1454 ) 1455 ) 1456 1457 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1458 if not interpolated_start_date.datetime_format: 1459 interpolated_start_date.datetime_format = datetime_format 1460 if interpolated_end_date and not interpolated_end_date.datetime_format: 1461 interpolated_end_date.datetime_format = datetime_format 1462 1463 start_date = interpolated_start_date.get_datetime(config=config) 1464 end_date_provider = ( 1465 partial(interpolated_end_date.get_datetime, config) 1466 if interpolated_end_date 1467 else connector_state_converter.get_end_provider() 1468 ) 1469 1470 if ( 1471 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1472 ) or ( 1473 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1474 ): 1475 raise ValueError( 1476 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1477 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1478 ) 1479 1480 # When step is not defined, default to a step size from the starting date to the present moment 1481 step_length = datetime.timedelta.max 1482 interpolated_step = ( 1483 InterpolatedString.create( 1484 datetime_based_cursor_model.step, 1485 parameters=model_parameters, 1486 ) 1487 if datetime_based_cursor_model.step 1488 else None 1489 ) 1490 if interpolated_step: 1491 evaluated_step = interpolated_step.eval(config) 1492 if evaluated_step: 1493 step_length = parse_duration(evaluated_step) 1494 1495 clamping_strategy: ClampingStrategy = NoClamping() 1496 if datetime_based_cursor_model.clamping: 1497 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1498 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1499 # object which we want to keep agnostic of being low-code 1500 target = InterpolatedString( 1501 string=datetime_based_cursor_model.clamping.target, 1502 parameters=model_parameters, 1503 ) 1504 evaluated_target = target.eval(config=config) 1505 match evaluated_target: 1506 case "DAY": 1507 clamping_strategy = DayClampingStrategy() 1508 end_date_provider = ClampingEndProvider( 1509 DayClampingStrategy(is_ceiling=False), 1510 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1511 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1512 ) 1513 case "WEEK": 1514 if ( 1515 not datetime_based_cursor_model.clamping.target_details 1516 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1517 ): 1518 raise ValueError( 1519 "Given WEEK clamping, weekday needs to be provided as target_details" 1520 ) 1521 weekday = self._assemble_weekday( 1522 datetime_based_cursor_model.clamping.target_details["weekday"] 1523 ) 1524 clamping_strategy = WeekClampingStrategy(weekday) 1525 end_date_provider = ClampingEndProvider( 1526 WeekClampingStrategy(weekday, is_ceiling=False), 1527 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 granularity=cursor_granularity or datetime.timedelta(days=1), 1529 ) 1530 case "MONTH": 1531 clamping_strategy = MonthClampingStrategy() 1532 end_date_provider = ClampingEndProvider( 1533 MonthClampingStrategy(is_ceiling=False), 1534 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1535 granularity=cursor_granularity or datetime.timedelta(days=1), 1536 ) 1537 case _: 1538 raise ValueError( 1539 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1540 ) 1541 1542 return ConcurrentCursor( 1543 stream_name=stream_name, 1544 stream_namespace=stream_namespace, 1545 stream_state=stream_state, 1546 message_repository=message_repository or self._message_repository, 1547 connector_state_manager=self._connector_state_manager, 1548 connector_state_converter=connector_state_converter, 1549 cursor_field=cursor_field, 1550 slice_boundary_fields=slice_boundary_fields, 1551 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1552 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1553 lookback_window=lookback_window, 1554 slice_range=step_length, 1555 cursor_granularity=cursor_granularity, 1556 clamping_strategy=clamping_strategy, 1557 ) 1558 1559 def create_concurrent_cursor_from_incrementing_count_cursor( 1560 self, 1561 model_type: Type[BaseModel], 1562 component_definition: ComponentDefinition, 1563 stream_name: str, 1564 stream_namespace: Optional[str], 1565 stream_state: MutableMapping[str, Any], 1566 config: Config, 1567 message_repository: Optional[MessageRepository] = None, 1568 **kwargs: Any, 1569 ) -> ConcurrentCursor: 1570 component_type = component_definition.get("type") 1571 if component_definition.get("type") != model_type.__name__: 1572 raise ValueError( 1573 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1574 ) 1575 1576 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1577 1578 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1579 raise ValueError( 1580 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1581 ) 1582 1583 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1584 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1585 # We need to handle both int and str representations of numeric values. 1586 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1587 if start_value is not None: 1588 interpolated_start_value = InterpolatedString.create( 1589 str(start_value), # Ensure we pass a string to InterpolatedString.create 1590 parameters=incrementing_count_cursor_model.parameters or {}, 1591 ) 1592 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1593 else: 1594 evaluated_start_value = 0 1595 1596 cursor_field = self._get_catalog_defined_cursor_field( 1597 stream_name=stream_name, 1598 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1599 or False, 1600 ) 1601 1602 if not cursor_field: 1603 interpolated_cursor_field = InterpolatedString.create( 1604 incrementing_count_cursor_model.cursor_field, 1605 parameters=incrementing_count_cursor_model.parameters or {}, 1606 ) 1607 cursor_field = CursorField( 1608 cursor_field_key=interpolated_cursor_field.eval(config=config), 1609 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1610 or False, 1611 ) 1612 1613 connector_state_converter = IncrementingCountStreamStateConverter( 1614 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1615 ) 1616 1617 return ConcurrentCursor( 1618 stream_name=stream_name, 1619 stream_namespace=stream_namespace, 1620 stream_state=stream_state, 1621 message_repository=message_repository or self._message_repository, 1622 connector_state_manager=self._connector_state_manager, 1623 connector_state_converter=connector_state_converter, 1624 cursor_field=cursor_field, 1625 slice_boundary_fields=None, 1626 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1627 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1628 ) 1629 1630 def _assemble_weekday(self, weekday: str) -> Weekday: 1631 match weekday: 1632 case "MONDAY": 1633 return Weekday.MONDAY 1634 case "TUESDAY": 1635 return Weekday.TUESDAY 1636 case "WEDNESDAY": 1637 return Weekday.WEDNESDAY 1638 case "THURSDAY": 1639 return Weekday.THURSDAY 1640 case "FRIDAY": 1641 return Weekday.FRIDAY 1642 case "SATURDAY": 1643 return Weekday.SATURDAY 1644 case "SUNDAY": 1645 return Weekday.SUNDAY 1646 case _: 1647 raise ValueError(f"Unknown weekday {weekday}") 1648 1649 def create_concurrent_cursor_from_perpartition_cursor( 1650 self, 1651 state_manager: ConnectorStateManager, 1652 model_type: Type[BaseModel], 1653 component_definition: ComponentDefinition, 1654 stream_name: str, 1655 stream_namespace: Optional[str], 1656 config: Config, 1657 stream_state: MutableMapping[str, Any], 1658 partition_router: PartitionRouter, 1659 attempt_to_create_cursor_if_not_provided: bool = False, 1660 **kwargs: Any, 1661 ) -> ConcurrentPerPartitionCursor: 1662 component_type = component_definition.get("type") 1663 if component_definition.get("type") != model_type.__name__: 1664 raise ValueError( 1665 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1666 ) 1667 1668 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1669 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1670 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1671 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1672 if "$parameters" not in component_definition and "parameters" in component_definition: 1673 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1674 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1675 1676 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1677 raise ValueError( 1678 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1679 ) 1680 1681 cursor_field = self._get_catalog_defined_cursor_field( 1682 stream_name=stream_name, 1683 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1684 or False, 1685 ) 1686 1687 if not cursor_field: 1688 interpolated_cursor_field = InterpolatedString.create( 1689 datetime_based_cursor_model.cursor_field, 1690 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1691 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1692 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1693 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1694 parameters=datetime_based_cursor_model.parameters or {}, 1695 ) 1696 cursor_field = CursorField( 1697 cursor_field_key=interpolated_cursor_field.eval(config=config), 1698 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1699 or False, 1700 ) 1701 1702 datetime_format = datetime_based_cursor_model.datetime_format 1703 1704 cursor_granularity = ( 1705 parse_duration(datetime_based_cursor_model.cursor_granularity) 1706 if datetime_based_cursor_model.cursor_granularity 1707 else None 1708 ) 1709 1710 connector_state_converter: DateTimeStreamStateConverter 1711 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1712 datetime_format=datetime_format, 1713 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1714 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1715 cursor_granularity=cursor_granularity, 1716 ) 1717 1718 # Create the cursor factory 1719 cursor_factory = ConcurrentCursorFactory( 1720 partial( 1721 self.create_concurrent_cursor_from_datetime_based_cursor, 1722 state_manager=state_manager, 1723 model_type=model_type, 1724 component_definition=component_definition, 1725 stream_name=stream_name, 1726 stream_namespace=stream_namespace, 1727 config=config, 1728 message_repository=NoopMessageRepository(), 1729 ) 1730 ) 1731 1732 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1733 use_global_cursor = isinstance( 1734 partition_router, GroupingPartitionRouter 1735 ) or component_definition.get("global_substream_cursor", False) 1736 1737 # Return the concurrent cursor and state converter 1738 return ConcurrentPerPartitionCursor( 1739 cursor_factory=cursor_factory, 1740 partition_router=partition_router, 1741 stream_name=stream_name, 1742 stream_namespace=stream_namespace, 1743 stream_state=stream_state, 1744 message_repository=self._message_repository, # type: ignore 1745 connector_state_manager=state_manager, 1746 connector_state_converter=connector_state_converter, 1747 cursor_field=cursor_field, 1748 use_global_cursor=use_global_cursor, 1749 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1750 ) 1751 1752 @staticmethod 1753 def create_constant_backoff_strategy( 1754 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1755 ) -> ConstantBackoffStrategy: 1756 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 1757 return ConstantBackoffStrategy( 1758 backoff_time_in_seconds=model.backoff_time_in_seconds, 1759 jitter_range_in_seconds=model.jitter_range_in_seconds, 1760 config=config, 1761 parameters=model.parameters or {}, 1762 ) 1763 1764 @staticmethod 1765 def _validate_jitter_range(jitter_range_in_seconds: Optional[float]) -> None: 1766 if jitter_range_in_seconds is not None and jitter_range_in_seconds < 0: 1767 raise ValueError("jitter_range_in_seconds must be greater than or equal to 0") 1768 1769 def create_cursor_pagination( 1770 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1771 ) -> CursorPaginationStrategy: 1772 if isinstance(decoder, PaginationDecoderDecorator): 1773 inner_decoder = decoder.decoder 1774 else: 1775 inner_decoder = decoder 1776 decoder = PaginationDecoderDecorator(decoder=decoder) 1777 1778 if self._is_supported_decoder_for_pagination(inner_decoder): 1779 decoder_to_use = decoder 1780 else: 1781 raise ValueError( 1782 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1783 ) 1784 1785 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1786 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1787 page_size = model.page_size 1788 if isinstance(page_size, str) and page_size.isdigit(): 1789 page_size = int(page_size) 1790 1791 return CursorPaginationStrategy( 1792 cursor_value=model.cursor_value, 1793 decoder=decoder_to_use, 1794 page_size=page_size, 1795 stop_condition=model.stop_condition, 1796 config=config, 1797 parameters=model.parameters or {}, 1798 ) 1799 1800 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1801 """ 1802 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1803 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1804 :param model: The Pydantic model of the custom component being created 1805 :param config: The custom defined connector config 1806 :return: The declarative component built from the Pydantic model to be used at runtime 1807 """ 1808 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1809 component_fields = get_type_hints(custom_component_class) 1810 model_args = model.dict() 1811 model_args["config"] = config 1812 1813 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1814 # we defer to these arguments over the component's definition 1815 for key, arg in kwargs.items(): 1816 model_args[key] = arg 1817 1818 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1819 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1820 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1821 for model_field, model_value in model_args.items(): 1822 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1823 if ( 1824 isinstance(model_value, dict) 1825 and "type" not in model_value 1826 and model_field in component_fields 1827 ): 1828 derived_type = self._derive_component_type_from_type_hints( 1829 component_fields.get(model_field) 1830 ) 1831 if derived_type: 1832 model_value["type"] = derived_type 1833 1834 if self._is_component(model_value): 1835 model_args[model_field] = self._create_nested_component( 1836 model, 1837 model_field, 1838 model_value, 1839 config, 1840 **kwargs, 1841 ) 1842 elif isinstance(model_value, list): 1843 vals = [] 1844 for v in model_value: 1845 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1846 derived_type = self._derive_component_type_from_type_hints( 1847 component_fields.get(model_field) 1848 ) 1849 if derived_type: 1850 v["type"] = derived_type 1851 if self._is_component(v): 1852 vals.append( 1853 self._create_nested_component( 1854 model, 1855 model_field, 1856 v, 1857 config, 1858 **kwargs, 1859 ) 1860 ) 1861 else: 1862 vals.append(v) 1863 model_args[model_field] = vals 1864 1865 kwargs = { 1866 class_field: model_args[class_field] 1867 for class_field in component_fields.keys() 1868 if class_field in model_args 1869 } 1870 1871 if "api_budget" in component_fields and kwargs.get("api_budget") is None: 1872 kwargs["api_budget"] = self._api_budget 1873 1874 return custom_component_class(**kwargs) 1875 1876 @staticmethod 1877 def _get_class_from_fully_qualified_class_name( 1878 full_qualified_class_name: str, 1879 ) -> Any: 1880 """Get a class from its fully qualified name. 1881 1882 If a custom components module is needed, we assume it is already registered - probably 1883 as `source_declarative_manifest.components` or `components`. 1884 1885 Args: 1886 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1887 1888 Returns: 1889 Any: The class object. 1890 1891 Raises: 1892 ValueError: If the class cannot be loaded. 1893 """ 1894 split = full_qualified_class_name.split(".") 1895 module_name_full = ".".join(split[:-1]) 1896 class_name = split[-1] 1897 1898 try: 1899 module_ref = importlib.import_module(module_name_full) 1900 except ModuleNotFoundError as e: 1901 if split[0] == "source_declarative_manifest": 1902 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1903 try: 1904 import os 1905 1906 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1907 module_ref = importlib.import_module( 1908 module_name_with_source_declarative_manifest 1909 ) 1910 except ModuleNotFoundError: 1911 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1912 else: 1913 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1914 1915 try: 1916 return getattr(module_ref, class_name) 1917 except AttributeError as e: 1918 raise ValueError( 1919 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1920 ) from e 1921 1922 @staticmethod 1923 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1924 interface = field_type 1925 while True: 1926 origin = get_origin(interface) 1927 if origin: 1928 # Unnest types until we reach the raw type 1929 # List[T] -> T 1930 # Optional[List[T]] -> T 1931 args = get_args(interface) 1932 interface = args[0] 1933 else: 1934 break 1935 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1936 return interface.__name__ 1937 return None 1938 1939 @staticmethod 1940 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1941 if not cls: 1942 return False 1943 return cls.__module__ == "builtins" 1944 1945 @staticmethod 1946 def _extract_missing_parameters(error: TypeError) -> List[str]: 1947 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1948 if parameter_search: 1949 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1950 else: 1951 return [] 1952 1953 def _create_nested_component( 1954 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1955 ) -> Any: 1956 type_name = model_value.get("type", None) 1957 if not type_name: 1958 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1959 return model_value 1960 1961 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1962 if model_type: 1963 parsed_model = model_type.parse_obj(model_value) 1964 try: 1965 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1966 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1967 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1968 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1969 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1970 # are needed by a component and could not be shared. 1971 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1972 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1973 model_parameters = model_value.get("$parameters", {}) 1974 matching_parameters = { 1975 kwarg: model_parameters[kwarg] 1976 for kwarg in constructor_kwargs 1977 if kwarg in model_parameters 1978 } 1979 matching_kwargs = { 1980 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1981 } 1982 return self._create_component_from_model( 1983 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1984 ) 1985 except TypeError as error: 1986 missing_parameters = self._extract_missing_parameters(error) 1987 if missing_parameters: 1988 raise ValueError( 1989 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1990 + ", ".join( 1991 ( 1992 f"{type_name}.$parameters.{parameter}" 1993 for parameter in missing_parameters 1994 ) 1995 ) 1996 ) 1997 raise TypeError( 1998 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1999 ) 2000 else: 2001 raise ValueError( 2002 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 2003 ) 2004 2005 @staticmethod 2006 def _is_component(model_value: Any) -> bool: 2007 return isinstance(model_value, dict) and model_value.get("type") is not None 2008 2009 def create_default_stream( 2010 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2011 ) -> AbstractStream: 2012 primary_key = model.primary_key.__root__ if model.primary_key else None 2013 self._migrate_state(model, config) 2014 2015 partition_router = self._build_stream_slicer_from_partition_router( 2016 model.retriever, 2017 config, 2018 stream_name=model.name, 2019 **kwargs, 2020 ) 2021 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2022 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2023 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2024 2025 end_time_option = ( 2026 self._create_component_from_model( 2027 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2028 ) 2029 if cursor_model.end_time_option 2030 else None 2031 ) 2032 start_time_option = ( 2033 self._create_component_from_model( 2034 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2035 ) 2036 if cursor_model.start_time_option 2037 else None 2038 ) 2039 2040 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2041 start_time_option=start_time_option, 2042 end_time_option=end_time_option, 2043 partition_field_start=cursor_model.partition_field_start, 2044 partition_field_end=cursor_model.partition_field_end, 2045 config=config, 2046 parameters=model.parameters or {}, 2047 ) 2048 request_options_provider = ( 2049 datetime_request_options_provider 2050 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2051 else PerPartitionRequestOptionsProvider( 2052 partition_router, datetime_request_options_provider 2053 ) 2054 ) 2055 elif model.incremental_sync and isinstance( 2056 model.incremental_sync, IncrementingCountCursorModel 2057 ): 2058 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2059 raise ValueError( 2060 "PerPartition does not support per partition states because switching to global state is time based" 2061 ) 2062 2063 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2064 2065 start_time_option = ( 2066 self._create_component_from_model( 2067 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2068 config, 2069 parameters=cursor_model.parameters or {}, 2070 ) 2071 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2072 else None 2073 ) 2074 2075 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2076 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2077 partition_field_start = "start" 2078 2079 request_options_provider = DatetimeBasedRequestOptionsProvider( 2080 start_time_option=start_time_option, 2081 partition_field_start=partition_field_start, 2082 config=config, 2083 parameters=model.parameters or {}, 2084 ) 2085 else: 2086 request_options_provider = None 2087 2088 transformations = [] 2089 if model.transformations: 2090 for transformation_model in model.transformations: 2091 transformations.append( 2092 self._create_component_from_model(model=transformation_model, config=config) 2093 ) 2094 file_uploader = None 2095 if model.file_uploader: 2096 file_uploader = self._create_component_from_model( 2097 model=model.file_uploader, config=config 2098 ) 2099 2100 stream_slicer: ConcurrentStreamSlicer = ( 2101 partition_router 2102 if isinstance(concurrent_cursor, FinalStateCursor) 2103 else concurrent_cursor 2104 ) 2105 2106 retriever = self._create_component_from_model( 2107 model=model.retriever, 2108 config=config, 2109 name=model.name, 2110 primary_key=primary_key, 2111 request_options_provider=request_options_provider, 2112 stream_slicer=stream_slicer, 2113 partition_router=partition_router, 2114 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2115 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2116 cursor=concurrent_cursor, 2117 transformations=transformations, 2118 file_uploader=file_uploader, 2119 incremental_sync=model.incremental_sync, 2120 ) 2121 if isinstance(retriever, AsyncRetriever): 2122 stream_slicer = retriever.stream_slicer 2123 2124 schema_loader: SchemaLoader 2125 if model.schema_loader and isinstance(model.schema_loader, list): 2126 nested_schema_loaders = [ 2127 self._create_component_from_model(model=nested_schema_loader, config=config) 2128 for nested_schema_loader in model.schema_loader 2129 ] 2130 schema_loader = CompositeSchemaLoader( 2131 schema_loaders=nested_schema_loaders, parameters={} 2132 ) 2133 elif model.schema_loader: 2134 schema_loader = self._create_component_from_model( 2135 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2136 config=config, 2137 ) 2138 else: 2139 options = model.parameters or {} 2140 if "name" not in options: 2141 options["name"] = model.name 2142 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2143 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2144 2145 stream_name = model.name or "" 2146 return DefaultStream( 2147 partition_generator=StreamSlicerPartitionGenerator( 2148 DeclarativePartitionFactory( 2149 stream_name, 2150 schema_loader, 2151 retriever, 2152 self._message_repository, 2153 ), 2154 stream_slicer, 2155 slice_limit=self._limit_slices_fetched, 2156 ), 2157 name=stream_name, 2158 json_schema=schema_loader.get_json_schema, 2159 primary_key=get_primary_key_from_stream(primary_key), 2160 cursor_field=( 2161 concurrent_cursor.cursor_field 2162 if hasattr(concurrent_cursor, "cursor_field") 2163 else None 2164 ), 2165 logger=logging.getLogger(f"airbyte.{stream_name}"), 2166 cursor=concurrent_cursor, 2167 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2168 ) 2169 2170 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2171 stream_name = model.name or "" 2172 stream_state = self._connector_state_manager.get_stream_state( 2173 stream_name=stream_name, namespace=None 2174 ) 2175 if model.state_migrations: 2176 state_transformations = [ 2177 self._create_component_from_model(state_migration, config, declarative_stream=model) 2178 for state_migration in model.state_migrations 2179 ] 2180 else: 2181 state_transformations = [] 2182 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2183 self._connector_state_manager.update_state_for_stream( 2184 stream_name=stream_name, namespace=None, value=stream_state 2185 ) 2186 2187 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2188 return bool( 2189 model.incremental_sync 2190 and hasattr(model.incremental_sync, "is_data_feed") 2191 and model.incremental_sync.is_data_feed 2192 ) 2193 2194 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2195 return bool( 2196 model.incremental_sync 2197 and hasattr(model.incremental_sync, "is_client_side_incremental") 2198 and model.incremental_sync.is_client_side_incremental 2199 ) 2200 2201 def _build_stream_slicer_from_partition_router( 2202 self, 2203 model: Union[ 2204 AsyncRetrieverModel, 2205 CustomRetrieverModel, 2206 SimpleRetrieverModel, 2207 ], 2208 config: Config, 2209 stream_name: Optional[str] = None, 2210 **kwargs: Any, 2211 ) -> PartitionRouter: 2212 if ( 2213 hasattr(model, "partition_router") 2214 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2215 and model.partition_router 2216 ): 2217 stream_slicer_model = model.partition_router 2218 if isinstance(stream_slicer_model, list): 2219 return CartesianProductStreamSlicer( 2220 [ 2221 self._create_component_from_model( 2222 model=slicer, config=config, stream_name=stream_name or "" 2223 ) 2224 for slicer in stream_slicer_model 2225 ], 2226 parameters={}, 2227 ) 2228 elif isinstance(stream_slicer_model, dict): 2229 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2230 params = stream_slicer_model.get("$parameters") 2231 if not isinstance(params, dict): 2232 params = {} 2233 stream_slicer_model["$parameters"] = params 2234 2235 if stream_name is not None: 2236 params["stream_name"] = stream_name 2237 2238 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2239 model, 2240 "partition_router", 2241 stream_slicer_model, 2242 config, 2243 **kwargs, 2244 ) 2245 else: 2246 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2247 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2248 ) 2249 return SinglePartitionRouter(parameters={}) 2250 2251 def _build_concurrent_cursor( 2252 self, 2253 model: DeclarativeStreamModel, 2254 stream_slicer: Optional[PartitionRouter], 2255 config: Config, 2256 ) -> Cursor: 2257 stream_name = model.name or "" 2258 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2259 2260 if ( 2261 model.incremental_sync 2262 and stream_slicer 2263 and not isinstance(stream_slicer, SinglePartitionRouter) 2264 ): 2265 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2266 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2267 # same time because we didn't solve for design questions like what the lookback window would 2268 # be as well as global cursor fall backs. We have not seen customers that have needed both 2269 # at the same time yet and are currently punting on this until we need to solve it. 2270 raise ValueError( 2271 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2272 ) 2273 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2274 state_manager=self._connector_state_manager, 2275 model_type=DatetimeBasedCursorModel, 2276 component_definition=model.incremental_sync.__dict__, 2277 stream_name=stream_name, 2278 stream_state=stream_state, 2279 stream_namespace=None, 2280 config=config or {}, 2281 partition_router=stream_slicer, 2282 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2283 ) 2284 elif model.incremental_sync: 2285 if type(model.incremental_sync) == IncrementingCountCursorModel: 2286 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2287 model_type=IncrementingCountCursorModel, 2288 component_definition=model.incremental_sync.__dict__, 2289 stream_name=stream_name, 2290 stream_namespace=None, 2291 stream_state=stream_state, 2292 config=config or {}, 2293 ) 2294 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2295 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2296 model_type=type(model.incremental_sync), 2297 component_definition=model.incremental_sync.__dict__, 2298 stream_name=stream_name, 2299 stream_namespace=None, 2300 stream_state=stream_state, 2301 config=config or {}, 2302 attempt_to_create_cursor_if_not_provided=True, 2303 ) 2304 else: 2305 raise ValueError( 2306 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2307 ) 2308 return FinalStateCursor(stream_name, None, self._message_repository) 2309 2310 def create_default_error_handler( 2311 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2312 ) -> DefaultErrorHandler: 2313 backoff_strategies = [] 2314 if model.backoff_strategies: 2315 for backoff_strategy_model in model.backoff_strategies: 2316 backoff_strategies.append( 2317 self._create_component_from_model(model=backoff_strategy_model, config=config) 2318 ) 2319 2320 response_filters = [] 2321 if model.response_filters: 2322 for response_filter_model in model.response_filters: 2323 response_filters.append( 2324 self._create_component_from_model(model=response_filter_model, config=config) 2325 ) 2326 response_filters.append( 2327 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2328 ) 2329 2330 return DefaultErrorHandler( 2331 backoff_strategies=backoff_strategies, 2332 max_retries=model.max_retries, 2333 response_filters=response_filters, 2334 config=config, 2335 parameters=model.parameters or {}, 2336 ) 2337 2338 def create_default_paginator( 2339 self, 2340 model: DefaultPaginatorModel, 2341 config: Config, 2342 *, 2343 url_base: str, 2344 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2345 decoder: Optional[Decoder] = None, 2346 cursor_used_for_stop_condition: Optional[Cursor] = None, 2347 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2348 if decoder: 2349 if self._is_supported_decoder_for_pagination(decoder): 2350 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2351 else: 2352 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2353 else: 2354 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2355 page_size_option = ( 2356 self._create_component_from_model(model=model.page_size_option, config=config) 2357 if model.page_size_option 2358 else None 2359 ) 2360 page_token_option = ( 2361 self._create_component_from_model(model=model.page_token_option, config=config) 2362 if model.page_token_option 2363 else None 2364 ) 2365 pagination_strategy = self._create_component_from_model( 2366 model=model.pagination_strategy, 2367 config=config, 2368 decoder=decoder_to_use, 2369 extractor_model=extractor_model, 2370 ) 2371 if cursor_used_for_stop_condition: 2372 pagination_strategy = StopConditionPaginationStrategyDecorator( 2373 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2374 ) 2375 paginator = DefaultPaginator( 2376 decoder=decoder_to_use, 2377 page_size_option=page_size_option, 2378 page_token_option=page_token_option, 2379 pagination_strategy=pagination_strategy, 2380 url_base=url_base, 2381 config=config, 2382 parameters=model.parameters or {}, 2383 ) 2384 if self._limit_pages_fetched_per_slice: 2385 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2386 return paginator 2387 2388 def create_dpath_extractor( 2389 self, 2390 model: DpathExtractorModel, 2391 config: Config, 2392 decoder: Optional[Decoder] = None, 2393 **kwargs: Any, 2394 ) -> DpathExtractor: 2395 if decoder: 2396 decoder_to_use = decoder 2397 else: 2398 decoder_to_use = JsonDecoder(parameters={}) 2399 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2400 2401 record_expander = None 2402 if model.record_expander: 2403 record_expander = self._create_component_from_model( 2404 model=model.record_expander, 2405 config=config, 2406 ) 2407 2408 return DpathExtractor( 2409 decoder=decoder_to_use, 2410 field_path=model_field_path, 2411 config=config, 2412 parameters=model.parameters or {}, 2413 record_expander=record_expander, 2414 ) 2415 2416 def create_record_expander( 2417 self, 2418 model: RecordExpanderModel, 2419 config: Config, 2420 **kwargs: Any, 2421 ) -> RecordExpander: 2422 return RecordExpander( 2423 expand_records_from_field=model.expand_records_from_field, 2424 config=config, 2425 parameters=model.parameters or {}, 2426 remain_original_record=model.remain_original_record or False, 2427 on_no_records=OnNoRecords(model.on_no_records.value) 2428 if model.on_no_records 2429 else OnNoRecords.skip, 2430 ) 2431 2432 @staticmethod 2433 def create_response_to_file_extractor( 2434 model: ResponseToFileExtractorModel, 2435 **kwargs: Any, 2436 ) -> ResponseToFileExtractor: 2437 return ResponseToFileExtractor(parameters=model.parameters or {}) 2438 2439 @staticmethod 2440 def create_exponential_backoff_strategy( 2441 model: ExponentialBackoffStrategyModel, config: Config 2442 ) -> ExponentialBackoffStrategy: 2443 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 2444 return ExponentialBackoffStrategy( 2445 factor=model.factor or 5, 2446 jitter_range_in_seconds=model.jitter_range_in_seconds, 2447 parameters=model.parameters or {}, 2448 config=config, 2449 ) 2450 2451 @staticmethod 2452 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2453 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2454 2455 def create_http_requester( 2456 self, 2457 model: HttpRequesterModel, 2458 config: Config, 2459 decoder: Decoder = JsonDecoder(parameters={}), 2460 query_properties_key: Optional[str] = None, 2461 use_cache: Optional[bool] = None, 2462 *, 2463 name: str, 2464 ) -> HttpRequester: 2465 authenticator = ( 2466 self._create_component_from_model( 2467 model=model.authenticator, 2468 config=config, 2469 url_base=model.url or model.url_base, 2470 name=name, 2471 decoder=decoder, 2472 ) 2473 if model.authenticator 2474 else None 2475 ) 2476 error_handler = ( 2477 self._create_component_from_model(model=model.error_handler, config=config) 2478 if model.error_handler 2479 else DefaultErrorHandler( 2480 backoff_strategies=[], 2481 response_filters=[], 2482 config=config, 2483 parameters=model.parameters or {}, 2484 ) 2485 ) 2486 2487 api_budget = self._api_budget 2488 2489 request_options_provider = InterpolatedRequestOptionsProvider( 2490 request_body=model.request_body, 2491 request_body_data=model.request_body_data, 2492 request_body_json=model.request_body_json, 2493 request_headers=model.request_headers, 2494 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2495 query_properties_key=query_properties_key, 2496 config=config, 2497 parameters=model.parameters or {}, 2498 ) 2499 2500 assert model.use_cache is not None # for mypy 2501 assert model.http_method is not None # for mypy 2502 2503 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2504 2505 return HttpRequester( 2506 name=name, 2507 url=model.url, 2508 url_base=model.url_base, 2509 path=model.path, 2510 authenticator=authenticator, 2511 error_handler=error_handler, 2512 api_budget=api_budget, 2513 http_method=HttpMethod[model.http_method.value], 2514 request_options_provider=request_options_provider, 2515 config=config, 2516 disable_retries=self._disable_retries, 2517 parameters=model.parameters or {}, 2518 message_repository=self._message_repository, 2519 use_cache=should_use_cache, 2520 decoder=decoder, 2521 stream_response=decoder.is_stream_response() if decoder else False, 2522 ) 2523 2524 @staticmethod 2525 def create_http_response_filter( 2526 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2527 ) -> HttpResponseFilter: 2528 if model.action: 2529 action = ResponseAction(model.action.value) 2530 else: 2531 action = None 2532 2533 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2534 2535 http_codes = ( 2536 set(model.http_codes) if model.http_codes else set() 2537 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2538 2539 return HttpResponseFilter( 2540 action=action, 2541 failure_type=failure_type, 2542 error_message=model.error_message or "", 2543 error_message_contains=model.error_message_contains or "", 2544 http_codes=http_codes, 2545 predicate=model.predicate or "", 2546 config=config, 2547 parameters=model.parameters or {}, 2548 ) 2549 2550 @staticmethod 2551 def create_inline_schema_loader( 2552 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2553 ) -> InlineSchemaLoader: 2554 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2555 2556 def create_complex_field_type( 2557 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2558 ) -> ComplexFieldType: 2559 items = ( 2560 self._create_component_from_model(model=model.items, config=config) 2561 if isinstance(model.items, ComplexFieldTypeModel) 2562 else model.items 2563 ) 2564 2565 return ComplexFieldType(field_type=model.field_type, items=items) 2566 2567 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2568 target_type = ( 2569 self._create_component_from_model(model=model.target_type, config=config) 2570 if isinstance(model.target_type, ComplexFieldTypeModel) 2571 else model.target_type 2572 ) 2573 2574 return TypesMap( 2575 target_type=target_type, 2576 current_type=model.current_type, 2577 condition=model.condition if model.condition is not None else "True", 2578 ) 2579 2580 def create_schema_type_identifier( 2581 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2582 ) -> SchemaTypeIdentifier: 2583 types_mapping = [] 2584 if model.types_mapping: 2585 types_mapping.extend( 2586 [ 2587 self._create_component_from_model(types_map, config=config) 2588 for types_map in model.types_mapping 2589 ] 2590 ) 2591 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2592 [x for x in model.schema_pointer] if model.schema_pointer else [] 2593 ) 2594 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2595 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2596 [x for x in model.type_pointer] if model.type_pointer else None 2597 ) 2598 2599 return SchemaTypeIdentifier( 2600 schema_pointer=model_schema_pointer, 2601 key_pointer=model_key_pointer, 2602 type_pointer=model_type_pointer, 2603 types_mapping=types_mapping, 2604 parameters=model.parameters or {}, 2605 ) 2606 2607 def create_dynamic_schema_loader( 2608 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2609 ) -> DynamicSchemaLoader: 2610 schema_transformations = [] 2611 if model.schema_transformations: 2612 for transformation_model in model.schema_transformations: 2613 schema_transformations.append( 2614 self._create_component_from_model(model=transformation_model, config=config) 2615 ) 2616 name = "dynamic_properties" 2617 retriever = self._create_component_from_model( 2618 model=model.retriever, 2619 config=config, 2620 name=name, 2621 primary_key=None, 2622 partition_router=self._build_stream_slicer_from_partition_router( 2623 model.retriever, config 2624 ), 2625 transformations=[], 2626 use_cache=True, 2627 log_formatter=( 2628 lambda response: format_http_message( 2629 response, 2630 f"Schema loader '{name}' request", 2631 f"Request performed in order to extract schema.", 2632 name, 2633 is_auxiliary=True, 2634 ) 2635 ), 2636 ) 2637 schema_type_identifier = self._create_component_from_model( 2638 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2639 ) 2640 schema_filter = ( 2641 self._create_component_from_model( 2642 model.schema_filter, config=config, parameters=model.parameters or {} 2643 ) 2644 if model.schema_filter is not None 2645 else None 2646 ) 2647 2648 return DynamicSchemaLoader( 2649 retriever=retriever, 2650 config=config, 2651 schema_transformations=schema_transformations, 2652 schema_filter=schema_filter, 2653 schema_type_identifier=schema_type_identifier, 2654 parameters=model.parameters or {}, 2655 ) 2656 2657 @staticmethod 2658 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2659 return JsonDecoder(parameters={}) 2660 2661 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2662 return CompositeRawDecoder( 2663 parser=ModelToComponentFactory._get_parser(model, config), 2664 stream_response=False if self._emit_connector_builder_messages else True, 2665 ) 2666 2667 def create_jsonl_decoder( 2668 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2669 ) -> Decoder: 2670 return CompositeRawDecoder( 2671 parser=ModelToComponentFactory._get_parser(model, config), 2672 stream_response=False if self._emit_connector_builder_messages else True, 2673 ) 2674 2675 def create_gzip_decoder( 2676 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2677 ) -> Decoder: 2678 _compressed_response_types = { 2679 "gzip", 2680 "x-gzip", 2681 "gzip, deflate", 2682 "x-gzip, deflate", 2683 "application/zip", 2684 "application/gzip", 2685 "application/x-gzip", 2686 "application/x-zip-compressed", 2687 } 2688 2689 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2690 2691 if self._emit_connector_builder_messages: 2692 # This is very surprising but if the response is not streamed, 2693 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2694 # which uses urllib3 directly and does not uncompress the data. 2695 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2696 2697 return CompositeRawDecoder.by_headers( 2698 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2699 stream_response=True, 2700 fallback_parser=gzip_parser.inner_parser, 2701 ) 2702 2703 @staticmethod 2704 def create_iterable_decoder( 2705 model: IterableDecoderModel, config: Config, **kwargs: Any 2706 ) -> IterableDecoder: 2707 return IterableDecoder(parameters={}) 2708 2709 @staticmethod 2710 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2711 return XmlDecoder(parameters={}) 2712 2713 def create_zipfile_decoder( 2714 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2715 ) -> ZipfileDecoder: 2716 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2717 2718 @staticmethod 2719 def _get_parser(model: BaseModel, config: Config) -> Parser: 2720 if isinstance(model, JsonDecoderModel): 2721 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2722 return JsonParser() 2723 elif isinstance(model, JsonlDecoderModel): 2724 return JsonLineParser() 2725 elif isinstance(model, CsvDecoderModel): 2726 return CsvParser( 2727 encoding=model.encoding, 2728 delimiter=model.delimiter, 2729 set_values_to_none=model.set_values_to_none, 2730 ) 2731 elif isinstance(model, GzipDecoderModel): 2732 return GzipParser( 2733 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2734 ) 2735 elif isinstance( 2736 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2737 ): 2738 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2739 2740 raise ValueError(f"Unknown decoder type {model}") 2741 2742 @staticmethod 2743 def create_json_file_schema_loader( 2744 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2745 ) -> JsonFileSchemaLoader: 2746 return JsonFileSchemaLoader( 2747 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2748 ) 2749 2750 def create_jwt_authenticator( 2751 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2752 ) -> JwtAuthenticator: 2753 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2754 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2755 request_option = ( 2756 self._create_component_from_model(model.request_option, config) 2757 if model.request_option 2758 else None 2759 ) 2760 return JwtAuthenticator( 2761 config=config, 2762 parameters=model.parameters or {}, 2763 algorithm=JwtAlgorithm(model.algorithm.value), 2764 secret_key=model.secret_key, 2765 base64_encode_secret_key=model.base64_encode_secret_key, 2766 token_duration=model.token_duration, 2767 header_prefix=model.header_prefix, 2768 kid=jwt_headers.kid, 2769 typ=jwt_headers.typ, 2770 cty=jwt_headers.cty, 2771 iss=jwt_payload.iss, 2772 sub=jwt_payload.sub, 2773 aud=jwt_payload.aud, 2774 additional_jwt_headers=model.additional_jwt_headers, 2775 additional_jwt_payload=model.additional_jwt_payload, 2776 passphrase=model.passphrase, 2777 request_option=request_option, 2778 ) 2779 2780 def create_list_partition_router( 2781 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2782 ) -> ListPartitionRouter: 2783 request_option = ( 2784 self._create_component_from_model(model.request_option, config) 2785 if model.request_option 2786 else None 2787 ) 2788 return ListPartitionRouter( 2789 cursor_field=model.cursor_field, 2790 request_option=request_option, 2791 values=model.values, 2792 config=config, 2793 parameters=model.parameters or {}, 2794 ) 2795 2796 @staticmethod 2797 def create_min_max_datetime( 2798 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2799 ) -> MinMaxDatetime: 2800 return MinMaxDatetime( 2801 datetime=model.datetime, 2802 datetime_format=model.datetime_format or "", 2803 max_datetime=model.max_datetime or "", 2804 min_datetime=model.min_datetime or "", 2805 parameters=model.parameters or {}, 2806 ) 2807 2808 @staticmethod 2809 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2810 return NoAuth(parameters=model.parameters or {}) 2811 2812 @staticmethod 2813 def create_no_pagination( 2814 model: NoPaginationModel, config: Config, **kwargs: Any 2815 ) -> NoPagination: 2816 return NoPagination(parameters={}) 2817 2818 def create_oauth_authenticator( 2819 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2820 ) -> DeclarativeOauth2Authenticator: 2821 profile_assertion = ( 2822 self._create_component_from_model(model.profile_assertion, config=config) 2823 if model.profile_assertion 2824 else None 2825 ) 2826 2827 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2828 self._get_refresh_token_error_information(model) 2829 ) 2830 if model.refresh_token_updater: 2831 # ignore type error because fixing it would have a lot of dependencies, revisit later 2832 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2833 config, 2834 InterpolatedString.create( 2835 model.token_refresh_endpoint, # type: ignore 2836 parameters=model.parameters or {}, 2837 ).eval(config), 2838 access_token_name=InterpolatedString.create( 2839 model.access_token_name or "access_token", parameters=model.parameters or {} 2840 ).eval(config), 2841 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2842 expires_in_name=InterpolatedString.create( 2843 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2844 ).eval(config), 2845 client_id_name=InterpolatedString.create( 2846 model.client_id_name or "client_id", parameters=model.parameters or {} 2847 ).eval(config), 2848 client_id=InterpolatedString.create( 2849 model.client_id, parameters=model.parameters or {} 2850 ).eval(config) 2851 if model.client_id 2852 else model.client_id, 2853 client_secret_name=InterpolatedString.create( 2854 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2855 ).eval(config), 2856 client_secret=InterpolatedString.create( 2857 model.client_secret, parameters=model.parameters or {} 2858 ).eval(config) 2859 if model.client_secret 2860 else model.client_secret, 2861 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2862 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2863 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2864 grant_type_name=InterpolatedString.create( 2865 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2866 ).eval(config), 2867 grant_type=InterpolatedString.create( 2868 model.grant_type or "refresh_token", parameters=model.parameters or {} 2869 ).eval(config), 2870 refresh_request_body=InterpolatedMapping( 2871 model.refresh_request_body or {}, parameters=model.parameters or {} 2872 ).eval(config), 2873 refresh_request_headers=InterpolatedMapping( 2874 model.refresh_request_headers or {}, parameters=model.parameters or {} 2875 ).eval(config), 2876 scopes=model.scopes, 2877 token_expiry_date_format=model.token_expiry_date_format, 2878 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2879 message_repository=self._message_repository, 2880 refresh_token_error_status_codes=refresh_token_error_status_codes, 2881 refresh_token_error_key=refresh_token_error_key, 2882 refresh_token_error_values=refresh_token_error_values, 2883 ) 2884 # ignore type error because fixing it would have a lot of dependencies, revisit later 2885 return DeclarativeOauth2Authenticator( # type: ignore 2886 access_token_name=model.access_token_name or "access_token", 2887 access_token_value=model.access_token_value, 2888 client_id_name=model.client_id_name or "client_id", 2889 client_id=model.client_id, 2890 client_secret_name=model.client_secret_name or "client_secret", 2891 client_secret=model.client_secret, 2892 expires_in_name=model.expires_in_name or "expires_in", 2893 grant_type_name=model.grant_type_name or "grant_type", 2894 grant_type=model.grant_type or "refresh_token", 2895 refresh_request_body=model.refresh_request_body, 2896 refresh_request_headers=model.refresh_request_headers, 2897 refresh_token_name=model.refresh_token_name or "refresh_token", 2898 refresh_token=model.refresh_token, 2899 scopes=model.scopes, 2900 token_expiry_date=model.token_expiry_date, 2901 token_expiry_date_format=model.token_expiry_date_format, 2902 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2903 token_refresh_endpoint=model.token_refresh_endpoint, 2904 config=config, 2905 parameters=model.parameters or {}, 2906 message_repository=self._message_repository, 2907 profile_assertion=profile_assertion, 2908 use_profile_assertion=model.use_profile_assertion, 2909 refresh_token_error_status_codes=refresh_token_error_status_codes, 2910 refresh_token_error_key=refresh_token_error_key, 2911 refresh_token_error_values=refresh_token_error_values, 2912 ) 2913 2914 @staticmethod 2915 def _get_refresh_token_error_information( 2916 model: OAuthAuthenticatorModel, 2917 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2918 """ 2919 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2920 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2921 information is defined only once and return the right fields. 2922 """ 2923 refresh_token_updater = model.refresh_token_updater 2924 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2925 refresh_token_updater.refresh_token_error_status_codes 2926 or refresh_token_updater.refresh_token_error_key 2927 or refresh_token_updater.refresh_token_error_values 2928 ) 2929 is_defined_on_oauth_authenticator = ( 2930 model.refresh_token_error_status_codes 2931 or model.refresh_token_error_key 2932 or model.refresh_token_error_values 2933 ) 2934 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2935 raise ValueError( 2936 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2937 ) 2938 2939 if is_defined_on_refresh_token_updated: 2940 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2941 return ( 2942 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2943 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2944 else (), 2945 not_optional_refresh_token_updater.refresh_token_error_key or "", 2946 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2947 if not_optional_refresh_token_updater.refresh_token_error_values 2948 else (), 2949 ) 2950 elif is_defined_on_oauth_authenticator: 2951 return ( 2952 tuple(model.refresh_token_error_status_codes) 2953 if model.refresh_token_error_status_codes 2954 else (), 2955 model.refresh_token_error_key or "", 2956 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2957 ) 2958 2959 # returning default values we think cover most cases 2960 return (400,), "error", ("invalid_grant", "invalid_permissions") 2961 2962 def create_offset_increment( 2963 self, 2964 model: OffsetIncrementModel, 2965 config: Config, 2966 decoder: Decoder, 2967 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2968 **kwargs: Any, 2969 ) -> OffsetIncrement: 2970 if isinstance(decoder, PaginationDecoderDecorator): 2971 inner_decoder = decoder.decoder 2972 else: 2973 inner_decoder = decoder 2974 decoder = PaginationDecoderDecorator(decoder=decoder) 2975 2976 if self._is_supported_decoder_for_pagination(inner_decoder): 2977 decoder_to_use = decoder 2978 else: 2979 raise ValueError( 2980 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2981 ) 2982 2983 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2984 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2985 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2986 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2987 # When we have more time to investigate we can look into reusing the same component. 2988 extractor = ( 2989 self._create_component_from_model( 2990 model=extractor_model, config=config, decoder=decoder_to_use 2991 ) 2992 if extractor_model 2993 else None 2994 ) 2995 2996 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2997 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2998 page_size = model.page_size 2999 if isinstance(page_size, str) and page_size.isdigit(): 3000 page_size = int(page_size) 3001 3002 return OffsetIncrement( 3003 page_size=page_size, 3004 config=config, 3005 decoder=decoder_to_use, 3006 extractor=extractor, 3007 inject_on_first_request=model.inject_on_first_request or False, 3008 parameters=model.parameters or {}, 3009 ) 3010 3011 @staticmethod 3012 def create_page_increment( 3013 model: PageIncrementModel, config: Config, **kwargs: Any 3014 ) -> PageIncrement: 3015 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3016 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3017 page_size = model.page_size 3018 if isinstance(page_size, str) and page_size.isdigit(): 3019 page_size = int(page_size) 3020 3021 return PageIncrement( 3022 page_size=page_size, 3023 config=config, 3024 start_from_page=model.start_from_page or 0, 3025 inject_on_first_request=model.inject_on_first_request or False, 3026 parameters=model.parameters or {}, 3027 ) 3028 3029 def create_parent_stream_config( 3030 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3031 ) -> ParentStreamConfig: 3032 declarative_stream = self._create_component_from_model( 3033 model.stream, 3034 config=config, 3035 is_parent=True, 3036 **kwargs, 3037 ) 3038 request_option = ( 3039 self._create_component_from_model(model.request_option, config=config) 3040 if model.request_option 3041 else None 3042 ) 3043 3044 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3045 raise ValueError( 3046 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3047 ) 3048 3049 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3050 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3051 ) 3052 3053 return ParentStreamConfig( 3054 parent_key=model.parent_key, 3055 request_option=request_option, 3056 stream=declarative_stream, 3057 partition_field=model.partition_field, 3058 config=config, 3059 incremental_dependency=model.incremental_dependency or False, 3060 parameters=model.parameters or {}, 3061 extra_fields=model.extra_fields, 3062 lazy_read_pointer=model_lazy_read_pointer, 3063 ) 3064 3065 def create_properties_from_endpoint( 3066 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3067 ) -> PropertiesFromEndpoint: 3068 retriever = self._create_component_from_model( 3069 model=model.retriever, 3070 config=config, 3071 name="dynamic_properties", 3072 primary_key=None, 3073 stream_slicer=None, 3074 transformations=[], 3075 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3076 ) 3077 return PropertiesFromEndpoint( 3078 property_field_path=model.property_field_path, 3079 retriever=retriever, 3080 config=config, 3081 parameters=model.parameters or {}, 3082 ) 3083 3084 def create_property_chunking( 3085 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3086 ) -> PropertyChunking: 3087 record_merge_strategy = ( 3088 self._create_component_from_model( 3089 model=model.record_merge_strategy, config=config, **kwargs 3090 ) 3091 if model.record_merge_strategy 3092 else None 3093 ) 3094 3095 property_limit_type: PropertyLimitType 3096 match model.property_limit_type: 3097 case PropertyLimitTypeModel.property_count: 3098 property_limit_type = PropertyLimitType.property_count 3099 case PropertyLimitTypeModel.characters: 3100 property_limit_type = PropertyLimitType.characters 3101 case _: 3102 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3103 3104 return PropertyChunking( 3105 property_limit_type=property_limit_type, 3106 property_limit=model.property_limit, 3107 record_merge_strategy=record_merge_strategy, 3108 config=config, 3109 parameters=model.parameters or {}, 3110 ) 3111 3112 def create_query_properties( 3113 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3114 ) -> QueryProperties: 3115 if isinstance(model.property_list, list): 3116 property_list = model.property_list 3117 else: 3118 property_list = self._create_component_from_model( 3119 model=model.property_list, config=config, **kwargs 3120 ) 3121 3122 property_chunking = ( 3123 self._create_component_from_model( 3124 model=model.property_chunking, config=config, **kwargs 3125 ) 3126 if model.property_chunking 3127 else None 3128 ) 3129 3130 property_selector = ( 3131 self._create_component_from_model( 3132 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3133 ) 3134 if model.property_selector 3135 else None 3136 ) 3137 3138 return QueryProperties( 3139 property_list=property_list, 3140 always_include_properties=model.always_include_properties, 3141 property_chunking=property_chunking, 3142 property_selector=property_selector, 3143 config=config, 3144 parameters=model.parameters or {}, 3145 ) 3146 3147 def create_json_schema_property_selector( 3148 self, 3149 model: JsonSchemaPropertySelectorModel, 3150 config: Config, 3151 *, 3152 stream_name: str, 3153 **kwargs: Any, 3154 ) -> JsonSchemaPropertySelector: 3155 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3156 3157 transformations = [] 3158 if model.transformations: 3159 for transformation_model in model.transformations: 3160 transformations.append( 3161 self._create_component_from_model(model=transformation_model, config=config) 3162 ) 3163 3164 return JsonSchemaPropertySelector( 3165 configured_stream=configured_stream, 3166 properties_transformations=transformations, 3167 config=config, 3168 parameters=model.parameters or {}, 3169 ) 3170 3171 @staticmethod 3172 def create_record_filter( 3173 model: RecordFilterModel, config: Config, **kwargs: Any 3174 ) -> RecordFilter: 3175 return RecordFilter( 3176 condition=model.condition or "", config=config, parameters=model.parameters or {} 3177 ) 3178 3179 @staticmethod 3180 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3181 return RequestPath(parameters={}) 3182 3183 @staticmethod 3184 def create_request_option( 3185 model: RequestOptionModel, config: Config, **kwargs: Any 3186 ) -> RequestOption: 3187 inject_into = RequestOptionType(model.inject_into.value) 3188 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3189 [ 3190 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3191 for segment in model.field_path 3192 ] 3193 if model.field_path 3194 else None 3195 ) 3196 field_name = ( 3197 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3198 if model.field_name 3199 else None 3200 ) 3201 return RequestOption( 3202 field_name=field_name, 3203 field_path=field_path, 3204 inject_into=inject_into, 3205 parameters=kwargs.get("parameters", {}), 3206 ) 3207 3208 def create_record_selector( 3209 self, 3210 model: RecordSelectorModel, 3211 config: Config, 3212 *, 3213 name: str, 3214 transformations: List[RecordTransformation] | None = None, 3215 decoder: Decoder | None = None, 3216 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3217 file_uploader: Optional[DefaultFileUploader] = None, 3218 **kwargs: Any, 3219 ) -> RecordSelector: 3220 extractor = self._create_component_from_model( 3221 model=model.extractor, decoder=decoder, config=config 3222 ) 3223 record_filter = ( 3224 self._create_component_from_model(model.record_filter, config=config) 3225 if model.record_filter 3226 else None 3227 ) 3228 3229 transform_before_filtering = ( 3230 False if model.transform_before_filtering is None else model.transform_before_filtering 3231 ) 3232 if client_side_incremental_sync_cursor: 3233 record_filter = ClientSideIncrementalRecordFilterDecorator( 3234 config=config, 3235 parameters=model.parameters, 3236 condition=model.record_filter.condition 3237 if (model.record_filter and hasattr(model.record_filter, "condition")) 3238 else None, 3239 cursor=client_side_incremental_sync_cursor, 3240 ) 3241 transform_before_filtering = ( 3242 True 3243 if model.transform_before_filtering is None 3244 else model.transform_before_filtering 3245 ) 3246 3247 if model.schema_normalization is None: 3248 # default to no schema normalization if not set 3249 model.schema_normalization = SchemaNormalizationModel.None_ 3250 3251 schema_normalization = ( 3252 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3253 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3254 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3255 ) 3256 3257 return RecordSelector( 3258 extractor=extractor, 3259 name=name, 3260 config=config, 3261 record_filter=record_filter, 3262 transformations=transformations or [], 3263 file_uploader=file_uploader, 3264 schema_normalization=schema_normalization, 3265 parameters=model.parameters or {}, 3266 transform_before_filtering=transform_before_filtering, 3267 ) 3268 3269 @staticmethod 3270 def create_remove_fields( 3271 model: RemoveFieldsModel, config: Config, **kwargs: Any 3272 ) -> RemoveFields: 3273 return RemoveFields( 3274 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3275 ) 3276 3277 def create_selective_authenticator( 3278 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3279 ) -> DeclarativeAuthenticator: 3280 authenticators = { 3281 name: self._create_component_from_model(model=auth, config=config) 3282 for name, auth in model.authenticators.items() 3283 } 3284 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3285 return SelectiveAuthenticator( # type: ignore[abstract] 3286 config=config, 3287 authenticators=authenticators, 3288 authenticator_selection_path=model.authenticator_selection_path, 3289 **kwargs, 3290 ) 3291 3292 @staticmethod 3293 def create_legacy_session_token_authenticator( 3294 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3295 ) -> LegacySessionTokenAuthenticator: 3296 return LegacySessionTokenAuthenticator( 3297 api_url=url_base, 3298 header=model.header, 3299 login_url=model.login_url, 3300 password=model.password or "", 3301 session_token=model.session_token or "", 3302 session_token_response_key=model.session_token_response_key or "", 3303 username=model.username or "", 3304 validate_session_url=model.validate_session_url, 3305 config=config, 3306 parameters=model.parameters or {}, 3307 ) 3308 3309 def create_simple_retriever( 3310 self, 3311 model: SimpleRetrieverModel, 3312 config: Config, 3313 *, 3314 name: str, 3315 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3316 request_options_provider: Optional[RequestOptionsProvider] = None, 3317 cursor: Optional[Cursor] = None, 3318 has_stop_condition_cursor: bool = False, 3319 is_client_side_incremental_sync: bool = False, 3320 transformations: List[RecordTransformation], 3321 file_uploader: Optional[DefaultFileUploader] = None, 3322 incremental_sync: Optional[ 3323 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3324 ] = None, 3325 use_cache: Optional[bool] = None, 3326 log_formatter: Optional[Callable[[Response], Any]] = None, 3327 partition_router: Optional[PartitionRouter] = None, 3328 **kwargs: Any, 3329 ) -> SimpleRetriever: 3330 def _get_url(req: Requester) -> str: 3331 """ 3332 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3333 This is needed because the URL is not set until the requester is created. 3334 """ 3335 3336 _url: str = ( 3337 model.requester.url 3338 if hasattr(model.requester, "url") and model.requester.url is not None 3339 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3340 ) 3341 _url_base: str = ( 3342 model.requester.url_base 3343 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3344 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3345 ) 3346 3347 return _url or _url_base 3348 3349 if cursor is None: 3350 cursor = FinalStateCursor(name, None, self._message_repository) 3351 3352 decoder = ( 3353 self._create_component_from_model(model=model.decoder, config=config) 3354 if model.decoder 3355 else JsonDecoder(parameters={}) 3356 ) 3357 record_selector = self._create_component_from_model( 3358 model=model.record_selector, 3359 name=name, 3360 config=config, 3361 decoder=decoder, 3362 transformations=transformations, 3363 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3364 file_uploader=file_uploader, 3365 ) 3366 3367 query_properties: Optional[QueryProperties] = None 3368 query_properties_key: Optional[str] = None 3369 self._ensure_query_properties_to_model(model.requester) 3370 if self._has_query_properties_in_request_parameters(model.requester): 3371 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3372 # places instead of default to request_parameters which isn't clearly documented 3373 if ( 3374 hasattr(model.requester, "fetch_properties_from_endpoint") 3375 and model.requester.fetch_properties_from_endpoint 3376 ): 3377 raise ValueError( 3378 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3379 ) 3380 3381 query_properties_definitions = [] 3382 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3383 if isinstance(request_parameter, QueryPropertiesModel): 3384 query_properties_key = key 3385 query_properties_definitions.append(request_parameter) 3386 3387 if len(query_properties_definitions) > 1: 3388 raise ValueError( 3389 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3390 ) 3391 3392 if len(query_properties_definitions) == 1: 3393 query_properties = self._create_component_from_model( 3394 model=query_properties_definitions[0], stream_name=name, config=config 3395 ) 3396 3397 # Removes QueryProperties components from the interpolated mappings because it has been designed 3398 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3399 # instead of through jinja interpolation 3400 if hasattr(model.requester, "request_parameters") and isinstance( 3401 model.requester.request_parameters, Mapping 3402 ): 3403 model.requester.request_parameters = self._remove_query_properties( 3404 model.requester.request_parameters 3405 ) 3406 elif ( 3407 hasattr(model.requester, "fetch_properties_from_endpoint") 3408 and model.requester.fetch_properties_from_endpoint 3409 ): 3410 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3411 query_properties_definition = QueryPropertiesModel( 3412 type="QueryProperties", 3413 property_list=model.requester.fetch_properties_from_endpoint, 3414 always_include_properties=None, 3415 property_chunking=None, 3416 ) # type: ignore # $parameters has a default value 3417 3418 query_properties = self.create_query_properties( 3419 model=query_properties_definition, 3420 stream_name=name, 3421 config=config, 3422 ) 3423 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3424 query_properties = self.create_query_properties( 3425 model=model.requester.query_properties, 3426 stream_name=name, 3427 config=config, 3428 ) 3429 3430 requester = self._create_component_from_model( 3431 model=model.requester, 3432 decoder=decoder, 3433 name=name, 3434 query_properties_key=query_properties_key, 3435 use_cache=use_cache, 3436 config=config, 3437 ) 3438 3439 if not request_options_provider: 3440 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3441 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3442 partition_router, PartitionRouter 3443 ): 3444 request_options_provider = partition_router 3445 3446 paginator = ( 3447 self._create_component_from_model( 3448 model=model.paginator, 3449 config=config, 3450 url_base=_get_url(requester), 3451 extractor_model=model.record_selector.extractor, 3452 decoder=decoder, 3453 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3454 ) 3455 if model.paginator 3456 else NoPagination(parameters={}) 3457 ) 3458 3459 ignore_stream_slicer_parameters_on_paginated_requests = ( 3460 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3461 ) 3462 3463 if ( 3464 model.partition_router 3465 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3466 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3467 and any( 3468 parent_stream_config.lazy_read_pointer 3469 for parent_stream_config in model.partition_router.parent_stream_configs 3470 ) 3471 ): 3472 if incremental_sync: 3473 if incremental_sync.type != "DatetimeBasedCursor": 3474 raise ValueError( 3475 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3476 ) 3477 3478 elif incremental_sync.step or incremental_sync.cursor_granularity: 3479 raise ValueError( 3480 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3481 ) 3482 3483 if model.decoder and model.decoder.type != "JsonDecoder": 3484 raise ValueError( 3485 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3486 ) 3487 3488 return LazySimpleRetriever( 3489 name=name, 3490 paginator=paginator, 3491 primary_key=primary_key, 3492 requester=requester, 3493 record_selector=record_selector, 3494 stream_slicer=_NO_STREAM_SLICING, 3495 request_option_provider=request_options_provider, 3496 config=config, 3497 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3498 parameters=model.parameters or {}, 3499 ) 3500 3501 if ( 3502 model.record_selector.record_filter 3503 and model.pagination_reset 3504 and model.pagination_reset.limits 3505 ): 3506 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3507 3508 return SimpleRetriever( 3509 name=name, 3510 paginator=paginator, 3511 primary_key=primary_key, 3512 requester=requester, 3513 record_selector=record_selector, 3514 stream_slicer=_NO_STREAM_SLICING, 3515 request_option_provider=request_options_provider, 3516 config=config, 3517 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3518 additional_query_properties=query_properties, 3519 log_formatter=self._get_log_formatter(log_formatter, name), 3520 pagination_tracker_factory=self._create_pagination_tracker_factory( 3521 model.pagination_reset, cursor 3522 ), 3523 parameters=model.parameters or {}, 3524 ) 3525 3526 def _create_pagination_tracker_factory( 3527 self, model: Optional[PaginationResetModel], cursor: Cursor 3528 ) -> Callable[[], PaginationTracker]: 3529 if model is None: 3530 return lambda: PaginationTracker() 3531 3532 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3533 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3534 if model.action == PaginationResetActionModel.RESET: 3535 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3536 pass 3537 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3538 if isinstance(cursor, ConcurrentCursor): 3539 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3540 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3541 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3542 {}, datetime.timedelta(0) 3543 ) 3544 elif not isinstance(cursor, FinalStateCursor): 3545 LOGGER.warning( 3546 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3547 ) 3548 else: 3549 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3550 3551 limit = model.limits.number_of_records if model and model.limits else None 3552 return lambda: PaginationTracker(cursor_factory(), limit) 3553 3554 def _get_log_formatter( 3555 self, log_formatter: Callable[[Response], Any] | None, name: str 3556 ) -> Callable[[Response], Any] | None: 3557 if self._should_limit_slices_fetched(): 3558 return ( 3559 ( 3560 lambda response: format_http_message( 3561 response, 3562 f"Stream '{name}' request", 3563 f"Request performed in order to extract records for stream '{name}'", 3564 name, 3565 ) 3566 ) 3567 if not log_formatter 3568 else log_formatter 3569 ) 3570 return None 3571 3572 def _should_limit_slices_fetched(self) -> bool: 3573 """ 3574 Returns True if the number of slices fetched should be limited, False otherwise. 3575 This is used to limit the number of slices fetched during tests. 3576 """ 3577 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3578 3579 @staticmethod 3580 def _has_query_properties_in_request_parameters( 3581 requester: Union[HttpRequesterModel, CustomRequesterModel], 3582 ) -> bool: 3583 if not hasattr(requester, "request_parameters"): 3584 return False 3585 request_parameters = requester.request_parameters 3586 if request_parameters and isinstance(request_parameters, Mapping): 3587 for request_parameter in request_parameters.values(): 3588 if isinstance(request_parameter, QueryPropertiesModel): 3589 return True 3590 return False 3591 3592 @staticmethod 3593 def _remove_query_properties( 3594 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3595 ) -> Mapping[str, str]: 3596 return { 3597 parameter_field: request_parameter 3598 for parameter_field, request_parameter in request_parameters.items() 3599 if not isinstance(request_parameter, QueryPropertiesModel) 3600 } 3601 3602 def create_state_delegating_stream( 3603 self, 3604 model: StateDelegatingStreamModel, 3605 config: Config, 3606 **kwargs: Any, 3607 ) -> DefaultStream: 3608 if ( 3609 model.full_refresh_stream.name != model.name 3610 or model.name != model.incremental_stream.name 3611 ): 3612 raise ValueError( 3613 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3614 ) 3615 3616 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3617 resolved_retention_period: Optional[str] = None 3618 if model.api_retention_period: 3619 interpolated_retention = InterpolatedString.create( 3620 model.api_retention_period, parameters=model.parameters or {} 3621 ) 3622 resolved_value = interpolated_retention.eval(config=config) 3623 if resolved_value: 3624 resolved_retention_period = str(resolved_value) 3625 3626 if resolved_retention_period: 3627 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3628 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3629 raise ValueError( 3630 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3631 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3632 f"cursors, so cursor age validation cannot be performed." 3633 ) 3634 3635 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3636 3637 if not stream_state: 3638 return self._create_component_from_model( # type: ignore[no-any-return] 3639 model.full_refresh_stream, config=config, **kwargs 3640 ) 3641 3642 incremental_stream: DefaultStream = self._create_component_from_model( 3643 model.incremental_stream, config=config, **kwargs 3644 ) # type: ignore[assignment] 3645 3646 # Only run cursor age validation for streams that are in the configured 3647 # catalog (or when no catalog was provided, e.g. during discover / connector 3648 # builder). Streams not selected by the user but instantiated as parent-stream 3649 # dependencies must not go through this path because it emits state messages 3650 # that the destination does not know about, causing "Stream not found" crashes. 3651 stream_is_in_catalog = ( 3652 not self._stream_name_to_configured_stream # no catalog → validate by default 3653 or model.name in self._stream_name_to_configured_stream 3654 ) 3655 if resolved_retention_period and stream_is_in_catalog: 3656 full_refresh_stream: DefaultStream = self._create_component_from_model( 3657 model.full_refresh_stream, config=config, **kwargs 3658 ) # type: ignore[assignment] 3659 if self._is_cursor_older_than_retention_period( 3660 stream_state, 3661 full_refresh_stream.cursor, 3662 incremental_stream.cursor, 3663 resolved_retention_period, 3664 model.name, 3665 ): 3666 # Clear state BEFORE constructing the full_refresh_stream so that 3667 # its cursor starts from start_date instead of the stale cursor. 3668 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3669 state_message = self._connector_state_manager.create_state_message(model.name, None) 3670 self._message_repository.emit_message(state_message) 3671 return self._create_component_from_model( # type: ignore[no-any-return] 3672 model.full_refresh_stream, config=config, **kwargs 3673 ) 3674 3675 return incremental_stream 3676 3677 @staticmethod 3678 def _is_cursor_older_than_retention_period( 3679 stream_state: Mapping[str, Any], 3680 full_refresh_cursor: Cursor, 3681 incremental_cursor: Cursor, 3682 api_retention_period: str, 3683 stream_name: str, 3684 ) -> bool: 3685 """Check if the cursor value in the state is older than the API's retention period. 3686 3687 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3688 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3689 which is always within retention, so we use incremental. For other states, it returns 3690 None and we fall back to checking the incremental cursor. 3691 3692 Returns True if the cursor is older than the retention period (should use full refresh). 3693 Returns False if the cursor is within the retention period (safe to use incremental). 3694 """ 3695 retention_duration = parse_duration(api_retention_period) 3696 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3697 3698 # Check full refresh cursor first 3699 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3700 3701 # If full refresh cursor returns None, check incremental cursor 3702 if cursor_datetime is None: 3703 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3704 3705 if cursor_datetime is None: 3706 # Neither cursor could parse the state - fall back to full refresh to be safe 3707 return True 3708 3709 if cursor_datetime < retention_cutoff: 3710 logging.warning( 3711 f"Stream '{stream_name}' has a cursor value older than " 3712 f"the API's retention period of {api_retention_period} " 3713 f"(cutoff: {retention_cutoff.isoformat()}). " 3714 f"Falling back to full refresh to avoid data loss." 3715 ) 3716 return True 3717 3718 return False 3719 3720 def _get_state_delegating_stream_model( 3721 self, 3722 model: StateDelegatingStreamModel, 3723 parent_state: Optional[Mapping[str, Any]] = None, 3724 ) -> DeclarativeStreamModel: 3725 """Return the appropriate underlying stream model based on state.""" 3726 return ( 3727 model.incremental_stream 3728 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3729 else model.full_refresh_stream 3730 ) 3731 3732 _OPTIONAL_ASYNC_STATUS_FIELDS = {"skipped"} 3733 3734 def _create_async_job_status_mapping( 3735 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3736 ) -> Mapping[str, AsyncJobStatus]: 3737 api_status_to_cdk_status = {} 3738 for cdk_status, api_statuses in model.dict().items(): 3739 if cdk_status == "type": 3740 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3741 continue 3742 3743 if api_statuses is None: 3744 if cdk_status in self._OPTIONAL_ASYNC_STATUS_FIELDS: 3745 continue 3746 raise ValueError( 3747 f"Required CDK status '{cdk_status}' has no API statuses mapped. " 3748 f"Please provide at least an empty list for required status fields." 3749 ) 3750 3751 for status in api_statuses: 3752 if status in api_status_to_cdk_status: 3753 raise ValueError( 3754 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3755 ) 3756 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3757 return api_status_to_cdk_status 3758 3759 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3760 match status: 3761 case "running": 3762 return AsyncJobStatus.RUNNING 3763 case "completed": 3764 return AsyncJobStatus.COMPLETED 3765 case "failed": 3766 return AsyncJobStatus.FAILED 3767 case "timeout": 3768 return AsyncJobStatus.TIMED_OUT 3769 case "skipped": 3770 return AsyncJobStatus.SKIPPED 3771 case _: 3772 raise ValueError(f"Unsupported CDK status {status}") 3773 3774 def create_async_retriever( 3775 self, 3776 model: AsyncRetrieverModel, 3777 config: Config, 3778 *, 3779 name: str, 3780 primary_key: Optional[ 3781 Union[str, List[str], List[List[str]]] 3782 ], # this seems to be needed to match create_simple_retriever 3783 stream_slicer: Optional[StreamSlicer], 3784 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3785 transformations: List[RecordTransformation], 3786 **kwargs: Any, 3787 ) -> AsyncRetriever: 3788 if model.download_target_requester and not model.download_target_extractor: 3789 raise ValueError( 3790 f"`download_target_extractor` required if using a `download_target_requester`" 3791 ) 3792 3793 def _get_download_retriever( 3794 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3795 ) -> SimpleRetriever: 3796 # We create a record selector for the download retriever 3797 # with no schema normalization and no transformations, neither record filter 3798 # as all this occurs in the record_selector of the AsyncRetriever 3799 record_selector = RecordSelector( 3800 extractor=extractor, 3801 name=name, 3802 record_filter=None, 3803 transformations=[], 3804 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3805 config=config, 3806 parameters={}, 3807 ) 3808 paginator = ( 3809 self._create_component_from_model( 3810 model=model.download_paginator, 3811 decoder=_decoder, 3812 config=config, 3813 url_base="", 3814 ) 3815 if model.download_paginator 3816 else NoPagination(parameters={}) 3817 ) 3818 3819 return SimpleRetriever( 3820 requester=requester, 3821 record_selector=record_selector, 3822 primary_key=None, 3823 name=name, 3824 paginator=paginator, 3825 config=config, 3826 parameters={}, 3827 log_formatter=self._get_log_formatter(None, name), 3828 ) 3829 3830 def _get_job_timeout() -> datetime.timedelta: 3831 user_defined_timeout: Optional[int] = ( 3832 int( 3833 InterpolatedString.create( 3834 str(model.polling_job_timeout), 3835 parameters={}, 3836 ).eval(config) 3837 ) 3838 if model.polling_job_timeout 3839 else None 3840 ) 3841 3842 # check for user defined timeout during the test read or 15 minutes 3843 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3844 # default value for non-connector builder is 60 minutes. 3845 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3846 3847 return ( 3848 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3849 ) 3850 3851 decoder = ( 3852 self._create_component_from_model(model=model.decoder, config=config) 3853 if model.decoder 3854 else JsonDecoder(parameters={}) 3855 ) 3856 record_selector = self._create_component_from_model( 3857 model=model.record_selector, 3858 config=config, 3859 decoder=decoder, 3860 name=name, 3861 transformations=transformations, 3862 client_side_incremental_sync=client_side_incremental_sync, 3863 ) 3864 3865 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3866 if self._should_limit_slices_fetched(): 3867 stream_slicer = cast( 3868 StreamSlicer, 3869 StreamSlicerTestReadDecorator( 3870 wrapped_slicer=stream_slicer, 3871 maximum_number_of_slices=self._limit_slices_fetched or 5, 3872 ), 3873 ) 3874 3875 creation_requester = self._create_component_from_model( 3876 model=model.creation_requester, 3877 decoder=decoder, 3878 config=config, 3879 name=f"job creation - {name}", 3880 ) 3881 polling_requester = self._create_component_from_model( 3882 model=model.polling_requester, 3883 decoder=decoder, 3884 config=config, 3885 name=f"job polling - {name}", 3886 ) 3887 job_download_components_name = f"job download - {name}" 3888 download_decoder = ( 3889 self._create_component_from_model(model=model.download_decoder, config=config) 3890 if model.download_decoder 3891 else JsonDecoder(parameters={}) 3892 ) 3893 download_extractor = ( 3894 self._create_component_from_model( 3895 model=model.download_extractor, 3896 config=config, 3897 decoder=download_decoder, 3898 parameters=model.parameters, 3899 ) 3900 if model.download_extractor 3901 else DpathExtractor( 3902 [], 3903 config=config, 3904 decoder=download_decoder, 3905 parameters=model.parameters or {}, 3906 ) 3907 ) 3908 download_requester = self._create_component_from_model( 3909 model=model.download_requester, 3910 decoder=download_decoder, 3911 config=config, 3912 name=job_download_components_name, 3913 ) 3914 download_retriever = _get_download_retriever( 3915 download_requester, download_extractor, download_decoder 3916 ) 3917 abort_requester = ( 3918 self._create_component_from_model( 3919 model=model.abort_requester, 3920 decoder=decoder, 3921 config=config, 3922 name=f"job abort - {name}", 3923 ) 3924 if model.abort_requester 3925 else None 3926 ) 3927 delete_requester = ( 3928 self._create_component_from_model( 3929 model=model.delete_requester, 3930 decoder=decoder, 3931 config=config, 3932 name=f"job delete - {name}", 3933 ) 3934 if model.delete_requester 3935 else None 3936 ) 3937 download_target_requester = ( 3938 self._create_component_from_model( 3939 model=model.download_target_requester, 3940 decoder=decoder, 3941 config=config, 3942 name=f"job extract_url - {name}", 3943 ) 3944 if model.download_target_requester 3945 else None 3946 ) 3947 status_extractor = self._create_component_from_model( 3948 model=model.status_extractor, decoder=decoder, config=config, name=name 3949 ) 3950 download_target_extractor = ( 3951 self._create_component_from_model( 3952 model=model.download_target_extractor, 3953 decoder=decoder, 3954 config=config, 3955 name=name, 3956 ) 3957 if model.download_target_extractor 3958 else None 3959 ) 3960 3961 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3962 creation_requester=creation_requester, 3963 polling_requester=polling_requester, 3964 download_retriever=download_retriever, 3965 download_target_requester=download_target_requester, 3966 abort_requester=abort_requester, 3967 delete_requester=delete_requester, 3968 status_extractor=status_extractor, 3969 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3970 download_target_extractor=download_target_extractor, 3971 job_timeout=_get_job_timeout(), 3972 ) 3973 3974 failed_retry_wait_time_in_seconds: Optional[int] = ( 3975 int( 3976 InterpolatedString.create( 3977 str(model.failed_retry_wait_time_in_seconds), 3978 parameters={}, 3979 ).eval(config) 3980 ) 3981 if model.failed_retry_wait_time_in_seconds 3982 else None 3983 ) 3984 3985 async_job_partition_router = AsyncJobPartitionRouter( 3986 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3987 job_repository, 3988 stream_slices, 3989 self._job_tracker, 3990 self._message_repository, 3991 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3992 has_bulk_parent=False, 3993 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3994 # `None` == default retry is set to 3 attempts, under the hood. 3995 job_max_retry=1 if self._emit_connector_builder_messages else None, 3996 failed_retry_wait_time_in_seconds=failed_retry_wait_time_in_seconds, 3997 ), 3998 stream_slicer=stream_slicer, 3999 config=config, 4000 parameters=model.parameters or {}, 4001 ) 4002 4003 return AsyncRetriever( 4004 record_selector=record_selector, 4005 stream_slicer=async_job_partition_router, 4006 config=config, 4007 parameters=model.parameters or {}, 4008 ) 4009 4010 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 4011 config_migrations = [ 4012 self._create_component_from_model(migration, config) 4013 for migration in ( 4014 model.config_normalization_rules.config_migrations 4015 if ( 4016 model.config_normalization_rules 4017 and model.config_normalization_rules.config_migrations 4018 ) 4019 else [] 4020 ) 4021 ] 4022 config_transformations = [ 4023 self._create_component_from_model(transformation, config) 4024 for transformation in ( 4025 model.config_normalization_rules.transformations 4026 if ( 4027 model.config_normalization_rules 4028 and model.config_normalization_rules.transformations 4029 ) 4030 else [] 4031 ) 4032 ] 4033 config_validations = [ 4034 self._create_component_from_model(validation, config) 4035 for validation in ( 4036 model.config_normalization_rules.validations 4037 if ( 4038 model.config_normalization_rules 4039 and model.config_normalization_rules.validations 4040 ) 4041 else [] 4042 ) 4043 ] 4044 4045 return Spec( 4046 connection_specification=model.connection_specification, 4047 documentation_url=model.documentation_url, 4048 advanced_auth=model.advanced_auth, 4049 parameters={}, 4050 config_migrations=config_migrations, 4051 config_transformations=config_transformations, 4052 config_validations=config_validations, 4053 ) 4054 4055 def create_substream_partition_router( 4056 self, 4057 model: SubstreamPartitionRouterModel, 4058 config: Config, 4059 *, 4060 stream_name: str, 4061 **kwargs: Any, 4062 ) -> SubstreamPartitionRouter: 4063 parent_stream_configs = [] 4064 if model.parent_stream_configs: 4065 parent_stream_configs.extend( 4066 [ 4067 self.create_parent_stream_config_with_substream_wrapper( 4068 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4069 ) 4070 for parent_stream_config in model.parent_stream_configs 4071 ] 4072 ) 4073 4074 return SubstreamPartitionRouter( 4075 parent_stream_configs=parent_stream_configs, 4076 parameters=model.parameters or {}, 4077 config=config, 4078 ) 4079 4080 def create_parent_stream_config_with_substream_wrapper( 4081 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4082 ) -> Any: 4083 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4084 4085 parent_state: Optional[Mapping[str, Any]] = ( 4086 child_state if model.incremental_dependency and child_state else None 4087 ) 4088 connector_state_manager = self._instantiate_parent_stream_state_manager( 4089 child_state, config, model, parent_state 4090 ) 4091 4092 substream_factory = ModelToComponentFactory( 4093 connector_state_manager=connector_state_manager, 4094 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4095 limit_slices_fetched=self._limit_slices_fetched, 4096 emit_connector_builder_messages=self._emit_connector_builder_messages, 4097 disable_retries=self._disable_retries, 4098 disable_cache=self._disable_cache, 4099 message_repository=StateFilteringMessageRepository( 4100 LogAppenderMessageRepositoryDecorator( 4101 { 4102 "airbyte_cdk": {"stream": {"is_substream": True}}, 4103 "http": {"is_auxiliary": True}, 4104 }, 4105 self._message_repository, 4106 self._evaluate_log_level(self._emit_connector_builder_messages), 4107 ), 4108 ), 4109 api_budget=self._api_budget, 4110 ) 4111 4112 return substream_factory.create_parent_stream_config( 4113 model=model, config=config, stream_name=stream_name, **kwargs 4114 ) 4115 4116 def _instantiate_parent_stream_state_manager( 4117 self, 4118 child_state: MutableMapping[str, Any], 4119 config: Config, 4120 model: ParentStreamConfigModel, 4121 parent_state: Optional[Mapping[str, Any]] = None, 4122 ) -> ConnectorStateManager: 4123 """ 4124 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4125 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4126 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4127 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4128 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4129 incremental_dependency is set. 4130 """ 4131 if model.incremental_dependency and child_state: 4132 parent_stream_name = model.stream.name or "" 4133 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4134 child_state, parent_stream_name 4135 ) 4136 4137 if not extracted_parent_state: 4138 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4139 child_state, parent_stream_name 4140 ) 4141 4142 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4143 cursor_values = child_state.values() 4144 if cursor_values and len(cursor_values) == 1: 4145 incremental_sync_model: Union[ 4146 DatetimeBasedCursorModel, 4147 IncrementingCountCursorModel, 4148 ] = ( 4149 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4150 if isinstance(model.stream, DeclarativeStreamModel) 4151 else self._get_state_delegating_stream_model( 4152 model.stream, parent_state=parent_state 4153 ).incremental_sync 4154 ) 4155 cursor_field = InterpolatedString.create( 4156 incremental_sync_model.cursor_field, 4157 parameters=incremental_sync_model.parameters or {}, 4158 ).eval(config) 4159 extracted_parent_state = AirbyteStateMessage( 4160 type=AirbyteStateType.STREAM, 4161 stream=AirbyteStreamState( 4162 stream_descriptor=StreamDescriptor( 4163 name=parent_stream_name, namespace=None 4164 ), 4165 stream_state=AirbyteStateBlob( 4166 {cursor_field: list(cursor_values)[0]} 4167 ), 4168 ), 4169 ) 4170 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4171 4172 return ConnectorStateManager([]) 4173 4174 @staticmethod 4175 def create_wait_time_from_header( 4176 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4177 ) -> WaitTimeFromHeaderBackoffStrategy: 4178 return WaitTimeFromHeaderBackoffStrategy( 4179 header=model.header, 4180 parameters=model.parameters or {}, 4181 config=config, 4182 regex=model.regex, 4183 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4184 if model.max_waiting_time_in_seconds is not None 4185 else None, 4186 ) 4187 4188 @staticmethod 4189 def create_wait_until_time_from_header( 4190 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4191 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4192 return WaitUntilTimeFromHeaderBackoffStrategy( 4193 header=model.header, 4194 parameters=model.parameters or {}, 4195 config=config, 4196 min_wait=model.min_wait, 4197 regex=model.regex, 4198 ) 4199 4200 def get_message_repository(self) -> MessageRepository: 4201 return self._message_repository 4202 4203 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4204 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4205 4206 @staticmethod 4207 def create_components_mapping_definition( 4208 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4209 ) -> ComponentMappingDefinition: 4210 interpolated_value = InterpolatedString.create( 4211 model.value, parameters=model.parameters or {} 4212 ) 4213 field_path = [ 4214 InterpolatedString.create(path, parameters=model.parameters or {}) 4215 for path in model.field_path 4216 ] 4217 return ComponentMappingDefinition( 4218 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4219 value=interpolated_value, 4220 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4221 create_or_update=model.create_or_update, 4222 condition=model.condition, 4223 parameters=model.parameters or {}, 4224 ) 4225 4226 def create_http_components_resolver( 4227 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4228 ) -> Any: 4229 retriever = self._create_component_from_model( 4230 model=model.retriever, 4231 config=config, 4232 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4233 primary_key=None, 4234 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4235 transformations=[], 4236 ) 4237 4238 components_mapping = [] 4239 for component_mapping_definition_model in model.components_mapping: 4240 if component_mapping_definition_model.condition: 4241 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4242 components_mapping.append( 4243 self._create_component_from_model( 4244 model=component_mapping_definition_model, 4245 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4246 component_mapping_definition_model.value_type 4247 ), 4248 config=config, 4249 ) 4250 ) 4251 4252 return HttpComponentsResolver( 4253 retriever=retriever, 4254 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4255 config=config, 4256 components_mapping=components_mapping, 4257 parameters=model.parameters or {}, 4258 ) 4259 4260 @staticmethod 4261 def create_stream_config( 4262 model: StreamConfigModel, config: Config, **kwargs: Any 4263 ) -> StreamConfig: 4264 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4265 [x for x in model.configs_pointer] if model.configs_pointer else [] 4266 ) 4267 4268 return StreamConfig( 4269 configs_pointer=model_configs_pointer, 4270 default_values=model.default_values, 4271 parameters=model.parameters or {}, 4272 ) 4273 4274 def create_config_components_resolver( 4275 self, 4276 model: ConfigComponentsResolverModel, 4277 config: Config, 4278 ) -> Any: 4279 model_stream_configs = ( 4280 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4281 ) 4282 4283 stream_configs = [ 4284 self._create_component_from_model( 4285 stream_config, config=config, parameters=model.parameters or {} 4286 ) 4287 for stream_config in model_stream_configs 4288 ] 4289 4290 components_mapping = [ 4291 self._create_component_from_model( 4292 model=components_mapping_definition_model, 4293 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4294 components_mapping_definition_model.value_type 4295 ), 4296 config=config, 4297 parameters=model.parameters, 4298 ) 4299 for components_mapping_definition_model in model.components_mapping 4300 ] 4301 4302 return ConfigComponentsResolver( 4303 stream_configs=stream_configs, 4304 config=config, 4305 components_mapping=components_mapping, 4306 parameters=model.parameters or {}, 4307 ) 4308 4309 def create_parametrized_components_resolver( 4310 self, 4311 model: ParametrizedComponentsResolverModel, 4312 config: Config, 4313 ) -> ParametrizedComponentsResolver: 4314 stream_parameters = StreamParametersDefinition( 4315 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4316 ) 4317 4318 components_mapping = [] 4319 for components_mapping_definition_model in model.components_mapping: 4320 if components_mapping_definition_model.condition: 4321 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4322 components_mapping.append( 4323 self._create_component_from_model( 4324 model=components_mapping_definition_model, 4325 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4326 components_mapping_definition_model.value_type 4327 ), 4328 config=config, 4329 ) 4330 ) 4331 return ParametrizedComponentsResolver( 4332 stream_parameters=stream_parameters, 4333 config=config, 4334 components_mapping=components_mapping, 4335 parameters=model.parameters or {}, 4336 ) 4337 4338 _UNSUPPORTED_DECODER_ERROR = ( 4339 "Specified decoder of {decoder_type} is not supported for pagination." 4340 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4341 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4342 ) 4343 4344 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4345 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4346 return True 4347 elif isinstance(decoder, CompositeRawDecoder): 4348 return self._is_supported_parser_for_pagination(decoder.parser) 4349 else: 4350 return False 4351 4352 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4353 if isinstance(parser, JsonParser): 4354 return True 4355 elif isinstance(parser, GzipParser): 4356 return isinstance(parser.inner_parser, JsonParser) 4357 else: 4358 return False 4359 4360 def create_http_api_budget( 4361 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4362 ) -> HttpAPIBudget: 4363 policies = [ 4364 self._create_component_from_model(model=policy, config=config) 4365 for policy in model.policies 4366 ] 4367 4368 return HttpAPIBudget( 4369 policies=policies, 4370 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4371 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4372 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4373 ) 4374 4375 def create_fixed_window_call_rate_policy( 4376 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4377 ) -> FixedWindowCallRatePolicy: 4378 matchers = [ 4379 self._create_component_from_model(model=matcher, config=config) 4380 for matcher in model.matchers 4381 ] 4382 4383 # Set the initial reset timestamp to 10 days from now. 4384 # This value will be updated by the first request. 4385 return FixedWindowCallRatePolicy( 4386 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4387 period=parse_duration(model.period), 4388 call_limit=model.call_limit, 4389 matchers=matchers, 4390 ) 4391 4392 def create_file_uploader( 4393 self, model: FileUploaderModel, config: Config, **kwargs: Any 4394 ) -> FileUploader: 4395 name = "File Uploader" 4396 requester = self._create_component_from_model( 4397 model=model.requester, 4398 config=config, 4399 name=name, 4400 **kwargs, 4401 ) 4402 download_target_extractor = self._create_component_from_model( 4403 model=model.download_target_extractor, 4404 config=config, 4405 name=name, 4406 **kwargs, 4407 ) 4408 emit_connector_builder_messages = self._emit_connector_builder_messages 4409 file_uploader = DefaultFileUploader( 4410 requester=requester, 4411 download_target_extractor=download_target_extractor, 4412 config=config, 4413 file_writer=NoopFileWriter() 4414 if emit_connector_builder_messages 4415 else LocalFileSystemFileWriter(), 4416 parameters=model.parameters or {}, 4417 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4418 ) 4419 4420 return ( 4421 ConnectorBuilderFileUploader(file_uploader) 4422 if emit_connector_builder_messages 4423 else file_uploader 4424 ) 4425 4426 def create_moving_window_call_rate_policy( 4427 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4428 ) -> MovingWindowCallRatePolicy: 4429 rates = [ 4430 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4431 ] 4432 matchers = [ 4433 self._create_component_from_model(model=matcher, config=config) 4434 for matcher in model.matchers 4435 ] 4436 return MovingWindowCallRatePolicy( 4437 rates=rates, 4438 matchers=matchers, 4439 ) 4440 4441 def create_unlimited_call_rate_policy( 4442 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4443 ) -> UnlimitedCallRatePolicy: 4444 matchers = [ 4445 self._create_component_from_model(model=matcher, config=config) 4446 for matcher in model.matchers 4447 ] 4448 4449 return UnlimitedCallRatePolicy( 4450 matchers=matchers, 4451 ) 4452 4453 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4454 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4455 return Rate( 4456 limit=int(interpolated_limit.eval(config=config)), 4457 interval=parse_duration(model.interval), 4458 ) 4459 4460 def create_http_request_matcher( 4461 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4462 ) -> HttpRequestRegexMatcher: 4463 weight = model.weight 4464 if weight is not None: 4465 if isinstance(weight, str): 4466 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4467 else: 4468 weight = int(weight) 4469 if weight < 1: 4470 raise ValueError(f"weight must be >= 1, got {weight}") 4471 return HttpRequestRegexMatcher( 4472 method=model.method, 4473 url_base=model.url_base, 4474 url_path_pattern=model.url_path_pattern, 4475 params=model.params, 4476 headers=model.headers, 4477 weight=weight, 4478 ) 4479 4480 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4481 self._api_budget = self.create_component( 4482 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4483 ) 4484 4485 def create_grouping_partition_router( 4486 self, 4487 model: GroupingPartitionRouterModel, 4488 config: Config, 4489 *, 4490 stream_name: str, 4491 **kwargs: Any, 4492 ) -> GroupingPartitionRouter: 4493 underlying_router = self._create_component_from_model( 4494 model=model.underlying_partition_router, 4495 config=config, 4496 stream_name=stream_name, 4497 **kwargs, 4498 ) 4499 if model.group_size < 1: 4500 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4501 4502 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4503 # because they are specific to individual partitions and cannot be aggregated or handled 4504 # when grouping, potentially leading to incorrect API calls. Any request customization 4505 # should be managed at the stream level through the requester's configuration. 4506 if isinstance(underlying_router, SubstreamPartitionRouter): 4507 if any( 4508 parent_config.request_option 4509 for parent_config in underlying_router.parent_stream_configs 4510 ): 4511 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4512 4513 if isinstance(underlying_router, ListPartitionRouter): 4514 if underlying_router.request_option: 4515 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4516 4517 return GroupingPartitionRouter( 4518 group_size=model.group_size, 4519 underlying_partition_router=underlying_router, 4520 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4521 config=config, 4522 ) 4523 4524 def _ensure_query_properties_to_model( 4525 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4526 ) -> None: 4527 """ 4528 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4529 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4530 proper model. 4531 """ 4532 if not hasattr(requester, "request_parameters"): 4533 return 4534 4535 request_parameters = requester.request_parameters 4536 if request_parameters and isinstance(request_parameters, Dict): 4537 for request_parameter_key in request_parameters.keys(): 4538 request_parameter = request_parameters[request_parameter_key] 4539 if ( 4540 isinstance(request_parameter, Dict) 4541 and request_parameter.get("type") == "QueryProperties" 4542 ): 4543 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4544 request_parameter 4545 ) 4546 4547 def _get_catalog_defined_cursor_field( 4548 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4549 ) -> Optional[CursorField]: 4550 if not allow_catalog_defined_cursor_field: 4551 return None 4552 4553 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4554 4555 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4556 # case we return None which will then use the default cursor field defined on the cursor model. 4557 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4558 # occur when the platform serializes "no cursor configured" streams incorrectly. 4559 if ( 4560 not configured_stream 4561 or not configured_stream.cursor_field 4562 or not configured_stream.cursor_field[0] 4563 ): 4564 return None 4565 elif len(configured_stream.cursor_field) > 1: 4566 raise ValueError( 4567 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4568 ) 4569 else: 4570 return CursorField( 4571 cursor_field_key=configured_stream.cursor_field[0], 4572 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4573 )
688 def __init__( 689 self, 690 limit_pages_fetched_per_slice: Optional[int] = None, 691 limit_slices_fetched: Optional[int] = None, 692 emit_connector_builder_messages: bool = False, 693 disable_retries: bool = False, 694 disable_cache: bool = False, 695 message_repository: Optional[MessageRepository] = None, 696 connector_state_manager: Optional[ConnectorStateManager] = None, 697 max_concurrent_async_job_count: Optional[int] = None, 698 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 699 api_budget: Optional[APIBudget] = None, 700 ): 701 self._init_mappings() 702 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 703 self._limit_slices_fetched = limit_slices_fetched 704 self._emit_connector_builder_messages = emit_connector_builder_messages 705 self._disable_retries = disable_retries 706 self._disable_cache = disable_cache 707 self._message_repository = message_repository or InMemoryMessageRepository( 708 self._evaluate_log_level(emit_connector_builder_messages) 709 ) 710 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 711 configured_catalog 712 ) 713 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 714 self._api_budget: Optional[Union[APIBudget]] = api_budget 715 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 716 # placeholder for deprecation warnings 717 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
841 def create_component( 842 self, 843 model_type: Type[BaseModel], 844 component_definition: ComponentDefinition, 845 config: Config, 846 **kwargs: Any, 847 ) -> Any: 848 """ 849 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 850 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 851 creating declarative components from that model. 852 853 :param model_type: The type of declarative component that is being initialized 854 :param component_definition: The mapping that represents a declarative component 855 :param config: The connector config that is provided by the customer 856 :return: The declarative component to be used at runtime 857 """ 858 859 component_type = component_definition.get("type") 860 if component_definition.get("type") != model_type.__name__: 861 raise ValueError( 862 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 863 ) 864 865 declarative_component_model = model_type.parse_obj(component_definition) 866 867 if not isinstance(declarative_component_model, model_type): 868 raise ValueError( 869 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 870 ) 871 872 return self._create_component_from_model( 873 model=declarative_component_model, config=config, **kwargs 874 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
891 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 892 """ 893 Returns the deprecation warnings that were collected during the creation of components. 894 """ 895 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
912 def create_config_migration( 913 self, model: ConfigMigrationModel, config: Config 914 ) -> ConfigMigration: 915 transformations: List[ConfigTransformation] = [ 916 self._create_component_from_model(transformation, config) 917 for transformation in model.transformations 918 ] 919 920 return ConfigMigration( 921 description=model.description, 922 transformations=transformations, 923 )
925 def create_config_add_fields( 926 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 927 ) -> ConfigAddFields: 928 fields = [self._create_component_from_model(field, config) for field in model.fields] 929 return ConfigAddFields( 930 fields=fields, 931 condition=model.condition or "", 932 )
981 @staticmethod 982 def create_added_field_definition( 983 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 984 ) -> AddedFieldDefinition: 985 interpolated_value = InterpolatedString.create( 986 model.value, parameters=model.parameters or {} 987 ) 988 return AddedFieldDefinition( 989 path=model.path, 990 value=interpolated_value, 991 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 992 parameters=model.parameters or {}, 993 )
995 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 996 added_field_definitions = [ 997 self._create_component_from_model( 998 model=added_field_definition_model, 999 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1000 added_field_definition_model.value_type 1001 ), 1002 config=config, 1003 ) 1004 for added_field_definition_model in model.fields 1005 ] 1006 return AddFields( 1007 fields=added_field_definitions, 1008 condition=model.condition or "", 1009 parameters=model.parameters or {}, 1010 )
1036 def create_dpath_flatten_fields( 1037 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1038 ) -> DpathFlattenFields: 1039 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1040 key_transformation = ( 1041 KeyTransformation( 1042 config=config, 1043 prefix=model.key_transformation.prefix, 1044 suffix=model.key_transformation.suffix, 1045 parameters=model.parameters or {}, 1046 ) 1047 if model.key_transformation is not None 1048 else None 1049 ) 1050 return DpathFlattenFields( 1051 config=config, 1052 field_path=model_field_path, 1053 delete_origin_value=model.delete_origin_value 1054 if model.delete_origin_value is not None 1055 else False, 1056 replace_record=model.replace_record if model.replace_record is not None else False, 1057 key_transformation=key_transformation, 1058 parameters=model.parameters or {}, 1059 )
1073 def create_api_key_authenticator( 1074 self, 1075 model: ApiKeyAuthenticatorModel, 1076 config: Config, 1077 token_provider: Optional[TokenProvider] = None, 1078 **kwargs: Any, 1079 ) -> ApiKeyAuthenticator: 1080 if model.inject_into is None and model.header is None: 1081 raise ValueError( 1082 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1083 ) 1084 1085 if model.inject_into is not None and model.header is not None: 1086 raise ValueError( 1087 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1088 ) 1089 1090 if token_provider is not None and model.api_token != "": 1091 raise ValueError( 1092 "If token_provider is set, api_token is ignored and has to be set to empty string." 1093 ) 1094 1095 request_option = ( 1096 self._create_component_from_model( 1097 model.inject_into, config, parameters=model.parameters or {} 1098 ) 1099 if model.inject_into 1100 else RequestOption( 1101 inject_into=RequestOptionType.header, 1102 field_name=model.header or "", 1103 parameters=model.parameters or {}, 1104 ) 1105 ) 1106 1107 return ApiKeyAuthenticator( 1108 token_provider=( 1109 token_provider 1110 if token_provider is not None 1111 else InterpolatedStringTokenProvider( 1112 api_token=model.api_token or "", 1113 config=config, 1114 parameters=model.parameters or {}, 1115 ) 1116 ), 1117 request_option=request_option, 1118 config=config, 1119 parameters=model.parameters or {}, 1120 )
1122 def create_legacy_to_per_partition_state_migration( 1123 self, 1124 model: LegacyToPerPartitionStateMigrationModel, 1125 config: Mapping[str, Any], 1126 declarative_stream: DeclarativeStreamModel, 1127 ) -> LegacyToPerPartitionStateMigration: 1128 retriever = declarative_stream.retriever 1129 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1130 raise ValueError( 1131 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1132 ) 1133 partition_router = retriever.partition_router 1134 if not isinstance( 1135 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1136 ): 1137 raise ValueError( 1138 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1139 ) 1140 if not hasattr(partition_router, "parent_stream_configs"): 1141 raise ValueError( 1142 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1143 ) 1144 1145 if not hasattr(declarative_stream, "incremental_sync"): 1146 raise ValueError( 1147 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1148 ) 1149 1150 return LegacyToPerPartitionStateMigration( 1151 partition_router, # type: ignore # was already checked above 1152 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1153 config, 1154 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1155 )
1157 def create_session_token_authenticator( 1158 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1159 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1160 decoder = ( 1161 self._create_component_from_model(model=model.decoder, config=config) 1162 if model.decoder 1163 else JsonDecoder(parameters={}) 1164 ) 1165 login_requester = self._create_component_from_model( 1166 model=model.login_requester, 1167 config=config, 1168 name=f"{name}_login_requester", 1169 decoder=decoder, 1170 ) 1171 token_provider = SessionTokenProvider( 1172 login_requester=login_requester, 1173 session_token_path=model.session_token_path, 1174 expiration_duration=parse_duration(model.expiration_duration) 1175 if model.expiration_duration 1176 else None, 1177 parameters=model.parameters or {}, 1178 message_repository=self._message_repository, 1179 decoder=decoder, 1180 ) 1181 if model.request_authentication.type == "Bearer": 1182 return ModelToComponentFactory.create_bearer_authenticator( 1183 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1184 config, 1185 token_provider=token_provider, 1186 ) 1187 else: 1188 # Get the api_token template if specified, default to just the session token 1189 api_token_template = ( 1190 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1191 ) 1192 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1193 config=config, 1194 api_token=api_token_template, 1195 session_token_provider=token_provider, 1196 parameters=model.parameters or {}, 1197 ) 1198 return self.create_api_key_authenticator( 1199 ApiKeyAuthenticatorModel( 1200 type="ApiKeyAuthenticator", 1201 api_token="", 1202 inject_into=model.request_authentication.inject_into, 1203 ), # type: ignore # $parameters and headers default to None 1204 config=config, 1205 token_provider=final_token_provider, 1206 )
1208 @staticmethod 1209 def create_basic_http_authenticator( 1210 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1211 ) -> BasicHttpAuthenticator: 1212 return BasicHttpAuthenticator( 1213 password=model.password or "", 1214 username=model.username, 1215 config=config, 1216 parameters=model.parameters or {}, 1217 )
1219 @staticmethod 1220 def create_bearer_authenticator( 1221 model: BearerAuthenticatorModel, 1222 config: Config, 1223 token_provider: Optional[TokenProvider] = None, 1224 **kwargs: Any, 1225 ) -> BearerAuthenticator: 1226 if token_provider is not None and model.api_token != "": 1227 raise ValueError( 1228 "If token_provider is set, api_token is ignored and has to be set to empty string." 1229 ) 1230 return BearerAuthenticator( 1231 token_provider=( 1232 token_provider 1233 if token_provider is not None 1234 else InterpolatedStringTokenProvider( 1235 api_token=model.api_token or "", 1236 config=config, 1237 parameters=model.parameters or {}, 1238 ) 1239 ), 1240 config=config, 1241 parameters=model.parameters or {}, 1242 )
1244 @staticmethod 1245 def create_dynamic_stream_check_config( 1246 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1247 ) -> DynamicStreamCheckConfig: 1248 return DynamicStreamCheckConfig( 1249 dynamic_stream_name=model.dynamic_stream_name, 1250 stream_count=model.stream_count, 1251 )
1253 def create_check_stream( 1254 self, model: CheckStreamModel, config: Config, **kwargs: Any 1255 ) -> CheckStream: 1256 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1257 raise ValueError( 1258 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1259 ) 1260 1261 dynamic_streams_check_configs = ( 1262 [ 1263 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1264 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1265 ] 1266 if model.dynamic_streams_check_configs 1267 else [] 1268 ) 1269 1270 return CheckStream( 1271 stream_names=model.stream_names or [], 1272 dynamic_streams_check_configs=dynamic_streams_check_configs, 1273 parameters={}, 1274 )
1276 @staticmethod 1277 def create_check_dynamic_stream( 1278 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1279 ) -> CheckDynamicStream: 1280 assert model.use_check_availability is not None # for mypy 1281 1282 use_check_availability = model.use_check_availability 1283 1284 return CheckDynamicStream( 1285 stream_count=model.stream_count, 1286 use_check_availability=use_check_availability, 1287 parameters={}, 1288 )
1290 def create_composite_error_handler( 1291 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1292 ) -> CompositeErrorHandler: 1293 error_handlers = [ 1294 self._create_component_from_model(model=error_handler_model, config=config) 1295 for error_handler_model in model.error_handlers 1296 ] 1297 return CompositeErrorHandler( 1298 error_handlers=error_handlers, parameters=model.parameters or {} 1299 )
1301 @staticmethod 1302 def create_concurrency_level( 1303 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1304 ) -> ConcurrencyLevel: 1305 return ConcurrencyLevel( 1306 default_concurrency=model.default_concurrency, 1307 max_concurrency=model.max_concurrency, 1308 config=config, 1309 parameters={}, 1310 )
1312 @staticmethod 1313 def apply_stream_state_migrations( 1314 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1315 ) -> MutableMapping[str, Any]: 1316 if stream_state_migrations: 1317 for state_migration in stream_state_migrations: 1318 if state_migration.should_migrate(stream_state): 1319 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1320 stream_state = dict(state_migration.migrate(stream_state)) 1321 return stream_state
1323 def create_concurrent_cursor_from_datetime_based_cursor( 1324 self, 1325 model_type: Type[BaseModel], 1326 component_definition: ComponentDefinition, 1327 stream_name: str, 1328 stream_namespace: Optional[str], 1329 stream_state: MutableMapping[str, Any], 1330 config: Config, 1331 message_repository: Optional[MessageRepository] = None, 1332 runtime_lookback_window: Optional[datetime.timedelta] = None, 1333 **kwargs: Any, 1334 ) -> ConcurrentCursor: 1335 component_type = component_definition.get("type") 1336 if component_definition.get("type") != model_type.__name__: 1337 raise ValueError( 1338 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1339 ) 1340 1341 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1342 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1343 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1344 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1345 if "$parameters" not in component_definition and "parameters" in component_definition: 1346 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1347 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1348 1349 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1350 raise ValueError( 1351 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1352 ) 1353 1354 model_parameters = datetime_based_cursor_model.parameters or {} 1355 1356 cursor_field = self._get_catalog_defined_cursor_field( 1357 stream_name=stream_name, 1358 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1359 or False, 1360 ) 1361 1362 if not cursor_field: 1363 interpolated_cursor_field = InterpolatedString.create( 1364 datetime_based_cursor_model.cursor_field, 1365 parameters=model_parameters, 1366 ) 1367 cursor_field = CursorField( 1368 cursor_field_key=interpolated_cursor_field.eval(config=config), 1369 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1370 or False, 1371 ) 1372 1373 interpolated_partition_field_start = InterpolatedString.create( 1374 datetime_based_cursor_model.partition_field_start or "start_time", 1375 parameters=model_parameters, 1376 ) 1377 interpolated_partition_field_end = InterpolatedString.create( 1378 datetime_based_cursor_model.partition_field_end or "end_time", 1379 parameters=model_parameters, 1380 ) 1381 1382 slice_boundary_fields = ( 1383 interpolated_partition_field_start.eval(config=config), 1384 interpolated_partition_field_end.eval(config=config), 1385 ) 1386 1387 datetime_format = datetime_based_cursor_model.datetime_format 1388 1389 cursor_granularity = ( 1390 parse_duration(datetime_based_cursor_model.cursor_granularity) 1391 if datetime_based_cursor_model.cursor_granularity 1392 else None 1393 ) 1394 1395 lookback_window = None 1396 interpolated_lookback_window = ( 1397 InterpolatedString.create( 1398 datetime_based_cursor_model.lookback_window, 1399 parameters=model_parameters, 1400 ) 1401 if datetime_based_cursor_model.lookback_window 1402 else None 1403 ) 1404 if interpolated_lookback_window: 1405 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1406 if evaluated_lookback_window: 1407 lookback_window = parse_duration(evaluated_lookback_window) 1408 1409 connector_state_converter: DateTimeStreamStateConverter 1410 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1411 datetime_format=datetime_format, 1412 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1413 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1414 cursor_granularity=cursor_granularity, 1415 ) 1416 1417 # Adjusts the stream state by applying the runtime lookback window. 1418 # This is used to ensure correct state handling in case of failed partitions. 1419 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1420 if runtime_lookback_window and stream_state_value: 1421 new_stream_state = ( 1422 connector_state_converter.parse_timestamp(stream_state_value) 1423 - runtime_lookback_window 1424 ) 1425 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1426 new_stream_state 1427 ) 1428 1429 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1430 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1431 start_date_runtime_value = self.create_min_max_datetime( 1432 model=datetime_based_cursor_model.start_datetime, config=config 1433 ) 1434 else: 1435 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1436 1437 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1438 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1439 end_date_runtime_value = self.create_min_max_datetime( 1440 model=datetime_based_cursor_model.end_datetime, config=config 1441 ) 1442 else: 1443 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1444 1445 interpolated_start_date = MinMaxDatetime.create( 1446 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1447 parameters=datetime_based_cursor_model.parameters, 1448 ) 1449 interpolated_end_date = ( 1450 None 1451 if not end_date_runtime_value 1452 else MinMaxDatetime.create( 1453 end_date_runtime_value, datetime_based_cursor_model.parameters 1454 ) 1455 ) 1456 1457 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1458 if not interpolated_start_date.datetime_format: 1459 interpolated_start_date.datetime_format = datetime_format 1460 if interpolated_end_date and not interpolated_end_date.datetime_format: 1461 interpolated_end_date.datetime_format = datetime_format 1462 1463 start_date = interpolated_start_date.get_datetime(config=config) 1464 end_date_provider = ( 1465 partial(interpolated_end_date.get_datetime, config) 1466 if interpolated_end_date 1467 else connector_state_converter.get_end_provider() 1468 ) 1469 1470 if ( 1471 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1472 ) or ( 1473 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1474 ): 1475 raise ValueError( 1476 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1477 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1478 ) 1479 1480 # When step is not defined, default to a step size from the starting date to the present moment 1481 step_length = datetime.timedelta.max 1482 interpolated_step = ( 1483 InterpolatedString.create( 1484 datetime_based_cursor_model.step, 1485 parameters=model_parameters, 1486 ) 1487 if datetime_based_cursor_model.step 1488 else None 1489 ) 1490 if interpolated_step: 1491 evaluated_step = interpolated_step.eval(config) 1492 if evaluated_step: 1493 step_length = parse_duration(evaluated_step) 1494 1495 clamping_strategy: ClampingStrategy = NoClamping() 1496 if datetime_based_cursor_model.clamping: 1497 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1498 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1499 # object which we want to keep agnostic of being low-code 1500 target = InterpolatedString( 1501 string=datetime_based_cursor_model.clamping.target, 1502 parameters=model_parameters, 1503 ) 1504 evaluated_target = target.eval(config=config) 1505 match evaluated_target: 1506 case "DAY": 1507 clamping_strategy = DayClampingStrategy() 1508 end_date_provider = ClampingEndProvider( 1509 DayClampingStrategy(is_ceiling=False), 1510 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1511 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1512 ) 1513 case "WEEK": 1514 if ( 1515 not datetime_based_cursor_model.clamping.target_details 1516 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1517 ): 1518 raise ValueError( 1519 "Given WEEK clamping, weekday needs to be provided as target_details" 1520 ) 1521 weekday = self._assemble_weekday( 1522 datetime_based_cursor_model.clamping.target_details["weekday"] 1523 ) 1524 clamping_strategy = WeekClampingStrategy(weekday) 1525 end_date_provider = ClampingEndProvider( 1526 WeekClampingStrategy(weekday, is_ceiling=False), 1527 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 granularity=cursor_granularity or datetime.timedelta(days=1), 1529 ) 1530 case "MONTH": 1531 clamping_strategy = MonthClampingStrategy() 1532 end_date_provider = ClampingEndProvider( 1533 MonthClampingStrategy(is_ceiling=False), 1534 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1535 granularity=cursor_granularity or datetime.timedelta(days=1), 1536 ) 1537 case _: 1538 raise ValueError( 1539 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1540 ) 1541 1542 return ConcurrentCursor( 1543 stream_name=stream_name, 1544 stream_namespace=stream_namespace, 1545 stream_state=stream_state, 1546 message_repository=message_repository or self._message_repository, 1547 connector_state_manager=self._connector_state_manager, 1548 connector_state_converter=connector_state_converter, 1549 cursor_field=cursor_field, 1550 slice_boundary_fields=slice_boundary_fields, 1551 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1552 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1553 lookback_window=lookback_window, 1554 slice_range=step_length, 1555 cursor_granularity=cursor_granularity, 1556 clamping_strategy=clamping_strategy, 1557 )
1559 def create_concurrent_cursor_from_incrementing_count_cursor( 1560 self, 1561 model_type: Type[BaseModel], 1562 component_definition: ComponentDefinition, 1563 stream_name: str, 1564 stream_namespace: Optional[str], 1565 stream_state: MutableMapping[str, Any], 1566 config: Config, 1567 message_repository: Optional[MessageRepository] = None, 1568 **kwargs: Any, 1569 ) -> ConcurrentCursor: 1570 component_type = component_definition.get("type") 1571 if component_definition.get("type") != model_type.__name__: 1572 raise ValueError( 1573 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1574 ) 1575 1576 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1577 1578 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1579 raise ValueError( 1580 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1581 ) 1582 1583 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1584 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1585 # We need to handle both int and str representations of numeric values. 1586 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1587 if start_value is not None: 1588 interpolated_start_value = InterpolatedString.create( 1589 str(start_value), # Ensure we pass a string to InterpolatedString.create 1590 parameters=incrementing_count_cursor_model.parameters or {}, 1591 ) 1592 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1593 else: 1594 evaluated_start_value = 0 1595 1596 cursor_field = self._get_catalog_defined_cursor_field( 1597 stream_name=stream_name, 1598 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1599 or False, 1600 ) 1601 1602 if not cursor_field: 1603 interpolated_cursor_field = InterpolatedString.create( 1604 incrementing_count_cursor_model.cursor_field, 1605 parameters=incrementing_count_cursor_model.parameters or {}, 1606 ) 1607 cursor_field = CursorField( 1608 cursor_field_key=interpolated_cursor_field.eval(config=config), 1609 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1610 or False, 1611 ) 1612 1613 connector_state_converter = IncrementingCountStreamStateConverter( 1614 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1615 ) 1616 1617 return ConcurrentCursor( 1618 stream_name=stream_name, 1619 stream_namespace=stream_namespace, 1620 stream_state=stream_state, 1621 message_repository=message_repository or self._message_repository, 1622 connector_state_manager=self._connector_state_manager, 1623 connector_state_converter=connector_state_converter, 1624 cursor_field=cursor_field, 1625 slice_boundary_fields=None, 1626 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1627 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1628 )
1649 def create_concurrent_cursor_from_perpartition_cursor( 1650 self, 1651 state_manager: ConnectorStateManager, 1652 model_type: Type[BaseModel], 1653 component_definition: ComponentDefinition, 1654 stream_name: str, 1655 stream_namespace: Optional[str], 1656 config: Config, 1657 stream_state: MutableMapping[str, Any], 1658 partition_router: PartitionRouter, 1659 attempt_to_create_cursor_if_not_provided: bool = False, 1660 **kwargs: Any, 1661 ) -> ConcurrentPerPartitionCursor: 1662 component_type = component_definition.get("type") 1663 if component_definition.get("type") != model_type.__name__: 1664 raise ValueError( 1665 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1666 ) 1667 1668 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1669 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1670 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1671 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1672 if "$parameters" not in component_definition and "parameters" in component_definition: 1673 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1674 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1675 1676 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1677 raise ValueError( 1678 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1679 ) 1680 1681 cursor_field = self._get_catalog_defined_cursor_field( 1682 stream_name=stream_name, 1683 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1684 or False, 1685 ) 1686 1687 if not cursor_field: 1688 interpolated_cursor_field = InterpolatedString.create( 1689 datetime_based_cursor_model.cursor_field, 1690 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1691 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1692 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1693 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1694 parameters=datetime_based_cursor_model.parameters or {}, 1695 ) 1696 cursor_field = CursorField( 1697 cursor_field_key=interpolated_cursor_field.eval(config=config), 1698 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1699 or False, 1700 ) 1701 1702 datetime_format = datetime_based_cursor_model.datetime_format 1703 1704 cursor_granularity = ( 1705 parse_duration(datetime_based_cursor_model.cursor_granularity) 1706 if datetime_based_cursor_model.cursor_granularity 1707 else None 1708 ) 1709 1710 connector_state_converter: DateTimeStreamStateConverter 1711 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1712 datetime_format=datetime_format, 1713 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1714 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1715 cursor_granularity=cursor_granularity, 1716 ) 1717 1718 # Create the cursor factory 1719 cursor_factory = ConcurrentCursorFactory( 1720 partial( 1721 self.create_concurrent_cursor_from_datetime_based_cursor, 1722 state_manager=state_manager, 1723 model_type=model_type, 1724 component_definition=component_definition, 1725 stream_name=stream_name, 1726 stream_namespace=stream_namespace, 1727 config=config, 1728 message_repository=NoopMessageRepository(), 1729 ) 1730 ) 1731 1732 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1733 use_global_cursor = isinstance( 1734 partition_router, GroupingPartitionRouter 1735 ) or component_definition.get("global_substream_cursor", False) 1736 1737 # Return the concurrent cursor and state converter 1738 return ConcurrentPerPartitionCursor( 1739 cursor_factory=cursor_factory, 1740 partition_router=partition_router, 1741 stream_name=stream_name, 1742 stream_namespace=stream_namespace, 1743 stream_state=stream_state, 1744 message_repository=self._message_repository, # type: ignore 1745 connector_state_manager=state_manager, 1746 connector_state_converter=connector_state_converter, 1747 cursor_field=cursor_field, 1748 use_global_cursor=use_global_cursor, 1749 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1750 )
1752 @staticmethod 1753 def create_constant_backoff_strategy( 1754 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1755 ) -> ConstantBackoffStrategy: 1756 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 1757 return ConstantBackoffStrategy( 1758 backoff_time_in_seconds=model.backoff_time_in_seconds, 1759 jitter_range_in_seconds=model.jitter_range_in_seconds, 1760 config=config, 1761 parameters=model.parameters or {}, 1762 )
1769 def create_cursor_pagination( 1770 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1771 ) -> CursorPaginationStrategy: 1772 if isinstance(decoder, PaginationDecoderDecorator): 1773 inner_decoder = decoder.decoder 1774 else: 1775 inner_decoder = decoder 1776 decoder = PaginationDecoderDecorator(decoder=decoder) 1777 1778 if self._is_supported_decoder_for_pagination(inner_decoder): 1779 decoder_to_use = decoder 1780 else: 1781 raise ValueError( 1782 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1783 ) 1784 1785 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1786 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1787 page_size = model.page_size 1788 if isinstance(page_size, str) and page_size.isdigit(): 1789 page_size = int(page_size) 1790 1791 return CursorPaginationStrategy( 1792 cursor_value=model.cursor_value, 1793 decoder=decoder_to_use, 1794 page_size=page_size, 1795 stop_condition=model.stop_condition, 1796 config=config, 1797 parameters=model.parameters or {}, 1798 )
1800 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1801 """ 1802 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1803 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1804 :param model: The Pydantic model of the custom component being created 1805 :param config: The custom defined connector config 1806 :return: The declarative component built from the Pydantic model to be used at runtime 1807 """ 1808 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1809 component_fields = get_type_hints(custom_component_class) 1810 model_args = model.dict() 1811 model_args["config"] = config 1812 1813 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1814 # we defer to these arguments over the component's definition 1815 for key, arg in kwargs.items(): 1816 model_args[key] = arg 1817 1818 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1819 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1820 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1821 for model_field, model_value in model_args.items(): 1822 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1823 if ( 1824 isinstance(model_value, dict) 1825 and "type" not in model_value 1826 and model_field in component_fields 1827 ): 1828 derived_type = self._derive_component_type_from_type_hints( 1829 component_fields.get(model_field) 1830 ) 1831 if derived_type: 1832 model_value["type"] = derived_type 1833 1834 if self._is_component(model_value): 1835 model_args[model_field] = self._create_nested_component( 1836 model, 1837 model_field, 1838 model_value, 1839 config, 1840 **kwargs, 1841 ) 1842 elif isinstance(model_value, list): 1843 vals = [] 1844 for v in model_value: 1845 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1846 derived_type = self._derive_component_type_from_type_hints( 1847 component_fields.get(model_field) 1848 ) 1849 if derived_type: 1850 v["type"] = derived_type 1851 if self._is_component(v): 1852 vals.append( 1853 self._create_nested_component( 1854 model, 1855 model_field, 1856 v, 1857 config, 1858 **kwargs, 1859 ) 1860 ) 1861 else: 1862 vals.append(v) 1863 model_args[model_field] = vals 1864 1865 kwargs = { 1866 class_field: model_args[class_field] 1867 for class_field in component_fields.keys() 1868 if class_field in model_args 1869 } 1870 1871 if "api_budget" in component_fields and kwargs.get("api_budget") is None: 1872 kwargs["api_budget"] = self._api_budget 1873 1874 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
2009 def create_default_stream( 2010 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2011 ) -> AbstractStream: 2012 primary_key = model.primary_key.__root__ if model.primary_key else None 2013 self._migrate_state(model, config) 2014 2015 partition_router = self._build_stream_slicer_from_partition_router( 2016 model.retriever, 2017 config, 2018 stream_name=model.name, 2019 **kwargs, 2020 ) 2021 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2022 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2023 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2024 2025 end_time_option = ( 2026 self._create_component_from_model( 2027 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2028 ) 2029 if cursor_model.end_time_option 2030 else None 2031 ) 2032 start_time_option = ( 2033 self._create_component_from_model( 2034 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2035 ) 2036 if cursor_model.start_time_option 2037 else None 2038 ) 2039 2040 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2041 start_time_option=start_time_option, 2042 end_time_option=end_time_option, 2043 partition_field_start=cursor_model.partition_field_start, 2044 partition_field_end=cursor_model.partition_field_end, 2045 config=config, 2046 parameters=model.parameters or {}, 2047 ) 2048 request_options_provider = ( 2049 datetime_request_options_provider 2050 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2051 else PerPartitionRequestOptionsProvider( 2052 partition_router, datetime_request_options_provider 2053 ) 2054 ) 2055 elif model.incremental_sync and isinstance( 2056 model.incremental_sync, IncrementingCountCursorModel 2057 ): 2058 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2059 raise ValueError( 2060 "PerPartition does not support per partition states because switching to global state is time based" 2061 ) 2062 2063 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2064 2065 start_time_option = ( 2066 self._create_component_from_model( 2067 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2068 config, 2069 parameters=cursor_model.parameters or {}, 2070 ) 2071 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2072 else None 2073 ) 2074 2075 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2076 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2077 partition_field_start = "start" 2078 2079 request_options_provider = DatetimeBasedRequestOptionsProvider( 2080 start_time_option=start_time_option, 2081 partition_field_start=partition_field_start, 2082 config=config, 2083 parameters=model.parameters or {}, 2084 ) 2085 else: 2086 request_options_provider = None 2087 2088 transformations = [] 2089 if model.transformations: 2090 for transformation_model in model.transformations: 2091 transformations.append( 2092 self._create_component_from_model(model=transformation_model, config=config) 2093 ) 2094 file_uploader = None 2095 if model.file_uploader: 2096 file_uploader = self._create_component_from_model( 2097 model=model.file_uploader, config=config 2098 ) 2099 2100 stream_slicer: ConcurrentStreamSlicer = ( 2101 partition_router 2102 if isinstance(concurrent_cursor, FinalStateCursor) 2103 else concurrent_cursor 2104 ) 2105 2106 retriever = self._create_component_from_model( 2107 model=model.retriever, 2108 config=config, 2109 name=model.name, 2110 primary_key=primary_key, 2111 request_options_provider=request_options_provider, 2112 stream_slicer=stream_slicer, 2113 partition_router=partition_router, 2114 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2115 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2116 cursor=concurrent_cursor, 2117 transformations=transformations, 2118 file_uploader=file_uploader, 2119 incremental_sync=model.incremental_sync, 2120 ) 2121 if isinstance(retriever, AsyncRetriever): 2122 stream_slicer = retriever.stream_slicer 2123 2124 schema_loader: SchemaLoader 2125 if model.schema_loader and isinstance(model.schema_loader, list): 2126 nested_schema_loaders = [ 2127 self._create_component_from_model(model=nested_schema_loader, config=config) 2128 for nested_schema_loader in model.schema_loader 2129 ] 2130 schema_loader = CompositeSchemaLoader( 2131 schema_loaders=nested_schema_loaders, parameters={} 2132 ) 2133 elif model.schema_loader: 2134 schema_loader = self._create_component_from_model( 2135 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2136 config=config, 2137 ) 2138 else: 2139 options = model.parameters or {} 2140 if "name" not in options: 2141 options["name"] = model.name 2142 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2143 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2144 2145 stream_name = model.name or "" 2146 return DefaultStream( 2147 partition_generator=StreamSlicerPartitionGenerator( 2148 DeclarativePartitionFactory( 2149 stream_name, 2150 schema_loader, 2151 retriever, 2152 self._message_repository, 2153 ), 2154 stream_slicer, 2155 slice_limit=self._limit_slices_fetched, 2156 ), 2157 name=stream_name, 2158 json_schema=schema_loader.get_json_schema, 2159 primary_key=get_primary_key_from_stream(primary_key), 2160 cursor_field=( 2161 concurrent_cursor.cursor_field 2162 if hasattr(concurrent_cursor, "cursor_field") 2163 else None 2164 ), 2165 logger=logging.getLogger(f"airbyte.{stream_name}"), 2166 cursor=concurrent_cursor, 2167 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2168 )
2310 def create_default_error_handler( 2311 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2312 ) -> DefaultErrorHandler: 2313 backoff_strategies = [] 2314 if model.backoff_strategies: 2315 for backoff_strategy_model in model.backoff_strategies: 2316 backoff_strategies.append( 2317 self._create_component_from_model(model=backoff_strategy_model, config=config) 2318 ) 2319 2320 response_filters = [] 2321 if model.response_filters: 2322 for response_filter_model in model.response_filters: 2323 response_filters.append( 2324 self._create_component_from_model(model=response_filter_model, config=config) 2325 ) 2326 response_filters.append( 2327 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2328 ) 2329 2330 return DefaultErrorHandler( 2331 backoff_strategies=backoff_strategies, 2332 max_retries=model.max_retries, 2333 response_filters=response_filters, 2334 config=config, 2335 parameters=model.parameters or {}, 2336 )
2338 def create_default_paginator( 2339 self, 2340 model: DefaultPaginatorModel, 2341 config: Config, 2342 *, 2343 url_base: str, 2344 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2345 decoder: Optional[Decoder] = None, 2346 cursor_used_for_stop_condition: Optional[Cursor] = None, 2347 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2348 if decoder: 2349 if self._is_supported_decoder_for_pagination(decoder): 2350 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2351 else: 2352 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2353 else: 2354 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2355 page_size_option = ( 2356 self._create_component_from_model(model=model.page_size_option, config=config) 2357 if model.page_size_option 2358 else None 2359 ) 2360 page_token_option = ( 2361 self._create_component_from_model(model=model.page_token_option, config=config) 2362 if model.page_token_option 2363 else None 2364 ) 2365 pagination_strategy = self._create_component_from_model( 2366 model=model.pagination_strategy, 2367 config=config, 2368 decoder=decoder_to_use, 2369 extractor_model=extractor_model, 2370 ) 2371 if cursor_used_for_stop_condition: 2372 pagination_strategy = StopConditionPaginationStrategyDecorator( 2373 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2374 ) 2375 paginator = DefaultPaginator( 2376 decoder=decoder_to_use, 2377 page_size_option=page_size_option, 2378 page_token_option=page_token_option, 2379 pagination_strategy=pagination_strategy, 2380 url_base=url_base, 2381 config=config, 2382 parameters=model.parameters or {}, 2383 ) 2384 if self._limit_pages_fetched_per_slice: 2385 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2386 return paginator
2388 def create_dpath_extractor( 2389 self, 2390 model: DpathExtractorModel, 2391 config: Config, 2392 decoder: Optional[Decoder] = None, 2393 **kwargs: Any, 2394 ) -> DpathExtractor: 2395 if decoder: 2396 decoder_to_use = decoder 2397 else: 2398 decoder_to_use = JsonDecoder(parameters={}) 2399 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2400 2401 record_expander = None 2402 if model.record_expander: 2403 record_expander = self._create_component_from_model( 2404 model=model.record_expander, 2405 config=config, 2406 ) 2407 2408 return DpathExtractor( 2409 decoder=decoder_to_use, 2410 field_path=model_field_path, 2411 config=config, 2412 parameters=model.parameters or {}, 2413 record_expander=record_expander, 2414 )
2416 def create_record_expander( 2417 self, 2418 model: RecordExpanderModel, 2419 config: Config, 2420 **kwargs: Any, 2421 ) -> RecordExpander: 2422 return RecordExpander( 2423 expand_records_from_field=model.expand_records_from_field, 2424 config=config, 2425 parameters=model.parameters or {}, 2426 remain_original_record=model.remain_original_record or False, 2427 on_no_records=OnNoRecords(model.on_no_records.value) 2428 if model.on_no_records 2429 else OnNoRecords.skip, 2430 )
2439 @staticmethod 2440 def create_exponential_backoff_strategy( 2441 model: ExponentialBackoffStrategyModel, config: Config 2442 ) -> ExponentialBackoffStrategy: 2443 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 2444 return ExponentialBackoffStrategy( 2445 factor=model.factor or 5, 2446 jitter_range_in_seconds=model.jitter_range_in_seconds, 2447 parameters=model.parameters or {}, 2448 config=config, 2449 )
2455 def create_http_requester( 2456 self, 2457 model: HttpRequesterModel, 2458 config: Config, 2459 decoder: Decoder = JsonDecoder(parameters={}), 2460 query_properties_key: Optional[str] = None, 2461 use_cache: Optional[bool] = None, 2462 *, 2463 name: str, 2464 ) -> HttpRequester: 2465 authenticator = ( 2466 self._create_component_from_model( 2467 model=model.authenticator, 2468 config=config, 2469 url_base=model.url or model.url_base, 2470 name=name, 2471 decoder=decoder, 2472 ) 2473 if model.authenticator 2474 else None 2475 ) 2476 error_handler = ( 2477 self._create_component_from_model(model=model.error_handler, config=config) 2478 if model.error_handler 2479 else DefaultErrorHandler( 2480 backoff_strategies=[], 2481 response_filters=[], 2482 config=config, 2483 parameters=model.parameters or {}, 2484 ) 2485 ) 2486 2487 api_budget = self._api_budget 2488 2489 request_options_provider = InterpolatedRequestOptionsProvider( 2490 request_body=model.request_body, 2491 request_body_data=model.request_body_data, 2492 request_body_json=model.request_body_json, 2493 request_headers=model.request_headers, 2494 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2495 query_properties_key=query_properties_key, 2496 config=config, 2497 parameters=model.parameters or {}, 2498 ) 2499 2500 assert model.use_cache is not None # for mypy 2501 assert model.http_method is not None # for mypy 2502 2503 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2504 2505 return HttpRequester( 2506 name=name, 2507 url=model.url, 2508 url_base=model.url_base, 2509 path=model.path, 2510 authenticator=authenticator, 2511 error_handler=error_handler, 2512 api_budget=api_budget, 2513 http_method=HttpMethod[model.http_method.value], 2514 request_options_provider=request_options_provider, 2515 config=config, 2516 disable_retries=self._disable_retries, 2517 parameters=model.parameters or {}, 2518 message_repository=self._message_repository, 2519 use_cache=should_use_cache, 2520 decoder=decoder, 2521 stream_response=decoder.is_stream_response() if decoder else False, 2522 )
2524 @staticmethod 2525 def create_http_response_filter( 2526 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2527 ) -> HttpResponseFilter: 2528 if model.action: 2529 action = ResponseAction(model.action.value) 2530 else: 2531 action = None 2532 2533 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2534 2535 http_codes = ( 2536 set(model.http_codes) if model.http_codes else set() 2537 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2538 2539 return HttpResponseFilter( 2540 action=action, 2541 failure_type=failure_type, 2542 error_message=model.error_message or "", 2543 error_message_contains=model.error_message_contains or "", 2544 http_codes=http_codes, 2545 predicate=model.predicate or "", 2546 config=config, 2547 parameters=model.parameters or {}, 2548 )
2556 def create_complex_field_type( 2557 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2558 ) -> ComplexFieldType: 2559 items = ( 2560 self._create_component_from_model(model=model.items, config=config) 2561 if isinstance(model.items, ComplexFieldTypeModel) 2562 else model.items 2563 ) 2564 2565 return ComplexFieldType(field_type=model.field_type, items=items)
2567 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2568 target_type = ( 2569 self._create_component_from_model(model=model.target_type, config=config) 2570 if isinstance(model.target_type, ComplexFieldTypeModel) 2571 else model.target_type 2572 ) 2573 2574 return TypesMap( 2575 target_type=target_type, 2576 current_type=model.current_type, 2577 condition=model.condition if model.condition is not None else "True", 2578 )
2580 def create_schema_type_identifier( 2581 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2582 ) -> SchemaTypeIdentifier: 2583 types_mapping = [] 2584 if model.types_mapping: 2585 types_mapping.extend( 2586 [ 2587 self._create_component_from_model(types_map, config=config) 2588 for types_map in model.types_mapping 2589 ] 2590 ) 2591 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2592 [x for x in model.schema_pointer] if model.schema_pointer else [] 2593 ) 2594 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2595 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2596 [x for x in model.type_pointer] if model.type_pointer else None 2597 ) 2598 2599 return SchemaTypeIdentifier( 2600 schema_pointer=model_schema_pointer, 2601 key_pointer=model_key_pointer, 2602 type_pointer=model_type_pointer, 2603 types_mapping=types_mapping, 2604 parameters=model.parameters or {}, 2605 )
2607 def create_dynamic_schema_loader( 2608 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2609 ) -> DynamicSchemaLoader: 2610 schema_transformations = [] 2611 if model.schema_transformations: 2612 for transformation_model in model.schema_transformations: 2613 schema_transformations.append( 2614 self._create_component_from_model(model=transformation_model, config=config) 2615 ) 2616 name = "dynamic_properties" 2617 retriever = self._create_component_from_model( 2618 model=model.retriever, 2619 config=config, 2620 name=name, 2621 primary_key=None, 2622 partition_router=self._build_stream_slicer_from_partition_router( 2623 model.retriever, config 2624 ), 2625 transformations=[], 2626 use_cache=True, 2627 log_formatter=( 2628 lambda response: format_http_message( 2629 response, 2630 f"Schema loader '{name}' request", 2631 f"Request performed in order to extract schema.", 2632 name, 2633 is_auxiliary=True, 2634 ) 2635 ), 2636 ) 2637 schema_type_identifier = self._create_component_from_model( 2638 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2639 ) 2640 schema_filter = ( 2641 self._create_component_from_model( 2642 model.schema_filter, config=config, parameters=model.parameters or {} 2643 ) 2644 if model.schema_filter is not None 2645 else None 2646 ) 2647 2648 return DynamicSchemaLoader( 2649 retriever=retriever, 2650 config=config, 2651 schema_transformations=schema_transformations, 2652 schema_filter=schema_filter, 2653 schema_type_identifier=schema_type_identifier, 2654 parameters=model.parameters or {}, 2655 )
2675 def create_gzip_decoder( 2676 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2677 ) -> Decoder: 2678 _compressed_response_types = { 2679 "gzip", 2680 "x-gzip", 2681 "gzip, deflate", 2682 "x-gzip, deflate", 2683 "application/zip", 2684 "application/gzip", 2685 "application/x-gzip", 2686 "application/x-zip-compressed", 2687 } 2688 2689 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2690 2691 if self._emit_connector_builder_messages: 2692 # This is very surprising but if the response is not streamed, 2693 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2694 # which uses urllib3 directly and does not uncompress the data. 2695 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2696 2697 return CompositeRawDecoder.by_headers( 2698 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2699 stream_response=True, 2700 fallback_parser=gzip_parser.inner_parser, 2701 )
2750 def create_jwt_authenticator( 2751 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2752 ) -> JwtAuthenticator: 2753 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2754 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2755 request_option = ( 2756 self._create_component_from_model(model.request_option, config) 2757 if model.request_option 2758 else None 2759 ) 2760 return JwtAuthenticator( 2761 config=config, 2762 parameters=model.parameters or {}, 2763 algorithm=JwtAlgorithm(model.algorithm.value), 2764 secret_key=model.secret_key, 2765 base64_encode_secret_key=model.base64_encode_secret_key, 2766 token_duration=model.token_duration, 2767 header_prefix=model.header_prefix, 2768 kid=jwt_headers.kid, 2769 typ=jwt_headers.typ, 2770 cty=jwt_headers.cty, 2771 iss=jwt_payload.iss, 2772 sub=jwt_payload.sub, 2773 aud=jwt_payload.aud, 2774 additional_jwt_headers=model.additional_jwt_headers, 2775 additional_jwt_payload=model.additional_jwt_payload, 2776 passphrase=model.passphrase, 2777 request_option=request_option, 2778 )
2780 def create_list_partition_router( 2781 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2782 ) -> ListPartitionRouter: 2783 request_option = ( 2784 self._create_component_from_model(model.request_option, config) 2785 if model.request_option 2786 else None 2787 ) 2788 return ListPartitionRouter( 2789 cursor_field=model.cursor_field, 2790 request_option=request_option, 2791 values=model.values, 2792 config=config, 2793 parameters=model.parameters or {}, 2794 )
2796 @staticmethod 2797 def create_min_max_datetime( 2798 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2799 ) -> MinMaxDatetime: 2800 return MinMaxDatetime( 2801 datetime=model.datetime, 2802 datetime_format=model.datetime_format or "", 2803 max_datetime=model.max_datetime or "", 2804 min_datetime=model.min_datetime or "", 2805 parameters=model.parameters or {}, 2806 )
2818 def create_oauth_authenticator( 2819 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2820 ) -> DeclarativeOauth2Authenticator: 2821 profile_assertion = ( 2822 self._create_component_from_model(model.profile_assertion, config=config) 2823 if model.profile_assertion 2824 else None 2825 ) 2826 2827 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2828 self._get_refresh_token_error_information(model) 2829 ) 2830 if model.refresh_token_updater: 2831 # ignore type error because fixing it would have a lot of dependencies, revisit later 2832 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2833 config, 2834 InterpolatedString.create( 2835 model.token_refresh_endpoint, # type: ignore 2836 parameters=model.parameters or {}, 2837 ).eval(config), 2838 access_token_name=InterpolatedString.create( 2839 model.access_token_name or "access_token", parameters=model.parameters or {} 2840 ).eval(config), 2841 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2842 expires_in_name=InterpolatedString.create( 2843 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2844 ).eval(config), 2845 client_id_name=InterpolatedString.create( 2846 model.client_id_name or "client_id", parameters=model.parameters or {} 2847 ).eval(config), 2848 client_id=InterpolatedString.create( 2849 model.client_id, parameters=model.parameters or {} 2850 ).eval(config) 2851 if model.client_id 2852 else model.client_id, 2853 client_secret_name=InterpolatedString.create( 2854 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2855 ).eval(config), 2856 client_secret=InterpolatedString.create( 2857 model.client_secret, parameters=model.parameters or {} 2858 ).eval(config) 2859 if model.client_secret 2860 else model.client_secret, 2861 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2862 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2863 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2864 grant_type_name=InterpolatedString.create( 2865 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2866 ).eval(config), 2867 grant_type=InterpolatedString.create( 2868 model.grant_type or "refresh_token", parameters=model.parameters or {} 2869 ).eval(config), 2870 refresh_request_body=InterpolatedMapping( 2871 model.refresh_request_body or {}, parameters=model.parameters or {} 2872 ).eval(config), 2873 refresh_request_headers=InterpolatedMapping( 2874 model.refresh_request_headers or {}, parameters=model.parameters or {} 2875 ).eval(config), 2876 scopes=model.scopes, 2877 token_expiry_date_format=model.token_expiry_date_format, 2878 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2879 message_repository=self._message_repository, 2880 refresh_token_error_status_codes=refresh_token_error_status_codes, 2881 refresh_token_error_key=refresh_token_error_key, 2882 refresh_token_error_values=refresh_token_error_values, 2883 ) 2884 # ignore type error because fixing it would have a lot of dependencies, revisit later 2885 return DeclarativeOauth2Authenticator( # type: ignore 2886 access_token_name=model.access_token_name or "access_token", 2887 access_token_value=model.access_token_value, 2888 client_id_name=model.client_id_name or "client_id", 2889 client_id=model.client_id, 2890 client_secret_name=model.client_secret_name or "client_secret", 2891 client_secret=model.client_secret, 2892 expires_in_name=model.expires_in_name or "expires_in", 2893 grant_type_name=model.grant_type_name or "grant_type", 2894 grant_type=model.grant_type or "refresh_token", 2895 refresh_request_body=model.refresh_request_body, 2896 refresh_request_headers=model.refresh_request_headers, 2897 refresh_token_name=model.refresh_token_name or "refresh_token", 2898 refresh_token=model.refresh_token, 2899 scopes=model.scopes, 2900 token_expiry_date=model.token_expiry_date, 2901 token_expiry_date_format=model.token_expiry_date_format, 2902 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2903 token_refresh_endpoint=model.token_refresh_endpoint, 2904 config=config, 2905 parameters=model.parameters or {}, 2906 message_repository=self._message_repository, 2907 profile_assertion=profile_assertion, 2908 use_profile_assertion=model.use_profile_assertion, 2909 refresh_token_error_status_codes=refresh_token_error_status_codes, 2910 refresh_token_error_key=refresh_token_error_key, 2911 refresh_token_error_values=refresh_token_error_values, 2912 )
2962 def create_offset_increment( 2963 self, 2964 model: OffsetIncrementModel, 2965 config: Config, 2966 decoder: Decoder, 2967 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2968 **kwargs: Any, 2969 ) -> OffsetIncrement: 2970 if isinstance(decoder, PaginationDecoderDecorator): 2971 inner_decoder = decoder.decoder 2972 else: 2973 inner_decoder = decoder 2974 decoder = PaginationDecoderDecorator(decoder=decoder) 2975 2976 if self._is_supported_decoder_for_pagination(inner_decoder): 2977 decoder_to_use = decoder 2978 else: 2979 raise ValueError( 2980 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2981 ) 2982 2983 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2984 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2985 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2986 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2987 # When we have more time to investigate we can look into reusing the same component. 2988 extractor = ( 2989 self._create_component_from_model( 2990 model=extractor_model, config=config, decoder=decoder_to_use 2991 ) 2992 if extractor_model 2993 else None 2994 ) 2995 2996 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2997 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2998 page_size = model.page_size 2999 if isinstance(page_size, str) and page_size.isdigit(): 3000 page_size = int(page_size) 3001 3002 return OffsetIncrement( 3003 page_size=page_size, 3004 config=config, 3005 decoder=decoder_to_use, 3006 extractor=extractor, 3007 inject_on_first_request=model.inject_on_first_request or False, 3008 parameters=model.parameters or {}, 3009 )
3011 @staticmethod 3012 def create_page_increment( 3013 model: PageIncrementModel, config: Config, **kwargs: Any 3014 ) -> PageIncrement: 3015 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3016 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3017 page_size = model.page_size 3018 if isinstance(page_size, str) and page_size.isdigit(): 3019 page_size = int(page_size) 3020 3021 return PageIncrement( 3022 page_size=page_size, 3023 config=config, 3024 start_from_page=model.start_from_page or 0, 3025 inject_on_first_request=model.inject_on_first_request or False, 3026 parameters=model.parameters or {}, 3027 )
3029 def create_parent_stream_config( 3030 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3031 ) -> ParentStreamConfig: 3032 declarative_stream = self._create_component_from_model( 3033 model.stream, 3034 config=config, 3035 is_parent=True, 3036 **kwargs, 3037 ) 3038 request_option = ( 3039 self._create_component_from_model(model.request_option, config=config) 3040 if model.request_option 3041 else None 3042 ) 3043 3044 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3045 raise ValueError( 3046 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3047 ) 3048 3049 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3050 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3051 ) 3052 3053 return ParentStreamConfig( 3054 parent_key=model.parent_key, 3055 request_option=request_option, 3056 stream=declarative_stream, 3057 partition_field=model.partition_field, 3058 config=config, 3059 incremental_dependency=model.incremental_dependency or False, 3060 parameters=model.parameters or {}, 3061 extra_fields=model.extra_fields, 3062 lazy_read_pointer=model_lazy_read_pointer, 3063 )
3065 def create_properties_from_endpoint( 3066 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3067 ) -> PropertiesFromEndpoint: 3068 retriever = self._create_component_from_model( 3069 model=model.retriever, 3070 config=config, 3071 name="dynamic_properties", 3072 primary_key=None, 3073 stream_slicer=None, 3074 transformations=[], 3075 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3076 ) 3077 return PropertiesFromEndpoint( 3078 property_field_path=model.property_field_path, 3079 retriever=retriever, 3080 config=config, 3081 parameters=model.parameters or {}, 3082 )
3084 def create_property_chunking( 3085 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3086 ) -> PropertyChunking: 3087 record_merge_strategy = ( 3088 self._create_component_from_model( 3089 model=model.record_merge_strategy, config=config, **kwargs 3090 ) 3091 if model.record_merge_strategy 3092 else None 3093 ) 3094 3095 property_limit_type: PropertyLimitType 3096 match model.property_limit_type: 3097 case PropertyLimitTypeModel.property_count: 3098 property_limit_type = PropertyLimitType.property_count 3099 case PropertyLimitTypeModel.characters: 3100 property_limit_type = PropertyLimitType.characters 3101 case _: 3102 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3103 3104 return PropertyChunking( 3105 property_limit_type=property_limit_type, 3106 property_limit=model.property_limit, 3107 record_merge_strategy=record_merge_strategy, 3108 config=config, 3109 parameters=model.parameters or {}, 3110 )
3112 def create_query_properties( 3113 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3114 ) -> QueryProperties: 3115 if isinstance(model.property_list, list): 3116 property_list = model.property_list 3117 else: 3118 property_list = self._create_component_from_model( 3119 model=model.property_list, config=config, **kwargs 3120 ) 3121 3122 property_chunking = ( 3123 self._create_component_from_model( 3124 model=model.property_chunking, config=config, **kwargs 3125 ) 3126 if model.property_chunking 3127 else None 3128 ) 3129 3130 property_selector = ( 3131 self._create_component_from_model( 3132 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3133 ) 3134 if model.property_selector 3135 else None 3136 ) 3137 3138 return QueryProperties( 3139 property_list=property_list, 3140 always_include_properties=model.always_include_properties, 3141 property_chunking=property_chunking, 3142 property_selector=property_selector, 3143 config=config, 3144 parameters=model.parameters or {}, 3145 )
3147 def create_json_schema_property_selector( 3148 self, 3149 model: JsonSchemaPropertySelectorModel, 3150 config: Config, 3151 *, 3152 stream_name: str, 3153 **kwargs: Any, 3154 ) -> JsonSchemaPropertySelector: 3155 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3156 3157 transformations = [] 3158 if model.transformations: 3159 for transformation_model in model.transformations: 3160 transformations.append( 3161 self._create_component_from_model(model=transformation_model, config=config) 3162 ) 3163 3164 return JsonSchemaPropertySelector( 3165 configured_stream=configured_stream, 3166 properties_transformations=transformations, 3167 config=config, 3168 parameters=model.parameters or {}, 3169 )
3183 @staticmethod 3184 def create_request_option( 3185 model: RequestOptionModel, config: Config, **kwargs: Any 3186 ) -> RequestOption: 3187 inject_into = RequestOptionType(model.inject_into.value) 3188 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3189 [ 3190 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3191 for segment in model.field_path 3192 ] 3193 if model.field_path 3194 else None 3195 ) 3196 field_name = ( 3197 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3198 if model.field_name 3199 else None 3200 ) 3201 return RequestOption( 3202 field_name=field_name, 3203 field_path=field_path, 3204 inject_into=inject_into, 3205 parameters=kwargs.get("parameters", {}), 3206 )
3208 def create_record_selector( 3209 self, 3210 model: RecordSelectorModel, 3211 config: Config, 3212 *, 3213 name: str, 3214 transformations: List[RecordTransformation] | None = None, 3215 decoder: Decoder | None = None, 3216 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3217 file_uploader: Optional[DefaultFileUploader] = None, 3218 **kwargs: Any, 3219 ) -> RecordSelector: 3220 extractor = self._create_component_from_model( 3221 model=model.extractor, decoder=decoder, config=config 3222 ) 3223 record_filter = ( 3224 self._create_component_from_model(model.record_filter, config=config) 3225 if model.record_filter 3226 else None 3227 ) 3228 3229 transform_before_filtering = ( 3230 False if model.transform_before_filtering is None else model.transform_before_filtering 3231 ) 3232 if client_side_incremental_sync_cursor: 3233 record_filter = ClientSideIncrementalRecordFilterDecorator( 3234 config=config, 3235 parameters=model.parameters, 3236 condition=model.record_filter.condition 3237 if (model.record_filter and hasattr(model.record_filter, "condition")) 3238 else None, 3239 cursor=client_side_incremental_sync_cursor, 3240 ) 3241 transform_before_filtering = ( 3242 True 3243 if model.transform_before_filtering is None 3244 else model.transform_before_filtering 3245 ) 3246 3247 if model.schema_normalization is None: 3248 # default to no schema normalization if not set 3249 model.schema_normalization = SchemaNormalizationModel.None_ 3250 3251 schema_normalization = ( 3252 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3253 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3254 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3255 ) 3256 3257 return RecordSelector( 3258 extractor=extractor, 3259 name=name, 3260 config=config, 3261 record_filter=record_filter, 3262 transformations=transformations or [], 3263 file_uploader=file_uploader, 3264 schema_normalization=schema_normalization, 3265 parameters=model.parameters or {}, 3266 transform_before_filtering=transform_before_filtering, 3267 )
3277 def create_selective_authenticator( 3278 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3279 ) -> DeclarativeAuthenticator: 3280 authenticators = { 3281 name: self._create_component_from_model(model=auth, config=config) 3282 for name, auth in model.authenticators.items() 3283 } 3284 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3285 return SelectiveAuthenticator( # type: ignore[abstract] 3286 config=config, 3287 authenticators=authenticators, 3288 authenticator_selection_path=model.authenticator_selection_path, 3289 **kwargs, 3290 )
3292 @staticmethod 3293 def create_legacy_session_token_authenticator( 3294 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3295 ) -> LegacySessionTokenAuthenticator: 3296 return LegacySessionTokenAuthenticator( 3297 api_url=url_base, 3298 header=model.header, 3299 login_url=model.login_url, 3300 password=model.password or "", 3301 session_token=model.session_token or "", 3302 session_token_response_key=model.session_token_response_key or "", 3303 username=model.username or "", 3304 validate_session_url=model.validate_session_url, 3305 config=config, 3306 parameters=model.parameters or {}, 3307 )
3309 def create_simple_retriever( 3310 self, 3311 model: SimpleRetrieverModel, 3312 config: Config, 3313 *, 3314 name: str, 3315 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3316 request_options_provider: Optional[RequestOptionsProvider] = None, 3317 cursor: Optional[Cursor] = None, 3318 has_stop_condition_cursor: bool = False, 3319 is_client_side_incremental_sync: bool = False, 3320 transformations: List[RecordTransformation], 3321 file_uploader: Optional[DefaultFileUploader] = None, 3322 incremental_sync: Optional[ 3323 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3324 ] = None, 3325 use_cache: Optional[bool] = None, 3326 log_formatter: Optional[Callable[[Response], Any]] = None, 3327 partition_router: Optional[PartitionRouter] = None, 3328 **kwargs: Any, 3329 ) -> SimpleRetriever: 3330 def _get_url(req: Requester) -> str: 3331 """ 3332 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3333 This is needed because the URL is not set until the requester is created. 3334 """ 3335 3336 _url: str = ( 3337 model.requester.url 3338 if hasattr(model.requester, "url") and model.requester.url is not None 3339 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3340 ) 3341 _url_base: str = ( 3342 model.requester.url_base 3343 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3344 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3345 ) 3346 3347 return _url or _url_base 3348 3349 if cursor is None: 3350 cursor = FinalStateCursor(name, None, self._message_repository) 3351 3352 decoder = ( 3353 self._create_component_from_model(model=model.decoder, config=config) 3354 if model.decoder 3355 else JsonDecoder(parameters={}) 3356 ) 3357 record_selector = self._create_component_from_model( 3358 model=model.record_selector, 3359 name=name, 3360 config=config, 3361 decoder=decoder, 3362 transformations=transformations, 3363 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3364 file_uploader=file_uploader, 3365 ) 3366 3367 query_properties: Optional[QueryProperties] = None 3368 query_properties_key: Optional[str] = None 3369 self._ensure_query_properties_to_model(model.requester) 3370 if self._has_query_properties_in_request_parameters(model.requester): 3371 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3372 # places instead of default to request_parameters which isn't clearly documented 3373 if ( 3374 hasattr(model.requester, "fetch_properties_from_endpoint") 3375 and model.requester.fetch_properties_from_endpoint 3376 ): 3377 raise ValueError( 3378 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3379 ) 3380 3381 query_properties_definitions = [] 3382 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3383 if isinstance(request_parameter, QueryPropertiesModel): 3384 query_properties_key = key 3385 query_properties_definitions.append(request_parameter) 3386 3387 if len(query_properties_definitions) > 1: 3388 raise ValueError( 3389 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3390 ) 3391 3392 if len(query_properties_definitions) == 1: 3393 query_properties = self._create_component_from_model( 3394 model=query_properties_definitions[0], stream_name=name, config=config 3395 ) 3396 3397 # Removes QueryProperties components from the interpolated mappings because it has been designed 3398 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3399 # instead of through jinja interpolation 3400 if hasattr(model.requester, "request_parameters") and isinstance( 3401 model.requester.request_parameters, Mapping 3402 ): 3403 model.requester.request_parameters = self._remove_query_properties( 3404 model.requester.request_parameters 3405 ) 3406 elif ( 3407 hasattr(model.requester, "fetch_properties_from_endpoint") 3408 and model.requester.fetch_properties_from_endpoint 3409 ): 3410 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3411 query_properties_definition = QueryPropertiesModel( 3412 type="QueryProperties", 3413 property_list=model.requester.fetch_properties_from_endpoint, 3414 always_include_properties=None, 3415 property_chunking=None, 3416 ) # type: ignore # $parameters has a default value 3417 3418 query_properties = self.create_query_properties( 3419 model=query_properties_definition, 3420 stream_name=name, 3421 config=config, 3422 ) 3423 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3424 query_properties = self.create_query_properties( 3425 model=model.requester.query_properties, 3426 stream_name=name, 3427 config=config, 3428 ) 3429 3430 requester = self._create_component_from_model( 3431 model=model.requester, 3432 decoder=decoder, 3433 name=name, 3434 query_properties_key=query_properties_key, 3435 use_cache=use_cache, 3436 config=config, 3437 ) 3438 3439 if not request_options_provider: 3440 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3441 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3442 partition_router, PartitionRouter 3443 ): 3444 request_options_provider = partition_router 3445 3446 paginator = ( 3447 self._create_component_from_model( 3448 model=model.paginator, 3449 config=config, 3450 url_base=_get_url(requester), 3451 extractor_model=model.record_selector.extractor, 3452 decoder=decoder, 3453 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3454 ) 3455 if model.paginator 3456 else NoPagination(parameters={}) 3457 ) 3458 3459 ignore_stream_slicer_parameters_on_paginated_requests = ( 3460 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3461 ) 3462 3463 if ( 3464 model.partition_router 3465 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3466 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3467 and any( 3468 parent_stream_config.lazy_read_pointer 3469 for parent_stream_config in model.partition_router.parent_stream_configs 3470 ) 3471 ): 3472 if incremental_sync: 3473 if incremental_sync.type != "DatetimeBasedCursor": 3474 raise ValueError( 3475 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3476 ) 3477 3478 elif incremental_sync.step or incremental_sync.cursor_granularity: 3479 raise ValueError( 3480 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3481 ) 3482 3483 if model.decoder and model.decoder.type != "JsonDecoder": 3484 raise ValueError( 3485 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3486 ) 3487 3488 return LazySimpleRetriever( 3489 name=name, 3490 paginator=paginator, 3491 primary_key=primary_key, 3492 requester=requester, 3493 record_selector=record_selector, 3494 stream_slicer=_NO_STREAM_SLICING, 3495 request_option_provider=request_options_provider, 3496 config=config, 3497 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3498 parameters=model.parameters or {}, 3499 ) 3500 3501 if ( 3502 model.record_selector.record_filter 3503 and model.pagination_reset 3504 and model.pagination_reset.limits 3505 ): 3506 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3507 3508 return SimpleRetriever( 3509 name=name, 3510 paginator=paginator, 3511 primary_key=primary_key, 3512 requester=requester, 3513 record_selector=record_selector, 3514 stream_slicer=_NO_STREAM_SLICING, 3515 request_option_provider=request_options_provider, 3516 config=config, 3517 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3518 additional_query_properties=query_properties, 3519 log_formatter=self._get_log_formatter(log_formatter, name), 3520 pagination_tracker_factory=self._create_pagination_tracker_factory( 3521 model.pagination_reset, cursor 3522 ), 3523 parameters=model.parameters or {}, 3524 )
3602 def create_state_delegating_stream( 3603 self, 3604 model: StateDelegatingStreamModel, 3605 config: Config, 3606 **kwargs: Any, 3607 ) -> DefaultStream: 3608 if ( 3609 model.full_refresh_stream.name != model.name 3610 or model.name != model.incremental_stream.name 3611 ): 3612 raise ValueError( 3613 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3614 ) 3615 3616 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3617 resolved_retention_period: Optional[str] = None 3618 if model.api_retention_period: 3619 interpolated_retention = InterpolatedString.create( 3620 model.api_retention_period, parameters=model.parameters or {} 3621 ) 3622 resolved_value = interpolated_retention.eval(config=config) 3623 if resolved_value: 3624 resolved_retention_period = str(resolved_value) 3625 3626 if resolved_retention_period: 3627 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3628 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3629 raise ValueError( 3630 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3631 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3632 f"cursors, so cursor age validation cannot be performed." 3633 ) 3634 3635 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3636 3637 if not stream_state: 3638 return self._create_component_from_model( # type: ignore[no-any-return] 3639 model.full_refresh_stream, config=config, **kwargs 3640 ) 3641 3642 incremental_stream: DefaultStream = self._create_component_from_model( 3643 model.incremental_stream, config=config, **kwargs 3644 ) # type: ignore[assignment] 3645 3646 # Only run cursor age validation for streams that are in the configured 3647 # catalog (or when no catalog was provided, e.g. during discover / connector 3648 # builder). Streams not selected by the user but instantiated as parent-stream 3649 # dependencies must not go through this path because it emits state messages 3650 # that the destination does not know about, causing "Stream not found" crashes. 3651 stream_is_in_catalog = ( 3652 not self._stream_name_to_configured_stream # no catalog → validate by default 3653 or model.name in self._stream_name_to_configured_stream 3654 ) 3655 if resolved_retention_period and stream_is_in_catalog: 3656 full_refresh_stream: DefaultStream = self._create_component_from_model( 3657 model.full_refresh_stream, config=config, **kwargs 3658 ) # type: ignore[assignment] 3659 if self._is_cursor_older_than_retention_period( 3660 stream_state, 3661 full_refresh_stream.cursor, 3662 incremental_stream.cursor, 3663 resolved_retention_period, 3664 model.name, 3665 ): 3666 # Clear state BEFORE constructing the full_refresh_stream so that 3667 # its cursor starts from start_date instead of the stale cursor. 3668 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3669 state_message = self._connector_state_manager.create_state_message(model.name, None) 3670 self._message_repository.emit_message(state_message) 3671 return self._create_component_from_model( # type: ignore[no-any-return] 3672 model.full_refresh_stream, config=config, **kwargs 3673 ) 3674 3675 return incremental_stream
3774 def create_async_retriever( 3775 self, 3776 model: AsyncRetrieverModel, 3777 config: Config, 3778 *, 3779 name: str, 3780 primary_key: Optional[ 3781 Union[str, List[str], List[List[str]]] 3782 ], # this seems to be needed to match create_simple_retriever 3783 stream_slicer: Optional[StreamSlicer], 3784 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3785 transformations: List[RecordTransformation], 3786 **kwargs: Any, 3787 ) -> AsyncRetriever: 3788 if model.download_target_requester and not model.download_target_extractor: 3789 raise ValueError( 3790 f"`download_target_extractor` required if using a `download_target_requester`" 3791 ) 3792 3793 def _get_download_retriever( 3794 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3795 ) -> SimpleRetriever: 3796 # We create a record selector for the download retriever 3797 # with no schema normalization and no transformations, neither record filter 3798 # as all this occurs in the record_selector of the AsyncRetriever 3799 record_selector = RecordSelector( 3800 extractor=extractor, 3801 name=name, 3802 record_filter=None, 3803 transformations=[], 3804 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3805 config=config, 3806 parameters={}, 3807 ) 3808 paginator = ( 3809 self._create_component_from_model( 3810 model=model.download_paginator, 3811 decoder=_decoder, 3812 config=config, 3813 url_base="", 3814 ) 3815 if model.download_paginator 3816 else NoPagination(parameters={}) 3817 ) 3818 3819 return SimpleRetriever( 3820 requester=requester, 3821 record_selector=record_selector, 3822 primary_key=None, 3823 name=name, 3824 paginator=paginator, 3825 config=config, 3826 parameters={}, 3827 log_formatter=self._get_log_formatter(None, name), 3828 ) 3829 3830 def _get_job_timeout() -> datetime.timedelta: 3831 user_defined_timeout: Optional[int] = ( 3832 int( 3833 InterpolatedString.create( 3834 str(model.polling_job_timeout), 3835 parameters={}, 3836 ).eval(config) 3837 ) 3838 if model.polling_job_timeout 3839 else None 3840 ) 3841 3842 # check for user defined timeout during the test read or 15 minutes 3843 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3844 # default value for non-connector builder is 60 minutes. 3845 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3846 3847 return ( 3848 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3849 ) 3850 3851 decoder = ( 3852 self._create_component_from_model(model=model.decoder, config=config) 3853 if model.decoder 3854 else JsonDecoder(parameters={}) 3855 ) 3856 record_selector = self._create_component_from_model( 3857 model=model.record_selector, 3858 config=config, 3859 decoder=decoder, 3860 name=name, 3861 transformations=transformations, 3862 client_side_incremental_sync=client_side_incremental_sync, 3863 ) 3864 3865 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3866 if self._should_limit_slices_fetched(): 3867 stream_slicer = cast( 3868 StreamSlicer, 3869 StreamSlicerTestReadDecorator( 3870 wrapped_slicer=stream_slicer, 3871 maximum_number_of_slices=self._limit_slices_fetched or 5, 3872 ), 3873 ) 3874 3875 creation_requester = self._create_component_from_model( 3876 model=model.creation_requester, 3877 decoder=decoder, 3878 config=config, 3879 name=f"job creation - {name}", 3880 ) 3881 polling_requester = self._create_component_from_model( 3882 model=model.polling_requester, 3883 decoder=decoder, 3884 config=config, 3885 name=f"job polling - {name}", 3886 ) 3887 job_download_components_name = f"job download - {name}" 3888 download_decoder = ( 3889 self._create_component_from_model(model=model.download_decoder, config=config) 3890 if model.download_decoder 3891 else JsonDecoder(parameters={}) 3892 ) 3893 download_extractor = ( 3894 self._create_component_from_model( 3895 model=model.download_extractor, 3896 config=config, 3897 decoder=download_decoder, 3898 parameters=model.parameters, 3899 ) 3900 if model.download_extractor 3901 else DpathExtractor( 3902 [], 3903 config=config, 3904 decoder=download_decoder, 3905 parameters=model.parameters or {}, 3906 ) 3907 ) 3908 download_requester = self._create_component_from_model( 3909 model=model.download_requester, 3910 decoder=download_decoder, 3911 config=config, 3912 name=job_download_components_name, 3913 ) 3914 download_retriever = _get_download_retriever( 3915 download_requester, download_extractor, download_decoder 3916 ) 3917 abort_requester = ( 3918 self._create_component_from_model( 3919 model=model.abort_requester, 3920 decoder=decoder, 3921 config=config, 3922 name=f"job abort - {name}", 3923 ) 3924 if model.abort_requester 3925 else None 3926 ) 3927 delete_requester = ( 3928 self._create_component_from_model( 3929 model=model.delete_requester, 3930 decoder=decoder, 3931 config=config, 3932 name=f"job delete - {name}", 3933 ) 3934 if model.delete_requester 3935 else None 3936 ) 3937 download_target_requester = ( 3938 self._create_component_from_model( 3939 model=model.download_target_requester, 3940 decoder=decoder, 3941 config=config, 3942 name=f"job extract_url - {name}", 3943 ) 3944 if model.download_target_requester 3945 else None 3946 ) 3947 status_extractor = self._create_component_from_model( 3948 model=model.status_extractor, decoder=decoder, config=config, name=name 3949 ) 3950 download_target_extractor = ( 3951 self._create_component_from_model( 3952 model=model.download_target_extractor, 3953 decoder=decoder, 3954 config=config, 3955 name=name, 3956 ) 3957 if model.download_target_extractor 3958 else None 3959 ) 3960 3961 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3962 creation_requester=creation_requester, 3963 polling_requester=polling_requester, 3964 download_retriever=download_retriever, 3965 download_target_requester=download_target_requester, 3966 abort_requester=abort_requester, 3967 delete_requester=delete_requester, 3968 status_extractor=status_extractor, 3969 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3970 download_target_extractor=download_target_extractor, 3971 job_timeout=_get_job_timeout(), 3972 ) 3973 3974 failed_retry_wait_time_in_seconds: Optional[int] = ( 3975 int( 3976 InterpolatedString.create( 3977 str(model.failed_retry_wait_time_in_seconds), 3978 parameters={}, 3979 ).eval(config) 3980 ) 3981 if model.failed_retry_wait_time_in_seconds 3982 else None 3983 ) 3984 3985 async_job_partition_router = AsyncJobPartitionRouter( 3986 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3987 job_repository, 3988 stream_slices, 3989 self._job_tracker, 3990 self._message_repository, 3991 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3992 has_bulk_parent=False, 3993 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3994 # `None` == default retry is set to 3 attempts, under the hood. 3995 job_max_retry=1 if self._emit_connector_builder_messages else None, 3996 failed_retry_wait_time_in_seconds=failed_retry_wait_time_in_seconds, 3997 ), 3998 stream_slicer=stream_slicer, 3999 config=config, 4000 parameters=model.parameters or {}, 4001 ) 4002 4003 return AsyncRetriever( 4004 record_selector=record_selector, 4005 stream_slicer=async_job_partition_router, 4006 config=config, 4007 parameters=model.parameters or {}, 4008 )
4010 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 4011 config_migrations = [ 4012 self._create_component_from_model(migration, config) 4013 for migration in ( 4014 model.config_normalization_rules.config_migrations 4015 if ( 4016 model.config_normalization_rules 4017 and model.config_normalization_rules.config_migrations 4018 ) 4019 else [] 4020 ) 4021 ] 4022 config_transformations = [ 4023 self._create_component_from_model(transformation, config) 4024 for transformation in ( 4025 model.config_normalization_rules.transformations 4026 if ( 4027 model.config_normalization_rules 4028 and model.config_normalization_rules.transformations 4029 ) 4030 else [] 4031 ) 4032 ] 4033 config_validations = [ 4034 self._create_component_from_model(validation, config) 4035 for validation in ( 4036 model.config_normalization_rules.validations 4037 if ( 4038 model.config_normalization_rules 4039 and model.config_normalization_rules.validations 4040 ) 4041 else [] 4042 ) 4043 ] 4044 4045 return Spec( 4046 connection_specification=model.connection_specification, 4047 documentation_url=model.documentation_url, 4048 advanced_auth=model.advanced_auth, 4049 parameters={}, 4050 config_migrations=config_migrations, 4051 config_transformations=config_transformations, 4052 config_validations=config_validations, 4053 )
4055 def create_substream_partition_router( 4056 self, 4057 model: SubstreamPartitionRouterModel, 4058 config: Config, 4059 *, 4060 stream_name: str, 4061 **kwargs: Any, 4062 ) -> SubstreamPartitionRouter: 4063 parent_stream_configs = [] 4064 if model.parent_stream_configs: 4065 parent_stream_configs.extend( 4066 [ 4067 self.create_parent_stream_config_with_substream_wrapper( 4068 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4069 ) 4070 for parent_stream_config in model.parent_stream_configs 4071 ] 4072 ) 4073 4074 return SubstreamPartitionRouter( 4075 parent_stream_configs=parent_stream_configs, 4076 parameters=model.parameters or {}, 4077 config=config, 4078 )
4080 def create_parent_stream_config_with_substream_wrapper( 4081 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4082 ) -> Any: 4083 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4084 4085 parent_state: Optional[Mapping[str, Any]] = ( 4086 child_state if model.incremental_dependency and child_state else None 4087 ) 4088 connector_state_manager = self._instantiate_parent_stream_state_manager( 4089 child_state, config, model, parent_state 4090 ) 4091 4092 substream_factory = ModelToComponentFactory( 4093 connector_state_manager=connector_state_manager, 4094 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4095 limit_slices_fetched=self._limit_slices_fetched, 4096 emit_connector_builder_messages=self._emit_connector_builder_messages, 4097 disable_retries=self._disable_retries, 4098 disable_cache=self._disable_cache, 4099 message_repository=StateFilteringMessageRepository( 4100 LogAppenderMessageRepositoryDecorator( 4101 { 4102 "airbyte_cdk": {"stream": {"is_substream": True}}, 4103 "http": {"is_auxiliary": True}, 4104 }, 4105 self._message_repository, 4106 self._evaluate_log_level(self._emit_connector_builder_messages), 4107 ), 4108 ), 4109 api_budget=self._api_budget, 4110 ) 4111 4112 return substream_factory.create_parent_stream_config( 4113 model=model, config=config, stream_name=stream_name, **kwargs 4114 )
4174 @staticmethod 4175 def create_wait_time_from_header( 4176 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4177 ) -> WaitTimeFromHeaderBackoffStrategy: 4178 return WaitTimeFromHeaderBackoffStrategy( 4179 header=model.header, 4180 parameters=model.parameters or {}, 4181 config=config, 4182 regex=model.regex, 4183 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4184 if model.max_waiting_time_in_seconds is not None 4185 else None, 4186 )
4188 @staticmethod 4189 def create_wait_until_time_from_header( 4190 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4191 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4192 return WaitUntilTimeFromHeaderBackoffStrategy( 4193 header=model.header, 4194 parameters=model.parameters or {}, 4195 config=config, 4196 min_wait=model.min_wait, 4197 regex=model.regex, 4198 )
4206 @staticmethod 4207 def create_components_mapping_definition( 4208 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4209 ) -> ComponentMappingDefinition: 4210 interpolated_value = InterpolatedString.create( 4211 model.value, parameters=model.parameters or {} 4212 ) 4213 field_path = [ 4214 InterpolatedString.create(path, parameters=model.parameters or {}) 4215 for path in model.field_path 4216 ] 4217 return ComponentMappingDefinition( 4218 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4219 value=interpolated_value, 4220 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4221 create_or_update=model.create_or_update, 4222 condition=model.condition, 4223 parameters=model.parameters or {}, 4224 )
4226 def create_http_components_resolver( 4227 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4228 ) -> Any: 4229 retriever = self._create_component_from_model( 4230 model=model.retriever, 4231 config=config, 4232 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4233 primary_key=None, 4234 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4235 transformations=[], 4236 ) 4237 4238 components_mapping = [] 4239 for component_mapping_definition_model in model.components_mapping: 4240 if component_mapping_definition_model.condition: 4241 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4242 components_mapping.append( 4243 self._create_component_from_model( 4244 model=component_mapping_definition_model, 4245 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4246 component_mapping_definition_model.value_type 4247 ), 4248 config=config, 4249 ) 4250 ) 4251 4252 return HttpComponentsResolver( 4253 retriever=retriever, 4254 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4255 config=config, 4256 components_mapping=components_mapping, 4257 parameters=model.parameters or {}, 4258 )
4260 @staticmethod 4261 def create_stream_config( 4262 model: StreamConfigModel, config: Config, **kwargs: Any 4263 ) -> StreamConfig: 4264 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4265 [x for x in model.configs_pointer] if model.configs_pointer else [] 4266 ) 4267 4268 return StreamConfig( 4269 configs_pointer=model_configs_pointer, 4270 default_values=model.default_values, 4271 parameters=model.parameters or {}, 4272 )
4274 def create_config_components_resolver( 4275 self, 4276 model: ConfigComponentsResolverModel, 4277 config: Config, 4278 ) -> Any: 4279 model_stream_configs = ( 4280 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4281 ) 4282 4283 stream_configs = [ 4284 self._create_component_from_model( 4285 stream_config, config=config, parameters=model.parameters or {} 4286 ) 4287 for stream_config in model_stream_configs 4288 ] 4289 4290 components_mapping = [ 4291 self._create_component_from_model( 4292 model=components_mapping_definition_model, 4293 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4294 components_mapping_definition_model.value_type 4295 ), 4296 config=config, 4297 parameters=model.parameters, 4298 ) 4299 for components_mapping_definition_model in model.components_mapping 4300 ] 4301 4302 return ConfigComponentsResolver( 4303 stream_configs=stream_configs, 4304 config=config, 4305 components_mapping=components_mapping, 4306 parameters=model.parameters or {}, 4307 )
4309 def create_parametrized_components_resolver( 4310 self, 4311 model: ParametrizedComponentsResolverModel, 4312 config: Config, 4313 ) -> ParametrizedComponentsResolver: 4314 stream_parameters = StreamParametersDefinition( 4315 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4316 ) 4317 4318 components_mapping = [] 4319 for components_mapping_definition_model in model.components_mapping: 4320 if components_mapping_definition_model.condition: 4321 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4322 components_mapping.append( 4323 self._create_component_from_model( 4324 model=components_mapping_definition_model, 4325 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4326 components_mapping_definition_model.value_type 4327 ), 4328 config=config, 4329 ) 4330 ) 4331 return ParametrizedComponentsResolver( 4332 stream_parameters=stream_parameters, 4333 config=config, 4334 components_mapping=components_mapping, 4335 parameters=model.parameters or {}, 4336 )
4360 def create_http_api_budget( 4361 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4362 ) -> HttpAPIBudget: 4363 policies = [ 4364 self._create_component_from_model(model=policy, config=config) 4365 for policy in model.policies 4366 ] 4367 4368 return HttpAPIBudget( 4369 policies=policies, 4370 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4371 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4372 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4373 )
4375 def create_fixed_window_call_rate_policy( 4376 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4377 ) -> FixedWindowCallRatePolicy: 4378 matchers = [ 4379 self._create_component_from_model(model=matcher, config=config) 4380 for matcher in model.matchers 4381 ] 4382 4383 # Set the initial reset timestamp to 10 days from now. 4384 # This value will be updated by the first request. 4385 return FixedWindowCallRatePolicy( 4386 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4387 period=parse_duration(model.period), 4388 call_limit=model.call_limit, 4389 matchers=matchers, 4390 )
4392 def create_file_uploader( 4393 self, model: FileUploaderModel, config: Config, **kwargs: Any 4394 ) -> FileUploader: 4395 name = "File Uploader" 4396 requester = self._create_component_from_model( 4397 model=model.requester, 4398 config=config, 4399 name=name, 4400 **kwargs, 4401 ) 4402 download_target_extractor = self._create_component_from_model( 4403 model=model.download_target_extractor, 4404 config=config, 4405 name=name, 4406 **kwargs, 4407 ) 4408 emit_connector_builder_messages = self._emit_connector_builder_messages 4409 file_uploader = DefaultFileUploader( 4410 requester=requester, 4411 download_target_extractor=download_target_extractor, 4412 config=config, 4413 file_writer=NoopFileWriter() 4414 if emit_connector_builder_messages 4415 else LocalFileSystemFileWriter(), 4416 parameters=model.parameters or {}, 4417 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4418 ) 4419 4420 return ( 4421 ConnectorBuilderFileUploader(file_uploader) 4422 if emit_connector_builder_messages 4423 else file_uploader 4424 )
4426 def create_moving_window_call_rate_policy( 4427 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4428 ) -> MovingWindowCallRatePolicy: 4429 rates = [ 4430 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4431 ] 4432 matchers = [ 4433 self._create_component_from_model(model=matcher, config=config) 4434 for matcher in model.matchers 4435 ] 4436 return MovingWindowCallRatePolicy( 4437 rates=rates, 4438 matchers=matchers, 4439 )
4441 def create_unlimited_call_rate_policy( 4442 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4443 ) -> UnlimitedCallRatePolicy: 4444 matchers = [ 4445 self._create_component_from_model(model=matcher, config=config) 4446 for matcher in model.matchers 4447 ] 4448 4449 return UnlimitedCallRatePolicy( 4450 matchers=matchers, 4451 )
4460 def create_http_request_matcher( 4461 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4462 ) -> HttpRequestRegexMatcher: 4463 weight = model.weight 4464 if weight is not None: 4465 if isinstance(weight, str): 4466 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4467 else: 4468 weight = int(weight) 4469 if weight < 1: 4470 raise ValueError(f"weight must be >= 1, got {weight}") 4471 return HttpRequestRegexMatcher( 4472 method=model.method, 4473 url_base=model.url_base, 4474 url_path_pattern=model.url_path_pattern, 4475 params=model.params, 4476 headers=model.headers, 4477 weight=weight, 4478 )
4485 def create_grouping_partition_router( 4486 self, 4487 model: GroupingPartitionRouterModel, 4488 config: Config, 4489 *, 4490 stream_name: str, 4491 **kwargs: Any, 4492 ) -> GroupingPartitionRouter: 4493 underlying_router = self._create_component_from_model( 4494 model=model.underlying_partition_router, 4495 config=config, 4496 stream_name=stream_name, 4497 **kwargs, 4498 ) 4499 if model.group_size < 1: 4500 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4501 4502 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4503 # because they are specific to individual partitions and cannot be aggregated or handled 4504 # when grouping, potentially leading to incorrect API calls. Any request customization 4505 # should be managed at the stream level through the requester's configuration. 4506 if isinstance(underlying_router, SubstreamPartitionRouter): 4507 if any( 4508 parent_config.request_option 4509 for parent_config in underlying_router.parent_stream_configs 4510 ): 4511 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4512 4513 if isinstance(underlying_router, ListPartitionRouter): 4514 if underlying_router.request_option: 4515 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4516 4517 return GroupingPartitionRouter( 4518 group_size=model.group_size, 4519 underlying_partition_router=underlying_router, 4520 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4521 config=config, 4522 )