airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 TYPE_CHECKING, 15 Any, 16 Callable, 17 Dict, 18 List, 19 Mapping, 20 MutableMapping, 21 Optional, 22 Tuple, 23 Type, 24 Union, 25 cast, 26 get_args, 27 get_origin, 28 get_type_hints, 29) 30 31if TYPE_CHECKING: 32 from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( 33 DatetimeBasedCursor, 34 ) 35 36from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream 37from isodate import parse_duration 38from pydantic.v1 import BaseModel 39from requests import Response 40 41from airbyte_cdk.connector_builder.models import ( 42 LogMessage as ConnectorBuilderLogMessage, 43) 44from airbyte_cdk.models import ( 45 AirbyteStateBlob, 46 AirbyteStateMessage, 47 AirbyteStateType, 48 AirbyteStreamState, 49 ConfiguredAirbyteCatalog, 50 FailureType, 51 Level, 52 StreamDescriptor, 53) 54from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 55from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 56from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 57from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 58from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 59from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 60from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 61 DeclarativeAuthenticator, 62 NoAuth, 63) 64from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 65from airbyte_cdk.sources.declarative.auth.oauth import ( 66 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 67) 68from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 69from airbyte_cdk.sources.declarative.auth.token import ( 70 ApiKeyAuthenticator, 71 BasicHttpAuthenticator, 72 BearerAuthenticator, 73 LegacySessionTokenAuthenticator, 74) 75from airbyte_cdk.sources.declarative.auth.token_provider import ( 76 InterpolatedSessionTokenProvider, 77 InterpolatedStringTokenProvider, 78 SessionTokenProvider, 79 TokenProvider, 80) 81from airbyte_cdk.sources.declarative.checks import ( 82 CheckDynamicStream, 83 CheckStream, 84 DynamicStreamCheckConfig, 85) 86from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 87from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 88from airbyte_cdk.sources.declarative.decoders import ( 89 Decoder, 90 IterableDecoder, 91 JsonDecoder, 92 PaginationDecoderDecorator, 93 XmlDecoder, 94 ZipfileDecoder, 95) 96from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 97 CompositeRawDecoder, 98 CsvParser, 99 GzipParser, 100 JsonLineParser, 101 JsonParser, 102 Parser, 103) 104from airbyte_cdk.sources.declarative.expanders.record_expander import ( 105 OnNoRecords, 106 RecordExpander, 107) 108from airbyte_cdk.sources.declarative.extractors import ( 109 DpathExtractor, 110 RecordFilter, 111 RecordSelector, 112 ResponseToFileExtractor, 113) 114from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 115from airbyte_cdk.sources.declarative.extractors.record_filter import ( 116 ClientSideIncrementalRecordFilterDecorator, 117) 118from airbyte_cdk.sources.declarative.incremental import ( 119 ConcurrentCursorFactory, 120 ConcurrentPerPartitionCursor, 121) 122from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 123from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 124from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 125 LegacyToPerPartitionStateMigration, 126) 127from airbyte_cdk.sources.declarative.models import ( 128 CustomStateMigration, 129 PaginationResetLimits, 130) 131from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 132 DEPRECATION_LOGS_TAG, 133 BaseModelWithDeprecations, 134) 135from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 136 Action1 as PaginationResetActionModel, 137) 138from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 139 AddedFieldDefinition as AddedFieldDefinitionModel, 140) 141from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 142 AddFields as AddFieldsModel, 143) 144from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 145 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 146) 147from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 148 AsyncJobStatusMap as AsyncJobStatusMapModel, 149) 150from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 151 AsyncRetriever as AsyncRetrieverModel, 152) 153from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 154 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 155) 156from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 157 BearerAuthenticator as BearerAuthenticatorModel, 158) 159from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 160 CheckDynamicStream as CheckDynamicStreamModel, 161) 162from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 163 CheckStream as CheckStreamModel, 164) 165from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 166 ComplexFieldType as ComplexFieldTypeModel, 167) 168from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 169 ComponentMappingDefinition as ComponentMappingDefinitionModel, 170) 171from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 172 CompositeErrorHandler as CompositeErrorHandlerModel, 173) 174from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 175 ConcurrencyLevel as ConcurrencyLevelModel, 176) 177from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 178 ConfigAddFields as ConfigAddFieldsModel, 179) 180from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 181 ConfigComponentsResolver as ConfigComponentsResolverModel, 182) 183from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 184 ConfigMigration as ConfigMigrationModel, 185) 186from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 187 ConfigRemapField as ConfigRemapFieldModel, 188) 189from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 190 ConfigRemoveFields as ConfigRemoveFieldsModel, 191) 192from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 193 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 194) 195from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 196 CsvDecoder as CsvDecoderModel, 197) 198from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 199 CursorPagination as CursorPaginationModel, 200) 201from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 202 CustomAuthenticator as CustomAuthenticatorModel, 203) 204from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 205 CustomBackoffStrategy as CustomBackoffStrategyModel, 206) 207from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 208 CustomConfigTransformation as CustomConfigTransformationModel, 209) 210from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 211 CustomDecoder as CustomDecoderModel, 212) 213from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 214 CustomErrorHandler as CustomErrorHandlerModel, 215) 216from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 217 CustomPaginationStrategy as CustomPaginationStrategyModel, 218) 219from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 220 CustomPartitionRouter as CustomPartitionRouterModel, 221) 222from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 223 CustomRecordExtractor as CustomRecordExtractorModel, 224) 225from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 226 CustomRecordFilter as CustomRecordFilterModel, 227) 228from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 229 CustomRequester as CustomRequesterModel, 230) 231from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 232 CustomRetriever as CustomRetrieverModel, 233) 234from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 235 CustomSchemaLoader as CustomSchemaLoader, 236) 237from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 238 CustomSchemaNormalization as CustomSchemaNormalizationModel, 239) 240from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 241 CustomTransformation as CustomTransformationModel, 242) 243from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 244 CustomValidationStrategy as CustomValidationStrategyModel, 245) 246from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 247 DatetimeBasedCursor as DatetimeBasedCursorModel, 248) 249from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 250 DeclarativeStream as DeclarativeStreamModel, 251) 252from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 253 DefaultErrorHandler as DefaultErrorHandlerModel, 254) 255from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 256 DefaultPaginator as DefaultPaginatorModel, 257) 258from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 259 DpathExtractor as DpathExtractorModel, 260) 261from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 262 DpathFlattenFields as DpathFlattenFieldsModel, 263) 264from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 265 DpathValidator as DpathValidatorModel, 266) 267from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 268 DynamicSchemaLoader as DynamicSchemaLoaderModel, 269) 270from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 271 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 272) 273from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 274 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 275) 276from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 277 FileUploader as FileUploaderModel, 278) 279from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 280 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 281) 282from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 283 FlattenFields as FlattenFieldsModel, 284) 285from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 286 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 287) 288from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 289 GroupingPartitionRouter as GroupingPartitionRouterModel, 290) 291from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 292 GzipDecoder as GzipDecoderModel, 293) 294from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 295 HTTPAPIBudget as HTTPAPIBudgetModel, 296) 297from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 298 HttpComponentsResolver as HttpComponentsResolverModel, 299) 300from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 301 HttpRequester as HttpRequesterModel, 302) 303from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 304 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 305) 306from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 307 HttpResponseFilter as HttpResponseFilterModel, 308) 309from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 310 IncrementingCountCursor as IncrementingCountCursorModel, 311) 312from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 313 InlineSchemaLoader as InlineSchemaLoaderModel, 314) 315from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 316 IterableDecoder as IterableDecoderModel, 317) 318from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 319 JsonDecoder as JsonDecoderModel, 320) 321from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 322 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 323) 324from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 325 JsonlDecoder as JsonlDecoderModel, 326) 327from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 328 JsonSchemaPropertySelector as JsonSchemaPropertySelectorModel, 329) 330from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 331 JwtAuthenticator as JwtAuthenticatorModel, 332) 333from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 334 JwtHeaders as JwtHeadersModel, 335) 336from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 337 JwtPayload as JwtPayloadModel, 338) 339from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 340 KeysReplace as KeysReplaceModel, 341) 342from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 343 KeysToLower as KeysToLowerModel, 344) 345from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 346 KeysToSnakeCase as KeysToSnakeCaseModel, 347) 348from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 349 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 350) 351from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 352 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 353) 354from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 355 ListPartitionRouter as ListPartitionRouterModel, 356) 357from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 358 MinMaxDatetime as MinMaxDatetimeModel, 359) 360from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 361 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 362) 363from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 364 NoAuth as NoAuthModel, 365) 366from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 367 NoPagination as NoPaginationModel, 368) 369from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 370 OAuthAuthenticator as OAuthAuthenticatorModel, 371) 372from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 373 OffsetIncrement as OffsetIncrementModel, 374) 375from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 376 PageIncrement as PageIncrementModel, 377) 378from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 379 PaginationReset as PaginationResetModel, 380) 381from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 382 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 383) 384from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 385 ParentStreamConfig as ParentStreamConfigModel, 386) 387from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 388 PredicateValidator as PredicateValidatorModel, 389) 390from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 391 PropertiesFromEndpoint as PropertiesFromEndpointModel, 392) 393from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 394 PropertyChunking as PropertyChunkingModel, 395) 396from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 397 PropertyLimitType as PropertyLimitTypeModel, 398) 399from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 400 QueryProperties as QueryPropertiesModel, 401) 402from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 403 Rate as RateModel, 404) 405from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 406 RecordExpander as RecordExpanderModel, 407) 408from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 409 RecordFilter as RecordFilterModel, 410) 411from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 412 RecordSelector as RecordSelectorModel, 413) 414from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 415 RefreshTokenUpdater as RefreshTokenUpdaterModel, 416) 417from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 418 RemoveFields as RemoveFieldsModel, 419) 420from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 421 RequestOption as RequestOptionModel, 422) 423from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 424 RequestPath as RequestPathModel, 425) 426from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 427 ResponseToFileExtractor as ResponseToFileExtractorModel, 428) 429from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 430 SchemaNormalization as SchemaNormalizationModel, 431) 432from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 433 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 434) 435from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 436 SelectiveAuthenticator as SelectiveAuthenticatorModel, 437) 438from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 439 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 440) 441from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 442 SimpleRetriever as SimpleRetrieverModel, 443) 444from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 445from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 446 StateDelegatingStream as StateDelegatingStreamModel, 447) 448from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 449 StreamConfig as StreamConfigModel, 450) 451from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 452 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 453) 454from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 455 TypesMap as TypesMapModel, 456) 457from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 458 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 459) 460from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 461 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 462) 463from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 464from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 465 WaitTimeFromHeader as WaitTimeFromHeaderModel, 466) 467from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 468 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 469) 470from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 471 XmlDecoder as XmlDecoderModel, 472) 473from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 474 ZipfileDecoder as ZipfileDecoderModel, 475) 476from airbyte_cdk.sources.declarative.partition_routers import ( 477 CartesianProductStreamSlicer, 478 GroupingPartitionRouter, 479 ListPartitionRouter, 480 PartitionRouter, 481 SinglePartitionRouter, 482 SubstreamPartitionRouter, 483) 484from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 485 AsyncJobPartitionRouter, 486) 487from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 488 ParentStreamConfig, 489) 490from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 491from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 492 CompositeErrorHandler, 493 DefaultErrorHandler, 494 HttpResponseFilter, 495) 496from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 497 ConstantBackoffStrategy, 498 ExponentialBackoffStrategy, 499 WaitTimeFromHeaderBackoffStrategy, 500 WaitUntilTimeFromHeaderBackoffStrategy, 501) 502from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 503from airbyte_cdk.sources.declarative.requesters.paginators import ( 504 DefaultPaginator, 505 NoPagination, 506 PaginatorTestReadDecorator, 507) 508from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 509 CursorPaginationStrategy, 510 CursorStopCondition, 511 OffsetIncrement, 512 PageIncrement, 513 StopConditionPaginationStrategyDecorator, 514) 515from airbyte_cdk.sources.declarative.requesters.query_properties import ( 516 PropertiesFromEndpoint, 517 PropertyChunking, 518 QueryProperties, 519) 520from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 521 PropertyLimitType, 522) 523from airbyte_cdk.sources.declarative.requesters.query_properties.property_selector import ( 524 JsonSchemaPropertySelector, 525) 526from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 527 GroupByKey, 528) 529from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 530from airbyte_cdk.sources.declarative.requesters.request_options import ( 531 DatetimeBasedRequestOptionsProvider, 532 DefaultRequestOptionsProvider, 533 InterpolatedRequestOptionsProvider, 534 RequestOptionsProvider, 535) 536from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 537 PerPartitionRequestOptionsProvider, 538) 539from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 540from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 541from airbyte_cdk.sources.declarative.resolvers import ( 542 ComponentMappingDefinition, 543 ConfigComponentsResolver, 544 HttpComponentsResolver, 545 ParametrizedComponentsResolver, 546 StreamConfig, 547 StreamParametersDefinition, 548) 549from airbyte_cdk.sources.declarative.retrievers import ( 550 AsyncRetriever, 551 LazySimpleRetriever, 552 SimpleRetriever, 553) 554from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 555 ConnectorBuilderFileUploader, 556 DefaultFileUploader, 557 FileUploader, 558 LocalFileSystemFileWriter, 559 NoopFileWriter, 560) 561from airbyte_cdk.sources.declarative.retrievers.pagination_tracker import PaginationTracker 562from airbyte_cdk.sources.declarative.schema import ( 563 ComplexFieldType, 564 DefaultSchemaLoader, 565 DynamicSchemaLoader, 566 InlineSchemaLoader, 567 JsonFileSchemaLoader, 568 SchemaLoader, 569 SchemaTypeIdentifier, 570 TypesMap, 571) 572from airbyte_cdk.sources.declarative.schema.caching_schema_loader_decorator import ( 573 CachingSchemaLoaderDecorator, 574) 575from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 576from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 577from airbyte_cdk.sources.declarative.stream_slicers import ( 578 StreamSlicer, 579 StreamSlicerTestReadDecorator, 580) 581from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 582 DeclarativePartitionFactory, 583 StreamSlicerPartitionGenerator, 584) 585from airbyte_cdk.sources.declarative.transformations import ( 586 AddFields, 587 RecordTransformation, 588 RemoveFields, 589) 590from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 591from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 592 ConfigAddFields, 593 ConfigRemapField, 594 ConfigRemoveFields, 595) 596from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 597 ConfigTransformation, 598) 599from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 600 DpathFlattenFields, 601 KeyTransformation, 602) 603from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 604 FlattenFields, 605) 606from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 607 KeysReplaceTransformation, 608) 609from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 610 KeysToLowerTransformation, 611) 612from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 613 KeysToSnakeCaseTransformation, 614) 615from airbyte_cdk.sources.declarative.validators import ( 616 DpathValidator, 617 PredicateValidator, 618 ValidateAdheresToSchema, 619) 620from airbyte_cdk.sources.http_logger import format_http_message 621from airbyte_cdk.sources.message import ( 622 InMemoryMessageRepository, 623 LogAppenderMessageRepositoryDecorator, 624 MessageRepository, 625 NoopMessageRepository, 626) 627from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 628from airbyte_cdk.sources.streams.call_rate import ( 629 APIBudget, 630 FixedWindowCallRatePolicy, 631 HttpAPIBudget, 632 HttpRequestRegexMatcher, 633 MovingWindowCallRatePolicy, 634 Rate, 635 UnlimitedCallRatePolicy, 636) 637from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 638from airbyte_cdk.sources.streams.concurrent.clamping import ( 639 ClampingEndProvider, 640 ClampingStrategy, 641 DayClampingStrategy, 642 MonthClampingStrategy, 643 NoClamping, 644 WeekClampingStrategy, 645 Weekday, 646) 647from airbyte_cdk.sources.streams.concurrent.cursor import ( 648 ConcurrentCursor, 649 Cursor, 650 CursorField, 651 FinalStateCursor, 652) 653from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 654from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 655from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 656 StreamSlicer as ConcurrentStreamSlicer, 657) 658from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 659 CustomFormatConcurrentStreamStateConverter, 660 DateTimeStreamStateConverter, 661) 662from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 663 IncrementingCountStreamStateConverter, 664) 665from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 666from airbyte_cdk.sources.types import Config 667from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 668 669ComponentDefinition = Mapping[str, Any] 670 671SCHEMA_TRANSFORMER_TYPE_MAPPING = { 672 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 673 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 674} 675_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 676 677# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 678# this would be a circular import 679MAX_SLICES = 5 680 681LOGGER = logging.getLogger(f"airbyte.model_to_component_factory") 682 683 684class ModelToComponentFactory: 685 EPOCH_DATETIME_FORMAT = "%s" 686 687 def __init__( 688 self, 689 limit_pages_fetched_per_slice: Optional[int] = None, 690 limit_slices_fetched: Optional[int] = None, 691 emit_connector_builder_messages: bool = False, 692 disable_retries: bool = False, 693 disable_cache: bool = False, 694 message_repository: Optional[MessageRepository] = None, 695 connector_state_manager: Optional[ConnectorStateManager] = None, 696 max_concurrent_async_job_count: Optional[int] = None, 697 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 698 api_budget: Optional[APIBudget] = None, 699 ): 700 self._init_mappings() 701 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 702 self._limit_slices_fetched = limit_slices_fetched 703 self._emit_connector_builder_messages = emit_connector_builder_messages 704 self._disable_retries = disable_retries 705 self._disable_cache = disable_cache 706 self._message_repository = message_repository or InMemoryMessageRepository( 707 self._evaluate_log_level(emit_connector_builder_messages) 708 ) 709 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 710 configured_catalog 711 ) 712 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 713 self._api_budget: Optional[Union[APIBudget]] = api_budget 714 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 715 # placeholder for deprecation warnings 716 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 717 718 def _init_mappings(self) -> None: 719 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 720 AddedFieldDefinitionModel: self.create_added_field_definition, 721 AddFieldsModel: self.create_add_fields, 722 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 723 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 724 BearerAuthenticatorModel: self.create_bearer_authenticator, 725 CheckStreamModel: self.create_check_stream, 726 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 727 CheckDynamicStreamModel: self.create_check_dynamic_stream, 728 CompositeErrorHandlerModel: self.create_composite_error_handler, 729 ConcurrencyLevelModel: self.create_concurrency_level, 730 ConfigMigrationModel: self.create_config_migration, 731 ConfigAddFieldsModel: self.create_config_add_fields, 732 ConfigRemapFieldModel: self.create_config_remap_field, 733 ConfigRemoveFieldsModel: self.create_config_remove_fields, 734 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 735 CsvDecoderModel: self.create_csv_decoder, 736 CursorPaginationModel: self.create_cursor_pagination, 737 CustomAuthenticatorModel: self.create_custom_component, 738 CustomBackoffStrategyModel: self.create_custom_component, 739 CustomDecoderModel: self.create_custom_component, 740 CustomErrorHandlerModel: self.create_custom_component, 741 CustomRecordExtractorModel: self.create_custom_component, 742 CustomRecordFilterModel: self.create_custom_component, 743 CustomRequesterModel: self.create_custom_component, 744 CustomRetrieverModel: self.create_custom_component, 745 CustomSchemaLoader: self.create_custom_component, 746 CustomSchemaNormalizationModel: self.create_custom_component, 747 CustomStateMigration: self.create_custom_component, 748 CustomPaginationStrategyModel: self.create_custom_component, 749 CustomPartitionRouterModel: self.create_custom_component, 750 CustomTransformationModel: self.create_custom_component, 751 CustomValidationStrategyModel: self.create_custom_component, 752 CustomConfigTransformationModel: self.create_custom_component, 753 DeclarativeStreamModel: self.create_default_stream, 754 DefaultErrorHandlerModel: self.create_default_error_handler, 755 DefaultPaginatorModel: self.create_default_paginator, 756 DpathExtractorModel: self.create_dpath_extractor, 757 DpathValidatorModel: self.create_dpath_validator, 758 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 759 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 760 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 761 GroupByKeyMergeStrategyModel: self.create_group_by_key, 762 HttpRequesterModel: self.create_http_requester, 763 HttpResponseFilterModel: self.create_http_response_filter, 764 InlineSchemaLoaderModel: self.create_inline_schema_loader, 765 JsonDecoderModel: self.create_json_decoder, 766 JsonlDecoderModel: self.create_jsonl_decoder, 767 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 768 GzipDecoderModel: self.create_gzip_decoder, 769 KeysToLowerModel: self.create_keys_to_lower_transformation, 770 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 771 KeysReplaceModel: self.create_keys_replace_transformation, 772 FlattenFieldsModel: self.create_flatten_fields, 773 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 774 IterableDecoderModel: self.create_iterable_decoder, 775 XmlDecoderModel: self.create_xml_decoder, 776 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 777 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 778 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 779 TypesMapModel: self.create_types_map, 780 ComplexFieldTypeModel: self.create_complex_field_type, 781 JwtAuthenticatorModel: self.create_jwt_authenticator, 782 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 783 ListPartitionRouterModel: self.create_list_partition_router, 784 MinMaxDatetimeModel: self.create_min_max_datetime, 785 NoAuthModel: self.create_no_auth, 786 NoPaginationModel: self.create_no_pagination, 787 OAuthAuthenticatorModel: self.create_oauth_authenticator, 788 OffsetIncrementModel: self.create_offset_increment, 789 PageIncrementModel: self.create_page_increment, 790 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 791 PredicateValidatorModel: self.create_predicate_validator, 792 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 793 PropertyChunkingModel: self.create_property_chunking, 794 QueryPropertiesModel: self.create_query_properties, 795 RecordExpanderModel: self.create_record_expander, 796 RecordFilterModel: self.create_record_filter, 797 RecordSelectorModel: self.create_record_selector, 798 RemoveFieldsModel: self.create_remove_fields, 799 RequestPathModel: self.create_request_path, 800 RequestOptionModel: self.create_request_option, 801 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 802 SelectiveAuthenticatorModel: self.create_selective_authenticator, 803 SimpleRetrieverModel: self.create_simple_retriever, 804 StateDelegatingStreamModel: self.create_state_delegating_stream, 805 SpecModel: self.create_spec, 806 SubstreamPartitionRouterModel: self.create_substream_partition_router, 807 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 808 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 809 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 810 AsyncRetrieverModel: self.create_async_retriever, 811 HttpComponentsResolverModel: self.create_http_components_resolver, 812 ConfigComponentsResolverModel: self.create_config_components_resolver, 813 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 814 StreamConfigModel: self.create_stream_config, 815 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 816 ZipfileDecoderModel: self.create_zipfile_decoder, 817 HTTPAPIBudgetModel: self.create_http_api_budget, 818 FileUploaderModel: self.create_file_uploader, 819 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 820 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 821 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 822 RateModel: self.create_rate, 823 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 824 GroupingPartitionRouterModel: self.create_grouping_partition_router, 825 } 826 827 # Needed for the case where we need to perform a second parse on the fields of a custom component 828 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 829 830 @staticmethod 831 def _create_stream_name_to_configured_stream( 832 configured_catalog: Optional[ConfiguredAirbyteCatalog], 833 ) -> Mapping[str, ConfiguredAirbyteStream]: 834 return ( 835 {stream.stream.name: stream for stream in configured_catalog.streams} 836 if configured_catalog 837 else {} 838 ) 839 840 def create_component( 841 self, 842 model_type: Type[BaseModel], 843 component_definition: ComponentDefinition, 844 config: Config, 845 **kwargs: Any, 846 ) -> Any: 847 """ 848 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 849 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 850 creating declarative components from that model. 851 852 :param model_type: The type of declarative component that is being initialized 853 :param component_definition: The mapping that represents a declarative component 854 :param config: The connector config that is provided by the customer 855 :return: The declarative component to be used at runtime 856 """ 857 858 component_type = component_definition.get("type") 859 if component_definition.get("type") != model_type.__name__: 860 raise ValueError( 861 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 862 ) 863 864 declarative_component_model = model_type.parse_obj(component_definition) 865 866 if not isinstance(declarative_component_model, model_type): 867 raise ValueError( 868 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 869 ) 870 871 return self._create_component_from_model( 872 model=declarative_component_model, config=config, **kwargs 873 ) 874 875 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 876 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 877 raise ValueError( 878 f"{model.__class__} with attributes {model} is not a valid component type" 879 ) 880 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 881 if not component_constructor: 882 raise ValueError(f"Could not find constructor for {model.__class__}") 883 884 # collect deprecation warnings for supported models. 885 if isinstance(model, BaseModelWithDeprecations): 886 self._collect_model_deprecations(model) 887 888 return component_constructor(model=model, config=config, **kwargs) 889 890 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 891 """ 892 Returns the deprecation warnings that were collected during the creation of components. 893 """ 894 return self._collected_deprecation_logs 895 896 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 897 """ 898 Collects deprecation logs from the given model and appends any new logs to the internal collection. 899 900 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 901 902 Args: 903 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 904 """ 905 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 906 for log in model._deprecation_logs: 907 # avoid duplicates for deprecation logs observed. 908 if log not in self._collected_deprecation_logs: 909 self._collected_deprecation_logs.append(log) 910 911 def create_config_migration( 912 self, model: ConfigMigrationModel, config: Config 913 ) -> ConfigMigration: 914 transformations: List[ConfigTransformation] = [ 915 self._create_component_from_model(transformation, config) 916 for transformation in model.transformations 917 ] 918 919 return ConfigMigration( 920 description=model.description, 921 transformations=transformations, 922 ) 923 924 def create_config_add_fields( 925 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 926 ) -> ConfigAddFields: 927 fields = [self._create_component_from_model(field, config) for field in model.fields] 928 return ConfigAddFields( 929 fields=fields, 930 condition=model.condition or "", 931 ) 932 933 @staticmethod 934 def create_config_remove_fields( 935 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 936 ) -> ConfigRemoveFields: 937 return ConfigRemoveFields( 938 field_pointers=model.field_pointers, 939 condition=model.condition or "", 940 ) 941 942 @staticmethod 943 def create_config_remap_field( 944 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 945 ) -> ConfigRemapField: 946 mapping = cast(Mapping[str, Any], model.map) 947 return ConfigRemapField( 948 map=mapping, 949 field_path=model.field_path, 950 config=config, 951 ) 952 953 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 954 strategy = self._create_component_from_model(model.validation_strategy, config) 955 956 return DpathValidator( 957 field_path=model.field_path, 958 strategy=strategy, 959 ) 960 961 def create_predicate_validator( 962 self, model: PredicateValidatorModel, config: Config 963 ) -> PredicateValidator: 964 strategy = self._create_component_from_model(model.validation_strategy, config) 965 966 return PredicateValidator( 967 value=model.value, 968 strategy=strategy, 969 ) 970 971 @staticmethod 972 def create_validate_adheres_to_schema( 973 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 974 ) -> ValidateAdheresToSchema: 975 base_schema = cast(Mapping[str, Any], model.base_schema) 976 return ValidateAdheresToSchema( 977 schema=base_schema, 978 ) 979 980 @staticmethod 981 def create_added_field_definition( 982 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 983 ) -> AddedFieldDefinition: 984 interpolated_value = InterpolatedString.create( 985 model.value, parameters=model.parameters or {} 986 ) 987 return AddedFieldDefinition( 988 path=model.path, 989 value=interpolated_value, 990 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 991 parameters=model.parameters or {}, 992 ) 993 994 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 995 added_field_definitions = [ 996 self._create_component_from_model( 997 model=added_field_definition_model, 998 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 999 added_field_definition_model.value_type 1000 ), 1001 config=config, 1002 ) 1003 for added_field_definition_model in model.fields 1004 ] 1005 return AddFields( 1006 fields=added_field_definitions, 1007 condition=model.condition or "", 1008 parameters=model.parameters or {}, 1009 ) 1010 1011 def create_keys_to_lower_transformation( 1012 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1013 ) -> KeysToLowerTransformation: 1014 return KeysToLowerTransformation() 1015 1016 def create_keys_to_snake_transformation( 1017 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1018 ) -> KeysToSnakeCaseTransformation: 1019 return KeysToSnakeCaseTransformation() 1020 1021 def create_keys_replace_transformation( 1022 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1023 ) -> KeysReplaceTransformation: 1024 return KeysReplaceTransformation( 1025 old=model.old, new=model.new, parameters=model.parameters or {} 1026 ) 1027 1028 def create_flatten_fields( 1029 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1030 ) -> FlattenFields: 1031 return FlattenFields( 1032 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1033 ) 1034 1035 def create_dpath_flatten_fields( 1036 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1037 ) -> DpathFlattenFields: 1038 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1039 key_transformation = ( 1040 KeyTransformation( 1041 config=config, 1042 prefix=model.key_transformation.prefix, 1043 suffix=model.key_transformation.suffix, 1044 parameters=model.parameters or {}, 1045 ) 1046 if model.key_transformation is not None 1047 else None 1048 ) 1049 return DpathFlattenFields( 1050 config=config, 1051 field_path=model_field_path, 1052 delete_origin_value=model.delete_origin_value 1053 if model.delete_origin_value is not None 1054 else False, 1055 replace_record=model.replace_record if model.replace_record is not None else False, 1056 key_transformation=key_transformation, 1057 parameters=model.parameters or {}, 1058 ) 1059 1060 @staticmethod 1061 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1062 if not value_type: 1063 return None 1064 names_to_types = { 1065 ValueType.string: str, 1066 ValueType.number: float, 1067 ValueType.integer: int, 1068 ValueType.boolean: bool, 1069 } 1070 return names_to_types[value_type] 1071 1072 def create_api_key_authenticator( 1073 self, 1074 model: ApiKeyAuthenticatorModel, 1075 config: Config, 1076 token_provider: Optional[TokenProvider] = None, 1077 **kwargs: Any, 1078 ) -> ApiKeyAuthenticator: 1079 if model.inject_into is None and model.header is None: 1080 raise ValueError( 1081 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1082 ) 1083 1084 if model.inject_into is not None and model.header is not None: 1085 raise ValueError( 1086 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1087 ) 1088 1089 if token_provider is not None and model.api_token != "": 1090 raise ValueError( 1091 "If token_provider is set, api_token is ignored and has to be set to empty string." 1092 ) 1093 1094 request_option = ( 1095 self._create_component_from_model( 1096 model.inject_into, config, parameters=model.parameters or {} 1097 ) 1098 if model.inject_into 1099 else RequestOption( 1100 inject_into=RequestOptionType.header, 1101 field_name=model.header or "", 1102 parameters=model.parameters or {}, 1103 ) 1104 ) 1105 1106 return ApiKeyAuthenticator( 1107 token_provider=( 1108 token_provider 1109 if token_provider is not None 1110 else InterpolatedStringTokenProvider( 1111 api_token=model.api_token or "", 1112 config=config, 1113 parameters=model.parameters or {}, 1114 ) 1115 ), 1116 request_option=request_option, 1117 config=config, 1118 parameters=model.parameters or {}, 1119 ) 1120 1121 def create_legacy_to_per_partition_state_migration( 1122 self, 1123 model: LegacyToPerPartitionStateMigrationModel, 1124 config: Mapping[str, Any], 1125 declarative_stream: DeclarativeStreamModel, 1126 ) -> LegacyToPerPartitionStateMigration: 1127 retriever = declarative_stream.retriever 1128 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1129 raise ValueError( 1130 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1131 ) 1132 partition_router = retriever.partition_router 1133 if not isinstance( 1134 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1135 ): 1136 raise ValueError( 1137 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1138 ) 1139 if not hasattr(partition_router, "parent_stream_configs"): 1140 raise ValueError( 1141 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1142 ) 1143 1144 if not hasattr(declarative_stream, "incremental_sync"): 1145 raise ValueError( 1146 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1147 ) 1148 1149 return LegacyToPerPartitionStateMigration( 1150 partition_router, # type: ignore # was already checked above 1151 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1152 config, 1153 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1154 ) 1155 1156 def create_session_token_authenticator( 1157 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1158 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1159 decoder = ( 1160 self._create_component_from_model(model=model.decoder, config=config) 1161 if model.decoder 1162 else JsonDecoder(parameters={}) 1163 ) 1164 login_requester = self._create_component_from_model( 1165 model=model.login_requester, 1166 config=config, 1167 name=f"{name}_login_requester", 1168 decoder=decoder, 1169 ) 1170 token_provider = SessionTokenProvider( 1171 login_requester=login_requester, 1172 session_token_path=model.session_token_path, 1173 expiration_duration=parse_duration(model.expiration_duration) 1174 if model.expiration_duration 1175 else None, 1176 parameters=model.parameters or {}, 1177 message_repository=self._message_repository, 1178 decoder=decoder, 1179 ) 1180 if model.request_authentication.type == "Bearer": 1181 return ModelToComponentFactory.create_bearer_authenticator( 1182 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1183 config, 1184 token_provider=token_provider, 1185 ) 1186 else: 1187 # Get the api_token template if specified, default to just the session token 1188 api_token_template = ( 1189 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1190 ) 1191 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1192 config=config, 1193 api_token=api_token_template, 1194 session_token_provider=token_provider, 1195 parameters=model.parameters or {}, 1196 ) 1197 return self.create_api_key_authenticator( 1198 ApiKeyAuthenticatorModel( 1199 type="ApiKeyAuthenticator", 1200 api_token="", 1201 inject_into=model.request_authentication.inject_into, 1202 ), # type: ignore # $parameters and headers default to None 1203 config=config, 1204 token_provider=final_token_provider, 1205 ) 1206 1207 @staticmethod 1208 def create_basic_http_authenticator( 1209 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1210 ) -> BasicHttpAuthenticator: 1211 return BasicHttpAuthenticator( 1212 password=model.password or "", 1213 username=model.username, 1214 config=config, 1215 parameters=model.parameters or {}, 1216 ) 1217 1218 @staticmethod 1219 def create_bearer_authenticator( 1220 model: BearerAuthenticatorModel, 1221 config: Config, 1222 token_provider: Optional[TokenProvider] = None, 1223 **kwargs: Any, 1224 ) -> BearerAuthenticator: 1225 if token_provider is not None and model.api_token != "": 1226 raise ValueError( 1227 "If token_provider is set, api_token is ignored and has to be set to empty string." 1228 ) 1229 return BearerAuthenticator( 1230 token_provider=( 1231 token_provider 1232 if token_provider is not None 1233 else InterpolatedStringTokenProvider( 1234 api_token=model.api_token or "", 1235 config=config, 1236 parameters=model.parameters or {}, 1237 ) 1238 ), 1239 config=config, 1240 parameters=model.parameters or {}, 1241 ) 1242 1243 @staticmethod 1244 def create_dynamic_stream_check_config( 1245 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1246 ) -> DynamicStreamCheckConfig: 1247 return DynamicStreamCheckConfig( 1248 dynamic_stream_name=model.dynamic_stream_name, 1249 stream_count=model.stream_count, 1250 ) 1251 1252 def create_check_stream( 1253 self, model: CheckStreamModel, config: Config, **kwargs: Any 1254 ) -> CheckStream: 1255 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1256 raise ValueError( 1257 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1258 ) 1259 1260 dynamic_streams_check_configs = ( 1261 [ 1262 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1263 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1264 ] 1265 if model.dynamic_streams_check_configs 1266 else [] 1267 ) 1268 1269 return CheckStream( 1270 stream_names=model.stream_names or [], 1271 dynamic_streams_check_configs=dynamic_streams_check_configs, 1272 parameters={}, 1273 ) 1274 1275 @staticmethod 1276 def create_check_dynamic_stream( 1277 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1278 ) -> CheckDynamicStream: 1279 assert model.use_check_availability is not None # for mypy 1280 1281 use_check_availability = model.use_check_availability 1282 1283 return CheckDynamicStream( 1284 stream_count=model.stream_count, 1285 use_check_availability=use_check_availability, 1286 parameters={}, 1287 ) 1288 1289 def create_composite_error_handler( 1290 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1291 ) -> CompositeErrorHandler: 1292 error_handlers = [ 1293 self._create_component_from_model(model=error_handler_model, config=config) 1294 for error_handler_model in model.error_handlers 1295 ] 1296 return CompositeErrorHandler( 1297 error_handlers=error_handlers, parameters=model.parameters or {} 1298 ) 1299 1300 @staticmethod 1301 def create_concurrency_level( 1302 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1303 ) -> ConcurrencyLevel: 1304 return ConcurrencyLevel( 1305 default_concurrency=model.default_concurrency, 1306 max_concurrency=model.max_concurrency, 1307 config=config, 1308 parameters={}, 1309 ) 1310 1311 @staticmethod 1312 def apply_stream_state_migrations( 1313 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1314 ) -> MutableMapping[str, Any]: 1315 if stream_state_migrations: 1316 for state_migration in stream_state_migrations: 1317 if state_migration.should_migrate(stream_state): 1318 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1319 stream_state = dict(state_migration.migrate(stream_state)) 1320 return stream_state 1321 1322 def create_concurrent_cursor_from_datetime_based_cursor( 1323 self, 1324 model_type: Type[BaseModel], 1325 component_definition: ComponentDefinition, 1326 stream_name: str, 1327 stream_namespace: Optional[str], 1328 stream_state: MutableMapping[str, Any], 1329 config: Config, 1330 message_repository: Optional[MessageRepository] = None, 1331 runtime_lookback_window: Optional[datetime.timedelta] = None, 1332 **kwargs: Any, 1333 ) -> ConcurrentCursor: 1334 component_type = component_definition.get("type") 1335 if component_definition.get("type") != model_type.__name__: 1336 raise ValueError( 1337 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1338 ) 1339 1340 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1341 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1342 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1343 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1344 if "$parameters" not in component_definition and "parameters" in component_definition: 1345 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1346 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1347 1348 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1349 raise ValueError( 1350 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1351 ) 1352 1353 model_parameters = datetime_based_cursor_model.parameters or {} 1354 1355 cursor_field = self._get_catalog_defined_cursor_field( 1356 stream_name=stream_name, 1357 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1358 or False, 1359 ) 1360 1361 if not cursor_field: 1362 interpolated_cursor_field = InterpolatedString.create( 1363 datetime_based_cursor_model.cursor_field, 1364 parameters=model_parameters, 1365 ) 1366 cursor_field = CursorField( 1367 cursor_field_key=interpolated_cursor_field.eval(config=config), 1368 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1369 or False, 1370 ) 1371 1372 interpolated_partition_field_start = InterpolatedString.create( 1373 datetime_based_cursor_model.partition_field_start or "start_time", 1374 parameters=model_parameters, 1375 ) 1376 interpolated_partition_field_end = InterpolatedString.create( 1377 datetime_based_cursor_model.partition_field_end or "end_time", 1378 parameters=model_parameters, 1379 ) 1380 1381 slice_boundary_fields = ( 1382 interpolated_partition_field_start.eval(config=config), 1383 interpolated_partition_field_end.eval(config=config), 1384 ) 1385 1386 datetime_format = datetime_based_cursor_model.datetime_format 1387 1388 cursor_granularity = ( 1389 parse_duration(datetime_based_cursor_model.cursor_granularity) 1390 if datetime_based_cursor_model.cursor_granularity 1391 else None 1392 ) 1393 1394 lookback_window = None 1395 interpolated_lookback_window = ( 1396 InterpolatedString.create( 1397 datetime_based_cursor_model.lookback_window, 1398 parameters=model_parameters, 1399 ) 1400 if datetime_based_cursor_model.lookback_window 1401 else None 1402 ) 1403 if interpolated_lookback_window: 1404 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1405 if evaluated_lookback_window: 1406 lookback_window = parse_duration(evaluated_lookback_window) 1407 1408 connector_state_converter: DateTimeStreamStateConverter 1409 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1410 datetime_format=datetime_format, 1411 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1412 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1413 cursor_granularity=cursor_granularity, 1414 ) 1415 1416 # Adjusts the stream state by applying the runtime lookback window. 1417 # This is used to ensure correct state handling in case of failed partitions. 1418 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1419 if runtime_lookback_window and stream_state_value: 1420 new_stream_state = ( 1421 connector_state_converter.parse_timestamp(stream_state_value) 1422 - runtime_lookback_window 1423 ) 1424 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1425 new_stream_state 1426 ) 1427 1428 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1429 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1430 start_date_runtime_value = self.create_min_max_datetime( 1431 model=datetime_based_cursor_model.start_datetime, config=config 1432 ) 1433 else: 1434 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1435 1436 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1437 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1438 end_date_runtime_value = self.create_min_max_datetime( 1439 model=datetime_based_cursor_model.end_datetime, config=config 1440 ) 1441 else: 1442 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1443 1444 interpolated_start_date = MinMaxDatetime.create( 1445 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1446 parameters=datetime_based_cursor_model.parameters, 1447 ) 1448 interpolated_end_date = ( 1449 None 1450 if not end_date_runtime_value 1451 else MinMaxDatetime.create( 1452 end_date_runtime_value, datetime_based_cursor_model.parameters 1453 ) 1454 ) 1455 1456 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1457 if not interpolated_start_date.datetime_format: 1458 interpolated_start_date.datetime_format = datetime_format 1459 if interpolated_end_date and not interpolated_end_date.datetime_format: 1460 interpolated_end_date.datetime_format = datetime_format 1461 1462 start_date = interpolated_start_date.get_datetime(config=config) 1463 end_date_provider = ( 1464 partial(interpolated_end_date.get_datetime, config) 1465 if interpolated_end_date 1466 else connector_state_converter.get_end_provider() 1467 ) 1468 1469 if ( 1470 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1471 ) or ( 1472 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1473 ): 1474 raise ValueError( 1475 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1476 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1477 ) 1478 1479 # When step is not defined, default to a step size from the starting date to the present moment 1480 step_length = datetime.timedelta.max 1481 interpolated_step = ( 1482 InterpolatedString.create( 1483 datetime_based_cursor_model.step, 1484 parameters=model_parameters, 1485 ) 1486 if datetime_based_cursor_model.step 1487 else None 1488 ) 1489 if interpolated_step: 1490 evaluated_step = interpolated_step.eval(config) 1491 if evaluated_step: 1492 step_length = parse_duration(evaluated_step) 1493 1494 clamping_strategy: ClampingStrategy = NoClamping() 1495 if datetime_based_cursor_model.clamping: 1496 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1497 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1498 # object which we want to keep agnostic of being low-code 1499 target = InterpolatedString( 1500 string=datetime_based_cursor_model.clamping.target, 1501 parameters=model_parameters, 1502 ) 1503 evaluated_target = target.eval(config=config) 1504 match evaluated_target: 1505 case "DAY": 1506 clamping_strategy = DayClampingStrategy() 1507 end_date_provider = ClampingEndProvider( 1508 DayClampingStrategy(is_ceiling=False), 1509 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1510 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1511 ) 1512 case "WEEK": 1513 if ( 1514 not datetime_based_cursor_model.clamping.target_details 1515 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1516 ): 1517 raise ValueError( 1518 "Given WEEK clamping, weekday needs to be provided as target_details" 1519 ) 1520 weekday = self._assemble_weekday( 1521 datetime_based_cursor_model.clamping.target_details["weekday"] 1522 ) 1523 clamping_strategy = WeekClampingStrategy(weekday) 1524 end_date_provider = ClampingEndProvider( 1525 WeekClampingStrategy(weekday, is_ceiling=False), 1526 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1527 granularity=cursor_granularity or datetime.timedelta(days=1), 1528 ) 1529 case "MONTH": 1530 clamping_strategy = MonthClampingStrategy() 1531 end_date_provider = ClampingEndProvider( 1532 MonthClampingStrategy(is_ceiling=False), 1533 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1534 granularity=cursor_granularity or datetime.timedelta(days=1), 1535 ) 1536 case _: 1537 raise ValueError( 1538 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1539 ) 1540 1541 return ConcurrentCursor( 1542 stream_name=stream_name, 1543 stream_namespace=stream_namespace, 1544 stream_state=stream_state, 1545 message_repository=message_repository or self._message_repository, 1546 connector_state_manager=self._connector_state_manager, 1547 connector_state_converter=connector_state_converter, 1548 cursor_field=cursor_field, 1549 slice_boundary_fields=slice_boundary_fields, 1550 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1551 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1552 lookback_window=lookback_window, 1553 slice_range=step_length, 1554 cursor_granularity=cursor_granularity, 1555 clamping_strategy=clamping_strategy, 1556 ) 1557 1558 def create_concurrent_cursor_from_incrementing_count_cursor( 1559 self, 1560 model_type: Type[BaseModel], 1561 component_definition: ComponentDefinition, 1562 stream_name: str, 1563 stream_namespace: Optional[str], 1564 stream_state: MutableMapping[str, Any], 1565 config: Config, 1566 message_repository: Optional[MessageRepository] = None, 1567 **kwargs: Any, 1568 ) -> ConcurrentCursor: 1569 component_type = component_definition.get("type") 1570 if component_definition.get("type") != model_type.__name__: 1571 raise ValueError( 1572 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1573 ) 1574 1575 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1576 1577 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1578 raise ValueError( 1579 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1580 ) 1581 1582 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1583 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1584 # We need to handle both int and str representations of numeric values. 1585 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1586 if start_value is not None: 1587 interpolated_start_value = InterpolatedString.create( 1588 str(start_value), # Ensure we pass a string to InterpolatedString.create 1589 parameters=incrementing_count_cursor_model.parameters or {}, 1590 ) 1591 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1592 else: 1593 evaluated_start_value = 0 1594 1595 cursor_field = self._get_catalog_defined_cursor_field( 1596 stream_name=stream_name, 1597 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1598 or False, 1599 ) 1600 1601 if not cursor_field: 1602 interpolated_cursor_field = InterpolatedString.create( 1603 incrementing_count_cursor_model.cursor_field, 1604 parameters=incrementing_count_cursor_model.parameters or {}, 1605 ) 1606 cursor_field = CursorField( 1607 cursor_field_key=interpolated_cursor_field.eval(config=config), 1608 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1609 or False, 1610 ) 1611 1612 connector_state_converter = IncrementingCountStreamStateConverter( 1613 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1614 ) 1615 1616 return ConcurrentCursor( 1617 stream_name=stream_name, 1618 stream_namespace=stream_namespace, 1619 stream_state=stream_state, 1620 message_repository=message_repository or self._message_repository, 1621 connector_state_manager=self._connector_state_manager, 1622 connector_state_converter=connector_state_converter, 1623 cursor_field=cursor_field, 1624 slice_boundary_fields=None, 1625 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1626 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1627 ) 1628 1629 def _assemble_weekday(self, weekday: str) -> Weekday: 1630 match weekday: 1631 case "MONDAY": 1632 return Weekday.MONDAY 1633 case "TUESDAY": 1634 return Weekday.TUESDAY 1635 case "WEDNESDAY": 1636 return Weekday.WEDNESDAY 1637 case "THURSDAY": 1638 return Weekday.THURSDAY 1639 case "FRIDAY": 1640 return Weekday.FRIDAY 1641 case "SATURDAY": 1642 return Weekday.SATURDAY 1643 case "SUNDAY": 1644 return Weekday.SUNDAY 1645 case _: 1646 raise ValueError(f"Unknown weekday {weekday}") 1647 1648 def create_concurrent_cursor_from_perpartition_cursor( 1649 self, 1650 state_manager: ConnectorStateManager, 1651 model_type: Type[BaseModel], 1652 component_definition: ComponentDefinition, 1653 stream_name: str, 1654 stream_namespace: Optional[str], 1655 config: Config, 1656 stream_state: MutableMapping[str, Any], 1657 partition_router: PartitionRouter, 1658 attempt_to_create_cursor_if_not_provided: bool = False, 1659 **kwargs: Any, 1660 ) -> ConcurrentPerPartitionCursor: 1661 component_type = component_definition.get("type") 1662 if component_definition.get("type") != model_type.__name__: 1663 raise ValueError( 1664 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1665 ) 1666 1667 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1668 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1669 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1670 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1671 if "$parameters" not in component_definition and "parameters" in component_definition: 1672 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1673 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1674 1675 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1676 raise ValueError( 1677 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1678 ) 1679 1680 cursor_field = self._get_catalog_defined_cursor_field( 1681 stream_name=stream_name, 1682 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1683 or False, 1684 ) 1685 1686 if not cursor_field: 1687 interpolated_cursor_field = InterpolatedString.create( 1688 datetime_based_cursor_model.cursor_field, 1689 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1690 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1691 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1692 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1693 parameters=datetime_based_cursor_model.parameters or {}, 1694 ) 1695 cursor_field = CursorField( 1696 cursor_field_key=interpolated_cursor_field.eval(config=config), 1697 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1698 or False, 1699 ) 1700 1701 datetime_format = datetime_based_cursor_model.datetime_format 1702 1703 cursor_granularity = ( 1704 parse_duration(datetime_based_cursor_model.cursor_granularity) 1705 if datetime_based_cursor_model.cursor_granularity 1706 else None 1707 ) 1708 1709 connector_state_converter: DateTimeStreamStateConverter 1710 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1711 datetime_format=datetime_format, 1712 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1713 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1714 cursor_granularity=cursor_granularity, 1715 ) 1716 1717 # Create the cursor factory 1718 cursor_factory = ConcurrentCursorFactory( 1719 partial( 1720 self.create_concurrent_cursor_from_datetime_based_cursor, 1721 state_manager=state_manager, 1722 model_type=model_type, 1723 component_definition=component_definition, 1724 stream_name=stream_name, 1725 stream_namespace=stream_namespace, 1726 config=config, 1727 message_repository=NoopMessageRepository(), 1728 ) 1729 ) 1730 1731 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1732 use_global_cursor = isinstance( 1733 partition_router, GroupingPartitionRouter 1734 ) or component_definition.get("global_substream_cursor", False) 1735 1736 # Return the concurrent cursor and state converter 1737 return ConcurrentPerPartitionCursor( 1738 cursor_factory=cursor_factory, 1739 partition_router=partition_router, 1740 stream_name=stream_name, 1741 stream_namespace=stream_namespace, 1742 stream_state=stream_state, 1743 message_repository=self._message_repository, # type: ignore 1744 connector_state_manager=state_manager, 1745 connector_state_converter=connector_state_converter, 1746 cursor_field=cursor_field, 1747 use_global_cursor=use_global_cursor, 1748 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1749 ) 1750 1751 @staticmethod 1752 def create_constant_backoff_strategy( 1753 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1754 ) -> ConstantBackoffStrategy: 1755 return ConstantBackoffStrategy( 1756 backoff_time_in_seconds=model.backoff_time_in_seconds, 1757 config=config, 1758 parameters=model.parameters or {}, 1759 ) 1760 1761 def create_cursor_pagination( 1762 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1763 ) -> CursorPaginationStrategy: 1764 if isinstance(decoder, PaginationDecoderDecorator): 1765 inner_decoder = decoder.decoder 1766 else: 1767 inner_decoder = decoder 1768 decoder = PaginationDecoderDecorator(decoder=decoder) 1769 1770 if self._is_supported_decoder_for_pagination(inner_decoder): 1771 decoder_to_use = decoder 1772 else: 1773 raise ValueError( 1774 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1775 ) 1776 1777 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1778 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1779 page_size = model.page_size 1780 if isinstance(page_size, str) and page_size.isdigit(): 1781 page_size = int(page_size) 1782 1783 return CursorPaginationStrategy( 1784 cursor_value=model.cursor_value, 1785 decoder=decoder_to_use, 1786 page_size=page_size, 1787 stop_condition=model.stop_condition, 1788 config=config, 1789 parameters=model.parameters or {}, 1790 ) 1791 1792 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1793 """ 1794 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1795 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1796 :param model: The Pydantic model of the custom component being created 1797 :param config: The custom defined connector config 1798 :return: The declarative component built from the Pydantic model to be used at runtime 1799 """ 1800 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1801 component_fields = get_type_hints(custom_component_class) 1802 model_args = model.dict() 1803 model_args["config"] = config 1804 1805 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1806 # we defer to these arguments over the component's definition 1807 for key, arg in kwargs.items(): 1808 model_args[key] = arg 1809 1810 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1811 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1812 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1813 for model_field, model_value in model_args.items(): 1814 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1815 if ( 1816 isinstance(model_value, dict) 1817 and "type" not in model_value 1818 and model_field in component_fields 1819 ): 1820 derived_type = self._derive_component_type_from_type_hints( 1821 component_fields.get(model_field) 1822 ) 1823 if derived_type: 1824 model_value["type"] = derived_type 1825 1826 if self._is_component(model_value): 1827 model_args[model_field] = self._create_nested_component( 1828 model, 1829 model_field, 1830 model_value, 1831 config, 1832 **kwargs, 1833 ) 1834 elif isinstance(model_value, list): 1835 vals = [] 1836 for v in model_value: 1837 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1838 derived_type = self._derive_component_type_from_type_hints( 1839 component_fields.get(model_field) 1840 ) 1841 if derived_type: 1842 v["type"] = derived_type 1843 if self._is_component(v): 1844 vals.append( 1845 self._create_nested_component( 1846 model, 1847 model_field, 1848 v, 1849 config, 1850 **kwargs, 1851 ) 1852 ) 1853 else: 1854 vals.append(v) 1855 model_args[model_field] = vals 1856 1857 kwargs = { 1858 class_field: model_args[class_field] 1859 for class_field in component_fields.keys() 1860 if class_field in model_args 1861 } 1862 return custom_component_class(**kwargs) 1863 1864 @staticmethod 1865 def _get_class_from_fully_qualified_class_name( 1866 full_qualified_class_name: str, 1867 ) -> Any: 1868 """Get a class from its fully qualified name. 1869 1870 If a custom components module is needed, we assume it is already registered - probably 1871 as `source_declarative_manifest.components` or `components`. 1872 1873 Args: 1874 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1875 1876 Returns: 1877 Any: The class object. 1878 1879 Raises: 1880 ValueError: If the class cannot be loaded. 1881 """ 1882 split = full_qualified_class_name.split(".") 1883 module_name_full = ".".join(split[:-1]) 1884 class_name = split[-1] 1885 1886 try: 1887 module_ref = importlib.import_module(module_name_full) 1888 except ModuleNotFoundError as e: 1889 if split[0] == "source_declarative_manifest": 1890 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1891 try: 1892 import os 1893 1894 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1895 module_ref = importlib.import_module( 1896 module_name_with_source_declarative_manifest 1897 ) 1898 except ModuleNotFoundError: 1899 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1900 else: 1901 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1902 1903 try: 1904 return getattr(module_ref, class_name) 1905 except AttributeError as e: 1906 raise ValueError( 1907 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1908 ) from e 1909 1910 @staticmethod 1911 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1912 interface = field_type 1913 while True: 1914 origin = get_origin(interface) 1915 if origin: 1916 # Unnest types until we reach the raw type 1917 # List[T] -> T 1918 # Optional[List[T]] -> T 1919 args = get_args(interface) 1920 interface = args[0] 1921 else: 1922 break 1923 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1924 return interface.__name__ 1925 return None 1926 1927 @staticmethod 1928 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1929 if not cls: 1930 return False 1931 return cls.__module__ == "builtins" 1932 1933 @staticmethod 1934 def _extract_missing_parameters(error: TypeError) -> List[str]: 1935 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1936 if parameter_search: 1937 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1938 else: 1939 return [] 1940 1941 def _create_nested_component( 1942 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1943 ) -> Any: 1944 type_name = model_value.get("type", None) 1945 if not type_name: 1946 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1947 return model_value 1948 1949 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1950 if model_type: 1951 parsed_model = model_type.parse_obj(model_value) 1952 try: 1953 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1954 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1955 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1956 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1957 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1958 # are needed by a component and could not be shared. 1959 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1960 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1961 model_parameters = model_value.get("$parameters", {}) 1962 matching_parameters = { 1963 kwarg: model_parameters[kwarg] 1964 for kwarg in constructor_kwargs 1965 if kwarg in model_parameters 1966 } 1967 matching_kwargs = { 1968 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1969 } 1970 return self._create_component_from_model( 1971 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1972 ) 1973 except TypeError as error: 1974 missing_parameters = self._extract_missing_parameters(error) 1975 if missing_parameters: 1976 raise ValueError( 1977 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1978 + ", ".join( 1979 ( 1980 f"{type_name}.$parameters.{parameter}" 1981 for parameter in missing_parameters 1982 ) 1983 ) 1984 ) 1985 raise TypeError( 1986 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1987 ) 1988 else: 1989 raise ValueError( 1990 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1991 ) 1992 1993 @staticmethod 1994 def _is_component(model_value: Any) -> bool: 1995 return isinstance(model_value, dict) and model_value.get("type") is not None 1996 1997 def create_default_stream( 1998 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1999 ) -> AbstractStream: 2000 primary_key = model.primary_key.__root__ if model.primary_key else None 2001 self._migrate_state(model, config) 2002 2003 partition_router = self._build_stream_slicer_from_partition_router( 2004 model.retriever, 2005 config, 2006 stream_name=model.name, 2007 **kwargs, 2008 ) 2009 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2010 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2011 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2012 2013 end_time_option = ( 2014 self._create_component_from_model( 2015 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2016 ) 2017 if cursor_model.end_time_option 2018 else None 2019 ) 2020 start_time_option = ( 2021 self._create_component_from_model( 2022 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2023 ) 2024 if cursor_model.start_time_option 2025 else None 2026 ) 2027 2028 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2029 start_time_option=start_time_option, 2030 end_time_option=end_time_option, 2031 partition_field_start=cursor_model.partition_field_start, 2032 partition_field_end=cursor_model.partition_field_end, 2033 config=config, 2034 parameters=model.parameters or {}, 2035 ) 2036 request_options_provider = ( 2037 datetime_request_options_provider 2038 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2039 else PerPartitionRequestOptionsProvider( 2040 partition_router, datetime_request_options_provider 2041 ) 2042 ) 2043 elif model.incremental_sync and isinstance( 2044 model.incremental_sync, IncrementingCountCursorModel 2045 ): 2046 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2047 raise ValueError( 2048 "PerPartition does not support per partition states because switching to global state is time based" 2049 ) 2050 2051 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2052 2053 start_time_option = ( 2054 self._create_component_from_model( 2055 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2056 config, 2057 parameters=cursor_model.parameters or {}, 2058 ) 2059 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2060 else None 2061 ) 2062 2063 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2064 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2065 partition_field_start = "start" 2066 2067 request_options_provider = DatetimeBasedRequestOptionsProvider( 2068 start_time_option=start_time_option, 2069 partition_field_start=partition_field_start, 2070 config=config, 2071 parameters=model.parameters or {}, 2072 ) 2073 else: 2074 request_options_provider = None 2075 2076 transformations = [] 2077 if model.transformations: 2078 for transformation_model in model.transformations: 2079 transformations.append( 2080 self._create_component_from_model(model=transformation_model, config=config) 2081 ) 2082 file_uploader = None 2083 if model.file_uploader: 2084 file_uploader = self._create_component_from_model( 2085 model=model.file_uploader, config=config 2086 ) 2087 2088 stream_slicer: ConcurrentStreamSlicer = ( 2089 partition_router 2090 if isinstance(concurrent_cursor, FinalStateCursor) 2091 else concurrent_cursor 2092 ) 2093 2094 retriever = self._create_component_from_model( 2095 model=model.retriever, 2096 config=config, 2097 name=model.name, 2098 primary_key=primary_key, 2099 request_options_provider=request_options_provider, 2100 stream_slicer=stream_slicer, 2101 partition_router=partition_router, 2102 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2103 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2104 cursor=concurrent_cursor, 2105 transformations=transformations, 2106 file_uploader=file_uploader, 2107 incremental_sync=model.incremental_sync, 2108 ) 2109 if isinstance(retriever, AsyncRetriever): 2110 stream_slicer = retriever.stream_slicer 2111 2112 schema_loader: SchemaLoader 2113 if model.schema_loader and isinstance(model.schema_loader, list): 2114 nested_schema_loaders = [ 2115 self._create_component_from_model(model=nested_schema_loader, config=config) 2116 for nested_schema_loader in model.schema_loader 2117 ] 2118 schema_loader = CompositeSchemaLoader( 2119 schema_loaders=nested_schema_loaders, parameters={} 2120 ) 2121 elif model.schema_loader: 2122 schema_loader = self._create_component_from_model( 2123 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2124 config=config, 2125 ) 2126 else: 2127 options = model.parameters or {} 2128 if "name" not in options: 2129 options["name"] = model.name 2130 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2131 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2132 2133 stream_name = model.name or "" 2134 return DefaultStream( 2135 partition_generator=StreamSlicerPartitionGenerator( 2136 DeclarativePartitionFactory( 2137 stream_name, 2138 schema_loader, 2139 retriever, 2140 self._message_repository, 2141 ), 2142 stream_slicer, 2143 slice_limit=self._limit_slices_fetched, 2144 ), 2145 name=stream_name, 2146 json_schema=schema_loader.get_json_schema, 2147 primary_key=get_primary_key_from_stream(primary_key), 2148 cursor_field=( 2149 concurrent_cursor.cursor_field 2150 if hasattr(concurrent_cursor, "cursor_field") 2151 else None 2152 ), 2153 logger=logging.getLogger(f"airbyte.{stream_name}"), 2154 cursor=concurrent_cursor, 2155 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2156 ) 2157 2158 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2159 stream_name = model.name or "" 2160 stream_state = self._connector_state_manager.get_stream_state( 2161 stream_name=stream_name, namespace=None 2162 ) 2163 if model.state_migrations: 2164 state_transformations = [ 2165 self._create_component_from_model(state_migration, config, declarative_stream=model) 2166 for state_migration in model.state_migrations 2167 ] 2168 else: 2169 state_transformations = [] 2170 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2171 self._connector_state_manager.update_state_for_stream( 2172 stream_name=stream_name, namespace=None, value=stream_state 2173 ) 2174 2175 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2176 return bool( 2177 model.incremental_sync 2178 and hasattr(model.incremental_sync, "is_data_feed") 2179 and model.incremental_sync.is_data_feed 2180 ) 2181 2182 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2183 return bool( 2184 model.incremental_sync 2185 and hasattr(model.incremental_sync, "is_client_side_incremental") 2186 and model.incremental_sync.is_client_side_incremental 2187 ) 2188 2189 def _build_stream_slicer_from_partition_router( 2190 self, 2191 model: Union[ 2192 AsyncRetrieverModel, 2193 CustomRetrieverModel, 2194 SimpleRetrieverModel, 2195 ], 2196 config: Config, 2197 stream_name: Optional[str] = None, 2198 **kwargs: Any, 2199 ) -> PartitionRouter: 2200 if ( 2201 hasattr(model, "partition_router") 2202 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2203 and model.partition_router 2204 ): 2205 stream_slicer_model = model.partition_router 2206 if isinstance(stream_slicer_model, list): 2207 return CartesianProductStreamSlicer( 2208 [ 2209 self._create_component_from_model( 2210 model=slicer, config=config, stream_name=stream_name or "" 2211 ) 2212 for slicer in stream_slicer_model 2213 ], 2214 parameters={}, 2215 ) 2216 elif isinstance(stream_slicer_model, dict): 2217 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2218 params = stream_slicer_model.get("$parameters") 2219 if not isinstance(params, dict): 2220 params = {} 2221 stream_slicer_model["$parameters"] = params 2222 2223 if stream_name is not None: 2224 params["stream_name"] = stream_name 2225 2226 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2227 model, 2228 "partition_router", 2229 stream_slicer_model, 2230 config, 2231 **kwargs, 2232 ) 2233 else: 2234 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2235 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2236 ) 2237 return SinglePartitionRouter(parameters={}) 2238 2239 def _build_concurrent_cursor( 2240 self, 2241 model: DeclarativeStreamModel, 2242 stream_slicer: Optional[PartitionRouter], 2243 config: Config, 2244 ) -> Cursor: 2245 stream_name = model.name or "" 2246 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2247 2248 if ( 2249 model.incremental_sync 2250 and stream_slicer 2251 and not isinstance(stream_slicer, SinglePartitionRouter) 2252 ): 2253 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2254 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2255 # same time because we didn't solve for design questions like what the lookback window would 2256 # be as well as global cursor fall backs. We have not seen customers that have needed both 2257 # at the same time yet and are currently punting on this until we need to solve it. 2258 raise ValueError( 2259 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2260 ) 2261 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2262 state_manager=self._connector_state_manager, 2263 model_type=DatetimeBasedCursorModel, 2264 component_definition=model.incremental_sync.__dict__, 2265 stream_name=stream_name, 2266 stream_state=stream_state, 2267 stream_namespace=None, 2268 config=config or {}, 2269 partition_router=stream_slicer, 2270 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2271 ) 2272 elif model.incremental_sync: 2273 if type(model.incremental_sync) == IncrementingCountCursorModel: 2274 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2275 model_type=IncrementingCountCursorModel, 2276 component_definition=model.incremental_sync.__dict__, 2277 stream_name=stream_name, 2278 stream_namespace=None, 2279 stream_state=stream_state, 2280 config=config or {}, 2281 ) 2282 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2283 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2284 model_type=type(model.incremental_sync), 2285 component_definition=model.incremental_sync.__dict__, 2286 stream_name=stream_name, 2287 stream_namespace=None, 2288 stream_state=stream_state, 2289 config=config or {}, 2290 attempt_to_create_cursor_if_not_provided=True, 2291 ) 2292 else: 2293 raise ValueError( 2294 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2295 ) 2296 return FinalStateCursor(stream_name, None, self._message_repository) 2297 2298 def create_default_error_handler( 2299 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2300 ) -> DefaultErrorHandler: 2301 backoff_strategies = [] 2302 if model.backoff_strategies: 2303 for backoff_strategy_model in model.backoff_strategies: 2304 backoff_strategies.append( 2305 self._create_component_from_model(model=backoff_strategy_model, config=config) 2306 ) 2307 2308 response_filters = [] 2309 if model.response_filters: 2310 for response_filter_model in model.response_filters: 2311 response_filters.append( 2312 self._create_component_from_model(model=response_filter_model, config=config) 2313 ) 2314 response_filters.append( 2315 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2316 ) 2317 2318 return DefaultErrorHandler( 2319 backoff_strategies=backoff_strategies, 2320 max_retries=model.max_retries, 2321 response_filters=response_filters, 2322 config=config, 2323 parameters=model.parameters or {}, 2324 ) 2325 2326 def create_default_paginator( 2327 self, 2328 model: DefaultPaginatorModel, 2329 config: Config, 2330 *, 2331 url_base: str, 2332 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2333 decoder: Optional[Decoder] = None, 2334 cursor_used_for_stop_condition: Optional[Cursor] = None, 2335 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2336 if decoder: 2337 if self._is_supported_decoder_for_pagination(decoder): 2338 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2339 else: 2340 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2341 else: 2342 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2343 page_size_option = ( 2344 self._create_component_from_model(model=model.page_size_option, config=config) 2345 if model.page_size_option 2346 else None 2347 ) 2348 page_token_option = ( 2349 self._create_component_from_model(model=model.page_token_option, config=config) 2350 if model.page_token_option 2351 else None 2352 ) 2353 pagination_strategy = self._create_component_from_model( 2354 model=model.pagination_strategy, 2355 config=config, 2356 decoder=decoder_to_use, 2357 extractor_model=extractor_model, 2358 ) 2359 if cursor_used_for_stop_condition: 2360 pagination_strategy = StopConditionPaginationStrategyDecorator( 2361 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2362 ) 2363 paginator = DefaultPaginator( 2364 decoder=decoder_to_use, 2365 page_size_option=page_size_option, 2366 page_token_option=page_token_option, 2367 pagination_strategy=pagination_strategy, 2368 url_base=url_base, 2369 config=config, 2370 parameters=model.parameters or {}, 2371 ) 2372 if self._limit_pages_fetched_per_slice: 2373 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2374 return paginator 2375 2376 def create_dpath_extractor( 2377 self, 2378 model: DpathExtractorModel, 2379 config: Config, 2380 decoder: Optional[Decoder] = None, 2381 **kwargs: Any, 2382 ) -> DpathExtractor: 2383 if decoder: 2384 decoder_to_use = decoder 2385 else: 2386 decoder_to_use = JsonDecoder(parameters={}) 2387 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2388 2389 record_expander = None 2390 if model.record_expander: 2391 record_expander = self._create_component_from_model( 2392 model=model.record_expander, 2393 config=config, 2394 ) 2395 2396 return DpathExtractor( 2397 decoder=decoder_to_use, 2398 field_path=model_field_path, 2399 config=config, 2400 parameters=model.parameters or {}, 2401 record_expander=record_expander, 2402 ) 2403 2404 def create_record_expander( 2405 self, 2406 model: RecordExpanderModel, 2407 config: Config, 2408 **kwargs: Any, 2409 ) -> RecordExpander: 2410 return RecordExpander( 2411 expand_records_from_field=model.expand_records_from_field, 2412 config=config, 2413 parameters=model.parameters or {}, 2414 remain_original_record=model.remain_original_record or False, 2415 on_no_records=OnNoRecords(model.on_no_records.value) 2416 if model.on_no_records 2417 else OnNoRecords.skip, 2418 ) 2419 2420 @staticmethod 2421 def create_response_to_file_extractor( 2422 model: ResponseToFileExtractorModel, 2423 **kwargs: Any, 2424 ) -> ResponseToFileExtractor: 2425 return ResponseToFileExtractor(parameters=model.parameters or {}) 2426 2427 @staticmethod 2428 def create_exponential_backoff_strategy( 2429 model: ExponentialBackoffStrategyModel, config: Config 2430 ) -> ExponentialBackoffStrategy: 2431 return ExponentialBackoffStrategy( 2432 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2433 ) 2434 2435 @staticmethod 2436 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2437 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2438 2439 def create_http_requester( 2440 self, 2441 model: HttpRequesterModel, 2442 config: Config, 2443 decoder: Decoder = JsonDecoder(parameters={}), 2444 query_properties_key: Optional[str] = None, 2445 use_cache: Optional[bool] = None, 2446 *, 2447 name: str, 2448 ) -> HttpRequester: 2449 authenticator = ( 2450 self._create_component_from_model( 2451 model=model.authenticator, 2452 config=config, 2453 url_base=model.url or model.url_base, 2454 name=name, 2455 decoder=decoder, 2456 ) 2457 if model.authenticator 2458 else None 2459 ) 2460 error_handler = ( 2461 self._create_component_from_model(model=model.error_handler, config=config) 2462 if model.error_handler 2463 else DefaultErrorHandler( 2464 backoff_strategies=[], 2465 response_filters=[], 2466 config=config, 2467 parameters=model.parameters or {}, 2468 ) 2469 ) 2470 2471 api_budget = self._api_budget 2472 2473 request_options_provider = InterpolatedRequestOptionsProvider( 2474 request_body=model.request_body, 2475 request_body_data=model.request_body_data, 2476 request_body_json=model.request_body_json, 2477 request_headers=model.request_headers, 2478 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2479 query_properties_key=query_properties_key, 2480 config=config, 2481 parameters=model.parameters or {}, 2482 ) 2483 2484 assert model.use_cache is not None # for mypy 2485 assert model.http_method is not None # for mypy 2486 2487 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2488 2489 return HttpRequester( 2490 name=name, 2491 url=model.url, 2492 url_base=model.url_base, 2493 path=model.path, 2494 authenticator=authenticator, 2495 error_handler=error_handler, 2496 api_budget=api_budget, 2497 http_method=HttpMethod[model.http_method.value], 2498 request_options_provider=request_options_provider, 2499 config=config, 2500 disable_retries=self._disable_retries, 2501 parameters=model.parameters or {}, 2502 message_repository=self._message_repository, 2503 use_cache=should_use_cache, 2504 decoder=decoder, 2505 stream_response=decoder.is_stream_response() if decoder else False, 2506 ) 2507 2508 @staticmethod 2509 def create_http_response_filter( 2510 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2511 ) -> HttpResponseFilter: 2512 if model.action: 2513 action = ResponseAction(model.action.value) 2514 else: 2515 action = None 2516 2517 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2518 2519 http_codes = ( 2520 set(model.http_codes) if model.http_codes else set() 2521 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2522 2523 return HttpResponseFilter( 2524 action=action, 2525 failure_type=failure_type, 2526 error_message=model.error_message or "", 2527 error_message_contains=model.error_message_contains or "", 2528 http_codes=http_codes, 2529 predicate=model.predicate or "", 2530 config=config, 2531 parameters=model.parameters or {}, 2532 ) 2533 2534 @staticmethod 2535 def create_inline_schema_loader( 2536 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2537 ) -> InlineSchemaLoader: 2538 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2539 2540 def create_complex_field_type( 2541 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2542 ) -> ComplexFieldType: 2543 items = ( 2544 self._create_component_from_model(model=model.items, config=config) 2545 if isinstance(model.items, ComplexFieldTypeModel) 2546 else model.items 2547 ) 2548 2549 return ComplexFieldType(field_type=model.field_type, items=items) 2550 2551 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2552 target_type = ( 2553 self._create_component_from_model(model=model.target_type, config=config) 2554 if isinstance(model.target_type, ComplexFieldTypeModel) 2555 else model.target_type 2556 ) 2557 2558 return TypesMap( 2559 target_type=target_type, 2560 current_type=model.current_type, 2561 condition=model.condition if model.condition is not None else "True", 2562 ) 2563 2564 def create_schema_type_identifier( 2565 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2566 ) -> SchemaTypeIdentifier: 2567 types_mapping = [] 2568 if model.types_mapping: 2569 types_mapping.extend( 2570 [ 2571 self._create_component_from_model(types_map, config=config) 2572 for types_map in model.types_mapping 2573 ] 2574 ) 2575 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2576 [x for x in model.schema_pointer] if model.schema_pointer else [] 2577 ) 2578 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2579 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2580 [x for x in model.type_pointer] if model.type_pointer else None 2581 ) 2582 2583 return SchemaTypeIdentifier( 2584 schema_pointer=model_schema_pointer, 2585 key_pointer=model_key_pointer, 2586 type_pointer=model_type_pointer, 2587 types_mapping=types_mapping, 2588 parameters=model.parameters or {}, 2589 ) 2590 2591 def create_dynamic_schema_loader( 2592 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2593 ) -> DynamicSchemaLoader: 2594 schema_transformations = [] 2595 if model.schema_transformations: 2596 for transformation_model in model.schema_transformations: 2597 schema_transformations.append( 2598 self._create_component_from_model(model=transformation_model, config=config) 2599 ) 2600 name = "dynamic_properties" 2601 retriever = self._create_component_from_model( 2602 model=model.retriever, 2603 config=config, 2604 name=name, 2605 primary_key=None, 2606 partition_router=self._build_stream_slicer_from_partition_router( 2607 model.retriever, config 2608 ), 2609 transformations=[], 2610 use_cache=True, 2611 log_formatter=( 2612 lambda response: format_http_message( 2613 response, 2614 f"Schema loader '{name}' request", 2615 f"Request performed in order to extract schema.", 2616 name, 2617 is_auxiliary=True, 2618 ) 2619 ), 2620 ) 2621 schema_type_identifier = self._create_component_from_model( 2622 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2623 ) 2624 schema_filter = ( 2625 self._create_component_from_model( 2626 model.schema_filter, config=config, parameters=model.parameters or {} 2627 ) 2628 if model.schema_filter is not None 2629 else None 2630 ) 2631 2632 return DynamicSchemaLoader( 2633 retriever=retriever, 2634 config=config, 2635 schema_transformations=schema_transformations, 2636 schema_filter=schema_filter, 2637 schema_type_identifier=schema_type_identifier, 2638 parameters=model.parameters or {}, 2639 ) 2640 2641 @staticmethod 2642 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2643 return JsonDecoder(parameters={}) 2644 2645 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2646 return CompositeRawDecoder( 2647 parser=ModelToComponentFactory._get_parser(model, config), 2648 stream_response=False if self._emit_connector_builder_messages else True, 2649 ) 2650 2651 def create_jsonl_decoder( 2652 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2653 ) -> Decoder: 2654 return CompositeRawDecoder( 2655 parser=ModelToComponentFactory._get_parser(model, config), 2656 stream_response=False if self._emit_connector_builder_messages else True, 2657 ) 2658 2659 def create_gzip_decoder( 2660 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2661 ) -> Decoder: 2662 _compressed_response_types = { 2663 "gzip", 2664 "x-gzip", 2665 "gzip, deflate", 2666 "x-gzip, deflate", 2667 "application/zip", 2668 "application/gzip", 2669 "application/x-gzip", 2670 "application/x-zip-compressed", 2671 } 2672 2673 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2674 2675 if self._emit_connector_builder_messages: 2676 # This is very surprising but if the response is not streamed, 2677 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2678 # which uses urllib3 directly and does not uncompress the data. 2679 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2680 2681 return CompositeRawDecoder.by_headers( 2682 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2683 stream_response=True, 2684 fallback_parser=gzip_parser.inner_parser, 2685 ) 2686 2687 @staticmethod 2688 def create_iterable_decoder( 2689 model: IterableDecoderModel, config: Config, **kwargs: Any 2690 ) -> IterableDecoder: 2691 return IterableDecoder(parameters={}) 2692 2693 @staticmethod 2694 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2695 return XmlDecoder(parameters={}) 2696 2697 def create_zipfile_decoder( 2698 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2699 ) -> ZipfileDecoder: 2700 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2701 2702 @staticmethod 2703 def _get_parser(model: BaseModel, config: Config) -> Parser: 2704 if isinstance(model, JsonDecoderModel): 2705 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2706 return JsonParser() 2707 elif isinstance(model, JsonlDecoderModel): 2708 return JsonLineParser() 2709 elif isinstance(model, CsvDecoderModel): 2710 return CsvParser( 2711 encoding=model.encoding, 2712 delimiter=model.delimiter, 2713 set_values_to_none=model.set_values_to_none, 2714 ) 2715 elif isinstance(model, GzipDecoderModel): 2716 return GzipParser( 2717 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2718 ) 2719 elif isinstance( 2720 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2721 ): 2722 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2723 2724 raise ValueError(f"Unknown decoder type {model}") 2725 2726 @staticmethod 2727 def create_json_file_schema_loader( 2728 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2729 ) -> JsonFileSchemaLoader: 2730 return JsonFileSchemaLoader( 2731 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2732 ) 2733 2734 def create_jwt_authenticator( 2735 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2736 ) -> JwtAuthenticator: 2737 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2738 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2739 request_option = ( 2740 self._create_component_from_model(model.request_option, config) 2741 if model.request_option 2742 else None 2743 ) 2744 return JwtAuthenticator( 2745 config=config, 2746 parameters=model.parameters or {}, 2747 algorithm=JwtAlgorithm(model.algorithm.value), 2748 secret_key=model.secret_key, 2749 base64_encode_secret_key=model.base64_encode_secret_key, 2750 token_duration=model.token_duration, 2751 header_prefix=model.header_prefix, 2752 kid=jwt_headers.kid, 2753 typ=jwt_headers.typ, 2754 cty=jwt_headers.cty, 2755 iss=jwt_payload.iss, 2756 sub=jwt_payload.sub, 2757 aud=jwt_payload.aud, 2758 additional_jwt_headers=model.additional_jwt_headers, 2759 additional_jwt_payload=model.additional_jwt_payload, 2760 passphrase=model.passphrase, 2761 request_option=request_option, 2762 ) 2763 2764 def create_list_partition_router( 2765 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2766 ) -> ListPartitionRouter: 2767 request_option = ( 2768 self._create_component_from_model(model.request_option, config) 2769 if model.request_option 2770 else None 2771 ) 2772 return ListPartitionRouter( 2773 cursor_field=model.cursor_field, 2774 request_option=request_option, 2775 values=model.values, 2776 config=config, 2777 parameters=model.parameters or {}, 2778 ) 2779 2780 @staticmethod 2781 def create_min_max_datetime( 2782 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2783 ) -> MinMaxDatetime: 2784 return MinMaxDatetime( 2785 datetime=model.datetime, 2786 datetime_format=model.datetime_format or "", 2787 max_datetime=model.max_datetime or "", 2788 min_datetime=model.min_datetime or "", 2789 parameters=model.parameters or {}, 2790 ) 2791 2792 @staticmethod 2793 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2794 return NoAuth(parameters=model.parameters or {}) 2795 2796 @staticmethod 2797 def create_no_pagination( 2798 model: NoPaginationModel, config: Config, **kwargs: Any 2799 ) -> NoPagination: 2800 return NoPagination(parameters={}) 2801 2802 def create_oauth_authenticator( 2803 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2804 ) -> DeclarativeOauth2Authenticator: 2805 profile_assertion = ( 2806 self._create_component_from_model(model.profile_assertion, config=config) 2807 if model.profile_assertion 2808 else None 2809 ) 2810 2811 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2812 self._get_refresh_token_error_information(model) 2813 ) 2814 if model.refresh_token_updater: 2815 # ignore type error because fixing it would have a lot of dependencies, revisit later 2816 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2817 config, 2818 InterpolatedString.create( 2819 model.token_refresh_endpoint, # type: ignore 2820 parameters=model.parameters or {}, 2821 ).eval(config), 2822 access_token_name=InterpolatedString.create( 2823 model.access_token_name or "access_token", parameters=model.parameters or {} 2824 ).eval(config), 2825 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2826 expires_in_name=InterpolatedString.create( 2827 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2828 ).eval(config), 2829 client_id_name=InterpolatedString.create( 2830 model.client_id_name or "client_id", parameters=model.parameters or {} 2831 ).eval(config), 2832 client_id=InterpolatedString.create( 2833 model.client_id, parameters=model.parameters or {} 2834 ).eval(config) 2835 if model.client_id 2836 else model.client_id, 2837 client_secret_name=InterpolatedString.create( 2838 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2839 ).eval(config), 2840 client_secret=InterpolatedString.create( 2841 model.client_secret, parameters=model.parameters or {} 2842 ).eval(config) 2843 if model.client_secret 2844 else model.client_secret, 2845 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2846 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2847 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2848 grant_type_name=InterpolatedString.create( 2849 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2850 ).eval(config), 2851 grant_type=InterpolatedString.create( 2852 model.grant_type or "refresh_token", parameters=model.parameters or {} 2853 ).eval(config), 2854 refresh_request_body=InterpolatedMapping( 2855 model.refresh_request_body or {}, parameters=model.parameters or {} 2856 ).eval(config), 2857 refresh_request_headers=InterpolatedMapping( 2858 model.refresh_request_headers or {}, parameters=model.parameters or {} 2859 ).eval(config), 2860 scopes=model.scopes, 2861 token_expiry_date_format=model.token_expiry_date_format, 2862 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2863 message_repository=self._message_repository, 2864 refresh_token_error_status_codes=refresh_token_error_status_codes, 2865 refresh_token_error_key=refresh_token_error_key, 2866 refresh_token_error_values=refresh_token_error_values, 2867 ) 2868 # ignore type error because fixing it would have a lot of dependencies, revisit later 2869 return DeclarativeOauth2Authenticator( # type: ignore 2870 access_token_name=model.access_token_name or "access_token", 2871 access_token_value=model.access_token_value, 2872 client_id_name=model.client_id_name or "client_id", 2873 client_id=model.client_id, 2874 client_secret_name=model.client_secret_name or "client_secret", 2875 client_secret=model.client_secret, 2876 expires_in_name=model.expires_in_name or "expires_in", 2877 grant_type_name=model.grant_type_name or "grant_type", 2878 grant_type=model.grant_type or "refresh_token", 2879 refresh_request_body=model.refresh_request_body, 2880 refresh_request_headers=model.refresh_request_headers, 2881 refresh_token_name=model.refresh_token_name or "refresh_token", 2882 refresh_token=model.refresh_token, 2883 scopes=model.scopes, 2884 token_expiry_date=model.token_expiry_date, 2885 token_expiry_date_format=model.token_expiry_date_format, 2886 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2887 token_refresh_endpoint=model.token_refresh_endpoint, 2888 config=config, 2889 parameters=model.parameters or {}, 2890 message_repository=self._message_repository, 2891 profile_assertion=profile_assertion, 2892 use_profile_assertion=model.use_profile_assertion, 2893 refresh_token_error_status_codes=refresh_token_error_status_codes, 2894 refresh_token_error_key=refresh_token_error_key, 2895 refresh_token_error_values=refresh_token_error_values, 2896 ) 2897 2898 @staticmethod 2899 def _get_refresh_token_error_information( 2900 model: OAuthAuthenticatorModel, 2901 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2902 """ 2903 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2904 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2905 information is defined only once and return the right fields. 2906 """ 2907 refresh_token_updater = model.refresh_token_updater 2908 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2909 refresh_token_updater.refresh_token_error_status_codes 2910 or refresh_token_updater.refresh_token_error_key 2911 or refresh_token_updater.refresh_token_error_values 2912 ) 2913 is_defined_on_oauth_authenticator = ( 2914 model.refresh_token_error_status_codes 2915 or model.refresh_token_error_key 2916 or model.refresh_token_error_values 2917 ) 2918 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2919 raise ValueError( 2920 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2921 ) 2922 2923 if is_defined_on_refresh_token_updated: 2924 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2925 return ( 2926 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2927 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2928 else (), 2929 not_optional_refresh_token_updater.refresh_token_error_key or "", 2930 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2931 if not_optional_refresh_token_updater.refresh_token_error_values 2932 else (), 2933 ) 2934 elif is_defined_on_oauth_authenticator: 2935 return ( 2936 tuple(model.refresh_token_error_status_codes) 2937 if model.refresh_token_error_status_codes 2938 else (), 2939 model.refresh_token_error_key or "", 2940 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2941 ) 2942 2943 # returning default values we think cover most cases 2944 return (400,), "error", ("invalid_grant", "invalid_permissions") 2945 2946 def create_offset_increment( 2947 self, 2948 model: OffsetIncrementModel, 2949 config: Config, 2950 decoder: Decoder, 2951 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2952 **kwargs: Any, 2953 ) -> OffsetIncrement: 2954 if isinstance(decoder, PaginationDecoderDecorator): 2955 inner_decoder = decoder.decoder 2956 else: 2957 inner_decoder = decoder 2958 decoder = PaginationDecoderDecorator(decoder=decoder) 2959 2960 if self._is_supported_decoder_for_pagination(inner_decoder): 2961 decoder_to_use = decoder 2962 else: 2963 raise ValueError( 2964 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2965 ) 2966 2967 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2968 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2969 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2970 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2971 # When we have more time to investigate we can look into reusing the same component. 2972 extractor = ( 2973 self._create_component_from_model( 2974 model=extractor_model, config=config, decoder=decoder_to_use 2975 ) 2976 if extractor_model 2977 else None 2978 ) 2979 2980 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2981 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2982 page_size = model.page_size 2983 if isinstance(page_size, str) and page_size.isdigit(): 2984 page_size = int(page_size) 2985 2986 return OffsetIncrement( 2987 page_size=page_size, 2988 config=config, 2989 decoder=decoder_to_use, 2990 extractor=extractor, 2991 inject_on_first_request=model.inject_on_first_request or False, 2992 parameters=model.parameters or {}, 2993 ) 2994 2995 @staticmethod 2996 def create_page_increment( 2997 model: PageIncrementModel, config: Config, **kwargs: Any 2998 ) -> PageIncrement: 2999 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3000 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3001 page_size = model.page_size 3002 if isinstance(page_size, str) and page_size.isdigit(): 3003 page_size = int(page_size) 3004 3005 return PageIncrement( 3006 page_size=page_size, 3007 config=config, 3008 start_from_page=model.start_from_page or 0, 3009 inject_on_first_request=model.inject_on_first_request or False, 3010 parameters=model.parameters or {}, 3011 ) 3012 3013 def create_parent_stream_config( 3014 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3015 ) -> ParentStreamConfig: 3016 declarative_stream = self._create_component_from_model( 3017 model.stream, 3018 config=config, 3019 is_parent=True, 3020 **kwargs, 3021 ) 3022 request_option = ( 3023 self._create_component_from_model(model.request_option, config=config) 3024 if model.request_option 3025 else None 3026 ) 3027 3028 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3029 raise ValueError( 3030 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3031 ) 3032 3033 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3034 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3035 ) 3036 3037 return ParentStreamConfig( 3038 parent_key=model.parent_key, 3039 request_option=request_option, 3040 stream=declarative_stream, 3041 partition_field=model.partition_field, 3042 config=config, 3043 incremental_dependency=model.incremental_dependency or False, 3044 parameters=model.parameters or {}, 3045 extra_fields=model.extra_fields, 3046 lazy_read_pointer=model_lazy_read_pointer, 3047 ) 3048 3049 def create_properties_from_endpoint( 3050 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3051 ) -> PropertiesFromEndpoint: 3052 retriever = self._create_component_from_model( 3053 model=model.retriever, 3054 config=config, 3055 name="dynamic_properties", 3056 primary_key=None, 3057 stream_slicer=None, 3058 transformations=[], 3059 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3060 ) 3061 return PropertiesFromEndpoint( 3062 property_field_path=model.property_field_path, 3063 retriever=retriever, 3064 config=config, 3065 parameters=model.parameters or {}, 3066 ) 3067 3068 def create_property_chunking( 3069 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3070 ) -> PropertyChunking: 3071 record_merge_strategy = ( 3072 self._create_component_from_model( 3073 model=model.record_merge_strategy, config=config, **kwargs 3074 ) 3075 if model.record_merge_strategy 3076 else None 3077 ) 3078 3079 property_limit_type: PropertyLimitType 3080 match model.property_limit_type: 3081 case PropertyLimitTypeModel.property_count: 3082 property_limit_type = PropertyLimitType.property_count 3083 case PropertyLimitTypeModel.characters: 3084 property_limit_type = PropertyLimitType.characters 3085 case _: 3086 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3087 3088 return PropertyChunking( 3089 property_limit_type=property_limit_type, 3090 property_limit=model.property_limit, 3091 record_merge_strategy=record_merge_strategy, 3092 config=config, 3093 parameters=model.parameters or {}, 3094 ) 3095 3096 def create_query_properties( 3097 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3098 ) -> QueryProperties: 3099 if isinstance(model.property_list, list): 3100 property_list = model.property_list 3101 else: 3102 property_list = self._create_component_from_model( 3103 model=model.property_list, config=config, **kwargs 3104 ) 3105 3106 property_chunking = ( 3107 self._create_component_from_model( 3108 model=model.property_chunking, config=config, **kwargs 3109 ) 3110 if model.property_chunking 3111 else None 3112 ) 3113 3114 property_selector = ( 3115 self._create_component_from_model( 3116 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3117 ) 3118 if model.property_selector 3119 else None 3120 ) 3121 3122 return QueryProperties( 3123 property_list=property_list, 3124 always_include_properties=model.always_include_properties, 3125 property_chunking=property_chunking, 3126 property_selector=property_selector, 3127 config=config, 3128 parameters=model.parameters or {}, 3129 ) 3130 3131 def create_json_schema_property_selector( 3132 self, 3133 model: JsonSchemaPropertySelectorModel, 3134 config: Config, 3135 *, 3136 stream_name: str, 3137 **kwargs: Any, 3138 ) -> JsonSchemaPropertySelector: 3139 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3140 3141 transformations = [] 3142 if model.transformations: 3143 for transformation_model in model.transformations: 3144 transformations.append( 3145 self._create_component_from_model(model=transformation_model, config=config) 3146 ) 3147 3148 return JsonSchemaPropertySelector( 3149 configured_stream=configured_stream, 3150 properties_transformations=transformations, 3151 config=config, 3152 parameters=model.parameters or {}, 3153 ) 3154 3155 @staticmethod 3156 def create_record_filter( 3157 model: RecordFilterModel, config: Config, **kwargs: Any 3158 ) -> RecordFilter: 3159 return RecordFilter( 3160 condition=model.condition or "", config=config, parameters=model.parameters or {} 3161 ) 3162 3163 @staticmethod 3164 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3165 return RequestPath(parameters={}) 3166 3167 @staticmethod 3168 def create_request_option( 3169 model: RequestOptionModel, config: Config, **kwargs: Any 3170 ) -> RequestOption: 3171 inject_into = RequestOptionType(model.inject_into.value) 3172 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3173 [ 3174 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3175 for segment in model.field_path 3176 ] 3177 if model.field_path 3178 else None 3179 ) 3180 field_name = ( 3181 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3182 if model.field_name 3183 else None 3184 ) 3185 return RequestOption( 3186 field_name=field_name, 3187 field_path=field_path, 3188 inject_into=inject_into, 3189 parameters=kwargs.get("parameters", {}), 3190 ) 3191 3192 def create_record_selector( 3193 self, 3194 model: RecordSelectorModel, 3195 config: Config, 3196 *, 3197 name: str, 3198 transformations: List[RecordTransformation] | None = None, 3199 decoder: Decoder | None = None, 3200 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3201 file_uploader: Optional[DefaultFileUploader] = None, 3202 **kwargs: Any, 3203 ) -> RecordSelector: 3204 extractor = self._create_component_from_model( 3205 model=model.extractor, decoder=decoder, config=config 3206 ) 3207 record_filter = ( 3208 self._create_component_from_model(model.record_filter, config=config) 3209 if model.record_filter 3210 else None 3211 ) 3212 3213 transform_before_filtering = ( 3214 False if model.transform_before_filtering is None else model.transform_before_filtering 3215 ) 3216 if client_side_incremental_sync_cursor: 3217 record_filter = ClientSideIncrementalRecordFilterDecorator( 3218 config=config, 3219 parameters=model.parameters, 3220 condition=model.record_filter.condition 3221 if (model.record_filter and hasattr(model.record_filter, "condition")) 3222 else None, 3223 cursor=client_side_incremental_sync_cursor, 3224 ) 3225 transform_before_filtering = ( 3226 True 3227 if model.transform_before_filtering is None 3228 else model.transform_before_filtering 3229 ) 3230 3231 if model.schema_normalization is None: 3232 # default to no schema normalization if not set 3233 model.schema_normalization = SchemaNormalizationModel.None_ 3234 3235 schema_normalization = ( 3236 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3237 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3238 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3239 ) 3240 3241 return RecordSelector( 3242 extractor=extractor, 3243 name=name, 3244 config=config, 3245 record_filter=record_filter, 3246 transformations=transformations or [], 3247 file_uploader=file_uploader, 3248 schema_normalization=schema_normalization, 3249 parameters=model.parameters or {}, 3250 transform_before_filtering=transform_before_filtering, 3251 ) 3252 3253 @staticmethod 3254 def create_remove_fields( 3255 model: RemoveFieldsModel, config: Config, **kwargs: Any 3256 ) -> RemoveFields: 3257 return RemoveFields( 3258 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3259 ) 3260 3261 def create_selective_authenticator( 3262 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3263 ) -> DeclarativeAuthenticator: 3264 authenticators = { 3265 name: self._create_component_from_model(model=auth, config=config) 3266 for name, auth in model.authenticators.items() 3267 } 3268 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3269 return SelectiveAuthenticator( # type: ignore[abstract] 3270 config=config, 3271 authenticators=authenticators, 3272 authenticator_selection_path=model.authenticator_selection_path, 3273 **kwargs, 3274 ) 3275 3276 @staticmethod 3277 def create_legacy_session_token_authenticator( 3278 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3279 ) -> LegacySessionTokenAuthenticator: 3280 return LegacySessionTokenAuthenticator( 3281 api_url=url_base, 3282 header=model.header, 3283 login_url=model.login_url, 3284 password=model.password or "", 3285 session_token=model.session_token or "", 3286 session_token_response_key=model.session_token_response_key or "", 3287 username=model.username or "", 3288 validate_session_url=model.validate_session_url, 3289 config=config, 3290 parameters=model.parameters or {}, 3291 ) 3292 3293 def create_simple_retriever( 3294 self, 3295 model: SimpleRetrieverModel, 3296 config: Config, 3297 *, 3298 name: str, 3299 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3300 request_options_provider: Optional[RequestOptionsProvider] = None, 3301 cursor: Optional[Cursor] = None, 3302 has_stop_condition_cursor: bool = False, 3303 is_client_side_incremental_sync: bool = False, 3304 transformations: List[RecordTransformation], 3305 file_uploader: Optional[DefaultFileUploader] = None, 3306 incremental_sync: Optional[ 3307 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3308 ] = None, 3309 use_cache: Optional[bool] = None, 3310 log_formatter: Optional[Callable[[Response], Any]] = None, 3311 partition_router: Optional[PartitionRouter] = None, 3312 **kwargs: Any, 3313 ) -> SimpleRetriever: 3314 def _get_url(req: Requester) -> str: 3315 """ 3316 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3317 This is needed because the URL is not set until the requester is created. 3318 """ 3319 3320 _url: str = ( 3321 model.requester.url 3322 if hasattr(model.requester, "url") and model.requester.url is not None 3323 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3324 ) 3325 _url_base: str = ( 3326 model.requester.url_base 3327 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3328 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3329 ) 3330 3331 return _url or _url_base 3332 3333 if cursor is None: 3334 cursor = FinalStateCursor(name, None, self._message_repository) 3335 3336 decoder = ( 3337 self._create_component_from_model(model=model.decoder, config=config) 3338 if model.decoder 3339 else JsonDecoder(parameters={}) 3340 ) 3341 record_selector = self._create_component_from_model( 3342 model=model.record_selector, 3343 name=name, 3344 config=config, 3345 decoder=decoder, 3346 transformations=transformations, 3347 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3348 file_uploader=file_uploader, 3349 ) 3350 3351 query_properties: Optional[QueryProperties] = None 3352 query_properties_key: Optional[str] = None 3353 self._ensure_query_properties_to_model(model.requester) 3354 if self._has_query_properties_in_request_parameters(model.requester): 3355 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3356 # places instead of default to request_parameters which isn't clearly documented 3357 if ( 3358 hasattr(model.requester, "fetch_properties_from_endpoint") 3359 and model.requester.fetch_properties_from_endpoint 3360 ): 3361 raise ValueError( 3362 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3363 ) 3364 3365 query_properties_definitions = [] 3366 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3367 if isinstance(request_parameter, QueryPropertiesModel): 3368 query_properties_key = key 3369 query_properties_definitions.append(request_parameter) 3370 3371 if len(query_properties_definitions) > 1: 3372 raise ValueError( 3373 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3374 ) 3375 3376 if len(query_properties_definitions) == 1: 3377 query_properties = self._create_component_from_model( 3378 model=query_properties_definitions[0], stream_name=name, config=config 3379 ) 3380 3381 # Removes QueryProperties components from the interpolated mappings because it has been designed 3382 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3383 # instead of through jinja interpolation 3384 if hasattr(model.requester, "request_parameters") and isinstance( 3385 model.requester.request_parameters, Mapping 3386 ): 3387 model.requester.request_parameters = self._remove_query_properties( 3388 model.requester.request_parameters 3389 ) 3390 elif ( 3391 hasattr(model.requester, "fetch_properties_from_endpoint") 3392 and model.requester.fetch_properties_from_endpoint 3393 ): 3394 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3395 query_properties_definition = QueryPropertiesModel( 3396 type="QueryProperties", 3397 property_list=model.requester.fetch_properties_from_endpoint, 3398 always_include_properties=None, 3399 property_chunking=None, 3400 ) # type: ignore # $parameters has a default value 3401 3402 query_properties = self.create_query_properties( 3403 model=query_properties_definition, 3404 stream_name=name, 3405 config=config, 3406 ) 3407 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3408 query_properties = self.create_query_properties( 3409 model=model.requester.query_properties, 3410 stream_name=name, 3411 config=config, 3412 ) 3413 3414 requester = self._create_component_from_model( 3415 model=model.requester, 3416 decoder=decoder, 3417 name=name, 3418 query_properties_key=query_properties_key, 3419 use_cache=use_cache, 3420 config=config, 3421 ) 3422 3423 if not request_options_provider: 3424 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3425 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3426 partition_router, PartitionRouter 3427 ): 3428 request_options_provider = partition_router 3429 3430 paginator = ( 3431 self._create_component_from_model( 3432 model=model.paginator, 3433 config=config, 3434 url_base=_get_url(requester), 3435 extractor_model=model.record_selector.extractor, 3436 decoder=decoder, 3437 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3438 ) 3439 if model.paginator 3440 else NoPagination(parameters={}) 3441 ) 3442 3443 ignore_stream_slicer_parameters_on_paginated_requests = ( 3444 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3445 ) 3446 3447 if ( 3448 model.partition_router 3449 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3450 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3451 and any( 3452 parent_stream_config.lazy_read_pointer 3453 for parent_stream_config in model.partition_router.parent_stream_configs 3454 ) 3455 ): 3456 if incremental_sync: 3457 if incremental_sync.type != "DatetimeBasedCursor": 3458 raise ValueError( 3459 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3460 ) 3461 3462 elif incremental_sync.step or incremental_sync.cursor_granularity: 3463 raise ValueError( 3464 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3465 ) 3466 3467 if model.decoder and model.decoder.type != "JsonDecoder": 3468 raise ValueError( 3469 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3470 ) 3471 3472 return LazySimpleRetriever( 3473 name=name, 3474 paginator=paginator, 3475 primary_key=primary_key, 3476 requester=requester, 3477 record_selector=record_selector, 3478 stream_slicer=_NO_STREAM_SLICING, 3479 request_option_provider=request_options_provider, 3480 config=config, 3481 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3482 parameters=model.parameters or {}, 3483 ) 3484 3485 if ( 3486 model.record_selector.record_filter 3487 and model.pagination_reset 3488 and model.pagination_reset.limits 3489 ): 3490 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3491 3492 return SimpleRetriever( 3493 name=name, 3494 paginator=paginator, 3495 primary_key=primary_key, 3496 requester=requester, 3497 record_selector=record_selector, 3498 stream_slicer=_NO_STREAM_SLICING, 3499 request_option_provider=request_options_provider, 3500 config=config, 3501 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3502 additional_query_properties=query_properties, 3503 log_formatter=self._get_log_formatter(log_formatter, name), 3504 pagination_tracker_factory=self._create_pagination_tracker_factory( 3505 model.pagination_reset, cursor 3506 ), 3507 parameters=model.parameters or {}, 3508 ) 3509 3510 def _create_pagination_tracker_factory( 3511 self, model: Optional[PaginationResetModel], cursor: Cursor 3512 ) -> Callable[[], PaginationTracker]: 3513 if model is None: 3514 return lambda: PaginationTracker() 3515 3516 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3517 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3518 if model.action == PaginationResetActionModel.RESET: 3519 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3520 pass 3521 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3522 if isinstance(cursor, ConcurrentCursor): 3523 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3524 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3525 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3526 {}, datetime.timedelta(0) 3527 ) 3528 elif not isinstance(cursor, FinalStateCursor): 3529 LOGGER.warning( 3530 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3531 ) 3532 else: 3533 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3534 3535 limit = model.limits.number_of_records if model and model.limits else None 3536 return lambda: PaginationTracker(cursor_factory(), limit) 3537 3538 def _get_log_formatter( 3539 self, log_formatter: Callable[[Response], Any] | None, name: str 3540 ) -> Callable[[Response], Any] | None: 3541 if self._should_limit_slices_fetched(): 3542 return ( 3543 ( 3544 lambda response: format_http_message( 3545 response, 3546 f"Stream '{name}' request", 3547 f"Request performed in order to extract records for stream '{name}'", 3548 name, 3549 ) 3550 ) 3551 if not log_formatter 3552 else log_formatter 3553 ) 3554 return None 3555 3556 def _should_limit_slices_fetched(self) -> bool: 3557 """ 3558 Returns True if the number of slices fetched should be limited, False otherwise. 3559 This is used to limit the number of slices fetched during tests. 3560 """ 3561 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3562 3563 @staticmethod 3564 def _has_query_properties_in_request_parameters( 3565 requester: Union[HttpRequesterModel, CustomRequesterModel], 3566 ) -> bool: 3567 if not hasattr(requester, "request_parameters"): 3568 return False 3569 request_parameters = requester.request_parameters 3570 if request_parameters and isinstance(request_parameters, Mapping): 3571 for request_parameter in request_parameters.values(): 3572 if isinstance(request_parameter, QueryPropertiesModel): 3573 return True 3574 return False 3575 3576 @staticmethod 3577 def _remove_query_properties( 3578 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3579 ) -> Mapping[str, str]: 3580 return { 3581 parameter_field: request_parameter 3582 for parameter_field, request_parameter in request_parameters.items() 3583 if not isinstance(request_parameter, QueryPropertiesModel) 3584 } 3585 3586 def create_state_delegating_stream( 3587 self, 3588 model: StateDelegatingStreamModel, 3589 config: Config, 3590 **kwargs: Any, 3591 ) -> DefaultStream: 3592 if ( 3593 model.full_refresh_stream.name != model.name 3594 or model.name != model.incremental_stream.name 3595 ): 3596 raise ValueError( 3597 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3598 ) 3599 3600 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3601 resolved_retention_period: Optional[str] = None 3602 if model.api_retention_period: 3603 interpolated_retention = InterpolatedString.create( 3604 model.api_retention_period, parameters=model.parameters or {} 3605 ) 3606 resolved_value = interpolated_retention.eval(config=config) 3607 if resolved_value: 3608 resolved_retention_period = str(resolved_value) 3609 3610 if resolved_retention_period: 3611 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3612 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3613 raise ValueError( 3614 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3615 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3616 f"cursors, so cursor age validation cannot be performed." 3617 ) 3618 3619 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3620 3621 if not stream_state: 3622 return self._create_component_from_model( # type: ignore[no-any-return] 3623 model.full_refresh_stream, config=config, **kwargs 3624 ) 3625 3626 incremental_stream: DefaultStream = self._create_component_from_model( 3627 model.incremental_stream, config=config, **kwargs 3628 ) # type: ignore[assignment] 3629 3630 # Only run cursor age validation for streams that are in the configured 3631 # catalog (or when no catalog was provided, e.g. during discover / connector 3632 # builder). Streams not selected by the user but instantiated as parent-stream 3633 # dependencies must not go through this path because it emits state messages 3634 # that the destination does not know about, causing "Stream not found" crashes. 3635 stream_is_in_catalog = ( 3636 not self._stream_name_to_configured_stream # no catalog → validate by default 3637 or model.name in self._stream_name_to_configured_stream 3638 ) 3639 if resolved_retention_period and stream_is_in_catalog: 3640 full_refresh_stream: DefaultStream = self._create_component_from_model( 3641 model.full_refresh_stream, config=config, **kwargs 3642 ) # type: ignore[assignment] 3643 if self._is_cursor_older_than_retention_period( 3644 stream_state, 3645 full_refresh_stream.cursor, 3646 incremental_stream.cursor, 3647 resolved_retention_period, 3648 model.name, 3649 ): 3650 # Clear state BEFORE constructing the full_refresh_stream so that 3651 # its cursor starts from start_date instead of the stale cursor. 3652 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3653 state_message = self._connector_state_manager.create_state_message(model.name, None) 3654 self._message_repository.emit_message(state_message) 3655 return self._create_component_from_model( # type: ignore[no-any-return] 3656 model.full_refresh_stream, config=config, **kwargs 3657 ) 3658 3659 return incremental_stream 3660 3661 @staticmethod 3662 def _is_cursor_older_than_retention_period( 3663 stream_state: Mapping[str, Any], 3664 full_refresh_cursor: Cursor, 3665 incremental_cursor: Cursor, 3666 api_retention_period: str, 3667 stream_name: str, 3668 ) -> bool: 3669 """Check if the cursor value in the state is older than the API's retention period. 3670 3671 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3672 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3673 which is always within retention, so we use incremental. For other states, it returns 3674 None and we fall back to checking the incremental cursor. 3675 3676 Returns True if the cursor is older than the retention period (should use full refresh). 3677 Returns False if the cursor is within the retention period (safe to use incremental). 3678 """ 3679 retention_duration = parse_duration(api_retention_period) 3680 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3681 3682 # Check full refresh cursor first 3683 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3684 3685 # If full refresh cursor returns None, check incremental cursor 3686 if cursor_datetime is None: 3687 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3688 3689 if cursor_datetime is None: 3690 # Neither cursor could parse the state - fall back to full refresh to be safe 3691 return True 3692 3693 if cursor_datetime < retention_cutoff: 3694 logging.warning( 3695 f"Stream '{stream_name}' has a cursor value older than " 3696 f"the API's retention period of {api_retention_period} " 3697 f"(cutoff: {retention_cutoff.isoformat()}). " 3698 f"Falling back to full refresh to avoid data loss." 3699 ) 3700 return True 3701 3702 return False 3703 3704 def _get_state_delegating_stream_model( 3705 self, 3706 model: StateDelegatingStreamModel, 3707 parent_state: Optional[Mapping[str, Any]] = None, 3708 ) -> DeclarativeStreamModel: 3709 """Return the appropriate underlying stream model based on state.""" 3710 return ( 3711 model.incremental_stream 3712 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3713 else model.full_refresh_stream 3714 ) 3715 3716 def _create_async_job_status_mapping( 3717 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3718 ) -> Mapping[str, AsyncJobStatus]: 3719 api_status_to_cdk_status = {} 3720 for cdk_status, api_statuses in model.dict().items(): 3721 if cdk_status == "type": 3722 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3723 continue 3724 3725 for status in api_statuses: 3726 if status in api_status_to_cdk_status: 3727 raise ValueError( 3728 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3729 ) 3730 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3731 return api_status_to_cdk_status 3732 3733 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3734 match status: 3735 case "running": 3736 return AsyncJobStatus.RUNNING 3737 case "completed": 3738 return AsyncJobStatus.COMPLETED 3739 case "failed": 3740 return AsyncJobStatus.FAILED 3741 case "timeout": 3742 return AsyncJobStatus.TIMED_OUT 3743 case _: 3744 raise ValueError(f"Unsupported CDK status {status}") 3745 3746 def create_async_retriever( 3747 self, 3748 model: AsyncRetrieverModel, 3749 config: Config, 3750 *, 3751 name: str, 3752 primary_key: Optional[ 3753 Union[str, List[str], List[List[str]]] 3754 ], # this seems to be needed to match create_simple_retriever 3755 stream_slicer: Optional[StreamSlicer], 3756 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3757 transformations: List[RecordTransformation], 3758 **kwargs: Any, 3759 ) -> AsyncRetriever: 3760 if model.download_target_requester and not model.download_target_extractor: 3761 raise ValueError( 3762 f"`download_target_extractor` required if using a `download_target_requester`" 3763 ) 3764 3765 def _get_download_retriever( 3766 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3767 ) -> SimpleRetriever: 3768 # We create a record selector for the download retriever 3769 # with no schema normalization and no transformations, neither record filter 3770 # as all this occurs in the record_selector of the AsyncRetriever 3771 record_selector = RecordSelector( 3772 extractor=extractor, 3773 name=name, 3774 record_filter=None, 3775 transformations=[], 3776 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3777 config=config, 3778 parameters={}, 3779 ) 3780 paginator = ( 3781 self._create_component_from_model( 3782 model=model.download_paginator, 3783 decoder=_decoder, 3784 config=config, 3785 url_base="", 3786 ) 3787 if model.download_paginator 3788 else NoPagination(parameters={}) 3789 ) 3790 3791 return SimpleRetriever( 3792 requester=requester, 3793 record_selector=record_selector, 3794 primary_key=None, 3795 name=name, 3796 paginator=paginator, 3797 config=config, 3798 parameters={}, 3799 log_formatter=self._get_log_formatter(None, name), 3800 ) 3801 3802 def _get_job_timeout() -> datetime.timedelta: 3803 user_defined_timeout: Optional[int] = ( 3804 int( 3805 InterpolatedString.create( 3806 str(model.polling_job_timeout), 3807 parameters={}, 3808 ).eval(config) 3809 ) 3810 if model.polling_job_timeout 3811 else None 3812 ) 3813 3814 # check for user defined timeout during the test read or 15 minutes 3815 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3816 # default value for non-connector builder is 60 minutes. 3817 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3818 3819 return ( 3820 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3821 ) 3822 3823 decoder = ( 3824 self._create_component_from_model(model=model.decoder, config=config) 3825 if model.decoder 3826 else JsonDecoder(parameters={}) 3827 ) 3828 record_selector = self._create_component_from_model( 3829 model=model.record_selector, 3830 config=config, 3831 decoder=decoder, 3832 name=name, 3833 transformations=transformations, 3834 client_side_incremental_sync=client_side_incremental_sync, 3835 ) 3836 3837 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3838 if self._should_limit_slices_fetched(): 3839 stream_slicer = cast( 3840 StreamSlicer, 3841 StreamSlicerTestReadDecorator( 3842 wrapped_slicer=stream_slicer, 3843 maximum_number_of_slices=self._limit_slices_fetched or 5, 3844 ), 3845 ) 3846 3847 creation_requester = self._create_component_from_model( 3848 model=model.creation_requester, 3849 decoder=decoder, 3850 config=config, 3851 name=f"job creation - {name}", 3852 ) 3853 polling_requester = self._create_component_from_model( 3854 model=model.polling_requester, 3855 decoder=decoder, 3856 config=config, 3857 name=f"job polling - {name}", 3858 ) 3859 job_download_components_name = f"job download - {name}" 3860 download_decoder = ( 3861 self._create_component_from_model(model=model.download_decoder, config=config) 3862 if model.download_decoder 3863 else JsonDecoder(parameters={}) 3864 ) 3865 download_extractor = ( 3866 self._create_component_from_model( 3867 model=model.download_extractor, 3868 config=config, 3869 decoder=download_decoder, 3870 parameters=model.parameters, 3871 ) 3872 if model.download_extractor 3873 else DpathExtractor( 3874 [], 3875 config=config, 3876 decoder=download_decoder, 3877 parameters=model.parameters or {}, 3878 ) 3879 ) 3880 download_requester = self._create_component_from_model( 3881 model=model.download_requester, 3882 decoder=download_decoder, 3883 config=config, 3884 name=job_download_components_name, 3885 ) 3886 download_retriever = _get_download_retriever( 3887 download_requester, download_extractor, download_decoder 3888 ) 3889 abort_requester = ( 3890 self._create_component_from_model( 3891 model=model.abort_requester, 3892 decoder=decoder, 3893 config=config, 3894 name=f"job abort - {name}", 3895 ) 3896 if model.abort_requester 3897 else None 3898 ) 3899 delete_requester = ( 3900 self._create_component_from_model( 3901 model=model.delete_requester, 3902 decoder=decoder, 3903 config=config, 3904 name=f"job delete - {name}", 3905 ) 3906 if model.delete_requester 3907 else None 3908 ) 3909 download_target_requester = ( 3910 self._create_component_from_model( 3911 model=model.download_target_requester, 3912 decoder=decoder, 3913 config=config, 3914 name=f"job extract_url - {name}", 3915 ) 3916 if model.download_target_requester 3917 else None 3918 ) 3919 status_extractor = self._create_component_from_model( 3920 model=model.status_extractor, decoder=decoder, config=config, name=name 3921 ) 3922 download_target_extractor = ( 3923 self._create_component_from_model( 3924 model=model.download_target_extractor, 3925 decoder=decoder, 3926 config=config, 3927 name=name, 3928 ) 3929 if model.download_target_extractor 3930 else None 3931 ) 3932 3933 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3934 creation_requester=creation_requester, 3935 polling_requester=polling_requester, 3936 download_retriever=download_retriever, 3937 download_target_requester=download_target_requester, 3938 abort_requester=abort_requester, 3939 delete_requester=delete_requester, 3940 status_extractor=status_extractor, 3941 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3942 download_target_extractor=download_target_extractor, 3943 job_timeout=_get_job_timeout(), 3944 ) 3945 3946 async_job_partition_router = AsyncJobPartitionRouter( 3947 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3948 job_repository, 3949 stream_slices, 3950 self._job_tracker, 3951 self._message_repository, 3952 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3953 has_bulk_parent=False, 3954 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3955 # `None` == default retry is set to 3 attempts, under the hood. 3956 job_max_retry=1 if self._emit_connector_builder_messages else None, 3957 ), 3958 stream_slicer=stream_slicer, 3959 config=config, 3960 parameters=model.parameters or {}, 3961 ) 3962 3963 return AsyncRetriever( 3964 record_selector=record_selector, 3965 stream_slicer=async_job_partition_router, 3966 config=config, 3967 parameters=model.parameters or {}, 3968 ) 3969 3970 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3971 config_migrations = [ 3972 self._create_component_from_model(migration, config) 3973 for migration in ( 3974 model.config_normalization_rules.config_migrations 3975 if ( 3976 model.config_normalization_rules 3977 and model.config_normalization_rules.config_migrations 3978 ) 3979 else [] 3980 ) 3981 ] 3982 config_transformations = [ 3983 self._create_component_from_model(transformation, config) 3984 for transformation in ( 3985 model.config_normalization_rules.transformations 3986 if ( 3987 model.config_normalization_rules 3988 and model.config_normalization_rules.transformations 3989 ) 3990 else [] 3991 ) 3992 ] 3993 config_validations = [ 3994 self._create_component_from_model(validation, config) 3995 for validation in ( 3996 model.config_normalization_rules.validations 3997 if ( 3998 model.config_normalization_rules 3999 and model.config_normalization_rules.validations 4000 ) 4001 else [] 4002 ) 4003 ] 4004 4005 return Spec( 4006 connection_specification=model.connection_specification, 4007 documentation_url=model.documentation_url, 4008 advanced_auth=model.advanced_auth, 4009 parameters={}, 4010 config_migrations=config_migrations, 4011 config_transformations=config_transformations, 4012 config_validations=config_validations, 4013 ) 4014 4015 def create_substream_partition_router( 4016 self, 4017 model: SubstreamPartitionRouterModel, 4018 config: Config, 4019 *, 4020 stream_name: str, 4021 **kwargs: Any, 4022 ) -> SubstreamPartitionRouter: 4023 parent_stream_configs = [] 4024 if model.parent_stream_configs: 4025 parent_stream_configs.extend( 4026 [ 4027 self.create_parent_stream_config_with_substream_wrapper( 4028 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4029 ) 4030 for parent_stream_config in model.parent_stream_configs 4031 ] 4032 ) 4033 4034 return SubstreamPartitionRouter( 4035 parent_stream_configs=parent_stream_configs, 4036 parameters=model.parameters or {}, 4037 config=config, 4038 ) 4039 4040 def create_parent_stream_config_with_substream_wrapper( 4041 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4042 ) -> Any: 4043 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4044 4045 parent_state: Optional[Mapping[str, Any]] = ( 4046 child_state if model.incremental_dependency and child_state else None 4047 ) 4048 connector_state_manager = self._instantiate_parent_stream_state_manager( 4049 child_state, config, model, parent_state 4050 ) 4051 4052 substream_factory = ModelToComponentFactory( 4053 connector_state_manager=connector_state_manager, 4054 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4055 limit_slices_fetched=self._limit_slices_fetched, 4056 emit_connector_builder_messages=self._emit_connector_builder_messages, 4057 disable_retries=self._disable_retries, 4058 disable_cache=self._disable_cache, 4059 message_repository=StateFilteringMessageRepository( 4060 LogAppenderMessageRepositoryDecorator( 4061 { 4062 "airbyte_cdk": {"stream": {"is_substream": True}}, 4063 "http": {"is_auxiliary": True}, 4064 }, 4065 self._message_repository, 4066 self._evaluate_log_level(self._emit_connector_builder_messages), 4067 ), 4068 ), 4069 api_budget=self._api_budget, 4070 ) 4071 4072 return substream_factory.create_parent_stream_config( 4073 model=model, config=config, stream_name=stream_name, **kwargs 4074 ) 4075 4076 def _instantiate_parent_stream_state_manager( 4077 self, 4078 child_state: MutableMapping[str, Any], 4079 config: Config, 4080 model: ParentStreamConfigModel, 4081 parent_state: Optional[Mapping[str, Any]] = None, 4082 ) -> ConnectorStateManager: 4083 """ 4084 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4085 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4086 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4087 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4088 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4089 incremental_dependency is set. 4090 """ 4091 if model.incremental_dependency and child_state: 4092 parent_stream_name = model.stream.name or "" 4093 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4094 child_state, parent_stream_name 4095 ) 4096 4097 if not extracted_parent_state: 4098 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4099 child_state, parent_stream_name 4100 ) 4101 4102 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4103 cursor_values = child_state.values() 4104 if cursor_values and len(cursor_values) == 1: 4105 incremental_sync_model: Union[ 4106 DatetimeBasedCursorModel, 4107 IncrementingCountCursorModel, 4108 ] = ( 4109 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4110 if isinstance(model.stream, DeclarativeStreamModel) 4111 else self._get_state_delegating_stream_model( 4112 model.stream, parent_state=parent_state 4113 ).incremental_sync 4114 ) 4115 cursor_field = InterpolatedString.create( 4116 incremental_sync_model.cursor_field, 4117 parameters=incremental_sync_model.parameters or {}, 4118 ).eval(config) 4119 extracted_parent_state = AirbyteStateMessage( 4120 type=AirbyteStateType.STREAM, 4121 stream=AirbyteStreamState( 4122 stream_descriptor=StreamDescriptor( 4123 name=parent_stream_name, namespace=None 4124 ), 4125 stream_state=AirbyteStateBlob( 4126 {cursor_field: list(cursor_values)[0]} 4127 ), 4128 ), 4129 ) 4130 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4131 4132 return ConnectorStateManager([]) 4133 4134 @staticmethod 4135 def create_wait_time_from_header( 4136 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4137 ) -> WaitTimeFromHeaderBackoffStrategy: 4138 return WaitTimeFromHeaderBackoffStrategy( 4139 header=model.header, 4140 parameters=model.parameters or {}, 4141 config=config, 4142 regex=model.regex, 4143 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4144 if model.max_waiting_time_in_seconds is not None 4145 else None, 4146 ) 4147 4148 @staticmethod 4149 def create_wait_until_time_from_header( 4150 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4151 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4152 return WaitUntilTimeFromHeaderBackoffStrategy( 4153 header=model.header, 4154 parameters=model.parameters or {}, 4155 config=config, 4156 min_wait=model.min_wait, 4157 regex=model.regex, 4158 ) 4159 4160 def get_message_repository(self) -> MessageRepository: 4161 return self._message_repository 4162 4163 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4164 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4165 4166 @staticmethod 4167 def create_components_mapping_definition( 4168 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4169 ) -> ComponentMappingDefinition: 4170 interpolated_value = InterpolatedString.create( 4171 model.value, parameters=model.parameters or {} 4172 ) 4173 field_path = [ 4174 InterpolatedString.create(path, parameters=model.parameters or {}) 4175 for path in model.field_path 4176 ] 4177 return ComponentMappingDefinition( 4178 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4179 value=interpolated_value, 4180 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4181 create_or_update=model.create_or_update, 4182 condition=model.condition, 4183 parameters=model.parameters or {}, 4184 ) 4185 4186 def create_http_components_resolver( 4187 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4188 ) -> Any: 4189 retriever = self._create_component_from_model( 4190 model=model.retriever, 4191 config=config, 4192 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4193 primary_key=None, 4194 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4195 transformations=[], 4196 ) 4197 4198 components_mapping = [] 4199 for component_mapping_definition_model in model.components_mapping: 4200 if component_mapping_definition_model.condition: 4201 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4202 components_mapping.append( 4203 self._create_component_from_model( 4204 model=component_mapping_definition_model, 4205 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4206 component_mapping_definition_model.value_type 4207 ), 4208 config=config, 4209 ) 4210 ) 4211 4212 return HttpComponentsResolver( 4213 retriever=retriever, 4214 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4215 config=config, 4216 components_mapping=components_mapping, 4217 parameters=model.parameters or {}, 4218 ) 4219 4220 @staticmethod 4221 def create_stream_config( 4222 model: StreamConfigModel, config: Config, **kwargs: Any 4223 ) -> StreamConfig: 4224 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4225 [x for x in model.configs_pointer] if model.configs_pointer else [] 4226 ) 4227 4228 return StreamConfig( 4229 configs_pointer=model_configs_pointer, 4230 default_values=model.default_values, 4231 parameters=model.parameters or {}, 4232 ) 4233 4234 def create_config_components_resolver( 4235 self, 4236 model: ConfigComponentsResolverModel, 4237 config: Config, 4238 ) -> Any: 4239 model_stream_configs = ( 4240 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4241 ) 4242 4243 stream_configs = [ 4244 self._create_component_from_model( 4245 stream_config, config=config, parameters=model.parameters or {} 4246 ) 4247 for stream_config in model_stream_configs 4248 ] 4249 4250 components_mapping = [ 4251 self._create_component_from_model( 4252 model=components_mapping_definition_model, 4253 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4254 components_mapping_definition_model.value_type 4255 ), 4256 config=config, 4257 parameters=model.parameters, 4258 ) 4259 for components_mapping_definition_model in model.components_mapping 4260 ] 4261 4262 return ConfigComponentsResolver( 4263 stream_configs=stream_configs, 4264 config=config, 4265 components_mapping=components_mapping, 4266 parameters=model.parameters or {}, 4267 ) 4268 4269 def create_parametrized_components_resolver( 4270 self, 4271 model: ParametrizedComponentsResolverModel, 4272 config: Config, 4273 ) -> ParametrizedComponentsResolver: 4274 stream_parameters = StreamParametersDefinition( 4275 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4276 ) 4277 4278 components_mapping = [] 4279 for components_mapping_definition_model in model.components_mapping: 4280 if components_mapping_definition_model.condition: 4281 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4282 components_mapping.append( 4283 self._create_component_from_model( 4284 model=components_mapping_definition_model, 4285 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4286 components_mapping_definition_model.value_type 4287 ), 4288 config=config, 4289 ) 4290 ) 4291 return ParametrizedComponentsResolver( 4292 stream_parameters=stream_parameters, 4293 config=config, 4294 components_mapping=components_mapping, 4295 parameters=model.parameters or {}, 4296 ) 4297 4298 _UNSUPPORTED_DECODER_ERROR = ( 4299 "Specified decoder of {decoder_type} is not supported for pagination." 4300 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4301 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4302 ) 4303 4304 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4305 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4306 return True 4307 elif isinstance(decoder, CompositeRawDecoder): 4308 return self._is_supported_parser_for_pagination(decoder.parser) 4309 else: 4310 return False 4311 4312 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4313 if isinstance(parser, JsonParser): 4314 return True 4315 elif isinstance(parser, GzipParser): 4316 return isinstance(parser.inner_parser, JsonParser) 4317 else: 4318 return False 4319 4320 def create_http_api_budget( 4321 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4322 ) -> HttpAPIBudget: 4323 policies = [ 4324 self._create_component_from_model(model=policy, config=config) 4325 for policy in model.policies 4326 ] 4327 4328 return HttpAPIBudget( 4329 policies=policies, 4330 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4331 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4332 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4333 ) 4334 4335 def create_fixed_window_call_rate_policy( 4336 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4337 ) -> FixedWindowCallRatePolicy: 4338 matchers = [ 4339 self._create_component_from_model(model=matcher, config=config) 4340 for matcher in model.matchers 4341 ] 4342 4343 # Set the initial reset timestamp to 10 days from now. 4344 # This value will be updated by the first request. 4345 return FixedWindowCallRatePolicy( 4346 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4347 period=parse_duration(model.period), 4348 call_limit=model.call_limit, 4349 matchers=matchers, 4350 ) 4351 4352 def create_file_uploader( 4353 self, model: FileUploaderModel, config: Config, **kwargs: Any 4354 ) -> FileUploader: 4355 name = "File Uploader" 4356 requester = self._create_component_from_model( 4357 model=model.requester, 4358 config=config, 4359 name=name, 4360 **kwargs, 4361 ) 4362 download_target_extractor = self._create_component_from_model( 4363 model=model.download_target_extractor, 4364 config=config, 4365 name=name, 4366 **kwargs, 4367 ) 4368 emit_connector_builder_messages = self._emit_connector_builder_messages 4369 file_uploader = DefaultFileUploader( 4370 requester=requester, 4371 download_target_extractor=download_target_extractor, 4372 config=config, 4373 file_writer=NoopFileWriter() 4374 if emit_connector_builder_messages 4375 else LocalFileSystemFileWriter(), 4376 parameters=model.parameters or {}, 4377 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4378 ) 4379 4380 return ( 4381 ConnectorBuilderFileUploader(file_uploader) 4382 if emit_connector_builder_messages 4383 else file_uploader 4384 ) 4385 4386 def create_moving_window_call_rate_policy( 4387 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4388 ) -> MovingWindowCallRatePolicy: 4389 rates = [ 4390 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4391 ] 4392 matchers = [ 4393 self._create_component_from_model(model=matcher, config=config) 4394 for matcher in model.matchers 4395 ] 4396 return MovingWindowCallRatePolicy( 4397 rates=rates, 4398 matchers=matchers, 4399 ) 4400 4401 def create_unlimited_call_rate_policy( 4402 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4403 ) -> UnlimitedCallRatePolicy: 4404 matchers = [ 4405 self._create_component_from_model(model=matcher, config=config) 4406 for matcher in model.matchers 4407 ] 4408 4409 return UnlimitedCallRatePolicy( 4410 matchers=matchers, 4411 ) 4412 4413 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4414 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4415 return Rate( 4416 limit=int(interpolated_limit.eval(config=config)), 4417 interval=parse_duration(model.interval), 4418 ) 4419 4420 def create_http_request_matcher( 4421 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4422 ) -> HttpRequestRegexMatcher: 4423 weight = model.weight 4424 if weight is not None: 4425 if isinstance(weight, str): 4426 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4427 else: 4428 weight = int(weight) 4429 if weight < 1: 4430 raise ValueError(f"weight must be >= 1, got {weight}") 4431 return HttpRequestRegexMatcher( 4432 method=model.method, 4433 url_base=model.url_base, 4434 url_path_pattern=model.url_path_pattern, 4435 params=model.params, 4436 headers=model.headers, 4437 weight=weight, 4438 ) 4439 4440 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4441 self._api_budget = self.create_component( 4442 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4443 ) 4444 4445 def create_grouping_partition_router( 4446 self, 4447 model: GroupingPartitionRouterModel, 4448 config: Config, 4449 *, 4450 stream_name: str, 4451 **kwargs: Any, 4452 ) -> GroupingPartitionRouter: 4453 underlying_router = self._create_component_from_model( 4454 model=model.underlying_partition_router, 4455 config=config, 4456 stream_name=stream_name, 4457 **kwargs, 4458 ) 4459 if model.group_size < 1: 4460 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4461 4462 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4463 # because they are specific to individual partitions and cannot be aggregated or handled 4464 # when grouping, potentially leading to incorrect API calls. Any request customization 4465 # should be managed at the stream level through the requester's configuration. 4466 if isinstance(underlying_router, SubstreamPartitionRouter): 4467 if any( 4468 parent_config.request_option 4469 for parent_config in underlying_router.parent_stream_configs 4470 ): 4471 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4472 4473 if isinstance(underlying_router, ListPartitionRouter): 4474 if underlying_router.request_option: 4475 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4476 4477 return GroupingPartitionRouter( 4478 group_size=model.group_size, 4479 underlying_partition_router=underlying_router, 4480 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4481 config=config, 4482 ) 4483 4484 def _ensure_query_properties_to_model( 4485 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4486 ) -> None: 4487 """ 4488 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4489 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4490 proper model. 4491 """ 4492 if not hasattr(requester, "request_parameters"): 4493 return 4494 4495 request_parameters = requester.request_parameters 4496 if request_parameters and isinstance(request_parameters, Dict): 4497 for request_parameter_key in request_parameters.keys(): 4498 request_parameter = request_parameters[request_parameter_key] 4499 if ( 4500 isinstance(request_parameter, Dict) 4501 and request_parameter.get("type") == "QueryProperties" 4502 ): 4503 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4504 request_parameter 4505 ) 4506 4507 def _get_catalog_defined_cursor_field( 4508 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4509 ) -> Optional[CursorField]: 4510 if not allow_catalog_defined_cursor_field: 4511 return None 4512 4513 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4514 4515 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4516 # case we return None which will then use the default cursor field defined on the cursor model. 4517 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4518 # occur when the platform serializes "no cursor configured" streams incorrectly. 4519 if ( 4520 not configured_stream 4521 or not configured_stream.cursor_field 4522 or not configured_stream.cursor_field[0] 4523 ): 4524 return None 4525 elif len(configured_stream.cursor_field) > 1: 4526 raise ValueError( 4527 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4528 ) 4529 else: 4530 return CursorField( 4531 cursor_field_key=configured_stream.cursor_field[0], 4532 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4533 )
685class ModelToComponentFactory: 686 EPOCH_DATETIME_FORMAT = "%s" 687 688 def __init__( 689 self, 690 limit_pages_fetched_per_slice: Optional[int] = None, 691 limit_slices_fetched: Optional[int] = None, 692 emit_connector_builder_messages: bool = False, 693 disable_retries: bool = False, 694 disable_cache: bool = False, 695 message_repository: Optional[MessageRepository] = None, 696 connector_state_manager: Optional[ConnectorStateManager] = None, 697 max_concurrent_async_job_count: Optional[int] = None, 698 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 699 api_budget: Optional[APIBudget] = None, 700 ): 701 self._init_mappings() 702 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 703 self._limit_slices_fetched = limit_slices_fetched 704 self._emit_connector_builder_messages = emit_connector_builder_messages 705 self._disable_retries = disable_retries 706 self._disable_cache = disable_cache 707 self._message_repository = message_repository or InMemoryMessageRepository( 708 self._evaluate_log_level(emit_connector_builder_messages) 709 ) 710 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 711 configured_catalog 712 ) 713 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 714 self._api_budget: Optional[Union[APIBudget]] = api_budget 715 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 716 # placeholder for deprecation warnings 717 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 718 719 def _init_mappings(self) -> None: 720 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 721 AddedFieldDefinitionModel: self.create_added_field_definition, 722 AddFieldsModel: self.create_add_fields, 723 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 724 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 725 BearerAuthenticatorModel: self.create_bearer_authenticator, 726 CheckStreamModel: self.create_check_stream, 727 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 728 CheckDynamicStreamModel: self.create_check_dynamic_stream, 729 CompositeErrorHandlerModel: self.create_composite_error_handler, 730 ConcurrencyLevelModel: self.create_concurrency_level, 731 ConfigMigrationModel: self.create_config_migration, 732 ConfigAddFieldsModel: self.create_config_add_fields, 733 ConfigRemapFieldModel: self.create_config_remap_field, 734 ConfigRemoveFieldsModel: self.create_config_remove_fields, 735 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 736 CsvDecoderModel: self.create_csv_decoder, 737 CursorPaginationModel: self.create_cursor_pagination, 738 CustomAuthenticatorModel: self.create_custom_component, 739 CustomBackoffStrategyModel: self.create_custom_component, 740 CustomDecoderModel: self.create_custom_component, 741 CustomErrorHandlerModel: self.create_custom_component, 742 CustomRecordExtractorModel: self.create_custom_component, 743 CustomRecordFilterModel: self.create_custom_component, 744 CustomRequesterModel: self.create_custom_component, 745 CustomRetrieverModel: self.create_custom_component, 746 CustomSchemaLoader: self.create_custom_component, 747 CustomSchemaNormalizationModel: self.create_custom_component, 748 CustomStateMigration: self.create_custom_component, 749 CustomPaginationStrategyModel: self.create_custom_component, 750 CustomPartitionRouterModel: self.create_custom_component, 751 CustomTransformationModel: self.create_custom_component, 752 CustomValidationStrategyModel: self.create_custom_component, 753 CustomConfigTransformationModel: self.create_custom_component, 754 DeclarativeStreamModel: self.create_default_stream, 755 DefaultErrorHandlerModel: self.create_default_error_handler, 756 DefaultPaginatorModel: self.create_default_paginator, 757 DpathExtractorModel: self.create_dpath_extractor, 758 DpathValidatorModel: self.create_dpath_validator, 759 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 760 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 761 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 762 GroupByKeyMergeStrategyModel: self.create_group_by_key, 763 HttpRequesterModel: self.create_http_requester, 764 HttpResponseFilterModel: self.create_http_response_filter, 765 InlineSchemaLoaderModel: self.create_inline_schema_loader, 766 JsonDecoderModel: self.create_json_decoder, 767 JsonlDecoderModel: self.create_jsonl_decoder, 768 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 769 GzipDecoderModel: self.create_gzip_decoder, 770 KeysToLowerModel: self.create_keys_to_lower_transformation, 771 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 772 KeysReplaceModel: self.create_keys_replace_transformation, 773 FlattenFieldsModel: self.create_flatten_fields, 774 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 775 IterableDecoderModel: self.create_iterable_decoder, 776 XmlDecoderModel: self.create_xml_decoder, 777 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 778 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 779 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 780 TypesMapModel: self.create_types_map, 781 ComplexFieldTypeModel: self.create_complex_field_type, 782 JwtAuthenticatorModel: self.create_jwt_authenticator, 783 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 784 ListPartitionRouterModel: self.create_list_partition_router, 785 MinMaxDatetimeModel: self.create_min_max_datetime, 786 NoAuthModel: self.create_no_auth, 787 NoPaginationModel: self.create_no_pagination, 788 OAuthAuthenticatorModel: self.create_oauth_authenticator, 789 OffsetIncrementModel: self.create_offset_increment, 790 PageIncrementModel: self.create_page_increment, 791 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 792 PredicateValidatorModel: self.create_predicate_validator, 793 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 794 PropertyChunkingModel: self.create_property_chunking, 795 QueryPropertiesModel: self.create_query_properties, 796 RecordExpanderModel: self.create_record_expander, 797 RecordFilterModel: self.create_record_filter, 798 RecordSelectorModel: self.create_record_selector, 799 RemoveFieldsModel: self.create_remove_fields, 800 RequestPathModel: self.create_request_path, 801 RequestOptionModel: self.create_request_option, 802 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 803 SelectiveAuthenticatorModel: self.create_selective_authenticator, 804 SimpleRetrieverModel: self.create_simple_retriever, 805 StateDelegatingStreamModel: self.create_state_delegating_stream, 806 SpecModel: self.create_spec, 807 SubstreamPartitionRouterModel: self.create_substream_partition_router, 808 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 809 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 810 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 811 AsyncRetrieverModel: self.create_async_retriever, 812 HttpComponentsResolverModel: self.create_http_components_resolver, 813 ConfigComponentsResolverModel: self.create_config_components_resolver, 814 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 815 StreamConfigModel: self.create_stream_config, 816 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 817 ZipfileDecoderModel: self.create_zipfile_decoder, 818 HTTPAPIBudgetModel: self.create_http_api_budget, 819 FileUploaderModel: self.create_file_uploader, 820 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 821 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 822 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 823 RateModel: self.create_rate, 824 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 825 GroupingPartitionRouterModel: self.create_grouping_partition_router, 826 } 827 828 # Needed for the case where we need to perform a second parse on the fields of a custom component 829 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 830 831 @staticmethod 832 def _create_stream_name_to_configured_stream( 833 configured_catalog: Optional[ConfiguredAirbyteCatalog], 834 ) -> Mapping[str, ConfiguredAirbyteStream]: 835 return ( 836 {stream.stream.name: stream for stream in configured_catalog.streams} 837 if configured_catalog 838 else {} 839 ) 840 841 def create_component( 842 self, 843 model_type: Type[BaseModel], 844 component_definition: ComponentDefinition, 845 config: Config, 846 **kwargs: Any, 847 ) -> Any: 848 """ 849 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 850 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 851 creating declarative components from that model. 852 853 :param model_type: The type of declarative component that is being initialized 854 :param component_definition: The mapping that represents a declarative component 855 :param config: The connector config that is provided by the customer 856 :return: The declarative component to be used at runtime 857 """ 858 859 component_type = component_definition.get("type") 860 if component_definition.get("type") != model_type.__name__: 861 raise ValueError( 862 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 863 ) 864 865 declarative_component_model = model_type.parse_obj(component_definition) 866 867 if not isinstance(declarative_component_model, model_type): 868 raise ValueError( 869 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 870 ) 871 872 return self._create_component_from_model( 873 model=declarative_component_model, config=config, **kwargs 874 ) 875 876 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 877 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 878 raise ValueError( 879 f"{model.__class__} with attributes {model} is not a valid component type" 880 ) 881 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 882 if not component_constructor: 883 raise ValueError(f"Could not find constructor for {model.__class__}") 884 885 # collect deprecation warnings for supported models. 886 if isinstance(model, BaseModelWithDeprecations): 887 self._collect_model_deprecations(model) 888 889 return component_constructor(model=model, config=config, **kwargs) 890 891 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 892 """ 893 Returns the deprecation warnings that were collected during the creation of components. 894 """ 895 return self._collected_deprecation_logs 896 897 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 898 """ 899 Collects deprecation logs from the given model and appends any new logs to the internal collection. 900 901 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 902 903 Args: 904 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 905 """ 906 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 907 for log in model._deprecation_logs: 908 # avoid duplicates for deprecation logs observed. 909 if log not in self._collected_deprecation_logs: 910 self._collected_deprecation_logs.append(log) 911 912 def create_config_migration( 913 self, model: ConfigMigrationModel, config: Config 914 ) -> ConfigMigration: 915 transformations: List[ConfigTransformation] = [ 916 self._create_component_from_model(transformation, config) 917 for transformation in model.transformations 918 ] 919 920 return ConfigMigration( 921 description=model.description, 922 transformations=transformations, 923 ) 924 925 def create_config_add_fields( 926 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 927 ) -> ConfigAddFields: 928 fields = [self._create_component_from_model(field, config) for field in model.fields] 929 return ConfigAddFields( 930 fields=fields, 931 condition=model.condition or "", 932 ) 933 934 @staticmethod 935 def create_config_remove_fields( 936 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 937 ) -> ConfigRemoveFields: 938 return ConfigRemoveFields( 939 field_pointers=model.field_pointers, 940 condition=model.condition or "", 941 ) 942 943 @staticmethod 944 def create_config_remap_field( 945 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 946 ) -> ConfigRemapField: 947 mapping = cast(Mapping[str, Any], model.map) 948 return ConfigRemapField( 949 map=mapping, 950 field_path=model.field_path, 951 config=config, 952 ) 953 954 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 955 strategy = self._create_component_from_model(model.validation_strategy, config) 956 957 return DpathValidator( 958 field_path=model.field_path, 959 strategy=strategy, 960 ) 961 962 def create_predicate_validator( 963 self, model: PredicateValidatorModel, config: Config 964 ) -> PredicateValidator: 965 strategy = self._create_component_from_model(model.validation_strategy, config) 966 967 return PredicateValidator( 968 value=model.value, 969 strategy=strategy, 970 ) 971 972 @staticmethod 973 def create_validate_adheres_to_schema( 974 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 975 ) -> ValidateAdheresToSchema: 976 base_schema = cast(Mapping[str, Any], model.base_schema) 977 return ValidateAdheresToSchema( 978 schema=base_schema, 979 ) 980 981 @staticmethod 982 def create_added_field_definition( 983 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 984 ) -> AddedFieldDefinition: 985 interpolated_value = InterpolatedString.create( 986 model.value, parameters=model.parameters or {} 987 ) 988 return AddedFieldDefinition( 989 path=model.path, 990 value=interpolated_value, 991 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 992 parameters=model.parameters or {}, 993 ) 994 995 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 996 added_field_definitions = [ 997 self._create_component_from_model( 998 model=added_field_definition_model, 999 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1000 added_field_definition_model.value_type 1001 ), 1002 config=config, 1003 ) 1004 for added_field_definition_model in model.fields 1005 ] 1006 return AddFields( 1007 fields=added_field_definitions, 1008 condition=model.condition or "", 1009 parameters=model.parameters or {}, 1010 ) 1011 1012 def create_keys_to_lower_transformation( 1013 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1014 ) -> KeysToLowerTransformation: 1015 return KeysToLowerTransformation() 1016 1017 def create_keys_to_snake_transformation( 1018 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1019 ) -> KeysToSnakeCaseTransformation: 1020 return KeysToSnakeCaseTransformation() 1021 1022 def create_keys_replace_transformation( 1023 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1024 ) -> KeysReplaceTransformation: 1025 return KeysReplaceTransformation( 1026 old=model.old, new=model.new, parameters=model.parameters or {} 1027 ) 1028 1029 def create_flatten_fields( 1030 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1031 ) -> FlattenFields: 1032 return FlattenFields( 1033 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1034 ) 1035 1036 def create_dpath_flatten_fields( 1037 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1038 ) -> DpathFlattenFields: 1039 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1040 key_transformation = ( 1041 KeyTransformation( 1042 config=config, 1043 prefix=model.key_transformation.prefix, 1044 suffix=model.key_transformation.suffix, 1045 parameters=model.parameters or {}, 1046 ) 1047 if model.key_transformation is not None 1048 else None 1049 ) 1050 return DpathFlattenFields( 1051 config=config, 1052 field_path=model_field_path, 1053 delete_origin_value=model.delete_origin_value 1054 if model.delete_origin_value is not None 1055 else False, 1056 replace_record=model.replace_record if model.replace_record is not None else False, 1057 key_transformation=key_transformation, 1058 parameters=model.parameters or {}, 1059 ) 1060 1061 @staticmethod 1062 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1063 if not value_type: 1064 return None 1065 names_to_types = { 1066 ValueType.string: str, 1067 ValueType.number: float, 1068 ValueType.integer: int, 1069 ValueType.boolean: bool, 1070 } 1071 return names_to_types[value_type] 1072 1073 def create_api_key_authenticator( 1074 self, 1075 model: ApiKeyAuthenticatorModel, 1076 config: Config, 1077 token_provider: Optional[TokenProvider] = None, 1078 **kwargs: Any, 1079 ) -> ApiKeyAuthenticator: 1080 if model.inject_into is None and model.header is None: 1081 raise ValueError( 1082 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1083 ) 1084 1085 if model.inject_into is not None and model.header is not None: 1086 raise ValueError( 1087 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1088 ) 1089 1090 if token_provider is not None and model.api_token != "": 1091 raise ValueError( 1092 "If token_provider is set, api_token is ignored and has to be set to empty string." 1093 ) 1094 1095 request_option = ( 1096 self._create_component_from_model( 1097 model.inject_into, config, parameters=model.parameters or {} 1098 ) 1099 if model.inject_into 1100 else RequestOption( 1101 inject_into=RequestOptionType.header, 1102 field_name=model.header or "", 1103 parameters=model.parameters or {}, 1104 ) 1105 ) 1106 1107 return ApiKeyAuthenticator( 1108 token_provider=( 1109 token_provider 1110 if token_provider is not None 1111 else InterpolatedStringTokenProvider( 1112 api_token=model.api_token or "", 1113 config=config, 1114 parameters=model.parameters or {}, 1115 ) 1116 ), 1117 request_option=request_option, 1118 config=config, 1119 parameters=model.parameters or {}, 1120 ) 1121 1122 def create_legacy_to_per_partition_state_migration( 1123 self, 1124 model: LegacyToPerPartitionStateMigrationModel, 1125 config: Mapping[str, Any], 1126 declarative_stream: DeclarativeStreamModel, 1127 ) -> LegacyToPerPartitionStateMigration: 1128 retriever = declarative_stream.retriever 1129 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1130 raise ValueError( 1131 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1132 ) 1133 partition_router = retriever.partition_router 1134 if not isinstance( 1135 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1136 ): 1137 raise ValueError( 1138 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1139 ) 1140 if not hasattr(partition_router, "parent_stream_configs"): 1141 raise ValueError( 1142 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1143 ) 1144 1145 if not hasattr(declarative_stream, "incremental_sync"): 1146 raise ValueError( 1147 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1148 ) 1149 1150 return LegacyToPerPartitionStateMigration( 1151 partition_router, # type: ignore # was already checked above 1152 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1153 config, 1154 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1155 ) 1156 1157 def create_session_token_authenticator( 1158 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1159 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1160 decoder = ( 1161 self._create_component_from_model(model=model.decoder, config=config) 1162 if model.decoder 1163 else JsonDecoder(parameters={}) 1164 ) 1165 login_requester = self._create_component_from_model( 1166 model=model.login_requester, 1167 config=config, 1168 name=f"{name}_login_requester", 1169 decoder=decoder, 1170 ) 1171 token_provider = SessionTokenProvider( 1172 login_requester=login_requester, 1173 session_token_path=model.session_token_path, 1174 expiration_duration=parse_duration(model.expiration_duration) 1175 if model.expiration_duration 1176 else None, 1177 parameters=model.parameters or {}, 1178 message_repository=self._message_repository, 1179 decoder=decoder, 1180 ) 1181 if model.request_authentication.type == "Bearer": 1182 return ModelToComponentFactory.create_bearer_authenticator( 1183 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1184 config, 1185 token_provider=token_provider, 1186 ) 1187 else: 1188 # Get the api_token template if specified, default to just the session token 1189 api_token_template = ( 1190 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1191 ) 1192 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1193 config=config, 1194 api_token=api_token_template, 1195 session_token_provider=token_provider, 1196 parameters=model.parameters or {}, 1197 ) 1198 return self.create_api_key_authenticator( 1199 ApiKeyAuthenticatorModel( 1200 type="ApiKeyAuthenticator", 1201 api_token="", 1202 inject_into=model.request_authentication.inject_into, 1203 ), # type: ignore # $parameters and headers default to None 1204 config=config, 1205 token_provider=final_token_provider, 1206 ) 1207 1208 @staticmethod 1209 def create_basic_http_authenticator( 1210 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1211 ) -> BasicHttpAuthenticator: 1212 return BasicHttpAuthenticator( 1213 password=model.password or "", 1214 username=model.username, 1215 config=config, 1216 parameters=model.parameters or {}, 1217 ) 1218 1219 @staticmethod 1220 def create_bearer_authenticator( 1221 model: BearerAuthenticatorModel, 1222 config: Config, 1223 token_provider: Optional[TokenProvider] = None, 1224 **kwargs: Any, 1225 ) -> BearerAuthenticator: 1226 if token_provider is not None and model.api_token != "": 1227 raise ValueError( 1228 "If token_provider is set, api_token is ignored and has to be set to empty string." 1229 ) 1230 return BearerAuthenticator( 1231 token_provider=( 1232 token_provider 1233 if token_provider is not None 1234 else InterpolatedStringTokenProvider( 1235 api_token=model.api_token or "", 1236 config=config, 1237 parameters=model.parameters or {}, 1238 ) 1239 ), 1240 config=config, 1241 parameters=model.parameters or {}, 1242 ) 1243 1244 @staticmethod 1245 def create_dynamic_stream_check_config( 1246 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1247 ) -> DynamicStreamCheckConfig: 1248 return DynamicStreamCheckConfig( 1249 dynamic_stream_name=model.dynamic_stream_name, 1250 stream_count=model.stream_count, 1251 ) 1252 1253 def create_check_stream( 1254 self, model: CheckStreamModel, config: Config, **kwargs: Any 1255 ) -> CheckStream: 1256 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1257 raise ValueError( 1258 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1259 ) 1260 1261 dynamic_streams_check_configs = ( 1262 [ 1263 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1264 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1265 ] 1266 if model.dynamic_streams_check_configs 1267 else [] 1268 ) 1269 1270 return CheckStream( 1271 stream_names=model.stream_names or [], 1272 dynamic_streams_check_configs=dynamic_streams_check_configs, 1273 parameters={}, 1274 ) 1275 1276 @staticmethod 1277 def create_check_dynamic_stream( 1278 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1279 ) -> CheckDynamicStream: 1280 assert model.use_check_availability is not None # for mypy 1281 1282 use_check_availability = model.use_check_availability 1283 1284 return CheckDynamicStream( 1285 stream_count=model.stream_count, 1286 use_check_availability=use_check_availability, 1287 parameters={}, 1288 ) 1289 1290 def create_composite_error_handler( 1291 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1292 ) -> CompositeErrorHandler: 1293 error_handlers = [ 1294 self._create_component_from_model(model=error_handler_model, config=config) 1295 for error_handler_model in model.error_handlers 1296 ] 1297 return CompositeErrorHandler( 1298 error_handlers=error_handlers, parameters=model.parameters or {} 1299 ) 1300 1301 @staticmethod 1302 def create_concurrency_level( 1303 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1304 ) -> ConcurrencyLevel: 1305 return ConcurrencyLevel( 1306 default_concurrency=model.default_concurrency, 1307 max_concurrency=model.max_concurrency, 1308 config=config, 1309 parameters={}, 1310 ) 1311 1312 @staticmethod 1313 def apply_stream_state_migrations( 1314 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1315 ) -> MutableMapping[str, Any]: 1316 if stream_state_migrations: 1317 for state_migration in stream_state_migrations: 1318 if state_migration.should_migrate(stream_state): 1319 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1320 stream_state = dict(state_migration.migrate(stream_state)) 1321 return stream_state 1322 1323 def create_concurrent_cursor_from_datetime_based_cursor( 1324 self, 1325 model_type: Type[BaseModel], 1326 component_definition: ComponentDefinition, 1327 stream_name: str, 1328 stream_namespace: Optional[str], 1329 stream_state: MutableMapping[str, Any], 1330 config: Config, 1331 message_repository: Optional[MessageRepository] = None, 1332 runtime_lookback_window: Optional[datetime.timedelta] = None, 1333 **kwargs: Any, 1334 ) -> ConcurrentCursor: 1335 component_type = component_definition.get("type") 1336 if component_definition.get("type") != model_type.__name__: 1337 raise ValueError( 1338 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1339 ) 1340 1341 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1342 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1343 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1344 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1345 if "$parameters" not in component_definition and "parameters" in component_definition: 1346 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1347 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1348 1349 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1350 raise ValueError( 1351 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1352 ) 1353 1354 model_parameters = datetime_based_cursor_model.parameters or {} 1355 1356 cursor_field = self._get_catalog_defined_cursor_field( 1357 stream_name=stream_name, 1358 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1359 or False, 1360 ) 1361 1362 if not cursor_field: 1363 interpolated_cursor_field = InterpolatedString.create( 1364 datetime_based_cursor_model.cursor_field, 1365 parameters=model_parameters, 1366 ) 1367 cursor_field = CursorField( 1368 cursor_field_key=interpolated_cursor_field.eval(config=config), 1369 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1370 or False, 1371 ) 1372 1373 interpolated_partition_field_start = InterpolatedString.create( 1374 datetime_based_cursor_model.partition_field_start or "start_time", 1375 parameters=model_parameters, 1376 ) 1377 interpolated_partition_field_end = InterpolatedString.create( 1378 datetime_based_cursor_model.partition_field_end or "end_time", 1379 parameters=model_parameters, 1380 ) 1381 1382 slice_boundary_fields = ( 1383 interpolated_partition_field_start.eval(config=config), 1384 interpolated_partition_field_end.eval(config=config), 1385 ) 1386 1387 datetime_format = datetime_based_cursor_model.datetime_format 1388 1389 cursor_granularity = ( 1390 parse_duration(datetime_based_cursor_model.cursor_granularity) 1391 if datetime_based_cursor_model.cursor_granularity 1392 else None 1393 ) 1394 1395 lookback_window = None 1396 interpolated_lookback_window = ( 1397 InterpolatedString.create( 1398 datetime_based_cursor_model.lookback_window, 1399 parameters=model_parameters, 1400 ) 1401 if datetime_based_cursor_model.lookback_window 1402 else None 1403 ) 1404 if interpolated_lookback_window: 1405 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1406 if evaluated_lookback_window: 1407 lookback_window = parse_duration(evaluated_lookback_window) 1408 1409 connector_state_converter: DateTimeStreamStateConverter 1410 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1411 datetime_format=datetime_format, 1412 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1413 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1414 cursor_granularity=cursor_granularity, 1415 ) 1416 1417 # Adjusts the stream state by applying the runtime lookback window. 1418 # This is used to ensure correct state handling in case of failed partitions. 1419 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1420 if runtime_lookback_window and stream_state_value: 1421 new_stream_state = ( 1422 connector_state_converter.parse_timestamp(stream_state_value) 1423 - runtime_lookback_window 1424 ) 1425 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1426 new_stream_state 1427 ) 1428 1429 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1430 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1431 start_date_runtime_value = self.create_min_max_datetime( 1432 model=datetime_based_cursor_model.start_datetime, config=config 1433 ) 1434 else: 1435 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1436 1437 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1438 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1439 end_date_runtime_value = self.create_min_max_datetime( 1440 model=datetime_based_cursor_model.end_datetime, config=config 1441 ) 1442 else: 1443 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1444 1445 interpolated_start_date = MinMaxDatetime.create( 1446 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1447 parameters=datetime_based_cursor_model.parameters, 1448 ) 1449 interpolated_end_date = ( 1450 None 1451 if not end_date_runtime_value 1452 else MinMaxDatetime.create( 1453 end_date_runtime_value, datetime_based_cursor_model.parameters 1454 ) 1455 ) 1456 1457 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1458 if not interpolated_start_date.datetime_format: 1459 interpolated_start_date.datetime_format = datetime_format 1460 if interpolated_end_date and not interpolated_end_date.datetime_format: 1461 interpolated_end_date.datetime_format = datetime_format 1462 1463 start_date = interpolated_start_date.get_datetime(config=config) 1464 end_date_provider = ( 1465 partial(interpolated_end_date.get_datetime, config) 1466 if interpolated_end_date 1467 else connector_state_converter.get_end_provider() 1468 ) 1469 1470 if ( 1471 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1472 ) or ( 1473 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1474 ): 1475 raise ValueError( 1476 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1477 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1478 ) 1479 1480 # When step is not defined, default to a step size from the starting date to the present moment 1481 step_length = datetime.timedelta.max 1482 interpolated_step = ( 1483 InterpolatedString.create( 1484 datetime_based_cursor_model.step, 1485 parameters=model_parameters, 1486 ) 1487 if datetime_based_cursor_model.step 1488 else None 1489 ) 1490 if interpolated_step: 1491 evaluated_step = interpolated_step.eval(config) 1492 if evaluated_step: 1493 step_length = parse_duration(evaluated_step) 1494 1495 clamping_strategy: ClampingStrategy = NoClamping() 1496 if datetime_based_cursor_model.clamping: 1497 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1498 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1499 # object which we want to keep agnostic of being low-code 1500 target = InterpolatedString( 1501 string=datetime_based_cursor_model.clamping.target, 1502 parameters=model_parameters, 1503 ) 1504 evaluated_target = target.eval(config=config) 1505 match evaluated_target: 1506 case "DAY": 1507 clamping_strategy = DayClampingStrategy() 1508 end_date_provider = ClampingEndProvider( 1509 DayClampingStrategy(is_ceiling=False), 1510 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1511 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1512 ) 1513 case "WEEK": 1514 if ( 1515 not datetime_based_cursor_model.clamping.target_details 1516 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1517 ): 1518 raise ValueError( 1519 "Given WEEK clamping, weekday needs to be provided as target_details" 1520 ) 1521 weekday = self._assemble_weekday( 1522 datetime_based_cursor_model.clamping.target_details["weekday"] 1523 ) 1524 clamping_strategy = WeekClampingStrategy(weekday) 1525 end_date_provider = ClampingEndProvider( 1526 WeekClampingStrategy(weekday, is_ceiling=False), 1527 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 granularity=cursor_granularity or datetime.timedelta(days=1), 1529 ) 1530 case "MONTH": 1531 clamping_strategy = MonthClampingStrategy() 1532 end_date_provider = ClampingEndProvider( 1533 MonthClampingStrategy(is_ceiling=False), 1534 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1535 granularity=cursor_granularity or datetime.timedelta(days=1), 1536 ) 1537 case _: 1538 raise ValueError( 1539 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1540 ) 1541 1542 return ConcurrentCursor( 1543 stream_name=stream_name, 1544 stream_namespace=stream_namespace, 1545 stream_state=stream_state, 1546 message_repository=message_repository or self._message_repository, 1547 connector_state_manager=self._connector_state_manager, 1548 connector_state_converter=connector_state_converter, 1549 cursor_field=cursor_field, 1550 slice_boundary_fields=slice_boundary_fields, 1551 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1552 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1553 lookback_window=lookback_window, 1554 slice_range=step_length, 1555 cursor_granularity=cursor_granularity, 1556 clamping_strategy=clamping_strategy, 1557 ) 1558 1559 def create_concurrent_cursor_from_incrementing_count_cursor( 1560 self, 1561 model_type: Type[BaseModel], 1562 component_definition: ComponentDefinition, 1563 stream_name: str, 1564 stream_namespace: Optional[str], 1565 stream_state: MutableMapping[str, Any], 1566 config: Config, 1567 message_repository: Optional[MessageRepository] = None, 1568 **kwargs: Any, 1569 ) -> ConcurrentCursor: 1570 component_type = component_definition.get("type") 1571 if component_definition.get("type") != model_type.__name__: 1572 raise ValueError( 1573 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1574 ) 1575 1576 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1577 1578 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1579 raise ValueError( 1580 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1581 ) 1582 1583 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1584 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1585 # We need to handle both int and str representations of numeric values. 1586 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1587 if start_value is not None: 1588 interpolated_start_value = InterpolatedString.create( 1589 str(start_value), # Ensure we pass a string to InterpolatedString.create 1590 parameters=incrementing_count_cursor_model.parameters or {}, 1591 ) 1592 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1593 else: 1594 evaluated_start_value = 0 1595 1596 cursor_field = self._get_catalog_defined_cursor_field( 1597 stream_name=stream_name, 1598 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1599 or False, 1600 ) 1601 1602 if not cursor_field: 1603 interpolated_cursor_field = InterpolatedString.create( 1604 incrementing_count_cursor_model.cursor_field, 1605 parameters=incrementing_count_cursor_model.parameters or {}, 1606 ) 1607 cursor_field = CursorField( 1608 cursor_field_key=interpolated_cursor_field.eval(config=config), 1609 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1610 or False, 1611 ) 1612 1613 connector_state_converter = IncrementingCountStreamStateConverter( 1614 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1615 ) 1616 1617 return ConcurrentCursor( 1618 stream_name=stream_name, 1619 stream_namespace=stream_namespace, 1620 stream_state=stream_state, 1621 message_repository=message_repository or self._message_repository, 1622 connector_state_manager=self._connector_state_manager, 1623 connector_state_converter=connector_state_converter, 1624 cursor_field=cursor_field, 1625 slice_boundary_fields=None, 1626 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1627 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1628 ) 1629 1630 def _assemble_weekday(self, weekday: str) -> Weekday: 1631 match weekday: 1632 case "MONDAY": 1633 return Weekday.MONDAY 1634 case "TUESDAY": 1635 return Weekday.TUESDAY 1636 case "WEDNESDAY": 1637 return Weekday.WEDNESDAY 1638 case "THURSDAY": 1639 return Weekday.THURSDAY 1640 case "FRIDAY": 1641 return Weekday.FRIDAY 1642 case "SATURDAY": 1643 return Weekday.SATURDAY 1644 case "SUNDAY": 1645 return Weekday.SUNDAY 1646 case _: 1647 raise ValueError(f"Unknown weekday {weekday}") 1648 1649 def create_concurrent_cursor_from_perpartition_cursor( 1650 self, 1651 state_manager: ConnectorStateManager, 1652 model_type: Type[BaseModel], 1653 component_definition: ComponentDefinition, 1654 stream_name: str, 1655 stream_namespace: Optional[str], 1656 config: Config, 1657 stream_state: MutableMapping[str, Any], 1658 partition_router: PartitionRouter, 1659 attempt_to_create_cursor_if_not_provided: bool = False, 1660 **kwargs: Any, 1661 ) -> ConcurrentPerPartitionCursor: 1662 component_type = component_definition.get("type") 1663 if component_definition.get("type") != model_type.__name__: 1664 raise ValueError( 1665 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1666 ) 1667 1668 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1669 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1670 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1671 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1672 if "$parameters" not in component_definition and "parameters" in component_definition: 1673 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1674 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1675 1676 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1677 raise ValueError( 1678 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1679 ) 1680 1681 cursor_field = self._get_catalog_defined_cursor_field( 1682 stream_name=stream_name, 1683 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1684 or False, 1685 ) 1686 1687 if not cursor_field: 1688 interpolated_cursor_field = InterpolatedString.create( 1689 datetime_based_cursor_model.cursor_field, 1690 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1691 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1692 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1693 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1694 parameters=datetime_based_cursor_model.parameters or {}, 1695 ) 1696 cursor_field = CursorField( 1697 cursor_field_key=interpolated_cursor_field.eval(config=config), 1698 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1699 or False, 1700 ) 1701 1702 datetime_format = datetime_based_cursor_model.datetime_format 1703 1704 cursor_granularity = ( 1705 parse_duration(datetime_based_cursor_model.cursor_granularity) 1706 if datetime_based_cursor_model.cursor_granularity 1707 else None 1708 ) 1709 1710 connector_state_converter: DateTimeStreamStateConverter 1711 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1712 datetime_format=datetime_format, 1713 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1714 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1715 cursor_granularity=cursor_granularity, 1716 ) 1717 1718 # Create the cursor factory 1719 cursor_factory = ConcurrentCursorFactory( 1720 partial( 1721 self.create_concurrent_cursor_from_datetime_based_cursor, 1722 state_manager=state_manager, 1723 model_type=model_type, 1724 component_definition=component_definition, 1725 stream_name=stream_name, 1726 stream_namespace=stream_namespace, 1727 config=config, 1728 message_repository=NoopMessageRepository(), 1729 ) 1730 ) 1731 1732 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1733 use_global_cursor = isinstance( 1734 partition_router, GroupingPartitionRouter 1735 ) or component_definition.get("global_substream_cursor", False) 1736 1737 # Return the concurrent cursor and state converter 1738 return ConcurrentPerPartitionCursor( 1739 cursor_factory=cursor_factory, 1740 partition_router=partition_router, 1741 stream_name=stream_name, 1742 stream_namespace=stream_namespace, 1743 stream_state=stream_state, 1744 message_repository=self._message_repository, # type: ignore 1745 connector_state_manager=state_manager, 1746 connector_state_converter=connector_state_converter, 1747 cursor_field=cursor_field, 1748 use_global_cursor=use_global_cursor, 1749 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1750 ) 1751 1752 @staticmethod 1753 def create_constant_backoff_strategy( 1754 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1755 ) -> ConstantBackoffStrategy: 1756 return ConstantBackoffStrategy( 1757 backoff_time_in_seconds=model.backoff_time_in_seconds, 1758 config=config, 1759 parameters=model.parameters or {}, 1760 ) 1761 1762 def create_cursor_pagination( 1763 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1764 ) -> CursorPaginationStrategy: 1765 if isinstance(decoder, PaginationDecoderDecorator): 1766 inner_decoder = decoder.decoder 1767 else: 1768 inner_decoder = decoder 1769 decoder = PaginationDecoderDecorator(decoder=decoder) 1770 1771 if self._is_supported_decoder_for_pagination(inner_decoder): 1772 decoder_to_use = decoder 1773 else: 1774 raise ValueError( 1775 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1776 ) 1777 1778 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1779 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1780 page_size = model.page_size 1781 if isinstance(page_size, str) and page_size.isdigit(): 1782 page_size = int(page_size) 1783 1784 return CursorPaginationStrategy( 1785 cursor_value=model.cursor_value, 1786 decoder=decoder_to_use, 1787 page_size=page_size, 1788 stop_condition=model.stop_condition, 1789 config=config, 1790 parameters=model.parameters or {}, 1791 ) 1792 1793 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1794 """ 1795 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1796 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1797 :param model: The Pydantic model of the custom component being created 1798 :param config: The custom defined connector config 1799 :return: The declarative component built from the Pydantic model to be used at runtime 1800 """ 1801 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1802 component_fields = get_type_hints(custom_component_class) 1803 model_args = model.dict() 1804 model_args["config"] = config 1805 1806 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1807 # we defer to these arguments over the component's definition 1808 for key, arg in kwargs.items(): 1809 model_args[key] = arg 1810 1811 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1812 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1813 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1814 for model_field, model_value in model_args.items(): 1815 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1816 if ( 1817 isinstance(model_value, dict) 1818 and "type" not in model_value 1819 and model_field in component_fields 1820 ): 1821 derived_type = self._derive_component_type_from_type_hints( 1822 component_fields.get(model_field) 1823 ) 1824 if derived_type: 1825 model_value["type"] = derived_type 1826 1827 if self._is_component(model_value): 1828 model_args[model_field] = self._create_nested_component( 1829 model, 1830 model_field, 1831 model_value, 1832 config, 1833 **kwargs, 1834 ) 1835 elif isinstance(model_value, list): 1836 vals = [] 1837 for v in model_value: 1838 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1839 derived_type = self._derive_component_type_from_type_hints( 1840 component_fields.get(model_field) 1841 ) 1842 if derived_type: 1843 v["type"] = derived_type 1844 if self._is_component(v): 1845 vals.append( 1846 self._create_nested_component( 1847 model, 1848 model_field, 1849 v, 1850 config, 1851 **kwargs, 1852 ) 1853 ) 1854 else: 1855 vals.append(v) 1856 model_args[model_field] = vals 1857 1858 kwargs = { 1859 class_field: model_args[class_field] 1860 for class_field in component_fields.keys() 1861 if class_field in model_args 1862 } 1863 return custom_component_class(**kwargs) 1864 1865 @staticmethod 1866 def _get_class_from_fully_qualified_class_name( 1867 full_qualified_class_name: str, 1868 ) -> Any: 1869 """Get a class from its fully qualified name. 1870 1871 If a custom components module is needed, we assume it is already registered - probably 1872 as `source_declarative_manifest.components` or `components`. 1873 1874 Args: 1875 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1876 1877 Returns: 1878 Any: The class object. 1879 1880 Raises: 1881 ValueError: If the class cannot be loaded. 1882 """ 1883 split = full_qualified_class_name.split(".") 1884 module_name_full = ".".join(split[:-1]) 1885 class_name = split[-1] 1886 1887 try: 1888 module_ref = importlib.import_module(module_name_full) 1889 except ModuleNotFoundError as e: 1890 if split[0] == "source_declarative_manifest": 1891 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1892 try: 1893 import os 1894 1895 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1896 module_ref = importlib.import_module( 1897 module_name_with_source_declarative_manifest 1898 ) 1899 except ModuleNotFoundError: 1900 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1901 else: 1902 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1903 1904 try: 1905 return getattr(module_ref, class_name) 1906 except AttributeError as e: 1907 raise ValueError( 1908 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1909 ) from e 1910 1911 @staticmethod 1912 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1913 interface = field_type 1914 while True: 1915 origin = get_origin(interface) 1916 if origin: 1917 # Unnest types until we reach the raw type 1918 # List[T] -> T 1919 # Optional[List[T]] -> T 1920 args = get_args(interface) 1921 interface = args[0] 1922 else: 1923 break 1924 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1925 return interface.__name__ 1926 return None 1927 1928 @staticmethod 1929 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1930 if not cls: 1931 return False 1932 return cls.__module__ == "builtins" 1933 1934 @staticmethod 1935 def _extract_missing_parameters(error: TypeError) -> List[str]: 1936 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1937 if parameter_search: 1938 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1939 else: 1940 return [] 1941 1942 def _create_nested_component( 1943 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1944 ) -> Any: 1945 type_name = model_value.get("type", None) 1946 if not type_name: 1947 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1948 return model_value 1949 1950 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1951 if model_type: 1952 parsed_model = model_type.parse_obj(model_value) 1953 try: 1954 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1955 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1956 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1957 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1958 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1959 # are needed by a component and could not be shared. 1960 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1961 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1962 model_parameters = model_value.get("$parameters", {}) 1963 matching_parameters = { 1964 kwarg: model_parameters[kwarg] 1965 for kwarg in constructor_kwargs 1966 if kwarg in model_parameters 1967 } 1968 matching_kwargs = { 1969 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1970 } 1971 return self._create_component_from_model( 1972 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1973 ) 1974 except TypeError as error: 1975 missing_parameters = self._extract_missing_parameters(error) 1976 if missing_parameters: 1977 raise ValueError( 1978 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1979 + ", ".join( 1980 ( 1981 f"{type_name}.$parameters.{parameter}" 1982 for parameter in missing_parameters 1983 ) 1984 ) 1985 ) 1986 raise TypeError( 1987 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1988 ) 1989 else: 1990 raise ValueError( 1991 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1992 ) 1993 1994 @staticmethod 1995 def _is_component(model_value: Any) -> bool: 1996 return isinstance(model_value, dict) and model_value.get("type") is not None 1997 1998 def create_default_stream( 1999 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2000 ) -> AbstractStream: 2001 primary_key = model.primary_key.__root__ if model.primary_key else None 2002 self._migrate_state(model, config) 2003 2004 partition_router = self._build_stream_slicer_from_partition_router( 2005 model.retriever, 2006 config, 2007 stream_name=model.name, 2008 **kwargs, 2009 ) 2010 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2011 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2012 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2013 2014 end_time_option = ( 2015 self._create_component_from_model( 2016 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2017 ) 2018 if cursor_model.end_time_option 2019 else None 2020 ) 2021 start_time_option = ( 2022 self._create_component_from_model( 2023 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2024 ) 2025 if cursor_model.start_time_option 2026 else None 2027 ) 2028 2029 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2030 start_time_option=start_time_option, 2031 end_time_option=end_time_option, 2032 partition_field_start=cursor_model.partition_field_start, 2033 partition_field_end=cursor_model.partition_field_end, 2034 config=config, 2035 parameters=model.parameters or {}, 2036 ) 2037 request_options_provider = ( 2038 datetime_request_options_provider 2039 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2040 else PerPartitionRequestOptionsProvider( 2041 partition_router, datetime_request_options_provider 2042 ) 2043 ) 2044 elif model.incremental_sync and isinstance( 2045 model.incremental_sync, IncrementingCountCursorModel 2046 ): 2047 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2048 raise ValueError( 2049 "PerPartition does not support per partition states because switching to global state is time based" 2050 ) 2051 2052 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2053 2054 start_time_option = ( 2055 self._create_component_from_model( 2056 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2057 config, 2058 parameters=cursor_model.parameters or {}, 2059 ) 2060 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2061 else None 2062 ) 2063 2064 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2065 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2066 partition_field_start = "start" 2067 2068 request_options_provider = DatetimeBasedRequestOptionsProvider( 2069 start_time_option=start_time_option, 2070 partition_field_start=partition_field_start, 2071 config=config, 2072 parameters=model.parameters or {}, 2073 ) 2074 else: 2075 request_options_provider = None 2076 2077 transformations = [] 2078 if model.transformations: 2079 for transformation_model in model.transformations: 2080 transformations.append( 2081 self._create_component_from_model(model=transformation_model, config=config) 2082 ) 2083 file_uploader = None 2084 if model.file_uploader: 2085 file_uploader = self._create_component_from_model( 2086 model=model.file_uploader, config=config 2087 ) 2088 2089 stream_slicer: ConcurrentStreamSlicer = ( 2090 partition_router 2091 if isinstance(concurrent_cursor, FinalStateCursor) 2092 else concurrent_cursor 2093 ) 2094 2095 retriever = self._create_component_from_model( 2096 model=model.retriever, 2097 config=config, 2098 name=model.name, 2099 primary_key=primary_key, 2100 request_options_provider=request_options_provider, 2101 stream_slicer=stream_slicer, 2102 partition_router=partition_router, 2103 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2104 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2105 cursor=concurrent_cursor, 2106 transformations=transformations, 2107 file_uploader=file_uploader, 2108 incremental_sync=model.incremental_sync, 2109 ) 2110 if isinstance(retriever, AsyncRetriever): 2111 stream_slicer = retriever.stream_slicer 2112 2113 schema_loader: SchemaLoader 2114 if model.schema_loader and isinstance(model.schema_loader, list): 2115 nested_schema_loaders = [ 2116 self._create_component_from_model(model=nested_schema_loader, config=config) 2117 for nested_schema_loader in model.schema_loader 2118 ] 2119 schema_loader = CompositeSchemaLoader( 2120 schema_loaders=nested_schema_loaders, parameters={} 2121 ) 2122 elif model.schema_loader: 2123 schema_loader = self._create_component_from_model( 2124 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2125 config=config, 2126 ) 2127 else: 2128 options = model.parameters or {} 2129 if "name" not in options: 2130 options["name"] = model.name 2131 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2132 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2133 2134 stream_name = model.name or "" 2135 return DefaultStream( 2136 partition_generator=StreamSlicerPartitionGenerator( 2137 DeclarativePartitionFactory( 2138 stream_name, 2139 schema_loader, 2140 retriever, 2141 self._message_repository, 2142 ), 2143 stream_slicer, 2144 slice_limit=self._limit_slices_fetched, 2145 ), 2146 name=stream_name, 2147 json_schema=schema_loader.get_json_schema, 2148 primary_key=get_primary_key_from_stream(primary_key), 2149 cursor_field=( 2150 concurrent_cursor.cursor_field 2151 if hasattr(concurrent_cursor, "cursor_field") 2152 else None 2153 ), 2154 logger=logging.getLogger(f"airbyte.{stream_name}"), 2155 cursor=concurrent_cursor, 2156 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2157 ) 2158 2159 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2160 stream_name = model.name or "" 2161 stream_state = self._connector_state_manager.get_stream_state( 2162 stream_name=stream_name, namespace=None 2163 ) 2164 if model.state_migrations: 2165 state_transformations = [ 2166 self._create_component_from_model(state_migration, config, declarative_stream=model) 2167 for state_migration in model.state_migrations 2168 ] 2169 else: 2170 state_transformations = [] 2171 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2172 self._connector_state_manager.update_state_for_stream( 2173 stream_name=stream_name, namespace=None, value=stream_state 2174 ) 2175 2176 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2177 return bool( 2178 model.incremental_sync 2179 and hasattr(model.incremental_sync, "is_data_feed") 2180 and model.incremental_sync.is_data_feed 2181 ) 2182 2183 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2184 return bool( 2185 model.incremental_sync 2186 and hasattr(model.incremental_sync, "is_client_side_incremental") 2187 and model.incremental_sync.is_client_side_incremental 2188 ) 2189 2190 def _build_stream_slicer_from_partition_router( 2191 self, 2192 model: Union[ 2193 AsyncRetrieverModel, 2194 CustomRetrieverModel, 2195 SimpleRetrieverModel, 2196 ], 2197 config: Config, 2198 stream_name: Optional[str] = None, 2199 **kwargs: Any, 2200 ) -> PartitionRouter: 2201 if ( 2202 hasattr(model, "partition_router") 2203 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2204 and model.partition_router 2205 ): 2206 stream_slicer_model = model.partition_router 2207 if isinstance(stream_slicer_model, list): 2208 return CartesianProductStreamSlicer( 2209 [ 2210 self._create_component_from_model( 2211 model=slicer, config=config, stream_name=stream_name or "" 2212 ) 2213 for slicer in stream_slicer_model 2214 ], 2215 parameters={}, 2216 ) 2217 elif isinstance(stream_slicer_model, dict): 2218 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2219 params = stream_slicer_model.get("$parameters") 2220 if not isinstance(params, dict): 2221 params = {} 2222 stream_slicer_model["$parameters"] = params 2223 2224 if stream_name is not None: 2225 params["stream_name"] = stream_name 2226 2227 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2228 model, 2229 "partition_router", 2230 stream_slicer_model, 2231 config, 2232 **kwargs, 2233 ) 2234 else: 2235 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2236 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2237 ) 2238 return SinglePartitionRouter(parameters={}) 2239 2240 def _build_concurrent_cursor( 2241 self, 2242 model: DeclarativeStreamModel, 2243 stream_slicer: Optional[PartitionRouter], 2244 config: Config, 2245 ) -> Cursor: 2246 stream_name = model.name or "" 2247 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2248 2249 if ( 2250 model.incremental_sync 2251 and stream_slicer 2252 and not isinstance(stream_slicer, SinglePartitionRouter) 2253 ): 2254 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2255 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2256 # same time because we didn't solve for design questions like what the lookback window would 2257 # be as well as global cursor fall backs. We have not seen customers that have needed both 2258 # at the same time yet and are currently punting on this until we need to solve it. 2259 raise ValueError( 2260 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2261 ) 2262 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2263 state_manager=self._connector_state_manager, 2264 model_type=DatetimeBasedCursorModel, 2265 component_definition=model.incremental_sync.__dict__, 2266 stream_name=stream_name, 2267 stream_state=stream_state, 2268 stream_namespace=None, 2269 config=config or {}, 2270 partition_router=stream_slicer, 2271 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2272 ) 2273 elif model.incremental_sync: 2274 if type(model.incremental_sync) == IncrementingCountCursorModel: 2275 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2276 model_type=IncrementingCountCursorModel, 2277 component_definition=model.incremental_sync.__dict__, 2278 stream_name=stream_name, 2279 stream_namespace=None, 2280 stream_state=stream_state, 2281 config=config or {}, 2282 ) 2283 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2284 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2285 model_type=type(model.incremental_sync), 2286 component_definition=model.incremental_sync.__dict__, 2287 stream_name=stream_name, 2288 stream_namespace=None, 2289 stream_state=stream_state, 2290 config=config or {}, 2291 attempt_to_create_cursor_if_not_provided=True, 2292 ) 2293 else: 2294 raise ValueError( 2295 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2296 ) 2297 return FinalStateCursor(stream_name, None, self._message_repository) 2298 2299 def create_default_error_handler( 2300 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2301 ) -> DefaultErrorHandler: 2302 backoff_strategies = [] 2303 if model.backoff_strategies: 2304 for backoff_strategy_model in model.backoff_strategies: 2305 backoff_strategies.append( 2306 self._create_component_from_model(model=backoff_strategy_model, config=config) 2307 ) 2308 2309 response_filters = [] 2310 if model.response_filters: 2311 for response_filter_model in model.response_filters: 2312 response_filters.append( 2313 self._create_component_from_model(model=response_filter_model, config=config) 2314 ) 2315 response_filters.append( 2316 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2317 ) 2318 2319 return DefaultErrorHandler( 2320 backoff_strategies=backoff_strategies, 2321 max_retries=model.max_retries, 2322 response_filters=response_filters, 2323 config=config, 2324 parameters=model.parameters or {}, 2325 ) 2326 2327 def create_default_paginator( 2328 self, 2329 model: DefaultPaginatorModel, 2330 config: Config, 2331 *, 2332 url_base: str, 2333 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2334 decoder: Optional[Decoder] = None, 2335 cursor_used_for_stop_condition: Optional[Cursor] = None, 2336 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2337 if decoder: 2338 if self._is_supported_decoder_for_pagination(decoder): 2339 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2340 else: 2341 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2342 else: 2343 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2344 page_size_option = ( 2345 self._create_component_from_model(model=model.page_size_option, config=config) 2346 if model.page_size_option 2347 else None 2348 ) 2349 page_token_option = ( 2350 self._create_component_from_model(model=model.page_token_option, config=config) 2351 if model.page_token_option 2352 else None 2353 ) 2354 pagination_strategy = self._create_component_from_model( 2355 model=model.pagination_strategy, 2356 config=config, 2357 decoder=decoder_to_use, 2358 extractor_model=extractor_model, 2359 ) 2360 if cursor_used_for_stop_condition: 2361 pagination_strategy = StopConditionPaginationStrategyDecorator( 2362 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2363 ) 2364 paginator = DefaultPaginator( 2365 decoder=decoder_to_use, 2366 page_size_option=page_size_option, 2367 page_token_option=page_token_option, 2368 pagination_strategy=pagination_strategy, 2369 url_base=url_base, 2370 config=config, 2371 parameters=model.parameters or {}, 2372 ) 2373 if self._limit_pages_fetched_per_slice: 2374 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2375 return paginator 2376 2377 def create_dpath_extractor( 2378 self, 2379 model: DpathExtractorModel, 2380 config: Config, 2381 decoder: Optional[Decoder] = None, 2382 **kwargs: Any, 2383 ) -> DpathExtractor: 2384 if decoder: 2385 decoder_to_use = decoder 2386 else: 2387 decoder_to_use = JsonDecoder(parameters={}) 2388 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2389 2390 record_expander = None 2391 if model.record_expander: 2392 record_expander = self._create_component_from_model( 2393 model=model.record_expander, 2394 config=config, 2395 ) 2396 2397 return DpathExtractor( 2398 decoder=decoder_to_use, 2399 field_path=model_field_path, 2400 config=config, 2401 parameters=model.parameters or {}, 2402 record_expander=record_expander, 2403 ) 2404 2405 def create_record_expander( 2406 self, 2407 model: RecordExpanderModel, 2408 config: Config, 2409 **kwargs: Any, 2410 ) -> RecordExpander: 2411 return RecordExpander( 2412 expand_records_from_field=model.expand_records_from_field, 2413 config=config, 2414 parameters=model.parameters or {}, 2415 remain_original_record=model.remain_original_record or False, 2416 on_no_records=OnNoRecords(model.on_no_records.value) 2417 if model.on_no_records 2418 else OnNoRecords.skip, 2419 ) 2420 2421 @staticmethod 2422 def create_response_to_file_extractor( 2423 model: ResponseToFileExtractorModel, 2424 **kwargs: Any, 2425 ) -> ResponseToFileExtractor: 2426 return ResponseToFileExtractor(parameters=model.parameters or {}) 2427 2428 @staticmethod 2429 def create_exponential_backoff_strategy( 2430 model: ExponentialBackoffStrategyModel, config: Config 2431 ) -> ExponentialBackoffStrategy: 2432 return ExponentialBackoffStrategy( 2433 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2434 ) 2435 2436 @staticmethod 2437 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2438 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2439 2440 def create_http_requester( 2441 self, 2442 model: HttpRequesterModel, 2443 config: Config, 2444 decoder: Decoder = JsonDecoder(parameters={}), 2445 query_properties_key: Optional[str] = None, 2446 use_cache: Optional[bool] = None, 2447 *, 2448 name: str, 2449 ) -> HttpRequester: 2450 authenticator = ( 2451 self._create_component_from_model( 2452 model=model.authenticator, 2453 config=config, 2454 url_base=model.url or model.url_base, 2455 name=name, 2456 decoder=decoder, 2457 ) 2458 if model.authenticator 2459 else None 2460 ) 2461 error_handler = ( 2462 self._create_component_from_model(model=model.error_handler, config=config) 2463 if model.error_handler 2464 else DefaultErrorHandler( 2465 backoff_strategies=[], 2466 response_filters=[], 2467 config=config, 2468 parameters=model.parameters or {}, 2469 ) 2470 ) 2471 2472 api_budget = self._api_budget 2473 2474 request_options_provider = InterpolatedRequestOptionsProvider( 2475 request_body=model.request_body, 2476 request_body_data=model.request_body_data, 2477 request_body_json=model.request_body_json, 2478 request_headers=model.request_headers, 2479 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2480 query_properties_key=query_properties_key, 2481 config=config, 2482 parameters=model.parameters or {}, 2483 ) 2484 2485 assert model.use_cache is not None # for mypy 2486 assert model.http_method is not None # for mypy 2487 2488 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2489 2490 return HttpRequester( 2491 name=name, 2492 url=model.url, 2493 url_base=model.url_base, 2494 path=model.path, 2495 authenticator=authenticator, 2496 error_handler=error_handler, 2497 api_budget=api_budget, 2498 http_method=HttpMethod[model.http_method.value], 2499 request_options_provider=request_options_provider, 2500 config=config, 2501 disable_retries=self._disable_retries, 2502 parameters=model.parameters or {}, 2503 message_repository=self._message_repository, 2504 use_cache=should_use_cache, 2505 decoder=decoder, 2506 stream_response=decoder.is_stream_response() if decoder else False, 2507 ) 2508 2509 @staticmethod 2510 def create_http_response_filter( 2511 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2512 ) -> HttpResponseFilter: 2513 if model.action: 2514 action = ResponseAction(model.action.value) 2515 else: 2516 action = None 2517 2518 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2519 2520 http_codes = ( 2521 set(model.http_codes) if model.http_codes else set() 2522 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2523 2524 return HttpResponseFilter( 2525 action=action, 2526 failure_type=failure_type, 2527 error_message=model.error_message or "", 2528 error_message_contains=model.error_message_contains or "", 2529 http_codes=http_codes, 2530 predicate=model.predicate or "", 2531 config=config, 2532 parameters=model.parameters or {}, 2533 ) 2534 2535 @staticmethod 2536 def create_inline_schema_loader( 2537 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2538 ) -> InlineSchemaLoader: 2539 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2540 2541 def create_complex_field_type( 2542 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2543 ) -> ComplexFieldType: 2544 items = ( 2545 self._create_component_from_model(model=model.items, config=config) 2546 if isinstance(model.items, ComplexFieldTypeModel) 2547 else model.items 2548 ) 2549 2550 return ComplexFieldType(field_type=model.field_type, items=items) 2551 2552 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2553 target_type = ( 2554 self._create_component_from_model(model=model.target_type, config=config) 2555 if isinstance(model.target_type, ComplexFieldTypeModel) 2556 else model.target_type 2557 ) 2558 2559 return TypesMap( 2560 target_type=target_type, 2561 current_type=model.current_type, 2562 condition=model.condition if model.condition is not None else "True", 2563 ) 2564 2565 def create_schema_type_identifier( 2566 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2567 ) -> SchemaTypeIdentifier: 2568 types_mapping = [] 2569 if model.types_mapping: 2570 types_mapping.extend( 2571 [ 2572 self._create_component_from_model(types_map, config=config) 2573 for types_map in model.types_mapping 2574 ] 2575 ) 2576 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2577 [x for x in model.schema_pointer] if model.schema_pointer else [] 2578 ) 2579 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2580 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2581 [x for x in model.type_pointer] if model.type_pointer else None 2582 ) 2583 2584 return SchemaTypeIdentifier( 2585 schema_pointer=model_schema_pointer, 2586 key_pointer=model_key_pointer, 2587 type_pointer=model_type_pointer, 2588 types_mapping=types_mapping, 2589 parameters=model.parameters or {}, 2590 ) 2591 2592 def create_dynamic_schema_loader( 2593 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2594 ) -> DynamicSchemaLoader: 2595 schema_transformations = [] 2596 if model.schema_transformations: 2597 for transformation_model in model.schema_transformations: 2598 schema_transformations.append( 2599 self._create_component_from_model(model=transformation_model, config=config) 2600 ) 2601 name = "dynamic_properties" 2602 retriever = self._create_component_from_model( 2603 model=model.retriever, 2604 config=config, 2605 name=name, 2606 primary_key=None, 2607 partition_router=self._build_stream_slicer_from_partition_router( 2608 model.retriever, config 2609 ), 2610 transformations=[], 2611 use_cache=True, 2612 log_formatter=( 2613 lambda response: format_http_message( 2614 response, 2615 f"Schema loader '{name}' request", 2616 f"Request performed in order to extract schema.", 2617 name, 2618 is_auxiliary=True, 2619 ) 2620 ), 2621 ) 2622 schema_type_identifier = self._create_component_from_model( 2623 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2624 ) 2625 schema_filter = ( 2626 self._create_component_from_model( 2627 model.schema_filter, config=config, parameters=model.parameters or {} 2628 ) 2629 if model.schema_filter is not None 2630 else None 2631 ) 2632 2633 return DynamicSchemaLoader( 2634 retriever=retriever, 2635 config=config, 2636 schema_transformations=schema_transformations, 2637 schema_filter=schema_filter, 2638 schema_type_identifier=schema_type_identifier, 2639 parameters=model.parameters or {}, 2640 ) 2641 2642 @staticmethod 2643 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2644 return JsonDecoder(parameters={}) 2645 2646 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2647 return CompositeRawDecoder( 2648 parser=ModelToComponentFactory._get_parser(model, config), 2649 stream_response=False if self._emit_connector_builder_messages else True, 2650 ) 2651 2652 def create_jsonl_decoder( 2653 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2654 ) -> Decoder: 2655 return CompositeRawDecoder( 2656 parser=ModelToComponentFactory._get_parser(model, config), 2657 stream_response=False if self._emit_connector_builder_messages else True, 2658 ) 2659 2660 def create_gzip_decoder( 2661 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2662 ) -> Decoder: 2663 _compressed_response_types = { 2664 "gzip", 2665 "x-gzip", 2666 "gzip, deflate", 2667 "x-gzip, deflate", 2668 "application/zip", 2669 "application/gzip", 2670 "application/x-gzip", 2671 "application/x-zip-compressed", 2672 } 2673 2674 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2675 2676 if self._emit_connector_builder_messages: 2677 # This is very surprising but if the response is not streamed, 2678 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2679 # which uses urllib3 directly and does not uncompress the data. 2680 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2681 2682 return CompositeRawDecoder.by_headers( 2683 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2684 stream_response=True, 2685 fallback_parser=gzip_parser.inner_parser, 2686 ) 2687 2688 @staticmethod 2689 def create_iterable_decoder( 2690 model: IterableDecoderModel, config: Config, **kwargs: Any 2691 ) -> IterableDecoder: 2692 return IterableDecoder(parameters={}) 2693 2694 @staticmethod 2695 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2696 return XmlDecoder(parameters={}) 2697 2698 def create_zipfile_decoder( 2699 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2700 ) -> ZipfileDecoder: 2701 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2702 2703 @staticmethod 2704 def _get_parser(model: BaseModel, config: Config) -> Parser: 2705 if isinstance(model, JsonDecoderModel): 2706 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2707 return JsonParser() 2708 elif isinstance(model, JsonlDecoderModel): 2709 return JsonLineParser() 2710 elif isinstance(model, CsvDecoderModel): 2711 return CsvParser( 2712 encoding=model.encoding, 2713 delimiter=model.delimiter, 2714 set_values_to_none=model.set_values_to_none, 2715 ) 2716 elif isinstance(model, GzipDecoderModel): 2717 return GzipParser( 2718 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2719 ) 2720 elif isinstance( 2721 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2722 ): 2723 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2724 2725 raise ValueError(f"Unknown decoder type {model}") 2726 2727 @staticmethod 2728 def create_json_file_schema_loader( 2729 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2730 ) -> JsonFileSchemaLoader: 2731 return JsonFileSchemaLoader( 2732 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2733 ) 2734 2735 def create_jwt_authenticator( 2736 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2737 ) -> JwtAuthenticator: 2738 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2739 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2740 request_option = ( 2741 self._create_component_from_model(model.request_option, config) 2742 if model.request_option 2743 else None 2744 ) 2745 return JwtAuthenticator( 2746 config=config, 2747 parameters=model.parameters or {}, 2748 algorithm=JwtAlgorithm(model.algorithm.value), 2749 secret_key=model.secret_key, 2750 base64_encode_secret_key=model.base64_encode_secret_key, 2751 token_duration=model.token_duration, 2752 header_prefix=model.header_prefix, 2753 kid=jwt_headers.kid, 2754 typ=jwt_headers.typ, 2755 cty=jwt_headers.cty, 2756 iss=jwt_payload.iss, 2757 sub=jwt_payload.sub, 2758 aud=jwt_payload.aud, 2759 additional_jwt_headers=model.additional_jwt_headers, 2760 additional_jwt_payload=model.additional_jwt_payload, 2761 passphrase=model.passphrase, 2762 request_option=request_option, 2763 ) 2764 2765 def create_list_partition_router( 2766 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2767 ) -> ListPartitionRouter: 2768 request_option = ( 2769 self._create_component_from_model(model.request_option, config) 2770 if model.request_option 2771 else None 2772 ) 2773 return ListPartitionRouter( 2774 cursor_field=model.cursor_field, 2775 request_option=request_option, 2776 values=model.values, 2777 config=config, 2778 parameters=model.parameters or {}, 2779 ) 2780 2781 @staticmethod 2782 def create_min_max_datetime( 2783 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2784 ) -> MinMaxDatetime: 2785 return MinMaxDatetime( 2786 datetime=model.datetime, 2787 datetime_format=model.datetime_format or "", 2788 max_datetime=model.max_datetime or "", 2789 min_datetime=model.min_datetime or "", 2790 parameters=model.parameters or {}, 2791 ) 2792 2793 @staticmethod 2794 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2795 return NoAuth(parameters=model.parameters or {}) 2796 2797 @staticmethod 2798 def create_no_pagination( 2799 model: NoPaginationModel, config: Config, **kwargs: Any 2800 ) -> NoPagination: 2801 return NoPagination(parameters={}) 2802 2803 def create_oauth_authenticator( 2804 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2805 ) -> DeclarativeOauth2Authenticator: 2806 profile_assertion = ( 2807 self._create_component_from_model(model.profile_assertion, config=config) 2808 if model.profile_assertion 2809 else None 2810 ) 2811 2812 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2813 self._get_refresh_token_error_information(model) 2814 ) 2815 if model.refresh_token_updater: 2816 # ignore type error because fixing it would have a lot of dependencies, revisit later 2817 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2818 config, 2819 InterpolatedString.create( 2820 model.token_refresh_endpoint, # type: ignore 2821 parameters=model.parameters or {}, 2822 ).eval(config), 2823 access_token_name=InterpolatedString.create( 2824 model.access_token_name or "access_token", parameters=model.parameters or {} 2825 ).eval(config), 2826 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2827 expires_in_name=InterpolatedString.create( 2828 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2829 ).eval(config), 2830 client_id_name=InterpolatedString.create( 2831 model.client_id_name or "client_id", parameters=model.parameters or {} 2832 ).eval(config), 2833 client_id=InterpolatedString.create( 2834 model.client_id, parameters=model.parameters or {} 2835 ).eval(config) 2836 if model.client_id 2837 else model.client_id, 2838 client_secret_name=InterpolatedString.create( 2839 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2840 ).eval(config), 2841 client_secret=InterpolatedString.create( 2842 model.client_secret, parameters=model.parameters or {} 2843 ).eval(config) 2844 if model.client_secret 2845 else model.client_secret, 2846 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2847 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2848 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2849 grant_type_name=InterpolatedString.create( 2850 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2851 ).eval(config), 2852 grant_type=InterpolatedString.create( 2853 model.grant_type or "refresh_token", parameters=model.parameters or {} 2854 ).eval(config), 2855 refresh_request_body=InterpolatedMapping( 2856 model.refresh_request_body or {}, parameters=model.parameters or {} 2857 ).eval(config), 2858 refresh_request_headers=InterpolatedMapping( 2859 model.refresh_request_headers or {}, parameters=model.parameters or {} 2860 ).eval(config), 2861 scopes=model.scopes, 2862 token_expiry_date_format=model.token_expiry_date_format, 2863 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2864 message_repository=self._message_repository, 2865 refresh_token_error_status_codes=refresh_token_error_status_codes, 2866 refresh_token_error_key=refresh_token_error_key, 2867 refresh_token_error_values=refresh_token_error_values, 2868 ) 2869 # ignore type error because fixing it would have a lot of dependencies, revisit later 2870 return DeclarativeOauth2Authenticator( # type: ignore 2871 access_token_name=model.access_token_name or "access_token", 2872 access_token_value=model.access_token_value, 2873 client_id_name=model.client_id_name or "client_id", 2874 client_id=model.client_id, 2875 client_secret_name=model.client_secret_name or "client_secret", 2876 client_secret=model.client_secret, 2877 expires_in_name=model.expires_in_name or "expires_in", 2878 grant_type_name=model.grant_type_name or "grant_type", 2879 grant_type=model.grant_type or "refresh_token", 2880 refresh_request_body=model.refresh_request_body, 2881 refresh_request_headers=model.refresh_request_headers, 2882 refresh_token_name=model.refresh_token_name or "refresh_token", 2883 refresh_token=model.refresh_token, 2884 scopes=model.scopes, 2885 token_expiry_date=model.token_expiry_date, 2886 token_expiry_date_format=model.token_expiry_date_format, 2887 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2888 token_refresh_endpoint=model.token_refresh_endpoint, 2889 config=config, 2890 parameters=model.parameters or {}, 2891 message_repository=self._message_repository, 2892 profile_assertion=profile_assertion, 2893 use_profile_assertion=model.use_profile_assertion, 2894 refresh_token_error_status_codes=refresh_token_error_status_codes, 2895 refresh_token_error_key=refresh_token_error_key, 2896 refresh_token_error_values=refresh_token_error_values, 2897 ) 2898 2899 @staticmethod 2900 def _get_refresh_token_error_information( 2901 model: OAuthAuthenticatorModel, 2902 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2903 """ 2904 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2905 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2906 information is defined only once and return the right fields. 2907 """ 2908 refresh_token_updater = model.refresh_token_updater 2909 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2910 refresh_token_updater.refresh_token_error_status_codes 2911 or refresh_token_updater.refresh_token_error_key 2912 or refresh_token_updater.refresh_token_error_values 2913 ) 2914 is_defined_on_oauth_authenticator = ( 2915 model.refresh_token_error_status_codes 2916 or model.refresh_token_error_key 2917 or model.refresh_token_error_values 2918 ) 2919 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2920 raise ValueError( 2921 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2922 ) 2923 2924 if is_defined_on_refresh_token_updated: 2925 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2926 return ( 2927 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2928 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2929 else (), 2930 not_optional_refresh_token_updater.refresh_token_error_key or "", 2931 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2932 if not_optional_refresh_token_updater.refresh_token_error_values 2933 else (), 2934 ) 2935 elif is_defined_on_oauth_authenticator: 2936 return ( 2937 tuple(model.refresh_token_error_status_codes) 2938 if model.refresh_token_error_status_codes 2939 else (), 2940 model.refresh_token_error_key or "", 2941 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2942 ) 2943 2944 # returning default values we think cover most cases 2945 return (400,), "error", ("invalid_grant", "invalid_permissions") 2946 2947 def create_offset_increment( 2948 self, 2949 model: OffsetIncrementModel, 2950 config: Config, 2951 decoder: Decoder, 2952 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2953 **kwargs: Any, 2954 ) -> OffsetIncrement: 2955 if isinstance(decoder, PaginationDecoderDecorator): 2956 inner_decoder = decoder.decoder 2957 else: 2958 inner_decoder = decoder 2959 decoder = PaginationDecoderDecorator(decoder=decoder) 2960 2961 if self._is_supported_decoder_for_pagination(inner_decoder): 2962 decoder_to_use = decoder 2963 else: 2964 raise ValueError( 2965 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2966 ) 2967 2968 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2969 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2970 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2971 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2972 # When we have more time to investigate we can look into reusing the same component. 2973 extractor = ( 2974 self._create_component_from_model( 2975 model=extractor_model, config=config, decoder=decoder_to_use 2976 ) 2977 if extractor_model 2978 else None 2979 ) 2980 2981 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2982 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2983 page_size = model.page_size 2984 if isinstance(page_size, str) and page_size.isdigit(): 2985 page_size = int(page_size) 2986 2987 return OffsetIncrement( 2988 page_size=page_size, 2989 config=config, 2990 decoder=decoder_to_use, 2991 extractor=extractor, 2992 inject_on_first_request=model.inject_on_first_request or False, 2993 parameters=model.parameters or {}, 2994 ) 2995 2996 @staticmethod 2997 def create_page_increment( 2998 model: PageIncrementModel, config: Config, **kwargs: Any 2999 ) -> PageIncrement: 3000 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3001 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3002 page_size = model.page_size 3003 if isinstance(page_size, str) and page_size.isdigit(): 3004 page_size = int(page_size) 3005 3006 return PageIncrement( 3007 page_size=page_size, 3008 config=config, 3009 start_from_page=model.start_from_page or 0, 3010 inject_on_first_request=model.inject_on_first_request or False, 3011 parameters=model.parameters or {}, 3012 ) 3013 3014 def create_parent_stream_config( 3015 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3016 ) -> ParentStreamConfig: 3017 declarative_stream = self._create_component_from_model( 3018 model.stream, 3019 config=config, 3020 is_parent=True, 3021 **kwargs, 3022 ) 3023 request_option = ( 3024 self._create_component_from_model(model.request_option, config=config) 3025 if model.request_option 3026 else None 3027 ) 3028 3029 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3030 raise ValueError( 3031 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3032 ) 3033 3034 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3035 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3036 ) 3037 3038 return ParentStreamConfig( 3039 parent_key=model.parent_key, 3040 request_option=request_option, 3041 stream=declarative_stream, 3042 partition_field=model.partition_field, 3043 config=config, 3044 incremental_dependency=model.incremental_dependency or False, 3045 parameters=model.parameters or {}, 3046 extra_fields=model.extra_fields, 3047 lazy_read_pointer=model_lazy_read_pointer, 3048 ) 3049 3050 def create_properties_from_endpoint( 3051 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3052 ) -> PropertiesFromEndpoint: 3053 retriever = self._create_component_from_model( 3054 model=model.retriever, 3055 config=config, 3056 name="dynamic_properties", 3057 primary_key=None, 3058 stream_slicer=None, 3059 transformations=[], 3060 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3061 ) 3062 return PropertiesFromEndpoint( 3063 property_field_path=model.property_field_path, 3064 retriever=retriever, 3065 config=config, 3066 parameters=model.parameters or {}, 3067 ) 3068 3069 def create_property_chunking( 3070 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3071 ) -> PropertyChunking: 3072 record_merge_strategy = ( 3073 self._create_component_from_model( 3074 model=model.record_merge_strategy, config=config, **kwargs 3075 ) 3076 if model.record_merge_strategy 3077 else None 3078 ) 3079 3080 property_limit_type: PropertyLimitType 3081 match model.property_limit_type: 3082 case PropertyLimitTypeModel.property_count: 3083 property_limit_type = PropertyLimitType.property_count 3084 case PropertyLimitTypeModel.characters: 3085 property_limit_type = PropertyLimitType.characters 3086 case _: 3087 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3088 3089 return PropertyChunking( 3090 property_limit_type=property_limit_type, 3091 property_limit=model.property_limit, 3092 record_merge_strategy=record_merge_strategy, 3093 config=config, 3094 parameters=model.parameters or {}, 3095 ) 3096 3097 def create_query_properties( 3098 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3099 ) -> QueryProperties: 3100 if isinstance(model.property_list, list): 3101 property_list = model.property_list 3102 else: 3103 property_list = self._create_component_from_model( 3104 model=model.property_list, config=config, **kwargs 3105 ) 3106 3107 property_chunking = ( 3108 self._create_component_from_model( 3109 model=model.property_chunking, config=config, **kwargs 3110 ) 3111 if model.property_chunking 3112 else None 3113 ) 3114 3115 property_selector = ( 3116 self._create_component_from_model( 3117 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3118 ) 3119 if model.property_selector 3120 else None 3121 ) 3122 3123 return QueryProperties( 3124 property_list=property_list, 3125 always_include_properties=model.always_include_properties, 3126 property_chunking=property_chunking, 3127 property_selector=property_selector, 3128 config=config, 3129 parameters=model.parameters or {}, 3130 ) 3131 3132 def create_json_schema_property_selector( 3133 self, 3134 model: JsonSchemaPropertySelectorModel, 3135 config: Config, 3136 *, 3137 stream_name: str, 3138 **kwargs: Any, 3139 ) -> JsonSchemaPropertySelector: 3140 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3141 3142 transformations = [] 3143 if model.transformations: 3144 for transformation_model in model.transformations: 3145 transformations.append( 3146 self._create_component_from_model(model=transformation_model, config=config) 3147 ) 3148 3149 return JsonSchemaPropertySelector( 3150 configured_stream=configured_stream, 3151 properties_transformations=transformations, 3152 config=config, 3153 parameters=model.parameters or {}, 3154 ) 3155 3156 @staticmethod 3157 def create_record_filter( 3158 model: RecordFilterModel, config: Config, **kwargs: Any 3159 ) -> RecordFilter: 3160 return RecordFilter( 3161 condition=model.condition or "", config=config, parameters=model.parameters or {} 3162 ) 3163 3164 @staticmethod 3165 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3166 return RequestPath(parameters={}) 3167 3168 @staticmethod 3169 def create_request_option( 3170 model: RequestOptionModel, config: Config, **kwargs: Any 3171 ) -> RequestOption: 3172 inject_into = RequestOptionType(model.inject_into.value) 3173 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3174 [ 3175 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3176 for segment in model.field_path 3177 ] 3178 if model.field_path 3179 else None 3180 ) 3181 field_name = ( 3182 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3183 if model.field_name 3184 else None 3185 ) 3186 return RequestOption( 3187 field_name=field_name, 3188 field_path=field_path, 3189 inject_into=inject_into, 3190 parameters=kwargs.get("parameters", {}), 3191 ) 3192 3193 def create_record_selector( 3194 self, 3195 model: RecordSelectorModel, 3196 config: Config, 3197 *, 3198 name: str, 3199 transformations: List[RecordTransformation] | None = None, 3200 decoder: Decoder | None = None, 3201 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3202 file_uploader: Optional[DefaultFileUploader] = None, 3203 **kwargs: Any, 3204 ) -> RecordSelector: 3205 extractor = self._create_component_from_model( 3206 model=model.extractor, decoder=decoder, config=config 3207 ) 3208 record_filter = ( 3209 self._create_component_from_model(model.record_filter, config=config) 3210 if model.record_filter 3211 else None 3212 ) 3213 3214 transform_before_filtering = ( 3215 False if model.transform_before_filtering is None else model.transform_before_filtering 3216 ) 3217 if client_side_incremental_sync_cursor: 3218 record_filter = ClientSideIncrementalRecordFilterDecorator( 3219 config=config, 3220 parameters=model.parameters, 3221 condition=model.record_filter.condition 3222 if (model.record_filter and hasattr(model.record_filter, "condition")) 3223 else None, 3224 cursor=client_side_incremental_sync_cursor, 3225 ) 3226 transform_before_filtering = ( 3227 True 3228 if model.transform_before_filtering is None 3229 else model.transform_before_filtering 3230 ) 3231 3232 if model.schema_normalization is None: 3233 # default to no schema normalization if not set 3234 model.schema_normalization = SchemaNormalizationModel.None_ 3235 3236 schema_normalization = ( 3237 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3238 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3239 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3240 ) 3241 3242 return RecordSelector( 3243 extractor=extractor, 3244 name=name, 3245 config=config, 3246 record_filter=record_filter, 3247 transformations=transformations or [], 3248 file_uploader=file_uploader, 3249 schema_normalization=schema_normalization, 3250 parameters=model.parameters or {}, 3251 transform_before_filtering=transform_before_filtering, 3252 ) 3253 3254 @staticmethod 3255 def create_remove_fields( 3256 model: RemoveFieldsModel, config: Config, **kwargs: Any 3257 ) -> RemoveFields: 3258 return RemoveFields( 3259 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3260 ) 3261 3262 def create_selective_authenticator( 3263 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3264 ) -> DeclarativeAuthenticator: 3265 authenticators = { 3266 name: self._create_component_from_model(model=auth, config=config) 3267 for name, auth in model.authenticators.items() 3268 } 3269 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3270 return SelectiveAuthenticator( # type: ignore[abstract] 3271 config=config, 3272 authenticators=authenticators, 3273 authenticator_selection_path=model.authenticator_selection_path, 3274 **kwargs, 3275 ) 3276 3277 @staticmethod 3278 def create_legacy_session_token_authenticator( 3279 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3280 ) -> LegacySessionTokenAuthenticator: 3281 return LegacySessionTokenAuthenticator( 3282 api_url=url_base, 3283 header=model.header, 3284 login_url=model.login_url, 3285 password=model.password or "", 3286 session_token=model.session_token or "", 3287 session_token_response_key=model.session_token_response_key or "", 3288 username=model.username or "", 3289 validate_session_url=model.validate_session_url, 3290 config=config, 3291 parameters=model.parameters or {}, 3292 ) 3293 3294 def create_simple_retriever( 3295 self, 3296 model: SimpleRetrieverModel, 3297 config: Config, 3298 *, 3299 name: str, 3300 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3301 request_options_provider: Optional[RequestOptionsProvider] = None, 3302 cursor: Optional[Cursor] = None, 3303 has_stop_condition_cursor: bool = False, 3304 is_client_side_incremental_sync: bool = False, 3305 transformations: List[RecordTransformation], 3306 file_uploader: Optional[DefaultFileUploader] = None, 3307 incremental_sync: Optional[ 3308 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3309 ] = None, 3310 use_cache: Optional[bool] = None, 3311 log_formatter: Optional[Callable[[Response], Any]] = None, 3312 partition_router: Optional[PartitionRouter] = None, 3313 **kwargs: Any, 3314 ) -> SimpleRetriever: 3315 def _get_url(req: Requester) -> str: 3316 """ 3317 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3318 This is needed because the URL is not set until the requester is created. 3319 """ 3320 3321 _url: str = ( 3322 model.requester.url 3323 if hasattr(model.requester, "url") and model.requester.url is not None 3324 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3325 ) 3326 _url_base: str = ( 3327 model.requester.url_base 3328 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3329 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3330 ) 3331 3332 return _url or _url_base 3333 3334 if cursor is None: 3335 cursor = FinalStateCursor(name, None, self._message_repository) 3336 3337 decoder = ( 3338 self._create_component_from_model(model=model.decoder, config=config) 3339 if model.decoder 3340 else JsonDecoder(parameters={}) 3341 ) 3342 record_selector = self._create_component_from_model( 3343 model=model.record_selector, 3344 name=name, 3345 config=config, 3346 decoder=decoder, 3347 transformations=transformations, 3348 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3349 file_uploader=file_uploader, 3350 ) 3351 3352 query_properties: Optional[QueryProperties] = None 3353 query_properties_key: Optional[str] = None 3354 self._ensure_query_properties_to_model(model.requester) 3355 if self._has_query_properties_in_request_parameters(model.requester): 3356 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3357 # places instead of default to request_parameters which isn't clearly documented 3358 if ( 3359 hasattr(model.requester, "fetch_properties_from_endpoint") 3360 and model.requester.fetch_properties_from_endpoint 3361 ): 3362 raise ValueError( 3363 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3364 ) 3365 3366 query_properties_definitions = [] 3367 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3368 if isinstance(request_parameter, QueryPropertiesModel): 3369 query_properties_key = key 3370 query_properties_definitions.append(request_parameter) 3371 3372 if len(query_properties_definitions) > 1: 3373 raise ValueError( 3374 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3375 ) 3376 3377 if len(query_properties_definitions) == 1: 3378 query_properties = self._create_component_from_model( 3379 model=query_properties_definitions[0], stream_name=name, config=config 3380 ) 3381 3382 # Removes QueryProperties components from the interpolated mappings because it has been designed 3383 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3384 # instead of through jinja interpolation 3385 if hasattr(model.requester, "request_parameters") and isinstance( 3386 model.requester.request_parameters, Mapping 3387 ): 3388 model.requester.request_parameters = self._remove_query_properties( 3389 model.requester.request_parameters 3390 ) 3391 elif ( 3392 hasattr(model.requester, "fetch_properties_from_endpoint") 3393 and model.requester.fetch_properties_from_endpoint 3394 ): 3395 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3396 query_properties_definition = QueryPropertiesModel( 3397 type="QueryProperties", 3398 property_list=model.requester.fetch_properties_from_endpoint, 3399 always_include_properties=None, 3400 property_chunking=None, 3401 ) # type: ignore # $parameters has a default value 3402 3403 query_properties = self.create_query_properties( 3404 model=query_properties_definition, 3405 stream_name=name, 3406 config=config, 3407 ) 3408 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3409 query_properties = self.create_query_properties( 3410 model=model.requester.query_properties, 3411 stream_name=name, 3412 config=config, 3413 ) 3414 3415 requester = self._create_component_from_model( 3416 model=model.requester, 3417 decoder=decoder, 3418 name=name, 3419 query_properties_key=query_properties_key, 3420 use_cache=use_cache, 3421 config=config, 3422 ) 3423 3424 if not request_options_provider: 3425 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3426 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3427 partition_router, PartitionRouter 3428 ): 3429 request_options_provider = partition_router 3430 3431 paginator = ( 3432 self._create_component_from_model( 3433 model=model.paginator, 3434 config=config, 3435 url_base=_get_url(requester), 3436 extractor_model=model.record_selector.extractor, 3437 decoder=decoder, 3438 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3439 ) 3440 if model.paginator 3441 else NoPagination(parameters={}) 3442 ) 3443 3444 ignore_stream_slicer_parameters_on_paginated_requests = ( 3445 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3446 ) 3447 3448 if ( 3449 model.partition_router 3450 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3451 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3452 and any( 3453 parent_stream_config.lazy_read_pointer 3454 for parent_stream_config in model.partition_router.parent_stream_configs 3455 ) 3456 ): 3457 if incremental_sync: 3458 if incremental_sync.type != "DatetimeBasedCursor": 3459 raise ValueError( 3460 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3461 ) 3462 3463 elif incremental_sync.step or incremental_sync.cursor_granularity: 3464 raise ValueError( 3465 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3466 ) 3467 3468 if model.decoder and model.decoder.type != "JsonDecoder": 3469 raise ValueError( 3470 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3471 ) 3472 3473 return LazySimpleRetriever( 3474 name=name, 3475 paginator=paginator, 3476 primary_key=primary_key, 3477 requester=requester, 3478 record_selector=record_selector, 3479 stream_slicer=_NO_STREAM_SLICING, 3480 request_option_provider=request_options_provider, 3481 config=config, 3482 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3483 parameters=model.parameters or {}, 3484 ) 3485 3486 if ( 3487 model.record_selector.record_filter 3488 and model.pagination_reset 3489 and model.pagination_reset.limits 3490 ): 3491 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3492 3493 return SimpleRetriever( 3494 name=name, 3495 paginator=paginator, 3496 primary_key=primary_key, 3497 requester=requester, 3498 record_selector=record_selector, 3499 stream_slicer=_NO_STREAM_SLICING, 3500 request_option_provider=request_options_provider, 3501 config=config, 3502 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3503 additional_query_properties=query_properties, 3504 log_formatter=self._get_log_formatter(log_formatter, name), 3505 pagination_tracker_factory=self._create_pagination_tracker_factory( 3506 model.pagination_reset, cursor 3507 ), 3508 parameters=model.parameters or {}, 3509 ) 3510 3511 def _create_pagination_tracker_factory( 3512 self, model: Optional[PaginationResetModel], cursor: Cursor 3513 ) -> Callable[[], PaginationTracker]: 3514 if model is None: 3515 return lambda: PaginationTracker() 3516 3517 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3518 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3519 if model.action == PaginationResetActionModel.RESET: 3520 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3521 pass 3522 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3523 if isinstance(cursor, ConcurrentCursor): 3524 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3525 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3526 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3527 {}, datetime.timedelta(0) 3528 ) 3529 elif not isinstance(cursor, FinalStateCursor): 3530 LOGGER.warning( 3531 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3532 ) 3533 else: 3534 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3535 3536 limit = model.limits.number_of_records if model and model.limits else None 3537 return lambda: PaginationTracker(cursor_factory(), limit) 3538 3539 def _get_log_formatter( 3540 self, log_formatter: Callable[[Response], Any] | None, name: str 3541 ) -> Callable[[Response], Any] | None: 3542 if self._should_limit_slices_fetched(): 3543 return ( 3544 ( 3545 lambda response: format_http_message( 3546 response, 3547 f"Stream '{name}' request", 3548 f"Request performed in order to extract records for stream '{name}'", 3549 name, 3550 ) 3551 ) 3552 if not log_formatter 3553 else log_formatter 3554 ) 3555 return None 3556 3557 def _should_limit_slices_fetched(self) -> bool: 3558 """ 3559 Returns True if the number of slices fetched should be limited, False otherwise. 3560 This is used to limit the number of slices fetched during tests. 3561 """ 3562 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3563 3564 @staticmethod 3565 def _has_query_properties_in_request_parameters( 3566 requester: Union[HttpRequesterModel, CustomRequesterModel], 3567 ) -> bool: 3568 if not hasattr(requester, "request_parameters"): 3569 return False 3570 request_parameters = requester.request_parameters 3571 if request_parameters and isinstance(request_parameters, Mapping): 3572 for request_parameter in request_parameters.values(): 3573 if isinstance(request_parameter, QueryPropertiesModel): 3574 return True 3575 return False 3576 3577 @staticmethod 3578 def _remove_query_properties( 3579 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3580 ) -> Mapping[str, str]: 3581 return { 3582 parameter_field: request_parameter 3583 for parameter_field, request_parameter in request_parameters.items() 3584 if not isinstance(request_parameter, QueryPropertiesModel) 3585 } 3586 3587 def create_state_delegating_stream( 3588 self, 3589 model: StateDelegatingStreamModel, 3590 config: Config, 3591 **kwargs: Any, 3592 ) -> DefaultStream: 3593 if ( 3594 model.full_refresh_stream.name != model.name 3595 or model.name != model.incremental_stream.name 3596 ): 3597 raise ValueError( 3598 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3599 ) 3600 3601 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3602 resolved_retention_period: Optional[str] = None 3603 if model.api_retention_period: 3604 interpolated_retention = InterpolatedString.create( 3605 model.api_retention_period, parameters=model.parameters or {} 3606 ) 3607 resolved_value = interpolated_retention.eval(config=config) 3608 if resolved_value: 3609 resolved_retention_period = str(resolved_value) 3610 3611 if resolved_retention_period: 3612 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3613 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3614 raise ValueError( 3615 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3616 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3617 f"cursors, so cursor age validation cannot be performed." 3618 ) 3619 3620 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3621 3622 if not stream_state: 3623 return self._create_component_from_model( # type: ignore[no-any-return] 3624 model.full_refresh_stream, config=config, **kwargs 3625 ) 3626 3627 incremental_stream: DefaultStream = self._create_component_from_model( 3628 model.incremental_stream, config=config, **kwargs 3629 ) # type: ignore[assignment] 3630 3631 # Only run cursor age validation for streams that are in the configured 3632 # catalog (or when no catalog was provided, e.g. during discover / connector 3633 # builder). Streams not selected by the user but instantiated as parent-stream 3634 # dependencies must not go through this path because it emits state messages 3635 # that the destination does not know about, causing "Stream not found" crashes. 3636 stream_is_in_catalog = ( 3637 not self._stream_name_to_configured_stream # no catalog → validate by default 3638 or model.name in self._stream_name_to_configured_stream 3639 ) 3640 if resolved_retention_period and stream_is_in_catalog: 3641 full_refresh_stream: DefaultStream = self._create_component_from_model( 3642 model.full_refresh_stream, config=config, **kwargs 3643 ) # type: ignore[assignment] 3644 if self._is_cursor_older_than_retention_period( 3645 stream_state, 3646 full_refresh_stream.cursor, 3647 incremental_stream.cursor, 3648 resolved_retention_period, 3649 model.name, 3650 ): 3651 # Clear state BEFORE constructing the full_refresh_stream so that 3652 # its cursor starts from start_date instead of the stale cursor. 3653 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3654 state_message = self._connector_state_manager.create_state_message(model.name, None) 3655 self._message_repository.emit_message(state_message) 3656 return self._create_component_from_model( # type: ignore[no-any-return] 3657 model.full_refresh_stream, config=config, **kwargs 3658 ) 3659 3660 return incremental_stream 3661 3662 @staticmethod 3663 def _is_cursor_older_than_retention_period( 3664 stream_state: Mapping[str, Any], 3665 full_refresh_cursor: Cursor, 3666 incremental_cursor: Cursor, 3667 api_retention_period: str, 3668 stream_name: str, 3669 ) -> bool: 3670 """Check if the cursor value in the state is older than the API's retention period. 3671 3672 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3673 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3674 which is always within retention, so we use incremental. For other states, it returns 3675 None and we fall back to checking the incremental cursor. 3676 3677 Returns True if the cursor is older than the retention period (should use full refresh). 3678 Returns False if the cursor is within the retention period (safe to use incremental). 3679 """ 3680 retention_duration = parse_duration(api_retention_period) 3681 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3682 3683 # Check full refresh cursor first 3684 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3685 3686 # If full refresh cursor returns None, check incremental cursor 3687 if cursor_datetime is None: 3688 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3689 3690 if cursor_datetime is None: 3691 # Neither cursor could parse the state - fall back to full refresh to be safe 3692 return True 3693 3694 if cursor_datetime < retention_cutoff: 3695 logging.warning( 3696 f"Stream '{stream_name}' has a cursor value older than " 3697 f"the API's retention period of {api_retention_period} " 3698 f"(cutoff: {retention_cutoff.isoformat()}). " 3699 f"Falling back to full refresh to avoid data loss." 3700 ) 3701 return True 3702 3703 return False 3704 3705 def _get_state_delegating_stream_model( 3706 self, 3707 model: StateDelegatingStreamModel, 3708 parent_state: Optional[Mapping[str, Any]] = None, 3709 ) -> DeclarativeStreamModel: 3710 """Return the appropriate underlying stream model based on state.""" 3711 return ( 3712 model.incremental_stream 3713 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3714 else model.full_refresh_stream 3715 ) 3716 3717 def _create_async_job_status_mapping( 3718 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3719 ) -> Mapping[str, AsyncJobStatus]: 3720 api_status_to_cdk_status = {} 3721 for cdk_status, api_statuses in model.dict().items(): 3722 if cdk_status == "type": 3723 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3724 continue 3725 3726 for status in api_statuses: 3727 if status in api_status_to_cdk_status: 3728 raise ValueError( 3729 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3730 ) 3731 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3732 return api_status_to_cdk_status 3733 3734 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3735 match status: 3736 case "running": 3737 return AsyncJobStatus.RUNNING 3738 case "completed": 3739 return AsyncJobStatus.COMPLETED 3740 case "failed": 3741 return AsyncJobStatus.FAILED 3742 case "timeout": 3743 return AsyncJobStatus.TIMED_OUT 3744 case _: 3745 raise ValueError(f"Unsupported CDK status {status}") 3746 3747 def create_async_retriever( 3748 self, 3749 model: AsyncRetrieverModel, 3750 config: Config, 3751 *, 3752 name: str, 3753 primary_key: Optional[ 3754 Union[str, List[str], List[List[str]]] 3755 ], # this seems to be needed to match create_simple_retriever 3756 stream_slicer: Optional[StreamSlicer], 3757 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3758 transformations: List[RecordTransformation], 3759 **kwargs: Any, 3760 ) -> AsyncRetriever: 3761 if model.download_target_requester and not model.download_target_extractor: 3762 raise ValueError( 3763 f"`download_target_extractor` required if using a `download_target_requester`" 3764 ) 3765 3766 def _get_download_retriever( 3767 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3768 ) -> SimpleRetriever: 3769 # We create a record selector for the download retriever 3770 # with no schema normalization and no transformations, neither record filter 3771 # as all this occurs in the record_selector of the AsyncRetriever 3772 record_selector = RecordSelector( 3773 extractor=extractor, 3774 name=name, 3775 record_filter=None, 3776 transformations=[], 3777 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3778 config=config, 3779 parameters={}, 3780 ) 3781 paginator = ( 3782 self._create_component_from_model( 3783 model=model.download_paginator, 3784 decoder=_decoder, 3785 config=config, 3786 url_base="", 3787 ) 3788 if model.download_paginator 3789 else NoPagination(parameters={}) 3790 ) 3791 3792 return SimpleRetriever( 3793 requester=requester, 3794 record_selector=record_selector, 3795 primary_key=None, 3796 name=name, 3797 paginator=paginator, 3798 config=config, 3799 parameters={}, 3800 log_formatter=self._get_log_formatter(None, name), 3801 ) 3802 3803 def _get_job_timeout() -> datetime.timedelta: 3804 user_defined_timeout: Optional[int] = ( 3805 int( 3806 InterpolatedString.create( 3807 str(model.polling_job_timeout), 3808 parameters={}, 3809 ).eval(config) 3810 ) 3811 if model.polling_job_timeout 3812 else None 3813 ) 3814 3815 # check for user defined timeout during the test read or 15 minutes 3816 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3817 # default value for non-connector builder is 60 minutes. 3818 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3819 3820 return ( 3821 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3822 ) 3823 3824 decoder = ( 3825 self._create_component_from_model(model=model.decoder, config=config) 3826 if model.decoder 3827 else JsonDecoder(parameters={}) 3828 ) 3829 record_selector = self._create_component_from_model( 3830 model=model.record_selector, 3831 config=config, 3832 decoder=decoder, 3833 name=name, 3834 transformations=transformations, 3835 client_side_incremental_sync=client_side_incremental_sync, 3836 ) 3837 3838 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3839 if self._should_limit_slices_fetched(): 3840 stream_slicer = cast( 3841 StreamSlicer, 3842 StreamSlicerTestReadDecorator( 3843 wrapped_slicer=stream_slicer, 3844 maximum_number_of_slices=self._limit_slices_fetched or 5, 3845 ), 3846 ) 3847 3848 creation_requester = self._create_component_from_model( 3849 model=model.creation_requester, 3850 decoder=decoder, 3851 config=config, 3852 name=f"job creation - {name}", 3853 ) 3854 polling_requester = self._create_component_from_model( 3855 model=model.polling_requester, 3856 decoder=decoder, 3857 config=config, 3858 name=f"job polling - {name}", 3859 ) 3860 job_download_components_name = f"job download - {name}" 3861 download_decoder = ( 3862 self._create_component_from_model(model=model.download_decoder, config=config) 3863 if model.download_decoder 3864 else JsonDecoder(parameters={}) 3865 ) 3866 download_extractor = ( 3867 self._create_component_from_model( 3868 model=model.download_extractor, 3869 config=config, 3870 decoder=download_decoder, 3871 parameters=model.parameters, 3872 ) 3873 if model.download_extractor 3874 else DpathExtractor( 3875 [], 3876 config=config, 3877 decoder=download_decoder, 3878 parameters=model.parameters or {}, 3879 ) 3880 ) 3881 download_requester = self._create_component_from_model( 3882 model=model.download_requester, 3883 decoder=download_decoder, 3884 config=config, 3885 name=job_download_components_name, 3886 ) 3887 download_retriever = _get_download_retriever( 3888 download_requester, download_extractor, download_decoder 3889 ) 3890 abort_requester = ( 3891 self._create_component_from_model( 3892 model=model.abort_requester, 3893 decoder=decoder, 3894 config=config, 3895 name=f"job abort - {name}", 3896 ) 3897 if model.abort_requester 3898 else None 3899 ) 3900 delete_requester = ( 3901 self._create_component_from_model( 3902 model=model.delete_requester, 3903 decoder=decoder, 3904 config=config, 3905 name=f"job delete - {name}", 3906 ) 3907 if model.delete_requester 3908 else None 3909 ) 3910 download_target_requester = ( 3911 self._create_component_from_model( 3912 model=model.download_target_requester, 3913 decoder=decoder, 3914 config=config, 3915 name=f"job extract_url - {name}", 3916 ) 3917 if model.download_target_requester 3918 else None 3919 ) 3920 status_extractor = self._create_component_from_model( 3921 model=model.status_extractor, decoder=decoder, config=config, name=name 3922 ) 3923 download_target_extractor = ( 3924 self._create_component_from_model( 3925 model=model.download_target_extractor, 3926 decoder=decoder, 3927 config=config, 3928 name=name, 3929 ) 3930 if model.download_target_extractor 3931 else None 3932 ) 3933 3934 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3935 creation_requester=creation_requester, 3936 polling_requester=polling_requester, 3937 download_retriever=download_retriever, 3938 download_target_requester=download_target_requester, 3939 abort_requester=abort_requester, 3940 delete_requester=delete_requester, 3941 status_extractor=status_extractor, 3942 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3943 download_target_extractor=download_target_extractor, 3944 job_timeout=_get_job_timeout(), 3945 ) 3946 3947 async_job_partition_router = AsyncJobPartitionRouter( 3948 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3949 job_repository, 3950 stream_slices, 3951 self._job_tracker, 3952 self._message_repository, 3953 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3954 has_bulk_parent=False, 3955 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3956 # `None` == default retry is set to 3 attempts, under the hood. 3957 job_max_retry=1 if self._emit_connector_builder_messages else None, 3958 ), 3959 stream_slicer=stream_slicer, 3960 config=config, 3961 parameters=model.parameters or {}, 3962 ) 3963 3964 return AsyncRetriever( 3965 record_selector=record_selector, 3966 stream_slicer=async_job_partition_router, 3967 config=config, 3968 parameters=model.parameters or {}, 3969 ) 3970 3971 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3972 config_migrations = [ 3973 self._create_component_from_model(migration, config) 3974 for migration in ( 3975 model.config_normalization_rules.config_migrations 3976 if ( 3977 model.config_normalization_rules 3978 and model.config_normalization_rules.config_migrations 3979 ) 3980 else [] 3981 ) 3982 ] 3983 config_transformations = [ 3984 self._create_component_from_model(transformation, config) 3985 for transformation in ( 3986 model.config_normalization_rules.transformations 3987 if ( 3988 model.config_normalization_rules 3989 and model.config_normalization_rules.transformations 3990 ) 3991 else [] 3992 ) 3993 ] 3994 config_validations = [ 3995 self._create_component_from_model(validation, config) 3996 for validation in ( 3997 model.config_normalization_rules.validations 3998 if ( 3999 model.config_normalization_rules 4000 and model.config_normalization_rules.validations 4001 ) 4002 else [] 4003 ) 4004 ] 4005 4006 return Spec( 4007 connection_specification=model.connection_specification, 4008 documentation_url=model.documentation_url, 4009 advanced_auth=model.advanced_auth, 4010 parameters={}, 4011 config_migrations=config_migrations, 4012 config_transformations=config_transformations, 4013 config_validations=config_validations, 4014 ) 4015 4016 def create_substream_partition_router( 4017 self, 4018 model: SubstreamPartitionRouterModel, 4019 config: Config, 4020 *, 4021 stream_name: str, 4022 **kwargs: Any, 4023 ) -> SubstreamPartitionRouter: 4024 parent_stream_configs = [] 4025 if model.parent_stream_configs: 4026 parent_stream_configs.extend( 4027 [ 4028 self.create_parent_stream_config_with_substream_wrapper( 4029 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4030 ) 4031 for parent_stream_config in model.parent_stream_configs 4032 ] 4033 ) 4034 4035 return SubstreamPartitionRouter( 4036 parent_stream_configs=parent_stream_configs, 4037 parameters=model.parameters or {}, 4038 config=config, 4039 ) 4040 4041 def create_parent_stream_config_with_substream_wrapper( 4042 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4043 ) -> Any: 4044 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4045 4046 parent_state: Optional[Mapping[str, Any]] = ( 4047 child_state if model.incremental_dependency and child_state else None 4048 ) 4049 connector_state_manager = self._instantiate_parent_stream_state_manager( 4050 child_state, config, model, parent_state 4051 ) 4052 4053 substream_factory = ModelToComponentFactory( 4054 connector_state_manager=connector_state_manager, 4055 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4056 limit_slices_fetched=self._limit_slices_fetched, 4057 emit_connector_builder_messages=self._emit_connector_builder_messages, 4058 disable_retries=self._disable_retries, 4059 disable_cache=self._disable_cache, 4060 message_repository=StateFilteringMessageRepository( 4061 LogAppenderMessageRepositoryDecorator( 4062 { 4063 "airbyte_cdk": {"stream": {"is_substream": True}}, 4064 "http": {"is_auxiliary": True}, 4065 }, 4066 self._message_repository, 4067 self._evaluate_log_level(self._emit_connector_builder_messages), 4068 ), 4069 ), 4070 api_budget=self._api_budget, 4071 ) 4072 4073 return substream_factory.create_parent_stream_config( 4074 model=model, config=config, stream_name=stream_name, **kwargs 4075 ) 4076 4077 def _instantiate_parent_stream_state_manager( 4078 self, 4079 child_state: MutableMapping[str, Any], 4080 config: Config, 4081 model: ParentStreamConfigModel, 4082 parent_state: Optional[Mapping[str, Any]] = None, 4083 ) -> ConnectorStateManager: 4084 """ 4085 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4086 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4087 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4088 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4089 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4090 incremental_dependency is set. 4091 """ 4092 if model.incremental_dependency and child_state: 4093 parent_stream_name = model.stream.name or "" 4094 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4095 child_state, parent_stream_name 4096 ) 4097 4098 if not extracted_parent_state: 4099 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4100 child_state, parent_stream_name 4101 ) 4102 4103 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4104 cursor_values = child_state.values() 4105 if cursor_values and len(cursor_values) == 1: 4106 incremental_sync_model: Union[ 4107 DatetimeBasedCursorModel, 4108 IncrementingCountCursorModel, 4109 ] = ( 4110 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4111 if isinstance(model.stream, DeclarativeStreamModel) 4112 else self._get_state_delegating_stream_model( 4113 model.stream, parent_state=parent_state 4114 ).incremental_sync 4115 ) 4116 cursor_field = InterpolatedString.create( 4117 incremental_sync_model.cursor_field, 4118 parameters=incremental_sync_model.parameters or {}, 4119 ).eval(config) 4120 extracted_parent_state = AirbyteStateMessage( 4121 type=AirbyteStateType.STREAM, 4122 stream=AirbyteStreamState( 4123 stream_descriptor=StreamDescriptor( 4124 name=parent_stream_name, namespace=None 4125 ), 4126 stream_state=AirbyteStateBlob( 4127 {cursor_field: list(cursor_values)[0]} 4128 ), 4129 ), 4130 ) 4131 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4132 4133 return ConnectorStateManager([]) 4134 4135 @staticmethod 4136 def create_wait_time_from_header( 4137 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4138 ) -> WaitTimeFromHeaderBackoffStrategy: 4139 return WaitTimeFromHeaderBackoffStrategy( 4140 header=model.header, 4141 parameters=model.parameters or {}, 4142 config=config, 4143 regex=model.regex, 4144 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4145 if model.max_waiting_time_in_seconds is not None 4146 else None, 4147 ) 4148 4149 @staticmethod 4150 def create_wait_until_time_from_header( 4151 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4152 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4153 return WaitUntilTimeFromHeaderBackoffStrategy( 4154 header=model.header, 4155 parameters=model.parameters or {}, 4156 config=config, 4157 min_wait=model.min_wait, 4158 regex=model.regex, 4159 ) 4160 4161 def get_message_repository(self) -> MessageRepository: 4162 return self._message_repository 4163 4164 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4165 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4166 4167 @staticmethod 4168 def create_components_mapping_definition( 4169 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4170 ) -> ComponentMappingDefinition: 4171 interpolated_value = InterpolatedString.create( 4172 model.value, parameters=model.parameters or {} 4173 ) 4174 field_path = [ 4175 InterpolatedString.create(path, parameters=model.parameters or {}) 4176 for path in model.field_path 4177 ] 4178 return ComponentMappingDefinition( 4179 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4180 value=interpolated_value, 4181 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4182 create_or_update=model.create_or_update, 4183 condition=model.condition, 4184 parameters=model.parameters or {}, 4185 ) 4186 4187 def create_http_components_resolver( 4188 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4189 ) -> Any: 4190 retriever = self._create_component_from_model( 4191 model=model.retriever, 4192 config=config, 4193 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4194 primary_key=None, 4195 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4196 transformations=[], 4197 ) 4198 4199 components_mapping = [] 4200 for component_mapping_definition_model in model.components_mapping: 4201 if component_mapping_definition_model.condition: 4202 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4203 components_mapping.append( 4204 self._create_component_from_model( 4205 model=component_mapping_definition_model, 4206 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4207 component_mapping_definition_model.value_type 4208 ), 4209 config=config, 4210 ) 4211 ) 4212 4213 return HttpComponentsResolver( 4214 retriever=retriever, 4215 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4216 config=config, 4217 components_mapping=components_mapping, 4218 parameters=model.parameters or {}, 4219 ) 4220 4221 @staticmethod 4222 def create_stream_config( 4223 model: StreamConfigModel, config: Config, **kwargs: Any 4224 ) -> StreamConfig: 4225 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4226 [x for x in model.configs_pointer] if model.configs_pointer else [] 4227 ) 4228 4229 return StreamConfig( 4230 configs_pointer=model_configs_pointer, 4231 default_values=model.default_values, 4232 parameters=model.parameters or {}, 4233 ) 4234 4235 def create_config_components_resolver( 4236 self, 4237 model: ConfigComponentsResolverModel, 4238 config: Config, 4239 ) -> Any: 4240 model_stream_configs = ( 4241 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4242 ) 4243 4244 stream_configs = [ 4245 self._create_component_from_model( 4246 stream_config, config=config, parameters=model.parameters or {} 4247 ) 4248 for stream_config in model_stream_configs 4249 ] 4250 4251 components_mapping = [ 4252 self._create_component_from_model( 4253 model=components_mapping_definition_model, 4254 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4255 components_mapping_definition_model.value_type 4256 ), 4257 config=config, 4258 parameters=model.parameters, 4259 ) 4260 for components_mapping_definition_model in model.components_mapping 4261 ] 4262 4263 return ConfigComponentsResolver( 4264 stream_configs=stream_configs, 4265 config=config, 4266 components_mapping=components_mapping, 4267 parameters=model.parameters or {}, 4268 ) 4269 4270 def create_parametrized_components_resolver( 4271 self, 4272 model: ParametrizedComponentsResolverModel, 4273 config: Config, 4274 ) -> ParametrizedComponentsResolver: 4275 stream_parameters = StreamParametersDefinition( 4276 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4277 ) 4278 4279 components_mapping = [] 4280 for components_mapping_definition_model in model.components_mapping: 4281 if components_mapping_definition_model.condition: 4282 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4283 components_mapping.append( 4284 self._create_component_from_model( 4285 model=components_mapping_definition_model, 4286 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4287 components_mapping_definition_model.value_type 4288 ), 4289 config=config, 4290 ) 4291 ) 4292 return ParametrizedComponentsResolver( 4293 stream_parameters=stream_parameters, 4294 config=config, 4295 components_mapping=components_mapping, 4296 parameters=model.parameters or {}, 4297 ) 4298 4299 _UNSUPPORTED_DECODER_ERROR = ( 4300 "Specified decoder of {decoder_type} is not supported for pagination." 4301 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4302 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4303 ) 4304 4305 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4306 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4307 return True 4308 elif isinstance(decoder, CompositeRawDecoder): 4309 return self._is_supported_parser_for_pagination(decoder.parser) 4310 else: 4311 return False 4312 4313 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4314 if isinstance(parser, JsonParser): 4315 return True 4316 elif isinstance(parser, GzipParser): 4317 return isinstance(parser.inner_parser, JsonParser) 4318 else: 4319 return False 4320 4321 def create_http_api_budget( 4322 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4323 ) -> HttpAPIBudget: 4324 policies = [ 4325 self._create_component_from_model(model=policy, config=config) 4326 for policy in model.policies 4327 ] 4328 4329 return HttpAPIBudget( 4330 policies=policies, 4331 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4332 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4333 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4334 ) 4335 4336 def create_fixed_window_call_rate_policy( 4337 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4338 ) -> FixedWindowCallRatePolicy: 4339 matchers = [ 4340 self._create_component_from_model(model=matcher, config=config) 4341 for matcher in model.matchers 4342 ] 4343 4344 # Set the initial reset timestamp to 10 days from now. 4345 # This value will be updated by the first request. 4346 return FixedWindowCallRatePolicy( 4347 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4348 period=parse_duration(model.period), 4349 call_limit=model.call_limit, 4350 matchers=matchers, 4351 ) 4352 4353 def create_file_uploader( 4354 self, model: FileUploaderModel, config: Config, **kwargs: Any 4355 ) -> FileUploader: 4356 name = "File Uploader" 4357 requester = self._create_component_from_model( 4358 model=model.requester, 4359 config=config, 4360 name=name, 4361 **kwargs, 4362 ) 4363 download_target_extractor = self._create_component_from_model( 4364 model=model.download_target_extractor, 4365 config=config, 4366 name=name, 4367 **kwargs, 4368 ) 4369 emit_connector_builder_messages = self._emit_connector_builder_messages 4370 file_uploader = DefaultFileUploader( 4371 requester=requester, 4372 download_target_extractor=download_target_extractor, 4373 config=config, 4374 file_writer=NoopFileWriter() 4375 if emit_connector_builder_messages 4376 else LocalFileSystemFileWriter(), 4377 parameters=model.parameters or {}, 4378 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4379 ) 4380 4381 return ( 4382 ConnectorBuilderFileUploader(file_uploader) 4383 if emit_connector_builder_messages 4384 else file_uploader 4385 ) 4386 4387 def create_moving_window_call_rate_policy( 4388 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4389 ) -> MovingWindowCallRatePolicy: 4390 rates = [ 4391 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4392 ] 4393 matchers = [ 4394 self._create_component_from_model(model=matcher, config=config) 4395 for matcher in model.matchers 4396 ] 4397 return MovingWindowCallRatePolicy( 4398 rates=rates, 4399 matchers=matchers, 4400 ) 4401 4402 def create_unlimited_call_rate_policy( 4403 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4404 ) -> UnlimitedCallRatePolicy: 4405 matchers = [ 4406 self._create_component_from_model(model=matcher, config=config) 4407 for matcher in model.matchers 4408 ] 4409 4410 return UnlimitedCallRatePolicy( 4411 matchers=matchers, 4412 ) 4413 4414 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4415 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4416 return Rate( 4417 limit=int(interpolated_limit.eval(config=config)), 4418 interval=parse_duration(model.interval), 4419 ) 4420 4421 def create_http_request_matcher( 4422 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4423 ) -> HttpRequestRegexMatcher: 4424 weight = model.weight 4425 if weight is not None: 4426 if isinstance(weight, str): 4427 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4428 else: 4429 weight = int(weight) 4430 if weight < 1: 4431 raise ValueError(f"weight must be >= 1, got {weight}") 4432 return HttpRequestRegexMatcher( 4433 method=model.method, 4434 url_base=model.url_base, 4435 url_path_pattern=model.url_path_pattern, 4436 params=model.params, 4437 headers=model.headers, 4438 weight=weight, 4439 ) 4440 4441 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4442 self._api_budget = self.create_component( 4443 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4444 ) 4445 4446 def create_grouping_partition_router( 4447 self, 4448 model: GroupingPartitionRouterModel, 4449 config: Config, 4450 *, 4451 stream_name: str, 4452 **kwargs: Any, 4453 ) -> GroupingPartitionRouter: 4454 underlying_router = self._create_component_from_model( 4455 model=model.underlying_partition_router, 4456 config=config, 4457 stream_name=stream_name, 4458 **kwargs, 4459 ) 4460 if model.group_size < 1: 4461 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4462 4463 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4464 # because they are specific to individual partitions and cannot be aggregated or handled 4465 # when grouping, potentially leading to incorrect API calls. Any request customization 4466 # should be managed at the stream level through the requester's configuration. 4467 if isinstance(underlying_router, SubstreamPartitionRouter): 4468 if any( 4469 parent_config.request_option 4470 for parent_config in underlying_router.parent_stream_configs 4471 ): 4472 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4473 4474 if isinstance(underlying_router, ListPartitionRouter): 4475 if underlying_router.request_option: 4476 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4477 4478 return GroupingPartitionRouter( 4479 group_size=model.group_size, 4480 underlying_partition_router=underlying_router, 4481 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4482 config=config, 4483 ) 4484 4485 def _ensure_query_properties_to_model( 4486 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4487 ) -> None: 4488 """ 4489 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4490 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4491 proper model. 4492 """ 4493 if not hasattr(requester, "request_parameters"): 4494 return 4495 4496 request_parameters = requester.request_parameters 4497 if request_parameters and isinstance(request_parameters, Dict): 4498 for request_parameter_key in request_parameters.keys(): 4499 request_parameter = request_parameters[request_parameter_key] 4500 if ( 4501 isinstance(request_parameter, Dict) 4502 and request_parameter.get("type") == "QueryProperties" 4503 ): 4504 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4505 request_parameter 4506 ) 4507 4508 def _get_catalog_defined_cursor_field( 4509 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4510 ) -> Optional[CursorField]: 4511 if not allow_catalog_defined_cursor_field: 4512 return None 4513 4514 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4515 4516 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4517 # case we return None which will then use the default cursor field defined on the cursor model. 4518 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4519 # occur when the platform serializes "no cursor configured" streams incorrectly. 4520 if ( 4521 not configured_stream 4522 or not configured_stream.cursor_field 4523 or not configured_stream.cursor_field[0] 4524 ): 4525 return None 4526 elif len(configured_stream.cursor_field) > 1: 4527 raise ValueError( 4528 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4529 ) 4530 else: 4531 return CursorField( 4532 cursor_field_key=configured_stream.cursor_field[0], 4533 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4534 )
688 def __init__( 689 self, 690 limit_pages_fetched_per_slice: Optional[int] = None, 691 limit_slices_fetched: Optional[int] = None, 692 emit_connector_builder_messages: bool = False, 693 disable_retries: bool = False, 694 disable_cache: bool = False, 695 message_repository: Optional[MessageRepository] = None, 696 connector_state_manager: Optional[ConnectorStateManager] = None, 697 max_concurrent_async_job_count: Optional[int] = None, 698 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 699 api_budget: Optional[APIBudget] = None, 700 ): 701 self._init_mappings() 702 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 703 self._limit_slices_fetched = limit_slices_fetched 704 self._emit_connector_builder_messages = emit_connector_builder_messages 705 self._disable_retries = disable_retries 706 self._disable_cache = disable_cache 707 self._message_repository = message_repository or InMemoryMessageRepository( 708 self._evaluate_log_level(emit_connector_builder_messages) 709 ) 710 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 711 configured_catalog 712 ) 713 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 714 self._api_budget: Optional[Union[APIBudget]] = api_budget 715 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 716 # placeholder for deprecation warnings 717 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
841 def create_component( 842 self, 843 model_type: Type[BaseModel], 844 component_definition: ComponentDefinition, 845 config: Config, 846 **kwargs: Any, 847 ) -> Any: 848 """ 849 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 850 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 851 creating declarative components from that model. 852 853 :param model_type: The type of declarative component that is being initialized 854 :param component_definition: The mapping that represents a declarative component 855 :param config: The connector config that is provided by the customer 856 :return: The declarative component to be used at runtime 857 """ 858 859 component_type = component_definition.get("type") 860 if component_definition.get("type") != model_type.__name__: 861 raise ValueError( 862 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 863 ) 864 865 declarative_component_model = model_type.parse_obj(component_definition) 866 867 if not isinstance(declarative_component_model, model_type): 868 raise ValueError( 869 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 870 ) 871 872 return self._create_component_from_model( 873 model=declarative_component_model, config=config, **kwargs 874 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
891 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 892 """ 893 Returns the deprecation warnings that were collected during the creation of components. 894 """ 895 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
912 def create_config_migration( 913 self, model: ConfigMigrationModel, config: Config 914 ) -> ConfigMigration: 915 transformations: List[ConfigTransformation] = [ 916 self._create_component_from_model(transformation, config) 917 for transformation in model.transformations 918 ] 919 920 return ConfigMigration( 921 description=model.description, 922 transformations=transformations, 923 )
925 def create_config_add_fields( 926 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 927 ) -> ConfigAddFields: 928 fields = [self._create_component_from_model(field, config) for field in model.fields] 929 return ConfigAddFields( 930 fields=fields, 931 condition=model.condition or "", 932 )
981 @staticmethod 982 def create_added_field_definition( 983 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 984 ) -> AddedFieldDefinition: 985 interpolated_value = InterpolatedString.create( 986 model.value, parameters=model.parameters or {} 987 ) 988 return AddedFieldDefinition( 989 path=model.path, 990 value=interpolated_value, 991 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 992 parameters=model.parameters or {}, 993 )
995 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 996 added_field_definitions = [ 997 self._create_component_from_model( 998 model=added_field_definition_model, 999 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1000 added_field_definition_model.value_type 1001 ), 1002 config=config, 1003 ) 1004 for added_field_definition_model in model.fields 1005 ] 1006 return AddFields( 1007 fields=added_field_definitions, 1008 condition=model.condition or "", 1009 parameters=model.parameters or {}, 1010 )
1036 def create_dpath_flatten_fields( 1037 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1038 ) -> DpathFlattenFields: 1039 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1040 key_transformation = ( 1041 KeyTransformation( 1042 config=config, 1043 prefix=model.key_transformation.prefix, 1044 suffix=model.key_transformation.suffix, 1045 parameters=model.parameters or {}, 1046 ) 1047 if model.key_transformation is not None 1048 else None 1049 ) 1050 return DpathFlattenFields( 1051 config=config, 1052 field_path=model_field_path, 1053 delete_origin_value=model.delete_origin_value 1054 if model.delete_origin_value is not None 1055 else False, 1056 replace_record=model.replace_record if model.replace_record is not None else False, 1057 key_transformation=key_transformation, 1058 parameters=model.parameters or {}, 1059 )
1073 def create_api_key_authenticator( 1074 self, 1075 model: ApiKeyAuthenticatorModel, 1076 config: Config, 1077 token_provider: Optional[TokenProvider] = None, 1078 **kwargs: Any, 1079 ) -> ApiKeyAuthenticator: 1080 if model.inject_into is None and model.header is None: 1081 raise ValueError( 1082 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1083 ) 1084 1085 if model.inject_into is not None and model.header is not None: 1086 raise ValueError( 1087 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1088 ) 1089 1090 if token_provider is not None and model.api_token != "": 1091 raise ValueError( 1092 "If token_provider is set, api_token is ignored and has to be set to empty string." 1093 ) 1094 1095 request_option = ( 1096 self._create_component_from_model( 1097 model.inject_into, config, parameters=model.parameters or {} 1098 ) 1099 if model.inject_into 1100 else RequestOption( 1101 inject_into=RequestOptionType.header, 1102 field_name=model.header or "", 1103 parameters=model.parameters or {}, 1104 ) 1105 ) 1106 1107 return ApiKeyAuthenticator( 1108 token_provider=( 1109 token_provider 1110 if token_provider is not None 1111 else InterpolatedStringTokenProvider( 1112 api_token=model.api_token or "", 1113 config=config, 1114 parameters=model.parameters or {}, 1115 ) 1116 ), 1117 request_option=request_option, 1118 config=config, 1119 parameters=model.parameters or {}, 1120 )
1122 def create_legacy_to_per_partition_state_migration( 1123 self, 1124 model: LegacyToPerPartitionStateMigrationModel, 1125 config: Mapping[str, Any], 1126 declarative_stream: DeclarativeStreamModel, 1127 ) -> LegacyToPerPartitionStateMigration: 1128 retriever = declarative_stream.retriever 1129 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1130 raise ValueError( 1131 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1132 ) 1133 partition_router = retriever.partition_router 1134 if not isinstance( 1135 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1136 ): 1137 raise ValueError( 1138 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1139 ) 1140 if not hasattr(partition_router, "parent_stream_configs"): 1141 raise ValueError( 1142 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1143 ) 1144 1145 if not hasattr(declarative_stream, "incremental_sync"): 1146 raise ValueError( 1147 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1148 ) 1149 1150 return LegacyToPerPartitionStateMigration( 1151 partition_router, # type: ignore # was already checked above 1152 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1153 config, 1154 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1155 )
1157 def create_session_token_authenticator( 1158 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1159 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1160 decoder = ( 1161 self._create_component_from_model(model=model.decoder, config=config) 1162 if model.decoder 1163 else JsonDecoder(parameters={}) 1164 ) 1165 login_requester = self._create_component_from_model( 1166 model=model.login_requester, 1167 config=config, 1168 name=f"{name}_login_requester", 1169 decoder=decoder, 1170 ) 1171 token_provider = SessionTokenProvider( 1172 login_requester=login_requester, 1173 session_token_path=model.session_token_path, 1174 expiration_duration=parse_duration(model.expiration_duration) 1175 if model.expiration_duration 1176 else None, 1177 parameters=model.parameters or {}, 1178 message_repository=self._message_repository, 1179 decoder=decoder, 1180 ) 1181 if model.request_authentication.type == "Bearer": 1182 return ModelToComponentFactory.create_bearer_authenticator( 1183 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1184 config, 1185 token_provider=token_provider, 1186 ) 1187 else: 1188 # Get the api_token template if specified, default to just the session token 1189 api_token_template = ( 1190 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1191 ) 1192 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1193 config=config, 1194 api_token=api_token_template, 1195 session_token_provider=token_provider, 1196 parameters=model.parameters or {}, 1197 ) 1198 return self.create_api_key_authenticator( 1199 ApiKeyAuthenticatorModel( 1200 type="ApiKeyAuthenticator", 1201 api_token="", 1202 inject_into=model.request_authentication.inject_into, 1203 ), # type: ignore # $parameters and headers default to None 1204 config=config, 1205 token_provider=final_token_provider, 1206 )
1208 @staticmethod 1209 def create_basic_http_authenticator( 1210 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1211 ) -> BasicHttpAuthenticator: 1212 return BasicHttpAuthenticator( 1213 password=model.password or "", 1214 username=model.username, 1215 config=config, 1216 parameters=model.parameters or {}, 1217 )
1219 @staticmethod 1220 def create_bearer_authenticator( 1221 model: BearerAuthenticatorModel, 1222 config: Config, 1223 token_provider: Optional[TokenProvider] = None, 1224 **kwargs: Any, 1225 ) -> BearerAuthenticator: 1226 if token_provider is not None and model.api_token != "": 1227 raise ValueError( 1228 "If token_provider is set, api_token is ignored and has to be set to empty string." 1229 ) 1230 return BearerAuthenticator( 1231 token_provider=( 1232 token_provider 1233 if token_provider is not None 1234 else InterpolatedStringTokenProvider( 1235 api_token=model.api_token or "", 1236 config=config, 1237 parameters=model.parameters or {}, 1238 ) 1239 ), 1240 config=config, 1241 parameters=model.parameters or {}, 1242 )
1244 @staticmethod 1245 def create_dynamic_stream_check_config( 1246 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1247 ) -> DynamicStreamCheckConfig: 1248 return DynamicStreamCheckConfig( 1249 dynamic_stream_name=model.dynamic_stream_name, 1250 stream_count=model.stream_count, 1251 )
1253 def create_check_stream( 1254 self, model: CheckStreamModel, config: Config, **kwargs: Any 1255 ) -> CheckStream: 1256 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1257 raise ValueError( 1258 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1259 ) 1260 1261 dynamic_streams_check_configs = ( 1262 [ 1263 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1264 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1265 ] 1266 if model.dynamic_streams_check_configs 1267 else [] 1268 ) 1269 1270 return CheckStream( 1271 stream_names=model.stream_names or [], 1272 dynamic_streams_check_configs=dynamic_streams_check_configs, 1273 parameters={}, 1274 )
1276 @staticmethod 1277 def create_check_dynamic_stream( 1278 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1279 ) -> CheckDynamicStream: 1280 assert model.use_check_availability is not None # for mypy 1281 1282 use_check_availability = model.use_check_availability 1283 1284 return CheckDynamicStream( 1285 stream_count=model.stream_count, 1286 use_check_availability=use_check_availability, 1287 parameters={}, 1288 )
1290 def create_composite_error_handler( 1291 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1292 ) -> CompositeErrorHandler: 1293 error_handlers = [ 1294 self._create_component_from_model(model=error_handler_model, config=config) 1295 for error_handler_model in model.error_handlers 1296 ] 1297 return CompositeErrorHandler( 1298 error_handlers=error_handlers, parameters=model.parameters or {} 1299 )
1301 @staticmethod 1302 def create_concurrency_level( 1303 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1304 ) -> ConcurrencyLevel: 1305 return ConcurrencyLevel( 1306 default_concurrency=model.default_concurrency, 1307 max_concurrency=model.max_concurrency, 1308 config=config, 1309 parameters={}, 1310 )
1312 @staticmethod 1313 def apply_stream_state_migrations( 1314 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1315 ) -> MutableMapping[str, Any]: 1316 if stream_state_migrations: 1317 for state_migration in stream_state_migrations: 1318 if state_migration.should_migrate(stream_state): 1319 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1320 stream_state = dict(state_migration.migrate(stream_state)) 1321 return stream_state
1323 def create_concurrent_cursor_from_datetime_based_cursor( 1324 self, 1325 model_type: Type[BaseModel], 1326 component_definition: ComponentDefinition, 1327 stream_name: str, 1328 stream_namespace: Optional[str], 1329 stream_state: MutableMapping[str, Any], 1330 config: Config, 1331 message_repository: Optional[MessageRepository] = None, 1332 runtime_lookback_window: Optional[datetime.timedelta] = None, 1333 **kwargs: Any, 1334 ) -> ConcurrentCursor: 1335 component_type = component_definition.get("type") 1336 if component_definition.get("type") != model_type.__name__: 1337 raise ValueError( 1338 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1339 ) 1340 1341 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1342 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1343 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1344 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1345 if "$parameters" not in component_definition and "parameters" in component_definition: 1346 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1347 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1348 1349 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1350 raise ValueError( 1351 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1352 ) 1353 1354 model_parameters = datetime_based_cursor_model.parameters or {} 1355 1356 cursor_field = self._get_catalog_defined_cursor_field( 1357 stream_name=stream_name, 1358 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1359 or False, 1360 ) 1361 1362 if not cursor_field: 1363 interpolated_cursor_field = InterpolatedString.create( 1364 datetime_based_cursor_model.cursor_field, 1365 parameters=model_parameters, 1366 ) 1367 cursor_field = CursorField( 1368 cursor_field_key=interpolated_cursor_field.eval(config=config), 1369 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1370 or False, 1371 ) 1372 1373 interpolated_partition_field_start = InterpolatedString.create( 1374 datetime_based_cursor_model.partition_field_start or "start_time", 1375 parameters=model_parameters, 1376 ) 1377 interpolated_partition_field_end = InterpolatedString.create( 1378 datetime_based_cursor_model.partition_field_end or "end_time", 1379 parameters=model_parameters, 1380 ) 1381 1382 slice_boundary_fields = ( 1383 interpolated_partition_field_start.eval(config=config), 1384 interpolated_partition_field_end.eval(config=config), 1385 ) 1386 1387 datetime_format = datetime_based_cursor_model.datetime_format 1388 1389 cursor_granularity = ( 1390 parse_duration(datetime_based_cursor_model.cursor_granularity) 1391 if datetime_based_cursor_model.cursor_granularity 1392 else None 1393 ) 1394 1395 lookback_window = None 1396 interpolated_lookback_window = ( 1397 InterpolatedString.create( 1398 datetime_based_cursor_model.lookback_window, 1399 parameters=model_parameters, 1400 ) 1401 if datetime_based_cursor_model.lookback_window 1402 else None 1403 ) 1404 if interpolated_lookback_window: 1405 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1406 if evaluated_lookback_window: 1407 lookback_window = parse_duration(evaluated_lookback_window) 1408 1409 connector_state_converter: DateTimeStreamStateConverter 1410 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1411 datetime_format=datetime_format, 1412 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1413 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1414 cursor_granularity=cursor_granularity, 1415 ) 1416 1417 # Adjusts the stream state by applying the runtime lookback window. 1418 # This is used to ensure correct state handling in case of failed partitions. 1419 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1420 if runtime_lookback_window and stream_state_value: 1421 new_stream_state = ( 1422 connector_state_converter.parse_timestamp(stream_state_value) 1423 - runtime_lookback_window 1424 ) 1425 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1426 new_stream_state 1427 ) 1428 1429 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1430 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1431 start_date_runtime_value = self.create_min_max_datetime( 1432 model=datetime_based_cursor_model.start_datetime, config=config 1433 ) 1434 else: 1435 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1436 1437 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1438 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1439 end_date_runtime_value = self.create_min_max_datetime( 1440 model=datetime_based_cursor_model.end_datetime, config=config 1441 ) 1442 else: 1443 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1444 1445 interpolated_start_date = MinMaxDatetime.create( 1446 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1447 parameters=datetime_based_cursor_model.parameters, 1448 ) 1449 interpolated_end_date = ( 1450 None 1451 if not end_date_runtime_value 1452 else MinMaxDatetime.create( 1453 end_date_runtime_value, datetime_based_cursor_model.parameters 1454 ) 1455 ) 1456 1457 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1458 if not interpolated_start_date.datetime_format: 1459 interpolated_start_date.datetime_format = datetime_format 1460 if interpolated_end_date and not interpolated_end_date.datetime_format: 1461 interpolated_end_date.datetime_format = datetime_format 1462 1463 start_date = interpolated_start_date.get_datetime(config=config) 1464 end_date_provider = ( 1465 partial(interpolated_end_date.get_datetime, config) 1466 if interpolated_end_date 1467 else connector_state_converter.get_end_provider() 1468 ) 1469 1470 if ( 1471 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1472 ) or ( 1473 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1474 ): 1475 raise ValueError( 1476 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1477 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1478 ) 1479 1480 # When step is not defined, default to a step size from the starting date to the present moment 1481 step_length = datetime.timedelta.max 1482 interpolated_step = ( 1483 InterpolatedString.create( 1484 datetime_based_cursor_model.step, 1485 parameters=model_parameters, 1486 ) 1487 if datetime_based_cursor_model.step 1488 else None 1489 ) 1490 if interpolated_step: 1491 evaluated_step = interpolated_step.eval(config) 1492 if evaluated_step: 1493 step_length = parse_duration(evaluated_step) 1494 1495 clamping_strategy: ClampingStrategy = NoClamping() 1496 if datetime_based_cursor_model.clamping: 1497 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1498 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1499 # object which we want to keep agnostic of being low-code 1500 target = InterpolatedString( 1501 string=datetime_based_cursor_model.clamping.target, 1502 parameters=model_parameters, 1503 ) 1504 evaluated_target = target.eval(config=config) 1505 match evaluated_target: 1506 case "DAY": 1507 clamping_strategy = DayClampingStrategy() 1508 end_date_provider = ClampingEndProvider( 1509 DayClampingStrategy(is_ceiling=False), 1510 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1511 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1512 ) 1513 case "WEEK": 1514 if ( 1515 not datetime_based_cursor_model.clamping.target_details 1516 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1517 ): 1518 raise ValueError( 1519 "Given WEEK clamping, weekday needs to be provided as target_details" 1520 ) 1521 weekday = self._assemble_weekday( 1522 datetime_based_cursor_model.clamping.target_details["weekday"] 1523 ) 1524 clamping_strategy = WeekClampingStrategy(weekday) 1525 end_date_provider = ClampingEndProvider( 1526 WeekClampingStrategy(weekday, is_ceiling=False), 1527 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1528 granularity=cursor_granularity or datetime.timedelta(days=1), 1529 ) 1530 case "MONTH": 1531 clamping_strategy = MonthClampingStrategy() 1532 end_date_provider = ClampingEndProvider( 1533 MonthClampingStrategy(is_ceiling=False), 1534 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1535 granularity=cursor_granularity or datetime.timedelta(days=1), 1536 ) 1537 case _: 1538 raise ValueError( 1539 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1540 ) 1541 1542 return ConcurrentCursor( 1543 stream_name=stream_name, 1544 stream_namespace=stream_namespace, 1545 stream_state=stream_state, 1546 message_repository=message_repository or self._message_repository, 1547 connector_state_manager=self._connector_state_manager, 1548 connector_state_converter=connector_state_converter, 1549 cursor_field=cursor_field, 1550 slice_boundary_fields=slice_boundary_fields, 1551 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1552 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1553 lookback_window=lookback_window, 1554 slice_range=step_length, 1555 cursor_granularity=cursor_granularity, 1556 clamping_strategy=clamping_strategy, 1557 )
1559 def create_concurrent_cursor_from_incrementing_count_cursor( 1560 self, 1561 model_type: Type[BaseModel], 1562 component_definition: ComponentDefinition, 1563 stream_name: str, 1564 stream_namespace: Optional[str], 1565 stream_state: MutableMapping[str, Any], 1566 config: Config, 1567 message_repository: Optional[MessageRepository] = None, 1568 **kwargs: Any, 1569 ) -> ConcurrentCursor: 1570 component_type = component_definition.get("type") 1571 if component_definition.get("type") != model_type.__name__: 1572 raise ValueError( 1573 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1574 ) 1575 1576 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1577 1578 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1579 raise ValueError( 1580 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1581 ) 1582 1583 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1584 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1585 # We need to handle both int and str representations of numeric values. 1586 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1587 if start_value is not None: 1588 interpolated_start_value = InterpolatedString.create( 1589 str(start_value), # Ensure we pass a string to InterpolatedString.create 1590 parameters=incrementing_count_cursor_model.parameters or {}, 1591 ) 1592 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1593 else: 1594 evaluated_start_value = 0 1595 1596 cursor_field = self._get_catalog_defined_cursor_field( 1597 stream_name=stream_name, 1598 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1599 or False, 1600 ) 1601 1602 if not cursor_field: 1603 interpolated_cursor_field = InterpolatedString.create( 1604 incrementing_count_cursor_model.cursor_field, 1605 parameters=incrementing_count_cursor_model.parameters or {}, 1606 ) 1607 cursor_field = CursorField( 1608 cursor_field_key=interpolated_cursor_field.eval(config=config), 1609 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1610 or False, 1611 ) 1612 1613 connector_state_converter = IncrementingCountStreamStateConverter( 1614 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1615 ) 1616 1617 return ConcurrentCursor( 1618 stream_name=stream_name, 1619 stream_namespace=stream_namespace, 1620 stream_state=stream_state, 1621 message_repository=message_repository or self._message_repository, 1622 connector_state_manager=self._connector_state_manager, 1623 connector_state_converter=connector_state_converter, 1624 cursor_field=cursor_field, 1625 slice_boundary_fields=None, 1626 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1627 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1628 )
1649 def create_concurrent_cursor_from_perpartition_cursor( 1650 self, 1651 state_manager: ConnectorStateManager, 1652 model_type: Type[BaseModel], 1653 component_definition: ComponentDefinition, 1654 stream_name: str, 1655 stream_namespace: Optional[str], 1656 config: Config, 1657 stream_state: MutableMapping[str, Any], 1658 partition_router: PartitionRouter, 1659 attempt_to_create_cursor_if_not_provided: bool = False, 1660 **kwargs: Any, 1661 ) -> ConcurrentPerPartitionCursor: 1662 component_type = component_definition.get("type") 1663 if component_definition.get("type") != model_type.__name__: 1664 raise ValueError( 1665 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1666 ) 1667 1668 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1669 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1670 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1671 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1672 if "$parameters" not in component_definition and "parameters" in component_definition: 1673 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1674 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1675 1676 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1677 raise ValueError( 1678 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1679 ) 1680 1681 cursor_field = self._get_catalog_defined_cursor_field( 1682 stream_name=stream_name, 1683 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1684 or False, 1685 ) 1686 1687 if not cursor_field: 1688 interpolated_cursor_field = InterpolatedString.create( 1689 datetime_based_cursor_model.cursor_field, 1690 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1691 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1692 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1693 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1694 parameters=datetime_based_cursor_model.parameters or {}, 1695 ) 1696 cursor_field = CursorField( 1697 cursor_field_key=interpolated_cursor_field.eval(config=config), 1698 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1699 or False, 1700 ) 1701 1702 datetime_format = datetime_based_cursor_model.datetime_format 1703 1704 cursor_granularity = ( 1705 parse_duration(datetime_based_cursor_model.cursor_granularity) 1706 if datetime_based_cursor_model.cursor_granularity 1707 else None 1708 ) 1709 1710 connector_state_converter: DateTimeStreamStateConverter 1711 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1712 datetime_format=datetime_format, 1713 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1714 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1715 cursor_granularity=cursor_granularity, 1716 ) 1717 1718 # Create the cursor factory 1719 cursor_factory = ConcurrentCursorFactory( 1720 partial( 1721 self.create_concurrent_cursor_from_datetime_based_cursor, 1722 state_manager=state_manager, 1723 model_type=model_type, 1724 component_definition=component_definition, 1725 stream_name=stream_name, 1726 stream_namespace=stream_namespace, 1727 config=config, 1728 message_repository=NoopMessageRepository(), 1729 ) 1730 ) 1731 1732 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1733 use_global_cursor = isinstance( 1734 partition_router, GroupingPartitionRouter 1735 ) or component_definition.get("global_substream_cursor", False) 1736 1737 # Return the concurrent cursor and state converter 1738 return ConcurrentPerPartitionCursor( 1739 cursor_factory=cursor_factory, 1740 partition_router=partition_router, 1741 stream_name=stream_name, 1742 stream_namespace=stream_namespace, 1743 stream_state=stream_state, 1744 message_repository=self._message_repository, # type: ignore 1745 connector_state_manager=state_manager, 1746 connector_state_converter=connector_state_converter, 1747 cursor_field=cursor_field, 1748 use_global_cursor=use_global_cursor, 1749 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1750 )
1752 @staticmethod 1753 def create_constant_backoff_strategy( 1754 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1755 ) -> ConstantBackoffStrategy: 1756 return ConstantBackoffStrategy( 1757 backoff_time_in_seconds=model.backoff_time_in_seconds, 1758 config=config, 1759 parameters=model.parameters or {}, 1760 )
1762 def create_cursor_pagination( 1763 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1764 ) -> CursorPaginationStrategy: 1765 if isinstance(decoder, PaginationDecoderDecorator): 1766 inner_decoder = decoder.decoder 1767 else: 1768 inner_decoder = decoder 1769 decoder = PaginationDecoderDecorator(decoder=decoder) 1770 1771 if self._is_supported_decoder_for_pagination(inner_decoder): 1772 decoder_to_use = decoder 1773 else: 1774 raise ValueError( 1775 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1776 ) 1777 1778 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1779 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1780 page_size = model.page_size 1781 if isinstance(page_size, str) and page_size.isdigit(): 1782 page_size = int(page_size) 1783 1784 return CursorPaginationStrategy( 1785 cursor_value=model.cursor_value, 1786 decoder=decoder_to_use, 1787 page_size=page_size, 1788 stop_condition=model.stop_condition, 1789 config=config, 1790 parameters=model.parameters or {}, 1791 )
1793 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1794 """ 1795 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1796 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1797 :param model: The Pydantic model of the custom component being created 1798 :param config: The custom defined connector config 1799 :return: The declarative component built from the Pydantic model to be used at runtime 1800 """ 1801 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1802 component_fields = get_type_hints(custom_component_class) 1803 model_args = model.dict() 1804 model_args["config"] = config 1805 1806 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1807 # we defer to these arguments over the component's definition 1808 for key, arg in kwargs.items(): 1809 model_args[key] = arg 1810 1811 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1812 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1813 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1814 for model_field, model_value in model_args.items(): 1815 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1816 if ( 1817 isinstance(model_value, dict) 1818 and "type" not in model_value 1819 and model_field in component_fields 1820 ): 1821 derived_type = self._derive_component_type_from_type_hints( 1822 component_fields.get(model_field) 1823 ) 1824 if derived_type: 1825 model_value["type"] = derived_type 1826 1827 if self._is_component(model_value): 1828 model_args[model_field] = self._create_nested_component( 1829 model, 1830 model_field, 1831 model_value, 1832 config, 1833 **kwargs, 1834 ) 1835 elif isinstance(model_value, list): 1836 vals = [] 1837 for v in model_value: 1838 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1839 derived_type = self._derive_component_type_from_type_hints( 1840 component_fields.get(model_field) 1841 ) 1842 if derived_type: 1843 v["type"] = derived_type 1844 if self._is_component(v): 1845 vals.append( 1846 self._create_nested_component( 1847 model, 1848 model_field, 1849 v, 1850 config, 1851 **kwargs, 1852 ) 1853 ) 1854 else: 1855 vals.append(v) 1856 model_args[model_field] = vals 1857 1858 kwargs = { 1859 class_field: model_args[class_field] 1860 for class_field in component_fields.keys() 1861 if class_field in model_args 1862 } 1863 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1998 def create_default_stream( 1999 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2000 ) -> AbstractStream: 2001 primary_key = model.primary_key.__root__ if model.primary_key else None 2002 self._migrate_state(model, config) 2003 2004 partition_router = self._build_stream_slicer_from_partition_router( 2005 model.retriever, 2006 config, 2007 stream_name=model.name, 2008 **kwargs, 2009 ) 2010 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2011 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2012 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2013 2014 end_time_option = ( 2015 self._create_component_from_model( 2016 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2017 ) 2018 if cursor_model.end_time_option 2019 else None 2020 ) 2021 start_time_option = ( 2022 self._create_component_from_model( 2023 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2024 ) 2025 if cursor_model.start_time_option 2026 else None 2027 ) 2028 2029 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2030 start_time_option=start_time_option, 2031 end_time_option=end_time_option, 2032 partition_field_start=cursor_model.partition_field_start, 2033 partition_field_end=cursor_model.partition_field_end, 2034 config=config, 2035 parameters=model.parameters or {}, 2036 ) 2037 request_options_provider = ( 2038 datetime_request_options_provider 2039 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2040 else PerPartitionRequestOptionsProvider( 2041 partition_router, datetime_request_options_provider 2042 ) 2043 ) 2044 elif model.incremental_sync and isinstance( 2045 model.incremental_sync, IncrementingCountCursorModel 2046 ): 2047 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2048 raise ValueError( 2049 "PerPartition does not support per partition states because switching to global state is time based" 2050 ) 2051 2052 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2053 2054 start_time_option = ( 2055 self._create_component_from_model( 2056 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2057 config, 2058 parameters=cursor_model.parameters or {}, 2059 ) 2060 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2061 else None 2062 ) 2063 2064 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2065 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2066 partition_field_start = "start" 2067 2068 request_options_provider = DatetimeBasedRequestOptionsProvider( 2069 start_time_option=start_time_option, 2070 partition_field_start=partition_field_start, 2071 config=config, 2072 parameters=model.parameters or {}, 2073 ) 2074 else: 2075 request_options_provider = None 2076 2077 transformations = [] 2078 if model.transformations: 2079 for transformation_model in model.transformations: 2080 transformations.append( 2081 self._create_component_from_model(model=transformation_model, config=config) 2082 ) 2083 file_uploader = None 2084 if model.file_uploader: 2085 file_uploader = self._create_component_from_model( 2086 model=model.file_uploader, config=config 2087 ) 2088 2089 stream_slicer: ConcurrentStreamSlicer = ( 2090 partition_router 2091 if isinstance(concurrent_cursor, FinalStateCursor) 2092 else concurrent_cursor 2093 ) 2094 2095 retriever = self._create_component_from_model( 2096 model=model.retriever, 2097 config=config, 2098 name=model.name, 2099 primary_key=primary_key, 2100 request_options_provider=request_options_provider, 2101 stream_slicer=stream_slicer, 2102 partition_router=partition_router, 2103 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2104 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2105 cursor=concurrent_cursor, 2106 transformations=transformations, 2107 file_uploader=file_uploader, 2108 incremental_sync=model.incremental_sync, 2109 ) 2110 if isinstance(retriever, AsyncRetriever): 2111 stream_slicer = retriever.stream_slicer 2112 2113 schema_loader: SchemaLoader 2114 if model.schema_loader and isinstance(model.schema_loader, list): 2115 nested_schema_loaders = [ 2116 self._create_component_from_model(model=nested_schema_loader, config=config) 2117 for nested_schema_loader in model.schema_loader 2118 ] 2119 schema_loader = CompositeSchemaLoader( 2120 schema_loaders=nested_schema_loaders, parameters={} 2121 ) 2122 elif model.schema_loader: 2123 schema_loader = self._create_component_from_model( 2124 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2125 config=config, 2126 ) 2127 else: 2128 options = model.parameters or {} 2129 if "name" not in options: 2130 options["name"] = model.name 2131 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2132 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2133 2134 stream_name = model.name or "" 2135 return DefaultStream( 2136 partition_generator=StreamSlicerPartitionGenerator( 2137 DeclarativePartitionFactory( 2138 stream_name, 2139 schema_loader, 2140 retriever, 2141 self._message_repository, 2142 ), 2143 stream_slicer, 2144 slice_limit=self._limit_slices_fetched, 2145 ), 2146 name=stream_name, 2147 json_schema=schema_loader.get_json_schema, 2148 primary_key=get_primary_key_from_stream(primary_key), 2149 cursor_field=( 2150 concurrent_cursor.cursor_field 2151 if hasattr(concurrent_cursor, "cursor_field") 2152 else None 2153 ), 2154 logger=logging.getLogger(f"airbyte.{stream_name}"), 2155 cursor=concurrent_cursor, 2156 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2157 )
2299 def create_default_error_handler( 2300 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2301 ) -> DefaultErrorHandler: 2302 backoff_strategies = [] 2303 if model.backoff_strategies: 2304 for backoff_strategy_model in model.backoff_strategies: 2305 backoff_strategies.append( 2306 self._create_component_from_model(model=backoff_strategy_model, config=config) 2307 ) 2308 2309 response_filters = [] 2310 if model.response_filters: 2311 for response_filter_model in model.response_filters: 2312 response_filters.append( 2313 self._create_component_from_model(model=response_filter_model, config=config) 2314 ) 2315 response_filters.append( 2316 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2317 ) 2318 2319 return DefaultErrorHandler( 2320 backoff_strategies=backoff_strategies, 2321 max_retries=model.max_retries, 2322 response_filters=response_filters, 2323 config=config, 2324 parameters=model.parameters or {}, 2325 )
2327 def create_default_paginator( 2328 self, 2329 model: DefaultPaginatorModel, 2330 config: Config, 2331 *, 2332 url_base: str, 2333 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2334 decoder: Optional[Decoder] = None, 2335 cursor_used_for_stop_condition: Optional[Cursor] = None, 2336 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2337 if decoder: 2338 if self._is_supported_decoder_for_pagination(decoder): 2339 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2340 else: 2341 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2342 else: 2343 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2344 page_size_option = ( 2345 self._create_component_from_model(model=model.page_size_option, config=config) 2346 if model.page_size_option 2347 else None 2348 ) 2349 page_token_option = ( 2350 self._create_component_from_model(model=model.page_token_option, config=config) 2351 if model.page_token_option 2352 else None 2353 ) 2354 pagination_strategy = self._create_component_from_model( 2355 model=model.pagination_strategy, 2356 config=config, 2357 decoder=decoder_to_use, 2358 extractor_model=extractor_model, 2359 ) 2360 if cursor_used_for_stop_condition: 2361 pagination_strategy = StopConditionPaginationStrategyDecorator( 2362 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2363 ) 2364 paginator = DefaultPaginator( 2365 decoder=decoder_to_use, 2366 page_size_option=page_size_option, 2367 page_token_option=page_token_option, 2368 pagination_strategy=pagination_strategy, 2369 url_base=url_base, 2370 config=config, 2371 parameters=model.parameters or {}, 2372 ) 2373 if self._limit_pages_fetched_per_slice: 2374 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2375 return paginator
2377 def create_dpath_extractor( 2378 self, 2379 model: DpathExtractorModel, 2380 config: Config, 2381 decoder: Optional[Decoder] = None, 2382 **kwargs: Any, 2383 ) -> DpathExtractor: 2384 if decoder: 2385 decoder_to_use = decoder 2386 else: 2387 decoder_to_use = JsonDecoder(parameters={}) 2388 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2389 2390 record_expander = None 2391 if model.record_expander: 2392 record_expander = self._create_component_from_model( 2393 model=model.record_expander, 2394 config=config, 2395 ) 2396 2397 return DpathExtractor( 2398 decoder=decoder_to_use, 2399 field_path=model_field_path, 2400 config=config, 2401 parameters=model.parameters or {}, 2402 record_expander=record_expander, 2403 )
2405 def create_record_expander( 2406 self, 2407 model: RecordExpanderModel, 2408 config: Config, 2409 **kwargs: Any, 2410 ) -> RecordExpander: 2411 return RecordExpander( 2412 expand_records_from_field=model.expand_records_from_field, 2413 config=config, 2414 parameters=model.parameters or {}, 2415 remain_original_record=model.remain_original_record or False, 2416 on_no_records=OnNoRecords(model.on_no_records.value) 2417 if model.on_no_records 2418 else OnNoRecords.skip, 2419 )
2440 def create_http_requester( 2441 self, 2442 model: HttpRequesterModel, 2443 config: Config, 2444 decoder: Decoder = JsonDecoder(parameters={}), 2445 query_properties_key: Optional[str] = None, 2446 use_cache: Optional[bool] = None, 2447 *, 2448 name: str, 2449 ) -> HttpRequester: 2450 authenticator = ( 2451 self._create_component_from_model( 2452 model=model.authenticator, 2453 config=config, 2454 url_base=model.url or model.url_base, 2455 name=name, 2456 decoder=decoder, 2457 ) 2458 if model.authenticator 2459 else None 2460 ) 2461 error_handler = ( 2462 self._create_component_from_model(model=model.error_handler, config=config) 2463 if model.error_handler 2464 else DefaultErrorHandler( 2465 backoff_strategies=[], 2466 response_filters=[], 2467 config=config, 2468 parameters=model.parameters or {}, 2469 ) 2470 ) 2471 2472 api_budget = self._api_budget 2473 2474 request_options_provider = InterpolatedRequestOptionsProvider( 2475 request_body=model.request_body, 2476 request_body_data=model.request_body_data, 2477 request_body_json=model.request_body_json, 2478 request_headers=model.request_headers, 2479 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2480 query_properties_key=query_properties_key, 2481 config=config, 2482 parameters=model.parameters or {}, 2483 ) 2484 2485 assert model.use_cache is not None # for mypy 2486 assert model.http_method is not None # for mypy 2487 2488 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2489 2490 return HttpRequester( 2491 name=name, 2492 url=model.url, 2493 url_base=model.url_base, 2494 path=model.path, 2495 authenticator=authenticator, 2496 error_handler=error_handler, 2497 api_budget=api_budget, 2498 http_method=HttpMethod[model.http_method.value], 2499 request_options_provider=request_options_provider, 2500 config=config, 2501 disable_retries=self._disable_retries, 2502 parameters=model.parameters or {}, 2503 message_repository=self._message_repository, 2504 use_cache=should_use_cache, 2505 decoder=decoder, 2506 stream_response=decoder.is_stream_response() if decoder else False, 2507 )
2509 @staticmethod 2510 def create_http_response_filter( 2511 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2512 ) -> HttpResponseFilter: 2513 if model.action: 2514 action = ResponseAction(model.action.value) 2515 else: 2516 action = None 2517 2518 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2519 2520 http_codes = ( 2521 set(model.http_codes) if model.http_codes else set() 2522 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2523 2524 return HttpResponseFilter( 2525 action=action, 2526 failure_type=failure_type, 2527 error_message=model.error_message or "", 2528 error_message_contains=model.error_message_contains or "", 2529 http_codes=http_codes, 2530 predicate=model.predicate or "", 2531 config=config, 2532 parameters=model.parameters or {}, 2533 )
2541 def create_complex_field_type( 2542 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2543 ) -> ComplexFieldType: 2544 items = ( 2545 self._create_component_from_model(model=model.items, config=config) 2546 if isinstance(model.items, ComplexFieldTypeModel) 2547 else model.items 2548 ) 2549 2550 return ComplexFieldType(field_type=model.field_type, items=items)
2552 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2553 target_type = ( 2554 self._create_component_from_model(model=model.target_type, config=config) 2555 if isinstance(model.target_type, ComplexFieldTypeModel) 2556 else model.target_type 2557 ) 2558 2559 return TypesMap( 2560 target_type=target_type, 2561 current_type=model.current_type, 2562 condition=model.condition if model.condition is not None else "True", 2563 )
2565 def create_schema_type_identifier( 2566 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2567 ) -> SchemaTypeIdentifier: 2568 types_mapping = [] 2569 if model.types_mapping: 2570 types_mapping.extend( 2571 [ 2572 self._create_component_from_model(types_map, config=config) 2573 for types_map in model.types_mapping 2574 ] 2575 ) 2576 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2577 [x for x in model.schema_pointer] if model.schema_pointer else [] 2578 ) 2579 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2580 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2581 [x for x in model.type_pointer] if model.type_pointer else None 2582 ) 2583 2584 return SchemaTypeIdentifier( 2585 schema_pointer=model_schema_pointer, 2586 key_pointer=model_key_pointer, 2587 type_pointer=model_type_pointer, 2588 types_mapping=types_mapping, 2589 parameters=model.parameters or {}, 2590 )
2592 def create_dynamic_schema_loader( 2593 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2594 ) -> DynamicSchemaLoader: 2595 schema_transformations = [] 2596 if model.schema_transformations: 2597 for transformation_model in model.schema_transformations: 2598 schema_transformations.append( 2599 self._create_component_from_model(model=transformation_model, config=config) 2600 ) 2601 name = "dynamic_properties" 2602 retriever = self._create_component_from_model( 2603 model=model.retriever, 2604 config=config, 2605 name=name, 2606 primary_key=None, 2607 partition_router=self._build_stream_slicer_from_partition_router( 2608 model.retriever, config 2609 ), 2610 transformations=[], 2611 use_cache=True, 2612 log_formatter=( 2613 lambda response: format_http_message( 2614 response, 2615 f"Schema loader '{name}' request", 2616 f"Request performed in order to extract schema.", 2617 name, 2618 is_auxiliary=True, 2619 ) 2620 ), 2621 ) 2622 schema_type_identifier = self._create_component_from_model( 2623 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2624 ) 2625 schema_filter = ( 2626 self._create_component_from_model( 2627 model.schema_filter, config=config, parameters=model.parameters or {} 2628 ) 2629 if model.schema_filter is not None 2630 else None 2631 ) 2632 2633 return DynamicSchemaLoader( 2634 retriever=retriever, 2635 config=config, 2636 schema_transformations=schema_transformations, 2637 schema_filter=schema_filter, 2638 schema_type_identifier=schema_type_identifier, 2639 parameters=model.parameters or {}, 2640 )
2660 def create_gzip_decoder( 2661 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2662 ) -> Decoder: 2663 _compressed_response_types = { 2664 "gzip", 2665 "x-gzip", 2666 "gzip, deflate", 2667 "x-gzip, deflate", 2668 "application/zip", 2669 "application/gzip", 2670 "application/x-gzip", 2671 "application/x-zip-compressed", 2672 } 2673 2674 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2675 2676 if self._emit_connector_builder_messages: 2677 # This is very surprising but if the response is not streamed, 2678 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2679 # which uses urllib3 directly and does not uncompress the data. 2680 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2681 2682 return CompositeRawDecoder.by_headers( 2683 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2684 stream_response=True, 2685 fallback_parser=gzip_parser.inner_parser, 2686 )
2735 def create_jwt_authenticator( 2736 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2737 ) -> JwtAuthenticator: 2738 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2739 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2740 request_option = ( 2741 self._create_component_from_model(model.request_option, config) 2742 if model.request_option 2743 else None 2744 ) 2745 return JwtAuthenticator( 2746 config=config, 2747 parameters=model.parameters or {}, 2748 algorithm=JwtAlgorithm(model.algorithm.value), 2749 secret_key=model.secret_key, 2750 base64_encode_secret_key=model.base64_encode_secret_key, 2751 token_duration=model.token_duration, 2752 header_prefix=model.header_prefix, 2753 kid=jwt_headers.kid, 2754 typ=jwt_headers.typ, 2755 cty=jwt_headers.cty, 2756 iss=jwt_payload.iss, 2757 sub=jwt_payload.sub, 2758 aud=jwt_payload.aud, 2759 additional_jwt_headers=model.additional_jwt_headers, 2760 additional_jwt_payload=model.additional_jwt_payload, 2761 passphrase=model.passphrase, 2762 request_option=request_option, 2763 )
2765 def create_list_partition_router( 2766 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2767 ) -> ListPartitionRouter: 2768 request_option = ( 2769 self._create_component_from_model(model.request_option, config) 2770 if model.request_option 2771 else None 2772 ) 2773 return ListPartitionRouter( 2774 cursor_field=model.cursor_field, 2775 request_option=request_option, 2776 values=model.values, 2777 config=config, 2778 parameters=model.parameters or {}, 2779 )
2781 @staticmethod 2782 def create_min_max_datetime( 2783 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2784 ) -> MinMaxDatetime: 2785 return MinMaxDatetime( 2786 datetime=model.datetime, 2787 datetime_format=model.datetime_format or "", 2788 max_datetime=model.max_datetime or "", 2789 min_datetime=model.min_datetime or "", 2790 parameters=model.parameters or {}, 2791 )
2803 def create_oauth_authenticator( 2804 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2805 ) -> DeclarativeOauth2Authenticator: 2806 profile_assertion = ( 2807 self._create_component_from_model(model.profile_assertion, config=config) 2808 if model.profile_assertion 2809 else None 2810 ) 2811 2812 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2813 self._get_refresh_token_error_information(model) 2814 ) 2815 if model.refresh_token_updater: 2816 # ignore type error because fixing it would have a lot of dependencies, revisit later 2817 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2818 config, 2819 InterpolatedString.create( 2820 model.token_refresh_endpoint, # type: ignore 2821 parameters=model.parameters or {}, 2822 ).eval(config), 2823 access_token_name=InterpolatedString.create( 2824 model.access_token_name or "access_token", parameters=model.parameters or {} 2825 ).eval(config), 2826 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2827 expires_in_name=InterpolatedString.create( 2828 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2829 ).eval(config), 2830 client_id_name=InterpolatedString.create( 2831 model.client_id_name or "client_id", parameters=model.parameters or {} 2832 ).eval(config), 2833 client_id=InterpolatedString.create( 2834 model.client_id, parameters=model.parameters or {} 2835 ).eval(config) 2836 if model.client_id 2837 else model.client_id, 2838 client_secret_name=InterpolatedString.create( 2839 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2840 ).eval(config), 2841 client_secret=InterpolatedString.create( 2842 model.client_secret, parameters=model.parameters or {} 2843 ).eval(config) 2844 if model.client_secret 2845 else model.client_secret, 2846 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2847 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2848 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2849 grant_type_name=InterpolatedString.create( 2850 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2851 ).eval(config), 2852 grant_type=InterpolatedString.create( 2853 model.grant_type or "refresh_token", parameters=model.parameters or {} 2854 ).eval(config), 2855 refresh_request_body=InterpolatedMapping( 2856 model.refresh_request_body or {}, parameters=model.parameters or {} 2857 ).eval(config), 2858 refresh_request_headers=InterpolatedMapping( 2859 model.refresh_request_headers or {}, parameters=model.parameters or {} 2860 ).eval(config), 2861 scopes=model.scopes, 2862 token_expiry_date_format=model.token_expiry_date_format, 2863 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2864 message_repository=self._message_repository, 2865 refresh_token_error_status_codes=refresh_token_error_status_codes, 2866 refresh_token_error_key=refresh_token_error_key, 2867 refresh_token_error_values=refresh_token_error_values, 2868 ) 2869 # ignore type error because fixing it would have a lot of dependencies, revisit later 2870 return DeclarativeOauth2Authenticator( # type: ignore 2871 access_token_name=model.access_token_name or "access_token", 2872 access_token_value=model.access_token_value, 2873 client_id_name=model.client_id_name or "client_id", 2874 client_id=model.client_id, 2875 client_secret_name=model.client_secret_name or "client_secret", 2876 client_secret=model.client_secret, 2877 expires_in_name=model.expires_in_name or "expires_in", 2878 grant_type_name=model.grant_type_name or "grant_type", 2879 grant_type=model.grant_type or "refresh_token", 2880 refresh_request_body=model.refresh_request_body, 2881 refresh_request_headers=model.refresh_request_headers, 2882 refresh_token_name=model.refresh_token_name or "refresh_token", 2883 refresh_token=model.refresh_token, 2884 scopes=model.scopes, 2885 token_expiry_date=model.token_expiry_date, 2886 token_expiry_date_format=model.token_expiry_date_format, 2887 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2888 token_refresh_endpoint=model.token_refresh_endpoint, 2889 config=config, 2890 parameters=model.parameters or {}, 2891 message_repository=self._message_repository, 2892 profile_assertion=profile_assertion, 2893 use_profile_assertion=model.use_profile_assertion, 2894 refresh_token_error_status_codes=refresh_token_error_status_codes, 2895 refresh_token_error_key=refresh_token_error_key, 2896 refresh_token_error_values=refresh_token_error_values, 2897 )
2947 def create_offset_increment( 2948 self, 2949 model: OffsetIncrementModel, 2950 config: Config, 2951 decoder: Decoder, 2952 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2953 **kwargs: Any, 2954 ) -> OffsetIncrement: 2955 if isinstance(decoder, PaginationDecoderDecorator): 2956 inner_decoder = decoder.decoder 2957 else: 2958 inner_decoder = decoder 2959 decoder = PaginationDecoderDecorator(decoder=decoder) 2960 2961 if self._is_supported_decoder_for_pagination(inner_decoder): 2962 decoder_to_use = decoder 2963 else: 2964 raise ValueError( 2965 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2966 ) 2967 2968 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2969 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2970 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2971 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2972 # When we have more time to investigate we can look into reusing the same component. 2973 extractor = ( 2974 self._create_component_from_model( 2975 model=extractor_model, config=config, decoder=decoder_to_use 2976 ) 2977 if extractor_model 2978 else None 2979 ) 2980 2981 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2982 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2983 page_size = model.page_size 2984 if isinstance(page_size, str) and page_size.isdigit(): 2985 page_size = int(page_size) 2986 2987 return OffsetIncrement( 2988 page_size=page_size, 2989 config=config, 2990 decoder=decoder_to_use, 2991 extractor=extractor, 2992 inject_on_first_request=model.inject_on_first_request or False, 2993 parameters=model.parameters or {}, 2994 )
2996 @staticmethod 2997 def create_page_increment( 2998 model: PageIncrementModel, config: Config, **kwargs: Any 2999 ) -> PageIncrement: 3000 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3001 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3002 page_size = model.page_size 3003 if isinstance(page_size, str) and page_size.isdigit(): 3004 page_size = int(page_size) 3005 3006 return PageIncrement( 3007 page_size=page_size, 3008 config=config, 3009 start_from_page=model.start_from_page or 0, 3010 inject_on_first_request=model.inject_on_first_request or False, 3011 parameters=model.parameters or {}, 3012 )
3014 def create_parent_stream_config( 3015 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3016 ) -> ParentStreamConfig: 3017 declarative_stream = self._create_component_from_model( 3018 model.stream, 3019 config=config, 3020 is_parent=True, 3021 **kwargs, 3022 ) 3023 request_option = ( 3024 self._create_component_from_model(model.request_option, config=config) 3025 if model.request_option 3026 else None 3027 ) 3028 3029 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3030 raise ValueError( 3031 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3032 ) 3033 3034 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3035 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3036 ) 3037 3038 return ParentStreamConfig( 3039 parent_key=model.parent_key, 3040 request_option=request_option, 3041 stream=declarative_stream, 3042 partition_field=model.partition_field, 3043 config=config, 3044 incremental_dependency=model.incremental_dependency or False, 3045 parameters=model.parameters or {}, 3046 extra_fields=model.extra_fields, 3047 lazy_read_pointer=model_lazy_read_pointer, 3048 )
3050 def create_properties_from_endpoint( 3051 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3052 ) -> PropertiesFromEndpoint: 3053 retriever = self._create_component_from_model( 3054 model=model.retriever, 3055 config=config, 3056 name="dynamic_properties", 3057 primary_key=None, 3058 stream_slicer=None, 3059 transformations=[], 3060 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3061 ) 3062 return PropertiesFromEndpoint( 3063 property_field_path=model.property_field_path, 3064 retriever=retriever, 3065 config=config, 3066 parameters=model.parameters or {}, 3067 )
3069 def create_property_chunking( 3070 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3071 ) -> PropertyChunking: 3072 record_merge_strategy = ( 3073 self._create_component_from_model( 3074 model=model.record_merge_strategy, config=config, **kwargs 3075 ) 3076 if model.record_merge_strategy 3077 else None 3078 ) 3079 3080 property_limit_type: PropertyLimitType 3081 match model.property_limit_type: 3082 case PropertyLimitTypeModel.property_count: 3083 property_limit_type = PropertyLimitType.property_count 3084 case PropertyLimitTypeModel.characters: 3085 property_limit_type = PropertyLimitType.characters 3086 case _: 3087 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3088 3089 return PropertyChunking( 3090 property_limit_type=property_limit_type, 3091 property_limit=model.property_limit, 3092 record_merge_strategy=record_merge_strategy, 3093 config=config, 3094 parameters=model.parameters or {}, 3095 )
3097 def create_query_properties( 3098 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3099 ) -> QueryProperties: 3100 if isinstance(model.property_list, list): 3101 property_list = model.property_list 3102 else: 3103 property_list = self._create_component_from_model( 3104 model=model.property_list, config=config, **kwargs 3105 ) 3106 3107 property_chunking = ( 3108 self._create_component_from_model( 3109 model=model.property_chunking, config=config, **kwargs 3110 ) 3111 if model.property_chunking 3112 else None 3113 ) 3114 3115 property_selector = ( 3116 self._create_component_from_model( 3117 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3118 ) 3119 if model.property_selector 3120 else None 3121 ) 3122 3123 return QueryProperties( 3124 property_list=property_list, 3125 always_include_properties=model.always_include_properties, 3126 property_chunking=property_chunking, 3127 property_selector=property_selector, 3128 config=config, 3129 parameters=model.parameters or {}, 3130 )
3132 def create_json_schema_property_selector( 3133 self, 3134 model: JsonSchemaPropertySelectorModel, 3135 config: Config, 3136 *, 3137 stream_name: str, 3138 **kwargs: Any, 3139 ) -> JsonSchemaPropertySelector: 3140 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3141 3142 transformations = [] 3143 if model.transformations: 3144 for transformation_model in model.transformations: 3145 transformations.append( 3146 self._create_component_from_model(model=transformation_model, config=config) 3147 ) 3148 3149 return JsonSchemaPropertySelector( 3150 configured_stream=configured_stream, 3151 properties_transformations=transformations, 3152 config=config, 3153 parameters=model.parameters or {}, 3154 )
3168 @staticmethod 3169 def create_request_option( 3170 model: RequestOptionModel, config: Config, **kwargs: Any 3171 ) -> RequestOption: 3172 inject_into = RequestOptionType(model.inject_into.value) 3173 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3174 [ 3175 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3176 for segment in model.field_path 3177 ] 3178 if model.field_path 3179 else None 3180 ) 3181 field_name = ( 3182 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3183 if model.field_name 3184 else None 3185 ) 3186 return RequestOption( 3187 field_name=field_name, 3188 field_path=field_path, 3189 inject_into=inject_into, 3190 parameters=kwargs.get("parameters", {}), 3191 )
3193 def create_record_selector( 3194 self, 3195 model: RecordSelectorModel, 3196 config: Config, 3197 *, 3198 name: str, 3199 transformations: List[RecordTransformation] | None = None, 3200 decoder: Decoder | None = None, 3201 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3202 file_uploader: Optional[DefaultFileUploader] = None, 3203 **kwargs: Any, 3204 ) -> RecordSelector: 3205 extractor = self._create_component_from_model( 3206 model=model.extractor, decoder=decoder, config=config 3207 ) 3208 record_filter = ( 3209 self._create_component_from_model(model.record_filter, config=config) 3210 if model.record_filter 3211 else None 3212 ) 3213 3214 transform_before_filtering = ( 3215 False if model.transform_before_filtering is None else model.transform_before_filtering 3216 ) 3217 if client_side_incremental_sync_cursor: 3218 record_filter = ClientSideIncrementalRecordFilterDecorator( 3219 config=config, 3220 parameters=model.parameters, 3221 condition=model.record_filter.condition 3222 if (model.record_filter and hasattr(model.record_filter, "condition")) 3223 else None, 3224 cursor=client_side_incremental_sync_cursor, 3225 ) 3226 transform_before_filtering = ( 3227 True 3228 if model.transform_before_filtering is None 3229 else model.transform_before_filtering 3230 ) 3231 3232 if model.schema_normalization is None: 3233 # default to no schema normalization if not set 3234 model.schema_normalization = SchemaNormalizationModel.None_ 3235 3236 schema_normalization = ( 3237 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3238 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3239 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3240 ) 3241 3242 return RecordSelector( 3243 extractor=extractor, 3244 name=name, 3245 config=config, 3246 record_filter=record_filter, 3247 transformations=transformations or [], 3248 file_uploader=file_uploader, 3249 schema_normalization=schema_normalization, 3250 parameters=model.parameters or {}, 3251 transform_before_filtering=transform_before_filtering, 3252 )
3262 def create_selective_authenticator( 3263 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3264 ) -> DeclarativeAuthenticator: 3265 authenticators = { 3266 name: self._create_component_from_model(model=auth, config=config) 3267 for name, auth in model.authenticators.items() 3268 } 3269 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3270 return SelectiveAuthenticator( # type: ignore[abstract] 3271 config=config, 3272 authenticators=authenticators, 3273 authenticator_selection_path=model.authenticator_selection_path, 3274 **kwargs, 3275 )
3277 @staticmethod 3278 def create_legacy_session_token_authenticator( 3279 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3280 ) -> LegacySessionTokenAuthenticator: 3281 return LegacySessionTokenAuthenticator( 3282 api_url=url_base, 3283 header=model.header, 3284 login_url=model.login_url, 3285 password=model.password or "", 3286 session_token=model.session_token or "", 3287 session_token_response_key=model.session_token_response_key or "", 3288 username=model.username or "", 3289 validate_session_url=model.validate_session_url, 3290 config=config, 3291 parameters=model.parameters or {}, 3292 )
3294 def create_simple_retriever( 3295 self, 3296 model: SimpleRetrieverModel, 3297 config: Config, 3298 *, 3299 name: str, 3300 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3301 request_options_provider: Optional[RequestOptionsProvider] = None, 3302 cursor: Optional[Cursor] = None, 3303 has_stop_condition_cursor: bool = False, 3304 is_client_side_incremental_sync: bool = False, 3305 transformations: List[RecordTransformation], 3306 file_uploader: Optional[DefaultFileUploader] = None, 3307 incremental_sync: Optional[ 3308 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3309 ] = None, 3310 use_cache: Optional[bool] = None, 3311 log_formatter: Optional[Callable[[Response], Any]] = None, 3312 partition_router: Optional[PartitionRouter] = None, 3313 **kwargs: Any, 3314 ) -> SimpleRetriever: 3315 def _get_url(req: Requester) -> str: 3316 """ 3317 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3318 This is needed because the URL is not set until the requester is created. 3319 """ 3320 3321 _url: str = ( 3322 model.requester.url 3323 if hasattr(model.requester, "url") and model.requester.url is not None 3324 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3325 ) 3326 _url_base: str = ( 3327 model.requester.url_base 3328 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3329 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3330 ) 3331 3332 return _url or _url_base 3333 3334 if cursor is None: 3335 cursor = FinalStateCursor(name, None, self._message_repository) 3336 3337 decoder = ( 3338 self._create_component_from_model(model=model.decoder, config=config) 3339 if model.decoder 3340 else JsonDecoder(parameters={}) 3341 ) 3342 record_selector = self._create_component_from_model( 3343 model=model.record_selector, 3344 name=name, 3345 config=config, 3346 decoder=decoder, 3347 transformations=transformations, 3348 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3349 file_uploader=file_uploader, 3350 ) 3351 3352 query_properties: Optional[QueryProperties] = None 3353 query_properties_key: Optional[str] = None 3354 self._ensure_query_properties_to_model(model.requester) 3355 if self._has_query_properties_in_request_parameters(model.requester): 3356 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3357 # places instead of default to request_parameters which isn't clearly documented 3358 if ( 3359 hasattr(model.requester, "fetch_properties_from_endpoint") 3360 and model.requester.fetch_properties_from_endpoint 3361 ): 3362 raise ValueError( 3363 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3364 ) 3365 3366 query_properties_definitions = [] 3367 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3368 if isinstance(request_parameter, QueryPropertiesModel): 3369 query_properties_key = key 3370 query_properties_definitions.append(request_parameter) 3371 3372 if len(query_properties_definitions) > 1: 3373 raise ValueError( 3374 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3375 ) 3376 3377 if len(query_properties_definitions) == 1: 3378 query_properties = self._create_component_from_model( 3379 model=query_properties_definitions[0], stream_name=name, config=config 3380 ) 3381 3382 # Removes QueryProperties components from the interpolated mappings because it has been designed 3383 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3384 # instead of through jinja interpolation 3385 if hasattr(model.requester, "request_parameters") and isinstance( 3386 model.requester.request_parameters, Mapping 3387 ): 3388 model.requester.request_parameters = self._remove_query_properties( 3389 model.requester.request_parameters 3390 ) 3391 elif ( 3392 hasattr(model.requester, "fetch_properties_from_endpoint") 3393 and model.requester.fetch_properties_from_endpoint 3394 ): 3395 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3396 query_properties_definition = QueryPropertiesModel( 3397 type="QueryProperties", 3398 property_list=model.requester.fetch_properties_from_endpoint, 3399 always_include_properties=None, 3400 property_chunking=None, 3401 ) # type: ignore # $parameters has a default value 3402 3403 query_properties = self.create_query_properties( 3404 model=query_properties_definition, 3405 stream_name=name, 3406 config=config, 3407 ) 3408 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3409 query_properties = self.create_query_properties( 3410 model=model.requester.query_properties, 3411 stream_name=name, 3412 config=config, 3413 ) 3414 3415 requester = self._create_component_from_model( 3416 model=model.requester, 3417 decoder=decoder, 3418 name=name, 3419 query_properties_key=query_properties_key, 3420 use_cache=use_cache, 3421 config=config, 3422 ) 3423 3424 if not request_options_provider: 3425 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3426 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3427 partition_router, PartitionRouter 3428 ): 3429 request_options_provider = partition_router 3430 3431 paginator = ( 3432 self._create_component_from_model( 3433 model=model.paginator, 3434 config=config, 3435 url_base=_get_url(requester), 3436 extractor_model=model.record_selector.extractor, 3437 decoder=decoder, 3438 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3439 ) 3440 if model.paginator 3441 else NoPagination(parameters={}) 3442 ) 3443 3444 ignore_stream_slicer_parameters_on_paginated_requests = ( 3445 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3446 ) 3447 3448 if ( 3449 model.partition_router 3450 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3451 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3452 and any( 3453 parent_stream_config.lazy_read_pointer 3454 for parent_stream_config in model.partition_router.parent_stream_configs 3455 ) 3456 ): 3457 if incremental_sync: 3458 if incremental_sync.type != "DatetimeBasedCursor": 3459 raise ValueError( 3460 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3461 ) 3462 3463 elif incremental_sync.step or incremental_sync.cursor_granularity: 3464 raise ValueError( 3465 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3466 ) 3467 3468 if model.decoder and model.decoder.type != "JsonDecoder": 3469 raise ValueError( 3470 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3471 ) 3472 3473 return LazySimpleRetriever( 3474 name=name, 3475 paginator=paginator, 3476 primary_key=primary_key, 3477 requester=requester, 3478 record_selector=record_selector, 3479 stream_slicer=_NO_STREAM_SLICING, 3480 request_option_provider=request_options_provider, 3481 config=config, 3482 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3483 parameters=model.parameters or {}, 3484 ) 3485 3486 if ( 3487 model.record_selector.record_filter 3488 and model.pagination_reset 3489 and model.pagination_reset.limits 3490 ): 3491 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3492 3493 return SimpleRetriever( 3494 name=name, 3495 paginator=paginator, 3496 primary_key=primary_key, 3497 requester=requester, 3498 record_selector=record_selector, 3499 stream_slicer=_NO_STREAM_SLICING, 3500 request_option_provider=request_options_provider, 3501 config=config, 3502 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3503 additional_query_properties=query_properties, 3504 log_formatter=self._get_log_formatter(log_formatter, name), 3505 pagination_tracker_factory=self._create_pagination_tracker_factory( 3506 model.pagination_reset, cursor 3507 ), 3508 parameters=model.parameters or {}, 3509 )
3587 def create_state_delegating_stream( 3588 self, 3589 model: StateDelegatingStreamModel, 3590 config: Config, 3591 **kwargs: Any, 3592 ) -> DefaultStream: 3593 if ( 3594 model.full_refresh_stream.name != model.name 3595 or model.name != model.incremental_stream.name 3596 ): 3597 raise ValueError( 3598 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3599 ) 3600 3601 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3602 resolved_retention_period: Optional[str] = None 3603 if model.api_retention_period: 3604 interpolated_retention = InterpolatedString.create( 3605 model.api_retention_period, parameters=model.parameters or {} 3606 ) 3607 resolved_value = interpolated_retention.eval(config=config) 3608 if resolved_value: 3609 resolved_retention_period = str(resolved_value) 3610 3611 if resolved_retention_period: 3612 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3613 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3614 raise ValueError( 3615 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3616 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3617 f"cursors, so cursor age validation cannot be performed." 3618 ) 3619 3620 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3621 3622 if not stream_state: 3623 return self._create_component_from_model( # type: ignore[no-any-return] 3624 model.full_refresh_stream, config=config, **kwargs 3625 ) 3626 3627 incremental_stream: DefaultStream = self._create_component_from_model( 3628 model.incremental_stream, config=config, **kwargs 3629 ) # type: ignore[assignment] 3630 3631 # Only run cursor age validation for streams that are in the configured 3632 # catalog (or when no catalog was provided, e.g. during discover / connector 3633 # builder). Streams not selected by the user but instantiated as parent-stream 3634 # dependencies must not go through this path because it emits state messages 3635 # that the destination does not know about, causing "Stream not found" crashes. 3636 stream_is_in_catalog = ( 3637 not self._stream_name_to_configured_stream # no catalog → validate by default 3638 or model.name in self._stream_name_to_configured_stream 3639 ) 3640 if resolved_retention_period and stream_is_in_catalog: 3641 full_refresh_stream: DefaultStream = self._create_component_from_model( 3642 model.full_refresh_stream, config=config, **kwargs 3643 ) # type: ignore[assignment] 3644 if self._is_cursor_older_than_retention_period( 3645 stream_state, 3646 full_refresh_stream.cursor, 3647 incremental_stream.cursor, 3648 resolved_retention_period, 3649 model.name, 3650 ): 3651 # Clear state BEFORE constructing the full_refresh_stream so that 3652 # its cursor starts from start_date instead of the stale cursor. 3653 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3654 state_message = self._connector_state_manager.create_state_message(model.name, None) 3655 self._message_repository.emit_message(state_message) 3656 return self._create_component_from_model( # type: ignore[no-any-return] 3657 model.full_refresh_stream, config=config, **kwargs 3658 ) 3659 3660 return incremental_stream
3747 def create_async_retriever( 3748 self, 3749 model: AsyncRetrieverModel, 3750 config: Config, 3751 *, 3752 name: str, 3753 primary_key: Optional[ 3754 Union[str, List[str], List[List[str]]] 3755 ], # this seems to be needed to match create_simple_retriever 3756 stream_slicer: Optional[StreamSlicer], 3757 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3758 transformations: List[RecordTransformation], 3759 **kwargs: Any, 3760 ) -> AsyncRetriever: 3761 if model.download_target_requester and not model.download_target_extractor: 3762 raise ValueError( 3763 f"`download_target_extractor` required if using a `download_target_requester`" 3764 ) 3765 3766 def _get_download_retriever( 3767 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3768 ) -> SimpleRetriever: 3769 # We create a record selector for the download retriever 3770 # with no schema normalization and no transformations, neither record filter 3771 # as all this occurs in the record_selector of the AsyncRetriever 3772 record_selector = RecordSelector( 3773 extractor=extractor, 3774 name=name, 3775 record_filter=None, 3776 transformations=[], 3777 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3778 config=config, 3779 parameters={}, 3780 ) 3781 paginator = ( 3782 self._create_component_from_model( 3783 model=model.download_paginator, 3784 decoder=_decoder, 3785 config=config, 3786 url_base="", 3787 ) 3788 if model.download_paginator 3789 else NoPagination(parameters={}) 3790 ) 3791 3792 return SimpleRetriever( 3793 requester=requester, 3794 record_selector=record_selector, 3795 primary_key=None, 3796 name=name, 3797 paginator=paginator, 3798 config=config, 3799 parameters={}, 3800 log_formatter=self._get_log_formatter(None, name), 3801 ) 3802 3803 def _get_job_timeout() -> datetime.timedelta: 3804 user_defined_timeout: Optional[int] = ( 3805 int( 3806 InterpolatedString.create( 3807 str(model.polling_job_timeout), 3808 parameters={}, 3809 ).eval(config) 3810 ) 3811 if model.polling_job_timeout 3812 else None 3813 ) 3814 3815 # check for user defined timeout during the test read or 15 minutes 3816 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3817 # default value for non-connector builder is 60 minutes. 3818 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3819 3820 return ( 3821 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3822 ) 3823 3824 decoder = ( 3825 self._create_component_from_model(model=model.decoder, config=config) 3826 if model.decoder 3827 else JsonDecoder(parameters={}) 3828 ) 3829 record_selector = self._create_component_from_model( 3830 model=model.record_selector, 3831 config=config, 3832 decoder=decoder, 3833 name=name, 3834 transformations=transformations, 3835 client_side_incremental_sync=client_side_incremental_sync, 3836 ) 3837 3838 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3839 if self._should_limit_slices_fetched(): 3840 stream_slicer = cast( 3841 StreamSlicer, 3842 StreamSlicerTestReadDecorator( 3843 wrapped_slicer=stream_slicer, 3844 maximum_number_of_slices=self._limit_slices_fetched or 5, 3845 ), 3846 ) 3847 3848 creation_requester = self._create_component_from_model( 3849 model=model.creation_requester, 3850 decoder=decoder, 3851 config=config, 3852 name=f"job creation - {name}", 3853 ) 3854 polling_requester = self._create_component_from_model( 3855 model=model.polling_requester, 3856 decoder=decoder, 3857 config=config, 3858 name=f"job polling - {name}", 3859 ) 3860 job_download_components_name = f"job download - {name}" 3861 download_decoder = ( 3862 self._create_component_from_model(model=model.download_decoder, config=config) 3863 if model.download_decoder 3864 else JsonDecoder(parameters={}) 3865 ) 3866 download_extractor = ( 3867 self._create_component_from_model( 3868 model=model.download_extractor, 3869 config=config, 3870 decoder=download_decoder, 3871 parameters=model.parameters, 3872 ) 3873 if model.download_extractor 3874 else DpathExtractor( 3875 [], 3876 config=config, 3877 decoder=download_decoder, 3878 parameters=model.parameters or {}, 3879 ) 3880 ) 3881 download_requester = self._create_component_from_model( 3882 model=model.download_requester, 3883 decoder=download_decoder, 3884 config=config, 3885 name=job_download_components_name, 3886 ) 3887 download_retriever = _get_download_retriever( 3888 download_requester, download_extractor, download_decoder 3889 ) 3890 abort_requester = ( 3891 self._create_component_from_model( 3892 model=model.abort_requester, 3893 decoder=decoder, 3894 config=config, 3895 name=f"job abort - {name}", 3896 ) 3897 if model.abort_requester 3898 else None 3899 ) 3900 delete_requester = ( 3901 self._create_component_from_model( 3902 model=model.delete_requester, 3903 decoder=decoder, 3904 config=config, 3905 name=f"job delete - {name}", 3906 ) 3907 if model.delete_requester 3908 else None 3909 ) 3910 download_target_requester = ( 3911 self._create_component_from_model( 3912 model=model.download_target_requester, 3913 decoder=decoder, 3914 config=config, 3915 name=f"job extract_url - {name}", 3916 ) 3917 if model.download_target_requester 3918 else None 3919 ) 3920 status_extractor = self._create_component_from_model( 3921 model=model.status_extractor, decoder=decoder, config=config, name=name 3922 ) 3923 download_target_extractor = ( 3924 self._create_component_from_model( 3925 model=model.download_target_extractor, 3926 decoder=decoder, 3927 config=config, 3928 name=name, 3929 ) 3930 if model.download_target_extractor 3931 else None 3932 ) 3933 3934 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3935 creation_requester=creation_requester, 3936 polling_requester=polling_requester, 3937 download_retriever=download_retriever, 3938 download_target_requester=download_target_requester, 3939 abort_requester=abort_requester, 3940 delete_requester=delete_requester, 3941 status_extractor=status_extractor, 3942 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3943 download_target_extractor=download_target_extractor, 3944 job_timeout=_get_job_timeout(), 3945 ) 3946 3947 async_job_partition_router = AsyncJobPartitionRouter( 3948 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3949 job_repository, 3950 stream_slices, 3951 self._job_tracker, 3952 self._message_repository, 3953 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3954 has_bulk_parent=False, 3955 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3956 # `None` == default retry is set to 3 attempts, under the hood. 3957 job_max_retry=1 if self._emit_connector_builder_messages else None, 3958 ), 3959 stream_slicer=stream_slicer, 3960 config=config, 3961 parameters=model.parameters or {}, 3962 ) 3963 3964 return AsyncRetriever( 3965 record_selector=record_selector, 3966 stream_slicer=async_job_partition_router, 3967 config=config, 3968 parameters=model.parameters or {}, 3969 )
3971 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3972 config_migrations = [ 3973 self._create_component_from_model(migration, config) 3974 for migration in ( 3975 model.config_normalization_rules.config_migrations 3976 if ( 3977 model.config_normalization_rules 3978 and model.config_normalization_rules.config_migrations 3979 ) 3980 else [] 3981 ) 3982 ] 3983 config_transformations = [ 3984 self._create_component_from_model(transformation, config) 3985 for transformation in ( 3986 model.config_normalization_rules.transformations 3987 if ( 3988 model.config_normalization_rules 3989 and model.config_normalization_rules.transformations 3990 ) 3991 else [] 3992 ) 3993 ] 3994 config_validations = [ 3995 self._create_component_from_model(validation, config) 3996 for validation in ( 3997 model.config_normalization_rules.validations 3998 if ( 3999 model.config_normalization_rules 4000 and model.config_normalization_rules.validations 4001 ) 4002 else [] 4003 ) 4004 ] 4005 4006 return Spec( 4007 connection_specification=model.connection_specification, 4008 documentation_url=model.documentation_url, 4009 advanced_auth=model.advanced_auth, 4010 parameters={}, 4011 config_migrations=config_migrations, 4012 config_transformations=config_transformations, 4013 config_validations=config_validations, 4014 )
4016 def create_substream_partition_router( 4017 self, 4018 model: SubstreamPartitionRouterModel, 4019 config: Config, 4020 *, 4021 stream_name: str, 4022 **kwargs: Any, 4023 ) -> SubstreamPartitionRouter: 4024 parent_stream_configs = [] 4025 if model.parent_stream_configs: 4026 parent_stream_configs.extend( 4027 [ 4028 self.create_parent_stream_config_with_substream_wrapper( 4029 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4030 ) 4031 for parent_stream_config in model.parent_stream_configs 4032 ] 4033 ) 4034 4035 return SubstreamPartitionRouter( 4036 parent_stream_configs=parent_stream_configs, 4037 parameters=model.parameters or {}, 4038 config=config, 4039 )
4041 def create_parent_stream_config_with_substream_wrapper( 4042 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4043 ) -> Any: 4044 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4045 4046 parent_state: Optional[Mapping[str, Any]] = ( 4047 child_state if model.incremental_dependency and child_state else None 4048 ) 4049 connector_state_manager = self._instantiate_parent_stream_state_manager( 4050 child_state, config, model, parent_state 4051 ) 4052 4053 substream_factory = ModelToComponentFactory( 4054 connector_state_manager=connector_state_manager, 4055 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4056 limit_slices_fetched=self._limit_slices_fetched, 4057 emit_connector_builder_messages=self._emit_connector_builder_messages, 4058 disable_retries=self._disable_retries, 4059 disable_cache=self._disable_cache, 4060 message_repository=StateFilteringMessageRepository( 4061 LogAppenderMessageRepositoryDecorator( 4062 { 4063 "airbyte_cdk": {"stream": {"is_substream": True}}, 4064 "http": {"is_auxiliary": True}, 4065 }, 4066 self._message_repository, 4067 self._evaluate_log_level(self._emit_connector_builder_messages), 4068 ), 4069 ), 4070 api_budget=self._api_budget, 4071 ) 4072 4073 return substream_factory.create_parent_stream_config( 4074 model=model, config=config, stream_name=stream_name, **kwargs 4075 )
4135 @staticmethod 4136 def create_wait_time_from_header( 4137 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4138 ) -> WaitTimeFromHeaderBackoffStrategy: 4139 return WaitTimeFromHeaderBackoffStrategy( 4140 header=model.header, 4141 parameters=model.parameters or {}, 4142 config=config, 4143 regex=model.regex, 4144 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4145 if model.max_waiting_time_in_seconds is not None 4146 else None, 4147 )
4149 @staticmethod 4150 def create_wait_until_time_from_header( 4151 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4152 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4153 return WaitUntilTimeFromHeaderBackoffStrategy( 4154 header=model.header, 4155 parameters=model.parameters or {}, 4156 config=config, 4157 min_wait=model.min_wait, 4158 regex=model.regex, 4159 )
4167 @staticmethod 4168 def create_components_mapping_definition( 4169 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4170 ) -> ComponentMappingDefinition: 4171 interpolated_value = InterpolatedString.create( 4172 model.value, parameters=model.parameters or {} 4173 ) 4174 field_path = [ 4175 InterpolatedString.create(path, parameters=model.parameters or {}) 4176 for path in model.field_path 4177 ] 4178 return ComponentMappingDefinition( 4179 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4180 value=interpolated_value, 4181 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4182 create_or_update=model.create_or_update, 4183 condition=model.condition, 4184 parameters=model.parameters or {}, 4185 )
4187 def create_http_components_resolver( 4188 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4189 ) -> Any: 4190 retriever = self._create_component_from_model( 4191 model=model.retriever, 4192 config=config, 4193 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4194 primary_key=None, 4195 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4196 transformations=[], 4197 ) 4198 4199 components_mapping = [] 4200 for component_mapping_definition_model in model.components_mapping: 4201 if component_mapping_definition_model.condition: 4202 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4203 components_mapping.append( 4204 self._create_component_from_model( 4205 model=component_mapping_definition_model, 4206 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4207 component_mapping_definition_model.value_type 4208 ), 4209 config=config, 4210 ) 4211 ) 4212 4213 return HttpComponentsResolver( 4214 retriever=retriever, 4215 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4216 config=config, 4217 components_mapping=components_mapping, 4218 parameters=model.parameters or {}, 4219 )
4221 @staticmethod 4222 def create_stream_config( 4223 model: StreamConfigModel, config: Config, **kwargs: Any 4224 ) -> StreamConfig: 4225 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4226 [x for x in model.configs_pointer] if model.configs_pointer else [] 4227 ) 4228 4229 return StreamConfig( 4230 configs_pointer=model_configs_pointer, 4231 default_values=model.default_values, 4232 parameters=model.parameters or {}, 4233 )
4235 def create_config_components_resolver( 4236 self, 4237 model: ConfigComponentsResolverModel, 4238 config: Config, 4239 ) -> Any: 4240 model_stream_configs = ( 4241 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4242 ) 4243 4244 stream_configs = [ 4245 self._create_component_from_model( 4246 stream_config, config=config, parameters=model.parameters or {} 4247 ) 4248 for stream_config in model_stream_configs 4249 ] 4250 4251 components_mapping = [ 4252 self._create_component_from_model( 4253 model=components_mapping_definition_model, 4254 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4255 components_mapping_definition_model.value_type 4256 ), 4257 config=config, 4258 parameters=model.parameters, 4259 ) 4260 for components_mapping_definition_model in model.components_mapping 4261 ] 4262 4263 return ConfigComponentsResolver( 4264 stream_configs=stream_configs, 4265 config=config, 4266 components_mapping=components_mapping, 4267 parameters=model.parameters or {}, 4268 )
4270 def create_parametrized_components_resolver( 4271 self, 4272 model: ParametrizedComponentsResolverModel, 4273 config: Config, 4274 ) -> ParametrizedComponentsResolver: 4275 stream_parameters = StreamParametersDefinition( 4276 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4277 ) 4278 4279 components_mapping = [] 4280 for components_mapping_definition_model in model.components_mapping: 4281 if components_mapping_definition_model.condition: 4282 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4283 components_mapping.append( 4284 self._create_component_from_model( 4285 model=components_mapping_definition_model, 4286 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4287 components_mapping_definition_model.value_type 4288 ), 4289 config=config, 4290 ) 4291 ) 4292 return ParametrizedComponentsResolver( 4293 stream_parameters=stream_parameters, 4294 config=config, 4295 components_mapping=components_mapping, 4296 parameters=model.parameters or {}, 4297 )
4321 def create_http_api_budget( 4322 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4323 ) -> HttpAPIBudget: 4324 policies = [ 4325 self._create_component_from_model(model=policy, config=config) 4326 for policy in model.policies 4327 ] 4328 4329 return HttpAPIBudget( 4330 policies=policies, 4331 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4332 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4333 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4334 )
4336 def create_fixed_window_call_rate_policy( 4337 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4338 ) -> FixedWindowCallRatePolicy: 4339 matchers = [ 4340 self._create_component_from_model(model=matcher, config=config) 4341 for matcher in model.matchers 4342 ] 4343 4344 # Set the initial reset timestamp to 10 days from now. 4345 # This value will be updated by the first request. 4346 return FixedWindowCallRatePolicy( 4347 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4348 period=parse_duration(model.period), 4349 call_limit=model.call_limit, 4350 matchers=matchers, 4351 )
4353 def create_file_uploader( 4354 self, model: FileUploaderModel, config: Config, **kwargs: Any 4355 ) -> FileUploader: 4356 name = "File Uploader" 4357 requester = self._create_component_from_model( 4358 model=model.requester, 4359 config=config, 4360 name=name, 4361 **kwargs, 4362 ) 4363 download_target_extractor = self._create_component_from_model( 4364 model=model.download_target_extractor, 4365 config=config, 4366 name=name, 4367 **kwargs, 4368 ) 4369 emit_connector_builder_messages = self._emit_connector_builder_messages 4370 file_uploader = DefaultFileUploader( 4371 requester=requester, 4372 download_target_extractor=download_target_extractor, 4373 config=config, 4374 file_writer=NoopFileWriter() 4375 if emit_connector_builder_messages 4376 else LocalFileSystemFileWriter(), 4377 parameters=model.parameters or {}, 4378 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4379 ) 4380 4381 return ( 4382 ConnectorBuilderFileUploader(file_uploader) 4383 if emit_connector_builder_messages 4384 else file_uploader 4385 )
4387 def create_moving_window_call_rate_policy( 4388 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4389 ) -> MovingWindowCallRatePolicy: 4390 rates = [ 4391 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4392 ] 4393 matchers = [ 4394 self._create_component_from_model(model=matcher, config=config) 4395 for matcher in model.matchers 4396 ] 4397 return MovingWindowCallRatePolicy( 4398 rates=rates, 4399 matchers=matchers, 4400 )
4402 def create_unlimited_call_rate_policy( 4403 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4404 ) -> UnlimitedCallRatePolicy: 4405 matchers = [ 4406 self._create_component_from_model(model=matcher, config=config) 4407 for matcher in model.matchers 4408 ] 4409 4410 return UnlimitedCallRatePolicy( 4411 matchers=matchers, 4412 )
4421 def create_http_request_matcher( 4422 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4423 ) -> HttpRequestRegexMatcher: 4424 weight = model.weight 4425 if weight is not None: 4426 if isinstance(weight, str): 4427 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4428 else: 4429 weight = int(weight) 4430 if weight < 1: 4431 raise ValueError(f"weight must be >= 1, got {weight}") 4432 return HttpRequestRegexMatcher( 4433 method=model.method, 4434 url_base=model.url_base, 4435 url_path_pattern=model.url_path_pattern, 4436 params=model.params, 4437 headers=model.headers, 4438 weight=weight, 4439 )
4446 def create_grouping_partition_router( 4447 self, 4448 model: GroupingPartitionRouterModel, 4449 config: Config, 4450 *, 4451 stream_name: str, 4452 **kwargs: Any, 4453 ) -> GroupingPartitionRouter: 4454 underlying_router = self._create_component_from_model( 4455 model=model.underlying_partition_router, 4456 config=config, 4457 stream_name=stream_name, 4458 **kwargs, 4459 ) 4460 if model.group_size < 1: 4461 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4462 4463 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4464 # because they are specific to individual partitions and cannot be aggregated or handled 4465 # when grouping, potentially leading to incorrect API calls. Any request customization 4466 # should be managed at the stream level through the requester's configuration. 4467 if isinstance(underlying_router, SubstreamPartitionRouter): 4468 if any( 4469 parent_config.request_option 4470 for parent_config in underlying_router.parent_stream_configs 4471 ): 4472 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4473 4474 if isinstance(underlying_router, ListPartitionRouter): 4475 if underlying_router.request_option: 4476 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4477 4478 return GroupingPartitionRouter( 4479 group_size=model.group_size, 4480 underlying_partition_router=underlying_router, 4481 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4482 config=config, 4483 )