airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 TYPE_CHECKING, 15 Any, 16 Callable, 17 Dict, 18 List, 19 Mapping, 20 MutableMapping, 21 Optional, 22 Tuple, 23 Type, 24 Union, 25 cast, 26 get_args, 27 get_origin, 28 get_type_hints, 29) 30 31if TYPE_CHECKING: 32 from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( 33 DatetimeBasedCursor, 34 ) 35 36from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream 37from isodate import parse_duration 38from pydantic.v1 import BaseModel 39from requests import Response 40 41from airbyte_cdk.connector_builder.models import ( 42 LogMessage as ConnectorBuilderLogMessage, 43) 44from airbyte_cdk.models import ( 45 AirbyteStateBlob, 46 AirbyteStateMessage, 47 AirbyteStateType, 48 AirbyteStreamState, 49 ConfiguredAirbyteCatalog, 50 FailureType, 51 Level, 52 StreamDescriptor, 53) 54from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 55from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 56from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 57from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 58from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 59from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 60from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 61 DeclarativeAuthenticator, 62 NoAuth, 63) 64from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 65from airbyte_cdk.sources.declarative.auth.oauth import ( 66 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 67) 68from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 69from airbyte_cdk.sources.declarative.auth.token import ( 70 ApiKeyAuthenticator, 71 BasicHttpAuthenticator, 72 BearerAuthenticator, 73 LegacySessionTokenAuthenticator, 74) 75from airbyte_cdk.sources.declarative.auth.token_provider import ( 76 InterpolatedSessionTokenProvider, 77 InterpolatedStringTokenProvider, 78 SessionTokenProvider, 79 TokenProvider, 80) 81from airbyte_cdk.sources.declarative.checks import ( 82 CheckDynamicStream, 83 CheckStream, 84 DynamicStreamCheckConfig, 85) 86from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 87from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 88from airbyte_cdk.sources.declarative.decoders import ( 89 Decoder, 90 IterableDecoder, 91 JsonDecoder, 92 PaginationDecoderDecorator, 93 XmlDecoder, 94 ZipfileDecoder, 95) 96from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 97 CompositeRawDecoder, 98 CsvParser, 99 GzipParser, 100 JsonItemsParser, 101 JsonLineParser, 102 JsonParser, 103 Parser, 104) 105from airbyte_cdk.sources.declarative.expanders.record_expander import ( 106 OnNoRecords, 107 RecordExpander, 108) 109from airbyte_cdk.sources.declarative.extractors import ( 110 DpathExtractor, 111 RecordFilter, 112 RecordSelector, 113 ResponseToFileExtractor, 114) 115from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 116from airbyte_cdk.sources.declarative.extractors.record_filter import ( 117 ClientSideIncrementalRecordFilterDecorator, 118) 119from airbyte_cdk.sources.declarative.incremental import ( 120 ConcurrentCursorFactory, 121 ConcurrentPerPartitionCursor, 122) 123from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 124from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 125from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 126 LegacyToPerPartitionStateMigration, 127) 128from airbyte_cdk.sources.declarative.models import ( 129 CustomStateMigration, 130 PaginationResetLimits, 131) 132from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 133 DEPRECATION_LOGS_TAG, 134 BaseModelWithDeprecations, 135) 136from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 137 Action1 as PaginationResetActionModel, 138) 139from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 140 AddedFieldDefinition as AddedFieldDefinitionModel, 141) 142from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 143 AddFields as AddFieldsModel, 144) 145from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 146 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 147) 148from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 149 AsyncJobStatusMap as AsyncJobStatusMapModel, 150) 151from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 152 AsyncRetriever as AsyncRetrieverModel, 153) 154from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 155 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 156) 157from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 158 BearerAuthenticator as BearerAuthenticatorModel, 159) 160from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 161 CheckDynamicStream as CheckDynamicStreamModel, 162) 163from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 164 CheckStream as CheckStreamModel, 165) 166from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 167 ComplexFieldType as ComplexFieldTypeModel, 168) 169from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 170 ComponentMappingDefinition as ComponentMappingDefinitionModel, 171) 172from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 173 CompositeErrorHandler as CompositeErrorHandlerModel, 174) 175from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 176 ConcurrencyLevel as ConcurrencyLevelModel, 177) 178from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 179 ConfigAddFields as ConfigAddFieldsModel, 180) 181from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 182 ConfigComponentsResolver as ConfigComponentsResolverModel, 183) 184from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 185 ConfigMigration as ConfigMigrationModel, 186) 187from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 188 ConfigRemapField as ConfigRemapFieldModel, 189) 190from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 191 ConfigRemoveFields as ConfigRemoveFieldsModel, 192) 193from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 194 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 195) 196from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 197 CsvDecoder as CsvDecoderModel, 198) 199from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 200 CursorPagination as CursorPaginationModel, 201) 202from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 203 CustomAuthenticator as CustomAuthenticatorModel, 204) 205from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 206 CustomBackoffStrategy as CustomBackoffStrategyModel, 207) 208from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 209 CustomConfigTransformation as CustomConfigTransformationModel, 210) 211from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 212 CustomDecoder as CustomDecoderModel, 213) 214from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 215 CustomErrorHandler as CustomErrorHandlerModel, 216) 217from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 218 CustomPaginationStrategy as CustomPaginationStrategyModel, 219) 220from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 221 CustomPartitionRouter as CustomPartitionRouterModel, 222) 223from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 224 CustomRecordExtractor as CustomRecordExtractorModel, 225) 226from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 227 CustomRecordFilter as CustomRecordFilterModel, 228) 229from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 230 CustomRequester as CustomRequesterModel, 231) 232from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 233 CustomRetriever as CustomRetrieverModel, 234) 235from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 236 CustomSchemaLoader as CustomSchemaLoader, 237) 238from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 239 CustomSchemaNormalization as CustomSchemaNormalizationModel, 240) 241from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 242 CustomTransformation as CustomTransformationModel, 243) 244from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 245 CustomValidationStrategy as CustomValidationStrategyModel, 246) 247from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 248 DatetimeBasedCursor as DatetimeBasedCursorModel, 249) 250from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 251 DeclarativeStream as DeclarativeStreamModel, 252) 253from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 254 DefaultErrorHandler as DefaultErrorHandlerModel, 255) 256from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 257 DefaultPaginator as DefaultPaginatorModel, 258) 259from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 260 DpathExtractor as DpathExtractorModel, 261) 262from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 263 DpathFlattenFields as DpathFlattenFieldsModel, 264) 265from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 266 DpathValidator as DpathValidatorModel, 267) 268from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 269 DynamicSchemaLoader as DynamicSchemaLoaderModel, 270) 271from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 272 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 273) 274from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 275 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 276) 277from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 278 FileUploader as FileUploaderModel, 279) 280from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 281 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 282) 283from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 284 FlattenFields as FlattenFieldsModel, 285) 286from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 287 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 288) 289from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 290 GroupingPartitionRouter as GroupingPartitionRouterModel, 291) 292from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 293 GzipDecoder as GzipDecoderModel, 294) 295from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 296 HTTPAPIBudget as HTTPAPIBudgetModel, 297) 298from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 299 HttpComponentsResolver as HttpComponentsResolverModel, 300) 301from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 302 HttpRequester as HttpRequesterModel, 303) 304from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 305 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 306) 307from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 308 HttpResponseFilter as HttpResponseFilterModel, 309) 310from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 311 IncrementingCountCursor as IncrementingCountCursorModel, 312) 313from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 314 InlineSchemaLoader as InlineSchemaLoaderModel, 315) 316from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 317 IterableDecoder as IterableDecoderModel, 318) 319from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 320 JsonDecoder as JsonDecoderModel, 321) 322from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 323 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 324) 325from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 326 JsonItemsDecoder as JsonItemsDecoderModel, 327) 328from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 329 JsonlDecoder as JsonlDecoderModel, 330) 331from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 332 JsonSchemaPropertySelector as JsonSchemaPropertySelectorModel, 333) 334from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 335 JwtAuthenticator as JwtAuthenticatorModel, 336) 337from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 338 JwtHeaders as JwtHeadersModel, 339) 340from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 341 JwtPayload as JwtPayloadModel, 342) 343from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 344 KeysReplace as KeysReplaceModel, 345) 346from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 347 KeysToLower as KeysToLowerModel, 348) 349from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 350 KeysToSnakeCase as KeysToSnakeCaseModel, 351) 352from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 353 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 354) 355from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 356 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 357) 358from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 359 ListPartitionRouter as ListPartitionRouterModel, 360) 361from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 362 MinMaxDatetime as MinMaxDatetimeModel, 363) 364from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 365 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 366) 367from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 368 NoAuth as NoAuthModel, 369) 370from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 371 NoPagination as NoPaginationModel, 372) 373from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 374 OAuthAuthenticator as OAuthAuthenticatorModel, 375) 376from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 377 OffsetIncrement as OffsetIncrementModel, 378) 379from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 380 PageIncrement as PageIncrementModel, 381) 382from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 383 PaginationReset as PaginationResetModel, 384) 385from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 386 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 387) 388from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 389 ParentStreamConfig as ParentStreamConfigModel, 390) 391from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 392 PredicateValidator as PredicateValidatorModel, 393) 394from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 395 PropertiesFromEndpoint as PropertiesFromEndpointModel, 396) 397from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 398 PropertyChunking as PropertyChunkingModel, 399) 400from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 401 PropertyLimitType as PropertyLimitTypeModel, 402) 403from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 404 QueryProperties as QueryPropertiesModel, 405) 406from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 407 Rate as RateModel, 408) 409from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 410 RecordExpander as RecordExpanderModel, 411) 412from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 413 RecordFilter as RecordFilterModel, 414) 415from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 416 RecordSelector as RecordSelectorModel, 417) 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 419 RefreshTokenUpdater as RefreshTokenUpdaterModel, 420) 421from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 422 RemoveFields as RemoveFieldsModel, 423) 424from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 425 RequestOption as RequestOptionModel, 426) 427from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 428 RequestPath as RequestPathModel, 429) 430from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 431 ResponseToFileExtractor as ResponseToFileExtractorModel, 432) 433from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 434 SchemaNormalization as SchemaNormalizationModel, 435) 436from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 437 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 438) 439from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 440 SelectiveAuthenticator as SelectiveAuthenticatorModel, 441) 442from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 443 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 444) 445from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 446 SimpleRetriever as SimpleRetrieverModel, 447) 448from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 449from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 450 StateDelegatingStream as StateDelegatingStreamModel, 451) 452from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 453 StreamConfig as StreamConfigModel, 454) 455from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 456 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 457) 458from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 459 TypesMap as TypesMapModel, 460) 461from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 462 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 463) 464from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 465 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 466) 467from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 468from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 469 WaitTimeFromHeader as WaitTimeFromHeaderModel, 470) 471from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 472 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 473) 474from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 475 XmlDecoder as XmlDecoderModel, 476) 477from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 478 ZipfileDecoder as ZipfileDecoderModel, 479) 480from airbyte_cdk.sources.declarative.partition_routers import ( 481 CartesianProductStreamSlicer, 482 GroupingPartitionRouter, 483 ListPartitionRouter, 484 PartitionRouter, 485 SinglePartitionRouter, 486 SubstreamPartitionRouter, 487) 488from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 489 AsyncJobPartitionRouter, 490) 491from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 492 ParentStreamConfig, 493) 494from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 495from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 496 CompositeErrorHandler, 497 DefaultErrorHandler, 498 HttpResponseFilter, 499) 500from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 501 ConstantBackoffStrategy, 502 ExponentialBackoffStrategy, 503 WaitTimeFromHeaderBackoffStrategy, 504 WaitUntilTimeFromHeaderBackoffStrategy, 505) 506from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 507from airbyte_cdk.sources.declarative.requesters.paginators import ( 508 DefaultPaginator, 509 NoPagination, 510 PaginatorTestReadDecorator, 511) 512from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 513 CursorPaginationStrategy, 514 CursorStopCondition, 515 OffsetIncrement, 516 PageIncrement, 517 StopConditionPaginationStrategyDecorator, 518) 519from airbyte_cdk.sources.declarative.requesters.query_properties import ( 520 PropertiesFromEndpoint, 521 PropertyChunking, 522 QueryProperties, 523) 524from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 525 PropertyLimitType, 526) 527from airbyte_cdk.sources.declarative.requesters.query_properties.property_selector import ( 528 JsonSchemaPropertySelector, 529) 530from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 531 GroupByKey, 532) 533from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 534from airbyte_cdk.sources.declarative.requesters.request_options import ( 535 DatetimeBasedRequestOptionsProvider, 536 DefaultRequestOptionsProvider, 537 InterpolatedRequestOptionsProvider, 538 RequestOptionsProvider, 539) 540from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 541 PerPartitionRequestOptionsProvider, 542) 543from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 544from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 545from airbyte_cdk.sources.declarative.resolvers import ( 546 ComponentMappingDefinition, 547 ConfigComponentsResolver, 548 HttpComponentsResolver, 549 ParametrizedComponentsResolver, 550 StreamConfig, 551 StreamParametersDefinition, 552) 553from airbyte_cdk.sources.declarative.retrievers import ( 554 AsyncRetriever, 555 LazySimpleRetriever, 556 SimpleRetriever, 557) 558from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 559 ConnectorBuilderFileUploader, 560 DefaultFileUploader, 561 FileUploader, 562 LocalFileSystemFileWriter, 563 NoopFileWriter, 564) 565from airbyte_cdk.sources.declarative.retrievers.pagination_tracker import PaginationTracker 566from airbyte_cdk.sources.declarative.schema import ( 567 ComplexFieldType, 568 DefaultSchemaLoader, 569 DynamicSchemaLoader, 570 InlineSchemaLoader, 571 JsonFileSchemaLoader, 572 SchemaLoader, 573 SchemaTypeIdentifier, 574 TypesMap, 575) 576from airbyte_cdk.sources.declarative.schema.caching_schema_loader_decorator import ( 577 CachingSchemaLoaderDecorator, 578) 579from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 580from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 581from airbyte_cdk.sources.declarative.stream_slicers import ( 582 StreamSlicer, 583 StreamSlicerTestReadDecorator, 584) 585from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 586 DeclarativePartitionFactory, 587 StreamSlicerPartitionGenerator, 588) 589from airbyte_cdk.sources.declarative.transformations import ( 590 AddFields, 591 RecordTransformation, 592 RemoveFields, 593) 594from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 595from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 596 ConfigAddFields, 597 ConfigRemapField, 598 ConfigRemoveFields, 599) 600from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 601 ConfigTransformation, 602) 603from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 604 DpathFlattenFields, 605 KeyTransformation, 606) 607from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 608 FlattenFields, 609) 610from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 611 KeysReplaceTransformation, 612) 613from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 614 KeysToLowerTransformation, 615) 616from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 617 KeysToSnakeCaseTransformation, 618) 619from airbyte_cdk.sources.declarative.validators import ( 620 DpathValidator, 621 PredicateValidator, 622 ValidateAdheresToSchema, 623) 624from airbyte_cdk.sources.http_logger import format_http_message 625from airbyte_cdk.sources.message import ( 626 InMemoryMessageRepository, 627 LogAppenderMessageRepositoryDecorator, 628 MessageRepository, 629 NoopMessageRepository, 630) 631from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 632from airbyte_cdk.sources.streams.call_rate import ( 633 APIBudget, 634 FixedWindowCallRatePolicy, 635 HttpAPIBudget, 636 HttpRequestRegexMatcher, 637 MovingWindowCallRatePolicy, 638 Rate, 639 UnlimitedCallRatePolicy, 640) 641from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 642from airbyte_cdk.sources.streams.concurrent.clamping import ( 643 ClampingEndProvider, 644 ClampingStrategy, 645 DayClampingStrategy, 646 MonthClampingStrategy, 647 NoClamping, 648 WeekClampingStrategy, 649 Weekday, 650) 651from airbyte_cdk.sources.streams.concurrent.cursor import ( 652 ConcurrentCursor, 653 Cursor, 654 CursorField, 655 FinalStateCursor, 656) 657from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 658from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 659from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 660 StreamSlicer as ConcurrentStreamSlicer, 661) 662from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 663 CustomFormatConcurrentStreamStateConverter, 664 DateTimeStreamStateConverter, 665) 666from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 667 IncrementingCountStreamStateConverter, 668) 669from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 670from airbyte_cdk.sources.types import Config 671from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 672 673ComponentDefinition = Mapping[str, Any] 674 675SCHEMA_TRANSFORMER_TYPE_MAPPING = { 676 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 677 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 678} 679_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 680 681# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 682# this would be a circular import 683MAX_SLICES = 5 684 685LOGGER = logging.getLogger(f"airbyte.model_to_component_factory") 686 687 688class ModelToComponentFactory: 689 EPOCH_DATETIME_FORMAT = "%s" 690 691 def __init__( 692 self, 693 limit_pages_fetched_per_slice: Optional[int] = None, 694 limit_slices_fetched: Optional[int] = None, 695 emit_connector_builder_messages: bool = False, 696 disable_retries: bool = False, 697 disable_cache: bool = False, 698 message_repository: Optional[MessageRepository] = None, 699 connector_state_manager: Optional[ConnectorStateManager] = None, 700 max_concurrent_async_job_count: Optional[int] = None, 701 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 702 api_budget: Optional[APIBudget] = None, 703 ): 704 self._init_mappings() 705 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 706 self._limit_slices_fetched = limit_slices_fetched 707 self._emit_connector_builder_messages = emit_connector_builder_messages 708 self._disable_retries = disable_retries 709 self._disable_cache = disable_cache 710 self._message_repository = message_repository or InMemoryMessageRepository( 711 self._evaluate_log_level(emit_connector_builder_messages) 712 ) 713 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 714 configured_catalog 715 ) 716 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 717 self._api_budget: Optional[Union[APIBudget]] = api_budget 718 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 719 # placeholder for deprecation warnings 720 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 721 722 def _init_mappings(self) -> None: 723 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 724 AddedFieldDefinitionModel: self.create_added_field_definition, 725 AddFieldsModel: self.create_add_fields, 726 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 727 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 728 BearerAuthenticatorModel: self.create_bearer_authenticator, 729 CheckStreamModel: self.create_check_stream, 730 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 731 CheckDynamicStreamModel: self.create_check_dynamic_stream, 732 CompositeErrorHandlerModel: self.create_composite_error_handler, 733 ConcurrencyLevelModel: self.create_concurrency_level, 734 ConfigMigrationModel: self.create_config_migration, 735 ConfigAddFieldsModel: self.create_config_add_fields, 736 ConfigRemapFieldModel: self.create_config_remap_field, 737 ConfigRemoveFieldsModel: self.create_config_remove_fields, 738 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 739 CsvDecoderModel: self.create_csv_decoder, 740 CursorPaginationModel: self.create_cursor_pagination, 741 CustomAuthenticatorModel: self.create_custom_component, 742 CustomBackoffStrategyModel: self.create_custom_component, 743 CustomDecoderModel: self.create_custom_component, 744 CustomErrorHandlerModel: self.create_custom_component, 745 CustomRecordExtractorModel: self.create_custom_component, 746 CustomRecordFilterModel: self.create_custom_component, 747 CustomRequesterModel: self.create_custom_component, 748 CustomRetrieverModel: self.create_custom_component, 749 CustomSchemaLoader: self.create_custom_component, 750 CustomSchemaNormalizationModel: self.create_custom_component, 751 CustomStateMigration: self.create_custom_component, 752 CustomPaginationStrategyModel: self.create_custom_component, 753 CustomPartitionRouterModel: self.create_custom_component, 754 CustomTransformationModel: self.create_custom_component, 755 CustomValidationStrategyModel: self.create_custom_component, 756 CustomConfigTransformationModel: self.create_custom_component, 757 DeclarativeStreamModel: self.create_default_stream, 758 DefaultErrorHandlerModel: self.create_default_error_handler, 759 DefaultPaginatorModel: self.create_default_paginator, 760 DpathExtractorModel: self.create_dpath_extractor, 761 DpathValidatorModel: self.create_dpath_validator, 762 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 763 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 764 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 765 GroupByKeyMergeStrategyModel: self.create_group_by_key, 766 HttpRequesterModel: self.create_http_requester, 767 HttpResponseFilterModel: self.create_http_response_filter, 768 InlineSchemaLoaderModel: self.create_inline_schema_loader, 769 JsonDecoderModel: self.create_json_decoder, 770 JsonItemsDecoderModel: self.create_json_items_decoder, 771 JsonlDecoderModel: self.create_jsonl_decoder, 772 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 773 GzipDecoderModel: self.create_gzip_decoder, 774 KeysToLowerModel: self.create_keys_to_lower_transformation, 775 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 776 KeysReplaceModel: self.create_keys_replace_transformation, 777 FlattenFieldsModel: self.create_flatten_fields, 778 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 779 IterableDecoderModel: self.create_iterable_decoder, 780 XmlDecoderModel: self.create_xml_decoder, 781 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 782 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 783 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 784 TypesMapModel: self.create_types_map, 785 ComplexFieldTypeModel: self.create_complex_field_type, 786 JwtAuthenticatorModel: self.create_jwt_authenticator, 787 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 788 ListPartitionRouterModel: self.create_list_partition_router, 789 MinMaxDatetimeModel: self.create_min_max_datetime, 790 NoAuthModel: self.create_no_auth, 791 NoPaginationModel: self.create_no_pagination, 792 OAuthAuthenticatorModel: self.create_oauth_authenticator, 793 OffsetIncrementModel: self.create_offset_increment, 794 PageIncrementModel: self.create_page_increment, 795 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 796 PredicateValidatorModel: self.create_predicate_validator, 797 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 798 PropertyChunkingModel: self.create_property_chunking, 799 QueryPropertiesModel: self.create_query_properties, 800 RecordExpanderModel: self.create_record_expander, 801 RecordFilterModel: self.create_record_filter, 802 RecordSelectorModel: self.create_record_selector, 803 RemoveFieldsModel: self.create_remove_fields, 804 RequestPathModel: self.create_request_path, 805 RequestOptionModel: self.create_request_option, 806 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 807 SelectiveAuthenticatorModel: self.create_selective_authenticator, 808 SimpleRetrieverModel: self.create_simple_retriever, 809 StateDelegatingStreamModel: self.create_state_delegating_stream, 810 SpecModel: self.create_spec, 811 SubstreamPartitionRouterModel: self.create_substream_partition_router, 812 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 813 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 814 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 815 AsyncRetrieverModel: self.create_async_retriever, 816 HttpComponentsResolverModel: self.create_http_components_resolver, 817 ConfigComponentsResolverModel: self.create_config_components_resolver, 818 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 819 StreamConfigModel: self.create_stream_config, 820 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 821 ZipfileDecoderModel: self.create_zipfile_decoder, 822 HTTPAPIBudgetModel: self.create_http_api_budget, 823 FileUploaderModel: self.create_file_uploader, 824 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 825 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 826 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 827 RateModel: self.create_rate, 828 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 829 GroupingPartitionRouterModel: self.create_grouping_partition_router, 830 } 831 832 # Needed for the case where we need to perform a second parse on the fields of a custom component 833 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 834 835 @staticmethod 836 def _create_stream_name_to_configured_stream( 837 configured_catalog: Optional[ConfiguredAirbyteCatalog], 838 ) -> Mapping[str, ConfiguredAirbyteStream]: 839 return ( 840 {stream.stream.name: stream for stream in configured_catalog.streams} 841 if configured_catalog 842 else {} 843 ) 844 845 def create_component( 846 self, 847 model_type: Type[BaseModel], 848 component_definition: ComponentDefinition, 849 config: Config, 850 **kwargs: Any, 851 ) -> Any: 852 """ 853 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 854 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 855 creating declarative components from that model. 856 857 :param model_type: The type of declarative component that is being initialized 858 :param component_definition: The mapping that represents a declarative component 859 :param config: The connector config that is provided by the customer 860 :return: The declarative component to be used at runtime 861 """ 862 863 component_type = component_definition.get("type") 864 if component_definition.get("type") != model_type.__name__: 865 raise ValueError( 866 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 867 ) 868 869 declarative_component_model = model_type.parse_obj(component_definition) 870 871 if not isinstance(declarative_component_model, model_type): 872 raise ValueError( 873 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 874 ) 875 876 return self._create_component_from_model( 877 model=declarative_component_model, config=config, **kwargs 878 ) 879 880 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 881 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 882 raise ValueError( 883 f"{model.__class__} with attributes {model} is not a valid component type" 884 ) 885 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 886 if not component_constructor: 887 raise ValueError(f"Could not find constructor for {model.__class__}") 888 889 # collect deprecation warnings for supported models. 890 if isinstance(model, BaseModelWithDeprecations): 891 self._collect_model_deprecations(model) 892 893 return component_constructor(model=model, config=config, **kwargs) 894 895 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 896 """ 897 Returns the deprecation warnings that were collected during the creation of components. 898 """ 899 return self._collected_deprecation_logs 900 901 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 902 """ 903 Collects deprecation logs from the given model and appends any new logs to the internal collection. 904 905 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 906 907 Args: 908 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 909 """ 910 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 911 for log in model._deprecation_logs: 912 # avoid duplicates for deprecation logs observed. 913 if log not in self._collected_deprecation_logs: 914 self._collected_deprecation_logs.append(log) 915 916 def create_config_migration( 917 self, model: ConfigMigrationModel, config: Config 918 ) -> ConfigMigration: 919 transformations: List[ConfigTransformation] = [ 920 self._create_component_from_model(transformation, config) 921 for transformation in model.transformations 922 ] 923 924 return ConfigMigration( 925 description=model.description, 926 transformations=transformations, 927 ) 928 929 def create_config_add_fields( 930 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 931 ) -> ConfigAddFields: 932 fields = [self._create_component_from_model(field, config) for field in model.fields] 933 return ConfigAddFields( 934 fields=fields, 935 condition=model.condition or "", 936 ) 937 938 @staticmethod 939 def create_config_remove_fields( 940 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 941 ) -> ConfigRemoveFields: 942 return ConfigRemoveFields( 943 field_pointers=model.field_pointers, 944 condition=model.condition or "", 945 ) 946 947 @staticmethod 948 def create_config_remap_field( 949 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 950 ) -> ConfigRemapField: 951 mapping = cast(Mapping[str, Any], model.map) 952 return ConfigRemapField( 953 map=mapping, 954 field_path=model.field_path, 955 config=config, 956 ) 957 958 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 959 strategy = self._create_component_from_model(model.validation_strategy, config) 960 961 return DpathValidator( 962 field_path=model.field_path, 963 strategy=strategy, 964 ) 965 966 def create_predicate_validator( 967 self, model: PredicateValidatorModel, config: Config 968 ) -> PredicateValidator: 969 strategy = self._create_component_from_model(model.validation_strategy, config) 970 971 return PredicateValidator( 972 value=model.value, 973 strategy=strategy, 974 ) 975 976 @staticmethod 977 def create_validate_adheres_to_schema( 978 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 979 ) -> ValidateAdheresToSchema: 980 base_schema = cast(Mapping[str, Any], model.base_schema) 981 return ValidateAdheresToSchema( 982 schema=base_schema, 983 ) 984 985 @staticmethod 986 def create_added_field_definition( 987 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 988 ) -> AddedFieldDefinition: 989 interpolated_value = InterpolatedString.create( 990 model.value, parameters=model.parameters or {} 991 ) 992 return AddedFieldDefinition( 993 path=model.path, 994 value=interpolated_value, 995 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 996 parameters=model.parameters or {}, 997 ) 998 999 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 1000 added_field_definitions = [ 1001 self._create_component_from_model( 1002 model=added_field_definition_model, 1003 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1004 added_field_definition_model.value_type 1005 ), 1006 config=config, 1007 ) 1008 for added_field_definition_model in model.fields 1009 ] 1010 return AddFields( 1011 fields=added_field_definitions, 1012 condition=model.condition or "", 1013 parameters=model.parameters or {}, 1014 ) 1015 1016 def create_keys_to_lower_transformation( 1017 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1018 ) -> KeysToLowerTransformation: 1019 return KeysToLowerTransformation() 1020 1021 def create_keys_to_snake_transformation( 1022 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1023 ) -> KeysToSnakeCaseTransformation: 1024 return KeysToSnakeCaseTransformation() 1025 1026 def create_keys_replace_transformation( 1027 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1028 ) -> KeysReplaceTransformation: 1029 return KeysReplaceTransformation( 1030 old=model.old, new=model.new, parameters=model.parameters or {} 1031 ) 1032 1033 def create_flatten_fields( 1034 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1035 ) -> FlattenFields: 1036 return FlattenFields( 1037 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1038 ) 1039 1040 def create_dpath_flatten_fields( 1041 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1042 ) -> DpathFlattenFields: 1043 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1044 key_transformation = ( 1045 KeyTransformation( 1046 config=config, 1047 prefix=model.key_transformation.prefix, 1048 suffix=model.key_transformation.suffix, 1049 parameters=model.parameters or {}, 1050 ) 1051 if model.key_transformation is not None 1052 else None 1053 ) 1054 return DpathFlattenFields( 1055 config=config, 1056 field_path=model_field_path, 1057 delete_origin_value=model.delete_origin_value 1058 if model.delete_origin_value is not None 1059 else False, 1060 replace_record=model.replace_record if model.replace_record is not None else False, 1061 key_transformation=key_transformation, 1062 parameters=model.parameters or {}, 1063 ) 1064 1065 @staticmethod 1066 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1067 if not value_type: 1068 return None 1069 names_to_types = { 1070 ValueType.string: str, 1071 ValueType.number: float, 1072 ValueType.integer: int, 1073 ValueType.boolean: bool, 1074 } 1075 return names_to_types[value_type] 1076 1077 def create_api_key_authenticator( 1078 self, 1079 model: ApiKeyAuthenticatorModel, 1080 config: Config, 1081 token_provider: Optional[TokenProvider] = None, 1082 **kwargs: Any, 1083 ) -> ApiKeyAuthenticator: 1084 if model.inject_into is None and model.header is None: 1085 raise ValueError( 1086 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1087 ) 1088 1089 if model.inject_into is not None and model.header is not None: 1090 raise ValueError( 1091 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1092 ) 1093 1094 if token_provider is not None and model.api_token != "": 1095 raise ValueError( 1096 "If token_provider is set, api_token is ignored and has to be set to empty string." 1097 ) 1098 1099 request_option = ( 1100 self._create_component_from_model( 1101 model.inject_into, config, parameters=model.parameters or {} 1102 ) 1103 if model.inject_into 1104 else RequestOption( 1105 inject_into=RequestOptionType.header, 1106 field_name=model.header or "", 1107 parameters=model.parameters or {}, 1108 ) 1109 ) 1110 1111 return ApiKeyAuthenticator( 1112 token_provider=( 1113 token_provider 1114 if token_provider is not None 1115 else InterpolatedStringTokenProvider( 1116 api_token=model.api_token or "", 1117 config=config, 1118 parameters=model.parameters or {}, 1119 ) 1120 ), 1121 request_option=request_option, 1122 config=config, 1123 parameters=model.parameters or {}, 1124 ) 1125 1126 def create_legacy_to_per_partition_state_migration( 1127 self, 1128 model: LegacyToPerPartitionStateMigrationModel, 1129 config: Mapping[str, Any], 1130 declarative_stream: DeclarativeStreamModel, 1131 ) -> LegacyToPerPartitionStateMigration: 1132 retriever = declarative_stream.retriever 1133 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1134 raise ValueError( 1135 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1136 ) 1137 partition_router = retriever.partition_router 1138 if not isinstance( 1139 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1140 ): 1141 raise ValueError( 1142 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1143 ) 1144 if not hasattr(partition_router, "parent_stream_configs"): 1145 raise ValueError( 1146 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1147 ) 1148 1149 if not hasattr(declarative_stream, "incremental_sync"): 1150 raise ValueError( 1151 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1152 ) 1153 1154 return LegacyToPerPartitionStateMigration( 1155 partition_router, # type: ignore # was already checked above 1156 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1157 config, 1158 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1159 ) 1160 1161 def create_session_token_authenticator( 1162 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1163 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1164 decoder = ( 1165 self._create_component_from_model(model=model.decoder, config=config) 1166 if model.decoder 1167 else JsonDecoder(parameters={}) 1168 ) 1169 login_requester = self._create_component_from_model( 1170 model=model.login_requester, 1171 config=config, 1172 name=f"{name}_login_requester", 1173 decoder=decoder, 1174 ) 1175 token_provider = SessionTokenProvider( 1176 login_requester=login_requester, 1177 session_token_path=model.session_token_path, 1178 expiration_duration=parse_duration(model.expiration_duration) 1179 if model.expiration_duration 1180 else None, 1181 parameters=model.parameters or {}, 1182 message_repository=self._message_repository, 1183 decoder=decoder, 1184 ) 1185 if model.request_authentication.type == "Bearer": 1186 return ModelToComponentFactory.create_bearer_authenticator( 1187 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1188 config, 1189 token_provider=token_provider, 1190 ) 1191 else: 1192 # Get the api_token template if specified, default to just the session token 1193 api_token_template = ( 1194 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1195 ) 1196 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1197 config=config, 1198 api_token=api_token_template, 1199 session_token_provider=token_provider, 1200 parameters=model.parameters or {}, 1201 ) 1202 return self.create_api_key_authenticator( 1203 ApiKeyAuthenticatorModel( 1204 type="ApiKeyAuthenticator", 1205 api_token="", 1206 inject_into=model.request_authentication.inject_into, 1207 ), # type: ignore # $parameters and headers default to None 1208 config=config, 1209 token_provider=final_token_provider, 1210 ) 1211 1212 @staticmethod 1213 def create_basic_http_authenticator( 1214 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1215 ) -> BasicHttpAuthenticator: 1216 return BasicHttpAuthenticator( 1217 password=model.password or "", 1218 username=model.username, 1219 config=config, 1220 parameters=model.parameters or {}, 1221 ) 1222 1223 @staticmethod 1224 def create_bearer_authenticator( 1225 model: BearerAuthenticatorModel, 1226 config: Config, 1227 token_provider: Optional[TokenProvider] = None, 1228 **kwargs: Any, 1229 ) -> BearerAuthenticator: 1230 if token_provider is not None and model.api_token != "": 1231 raise ValueError( 1232 "If token_provider is set, api_token is ignored and has to be set to empty string." 1233 ) 1234 return BearerAuthenticator( 1235 token_provider=( 1236 token_provider 1237 if token_provider is not None 1238 else InterpolatedStringTokenProvider( 1239 api_token=model.api_token or "", 1240 config=config, 1241 parameters=model.parameters or {}, 1242 ) 1243 ), 1244 config=config, 1245 parameters=model.parameters or {}, 1246 ) 1247 1248 @staticmethod 1249 def create_dynamic_stream_check_config( 1250 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1251 ) -> DynamicStreamCheckConfig: 1252 return DynamicStreamCheckConfig( 1253 dynamic_stream_name=model.dynamic_stream_name, 1254 stream_count=model.stream_count, 1255 ) 1256 1257 def create_check_stream( 1258 self, model: CheckStreamModel, config: Config, **kwargs: Any 1259 ) -> CheckStream: 1260 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1261 raise ValueError( 1262 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1263 ) 1264 1265 dynamic_streams_check_configs = ( 1266 [ 1267 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1268 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1269 ] 1270 if model.dynamic_streams_check_configs 1271 else [] 1272 ) 1273 1274 return CheckStream( 1275 stream_names=model.stream_names or [], 1276 dynamic_streams_check_configs=dynamic_streams_check_configs, 1277 parameters={}, 1278 ) 1279 1280 @staticmethod 1281 def create_check_dynamic_stream( 1282 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1283 ) -> CheckDynamicStream: 1284 assert model.use_check_availability is not None # for mypy 1285 1286 use_check_availability = model.use_check_availability 1287 1288 return CheckDynamicStream( 1289 stream_count=model.stream_count, 1290 use_check_availability=use_check_availability, 1291 parameters={}, 1292 ) 1293 1294 def create_composite_error_handler( 1295 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1296 ) -> CompositeErrorHandler: 1297 error_handlers = [ 1298 self._create_component_from_model(model=error_handler_model, config=config) 1299 for error_handler_model in model.error_handlers 1300 ] 1301 return CompositeErrorHandler( 1302 error_handlers=error_handlers, parameters=model.parameters or {} 1303 ) 1304 1305 @staticmethod 1306 def create_concurrency_level( 1307 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1308 ) -> ConcurrencyLevel: 1309 return ConcurrencyLevel( 1310 default_concurrency=model.default_concurrency, 1311 max_concurrency=model.max_concurrency, 1312 config=config, 1313 parameters={}, 1314 ) 1315 1316 @staticmethod 1317 def apply_stream_state_migrations( 1318 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1319 ) -> MutableMapping[str, Any]: 1320 if stream_state_migrations: 1321 for state_migration in stream_state_migrations: 1322 if state_migration.should_migrate(stream_state): 1323 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1324 stream_state = dict(state_migration.migrate(stream_state)) 1325 return stream_state 1326 1327 def create_concurrent_cursor_from_datetime_based_cursor( 1328 self, 1329 model_type: Type[BaseModel], 1330 component_definition: ComponentDefinition, 1331 stream_name: str, 1332 stream_namespace: Optional[str], 1333 stream_state: MutableMapping[str, Any], 1334 config: Config, 1335 message_repository: Optional[MessageRepository] = None, 1336 runtime_lookback_window: Optional[datetime.timedelta] = None, 1337 **kwargs: Any, 1338 ) -> ConcurrentCursor: 1339 component_type = component_definition.get("type") 1340 if component_definition.get("type") != model_type.__name__: 1341 raise ValueError( 1342 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1343 ) 1344 1345 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1346 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1347 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1348 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1349 if "$parameters" not in component_definition and "parameters" in component_definition: 1350 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1351 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1352 1353 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1354 raise ValueError( 1355 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1356 ) 1357 1358 model_parameters = datetime_based_cursor_model.parameters or {} 1359 1360 cursor_field = self._get_catalog_defined_cursor_field( 1361 stream_name=stream_name, 1362 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1363 or False, 1364 ) 1365 1366 if not cursor_field: 1367 interpolated_cursor_field = InterpolatedString.create( 1368 datetime_based_cursor_model.cursor_field, 1369 parameters=model_parameters, 1370 ) 1371 cursor_field = CursorField( 1372 cursor_field_key=interpolated_cursor_field.eval(config=config), 1373 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1374 or False, 1375 ) 1376 1377 interpolated_partition_field_start = InterpolatedString.create( 1378 datetime_based_cursor_model.partition_field_start or "start_time", 1379 parameters=model_parameters, 1380 ) 1381 interpolated_partition_field_end = InterpolatedString.create( 1382 datetime_based_cursor_model.partition_field_end or "end_time", 1383 parameters=model_parameters, 1384 ) 1385 1386 slice_boundary_fields = ( 1387 interpolated_partition_field_start.eval(config=config), 1388 interpolated_partition_field_end.eval(config=config), 1389 ) 1390 1391 datetime_format = datetime_based_cursor_model.datetime_format 1392 1393 cursor_granularity = ( 1394 parse_duration(datetime_based_cursor_model.cursor_granularity) 1395 if datetime_based_cursor_model.cursor_granularity 1396 else None 1397 ) 1398 1399 lookback_window = None 1400 interpolated_lookback_window = ( 1401 InterpolatedString.create( 1402 datetime_based_cursor_model.lookback_window, 1403 parameters=model_parameters, 1404 ) 1405 if datetime_based_cursor_model.lookback_window 1406 else None 1407 ) 1408 if interpolated_lookback_window: 1409 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1410 if evaluated_lookback_window: 1411 lookback_window = parse_duration(evaluated_lookback_window) 1412 1413 connector_state_converter: DateTimeStreamStateConverter 1414 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1415 datetime_format=datetime_format, 1416 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1417 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1418 cursor_granularity=cursor_granularity, 1419 ) 1420 1421 # Adjusts the stream state by applying the runtime lookback window. 1422 # This is used to ensure correct state handling in case of failed partitions. 1423 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1424 if runtime_lookback_window and stream_state_value: 1425 new_stream_state = ( 1426 connector_state_converter.parse_timestamp(stream_state_value) 1427 - runtime_lookback_window 1428 ) 1429 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1430 new_stream_state 1431 ) 1432 1433 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1434 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1435 start_date_runtime_value = self.create_min_max_datetime( 1436 model=datetime_based_cursor_model.start_datetime, config=config 1437 ) 1438 else: 1439 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1440 1441 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1442 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1443 end_date_runtime_value = self.create_min_max_datetime( 1444 model=datetime_based_cursor_model.end_datetime, config=config 1445 ) 1446 else: 1447 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1448 1449 interpolated_start_date = MinMaxDatetime.create( 1450 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1451 parameters=datetime_based_cursor_model.parameters, 1452 ) 1453 interpolated_end_date = ( 1454 None 1455 if not end_date_runtime_value 1456 else MinMaxDatetime.create( 1457 end_date_runtime_value, datetime_based_cursor_model.parameters 1458 ) 1459 ) 1460 1461 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1462 if not interpolated_start_date.datetime_format: 1463 interpolated_start_date.datetime_format = datetime_format 1464 if interpolated_end_date and not interpolated_end_date.datetime_format: 1465 interpolated_end_date.datetime_format = datetime_format 1466 1467 start_date = interpolated_start_date.get_datetime(config=config) 1468 end_date_provider = ( 1469 partial(interpolated_end_date.get_datetime, config) 1470 if interpolated_end_date 1471 else connector_state_converter.get_end_provider() 1472 ) 1473 1474 if ( 1475 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1476 ) or ( 1477 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1478 ): 1479 raise ValueError( 1480 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1481 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1482 ) 1483 1484 # When step is not defined, default to a step size from the starting date to the present moment 1485 step_length = datetime.timedelta.max 1486 interpolated_step = ( 1487 InterpolatedString.create( 1488 datetime_based_cursor_model.step, 1489 parameters=model_parameters, 1490 ) 1491 if datetime_based_cursor_model.step 1492 else None 1493 ) 1494 if interpolated_step: 1495 evaluated_step = interpolated_step.eval(config) 1496 if evaluated_step: 1497 step_length = parse_duration(evaluated_step) 1498 1499 clamping_strategy: ClampingStrategy = NoClamping() 1500 if datetime_based_cursor_model.clamping: 1501 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1502 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1503 # object which we want to keep agnostic of being low-code 1504 target = InterpolatedString( 1505 string=datetime_based_cursor_model.clamping.target, 1506 parameters=model_parameters, 1507 ) 1508 evaluated_target = target.eval(config=config) 1509 match evaluated_target: 1510 case "DAY": 1511 clamping_strategy = DayClampingStrategy() 1512 end_date_provider = ClampingEndProvider( 1513 DayClampingStrategy(is_ceiling=False), 1514 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1515 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1516 ) 1517 case "WEEK": 1518 if ( 1519 not datetime_based_cursor_model.clamping.target_details 1520 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1521 ): 1522 raise ValueError( 1523 "Given WEEK clamping, weekday needs to be provided as target_details" 1524 ) 1525 weekday = self._assemble_weekday( 1526 datetime_based_cursor_model.clamping.target_details["weekday"] 1527 ) 1528 clamping_strategy = WeekClampingStrategy(weekday) 1529 end_date_provider = ClampingEndProvider( 1530 WeekClampingStrategy(weekday, is_ceiling=False), 1531 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1532 granularity=cursor_granularity or datetime.timedelta(days=1), 1533 ) 1534 case "MONTH": 1535 clamping_strategy = MonthClampingStrategy() 1536 end_date_provider = ClampingEndProvider( 1537 MonthClampingStrategy(is_ceiling=False), 1538 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1539 granularity=cursor_granularity or datetime.timedelta(days=1), 1540 ) 1541 case _: 1542 raise ValueError( 1543 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1544 ) 1545 1546 return ConcurrentCursor( 1547 stream_name=stream_name, 1548 stream_namespace=stream_namespace, 1549 stream_state=stream_state, 1550 message_repository=message_repository or self._message_repository, 1551 connector_state_manager=self._connector_state_manager, 1552 connector_state_converter=connector_state_converter, 1553 cursor_field=cursor_field, 1554 slice_boundary_fields=slice_boundary_fields, 1555 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1556 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1557 lookback_window=lookback_window, 1558 slice_range=step_length, 1559 cursor_granularity=cursor_granularity, 1560 clamping_strategy=clamping_strategy, 1561 ) 1562 1563 def create_concurrent_cursor_from_incrementing_count_cursor( 1564 self, 1565 model_type: Type[BaseModel], 1566 component_definition: ComponentDefinition, 1567 stream_name: str, 1568 stream_namespace: Optional[str], 1569 stream_state: MutableMapping[str, Any], 1570 config: Config, 1571 message_repository: Optional[MessageRepository] = None, 1572 **kwargs: Any, 1573 ) -> ConcurrentCursor: 1574 component_type = component_definition.get("type") 1575 if component_definition.get("type") != model_type.__name__: 1576 raise ValueError( 1577 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1578 ) 1579 1580 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1581 1582 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1583 raise ValueError( 1584 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1585 ) 1586 1587 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1588 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1589 # We need to handle both int and str representations of numeric values. 1590 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1591 if start_value is not None: 1592 interpolated_start_value = InterpolatedString.create( 1593 str(start_value), # Ensure we pass a string to InterpolatedString.create 1594 parameters=incrementing_count_cursor_model.parameters or {}, 1595 ) 1596 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1597 else: 1598 evaluated_start_value = 0 1599 1600 cursor_field = self._get_catalog_defined_cursor_field( 1601 stream_name=stream_name, 1602 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1603 or False, 1604 ) 1605 1606 if not cursor_field: 1607 interpolated_cursor_field = InterpolatedString.create( 1608 incrementing_count_cursor_model.cursor_field, 1609 parameters=incrementing_count_cursor_model.parameters or {}, 1610 ) 1611 cursor_field = CursorField( 1612 cursor_field_key=interpolated_cursor_field.eval(config=config), 1613 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1614 or False, 1615 ) 1616 1617 connector_state_converter = IncrementingCountStreamStateConverter( 1618 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1619 ) 1620 1621 return ConcurrentCursor( 1622 stream_name=stream_name, 1623 stream_namespace=stream_namespace, 1624 stream_state=stream_state, 1625 message_repository=message_repository or self._message_repository, 1626 connector_state_manager=self._connector_state_manager, 1627 connector_state_converter=connector_state_converter, 1628 cursor_field=cursor_field, 1629 slice_boundary_fields=None, 1630 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1631 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1632 ) 1633 1634 def _assemble_weekday(self, weekday: str) -> Weekday: 1635 match weekday: 1636 case "MONDAY": 1637 return Weekday.MONDAY 1638 case "TUESDAY": 1639 return Weekday.TUESDAY 1640 case "WEDNESDAY": 1641 return Weekday.WEDNESDAY 1642 case "THURSDAY": 1643 return Weekday.THURSDAY 1644 case "FRIDAY": 1645 return Weekday.FRIDAY 1646 case "SATURDAY": 1647 return Weekday.SATURDAY 1648 case "SUNDAY": 1649 return Weekday.SUNDAY 1650 case _: 1651 raise ValueError(f"Unknown weekday {weekday}") 1652 1653 def create_concurrent_cursor_from_perpartition_cursor( 1654 self, 1655 state_manager: ConnectorStateManager, 1656 model_type: Type[BaseModel], 1657 component_definition: ComponentDefinition, 1658 stream_name: str, 1659 stream_namespace: Optional[str], 1660 config: Config, 1661 stream_state: MutableMapping[str, Any], 1662 partition_router: PartitionRouter, 1663 attempt_to_create_cursor_if_not_provided: bool = False, 1664 **kwargs: Any, 1665 ) -> ConcurrentPerPartitionCursor: 1666 component_type = component_definition.get("type") 1667 if component_definition.get("type") != model_type.__name__: 1668 raise ValueError( 1669 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1670 ) 1671 1672 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1673 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1674 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1675 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1676 if "$parameters" not in component_definition and "parameters" in component_definition: 1677 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1678 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1679 1680 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1681 raise ValueError( 1682 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1683 ) 1684 1685 cursor_field = self._get_catalog_defined_cursor_field( 1686 stream_name=stream_name, 1687 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1688 or False, 1689 ) 1690 1691 if not cursor_field: 1692 interpolated_cursor_field = InterpolatedString.create( 1693 datetime_based_cursor_model.cursor_field, 1694 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1695 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1696 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1697 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1698 parameters=datetime_based_cursor_model.parameters or {}, 1699 ) 1700 cursor_field = CursorField( 1701 cursor_field_key=interpolated_cursor_field.eval(config=config), 1702 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1703 or False, 1704 ) 1705 1706 datetime_format = datetime_based_cursor_model.datetime_format 1707 1708 cursor_granularity = ( 1709 parse_duration(datetime_based_cursor_model.cursor_granularity) 1710 if datetime_based_cursor_model.cursor_granularity 1711 else None 1712 ) 1713 1714 connector_state_converter: DateTimeStreamStateConverter 1715 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1716 datetime_format=datetime_format, 1717 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1718 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1719 cursor_granularity=cursor_granularity, 1720 ) 1721 1722 # Create the cursor factory 1723 cursor_factory = ConcurrentCursorFactory( 1724 partial( 1725 self.create_concurrent_cursor_from_datetime_based_cursor, 1726 state_manager=state_manager, 1727 model_type=model_type, 1728 component_definition=component_definition, 1729 stream_name=stream_name, 1730 stream_namespace=stream_namespace, 1731 config=config, 1732 message_repository=NoopMessageRepository(), 1733 ) 1734 ) 1735 1736 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1737 use_global_cursor = isinstance( 1738 partition_router, GroupingPartitionRouter 1739 ) or component_definition.get("global_substream_cursor", False) 1740 1741 # Return the concurrent cursor and state converter 1742 return ConcurrentPerPartitionCursor( 1743 cursor_factory=cursor_factory, 1744 partition_router=partition_router, 1745 stream_name=stream_name, 1746 stream_namespace=stream_namespace, 1747 stream_state=stream_state, 1748 message_repository=self._message_repository, # type: ignore 1749 connector_state_manager=state_manager, 1750 connector_state_converter=connector_state_converter, 1751 cursor_field=cursor_field, 1752 use_global_cursor=use_global_cursor, 1753 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1754 ) 1755 1756 @staticmethod 1757 def create_constant_backoff_strategy( 1758 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1759 ) -> ConstantBackoffStrategy: 1760 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 1761 return ConstantBackoffStrategy( 1762 backoff_time_in_seconds=model.backoff_time_in_seconds, 1763 jitter_range_in_seconds=model.jitter_range_in_seconds, 1764 config=config, 1765 parameters=model.parameters or {}, 1766 ) 1767 1768 @staticmethod 1769 def _validate_jitter_range(jitter_range_in_seconds: Optional[float]) -> None: 1770 if jitter_range_in_seconds is not None and jitter_range_in_seconds < 0: 1771 raise ValueError("jitter_range_in_seconds must be greater than or equal to 0") 1772 1773 def create_cursor_pagination( 1774 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1775 ) -> CursorPaginationStrategy: 1776 if isinstance(decoder, PaginationDecoderDecorator): 1777 inner_decoder = decoder.decoder 1778 else: 1779 inner_decoder = decoder 1780 decoder = PaginationDecoderDecorator(decoder=decoder) 1781 1782 if self._is_supported_decoder_for_pagination(inner_decoder): 1783 decoder_to_use = decoder 1784 else: 1785 raise ValueError( 1786 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1787 ) 1788 1789 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1790 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1791 page_size = model.page_size 1792 if isinstance(page_size, str) and page_size.isdigit(): 1793 page_size = int(page_size) 1794 1795 return CursorPaginationStrategy( 1796 cursor_value=model.cursor_value, 1797 decoder=decoder_to_use, 1798 page_size=page_size, 1799 stop_condition=model.stop_condition, 1800 config=config, 1801 parameters=model.parameters or {}, 1802 ) 1803 1804 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1805 """ 1806 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1807 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1808 :param model: The Pydantic model of the custom component being created 1809 :param config: The custom defined connector config 1810 :return: The declarative component built from the Pydantic model to be used at runtime 1811 """ 1812 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1813 component_fields = get_type_hints(custom_component_class) 1814 model_args = model.dict() 1815 model_args["config"] = config 1816 1817 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1818 # we defer to these arguments over the component's definition 1819 for key, arg in kwargs.items(): 1820 model_args[key] = arg 1821 1822 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1823 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1824 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1825 for model_field, model_value in model_args.items(): 1826 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1827 if ( 1828 isinstance(model_value, dict) 1829 and "type" not in model_value 1830 and model_field in component_fields 1831 ): 1832 derived_type = self._derive_component_type_from_type_hints( 1833 component_fields.get(model_field) 1834 ) 1835 if derived_type: 1836 model_value["type"] = derived_type 1837 1838 if self._is_component(model_value): 1839 model_args[model_field] = self._create_nested_component( 1840 model, 1841 model_field, 1842 model_value, 1843 config, 1844 **kwargs, 1845 ) 1846 elif isinstance(model_value, list): 1847 vals = [] 1848 for v in model_value: 1849 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1850 derived_type = self._derive_component_type_from_type_hints( 1851 component_fields.get(model_field) 1852 ) 1853 if derived_type: 1854 v["type"] = derived_type 1855 if self._is_component(v): 1856 vals.append( 1857 self._create_nested_component( 1858 model, 1859 model_field, 1860 v, 1861 config, 1862 **kwargs, 1863 ) 1864 ) 1865 else: 1866 vals.append(v) 1867 model_args[model_field] = vals 1868 1869 kwargs = { 1870 class_field: model_args[class_field] 1871 for class_field in component_fields.keys() 1872 if class_field in model_args 1873 } 1874 1875 if "api_budget" in component_fields and kwargs.get("api_budget") is None: 1876 kwargs["api_budget"] = self._api_budget 1877 1878 return custom_component_class(**kwargs) 1879 1880 @staticmethod 1881 def _get_class_from_fully_qualified_class_name( 1882 full_qualified_class_name: str, 1883 ) -> Any: 1884 """Get a class from its fully qualified name. 1885 1886 If a custom components module is needed, we assume it is already registered - probably 1887 as `source_declarative_manifest.components` or `components`. 1888 1889 Args: 1890 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1891 1892 Returns: 1893 Any: The class object. 1894 1895 Raises: 1896 ValueError: If the class cannot be loaded. 1897 """ 1898 split = full_qualified_class_name.split(".") 1899 module_name_full = ".".join(split[:-1]) 1900 class_name = split[-1] 1901 1902 try: 1903 module_ref = importlib.import_module(module_name_full) 1904 except ModuleNotFoundError as e: 1905 if split[0] == "source_declarative_manifest": 1906 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1907 try: 1908 import os 1909 1910 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1911 module_ref = importlib.import_module( 1912 module_name_with_source_declarative_manifest 1913 ) 1914 except ModuleNotFoundError: 1915 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1916 else: 1917 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1918 1919 try: 1920 return getattr(module_ref, class_name) 1921 except AttributeError as e: 1922 raise ValueError( 1923 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1924 ) from e 1925 1926 @staticmethod 1927 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1928 interface = field_type 1929 while True: 1930 origin = get_origin(interface) 1931 if origin: 1932 # Unnest types until we reach the raw type 1933 # List[T] -> T 1934 # Optional[List[T]] -> T 1935 args = get_args(interface) 1936 interface = args[0] 1937 else: 1938 break 1939 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1940 return interface.__name__ 1941 return None 1942 1943 @staticmethod 1944 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1945 if not cls: 1946 return False 1947 return cls.__module__ == "builtins" 1948 1949 @staticmethod 1950 def _extract_missing_parameters(error: TypeError) -> List[str]: 1951 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1952 if parameter_search: 1953 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1954 else: 1955 return [] 1956 1957 def _create_nested_component( 1958 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1959 ) -> Any: 1960 type_name = model_value.get("type", None) 1961 if not type_name: 1962 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1963 return model_value 1964 1965 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1966 if model_type: 1967 parsed_model = model_type.parse_obj(model_value) 1968 try: 1969 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1970 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1971 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1972 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1973 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1974 # are needed by a component and could not be shared. 1975 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1976 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1977 model_parameters = model_value.get("$parameters", {}) 1978 matching_parameters = { 1979 kwarg: model_parameters[kwarg] 1980 for kwarg in constructor_kwargs 1981 if kwarg in model_parameters 1982 } 1983 matching_kwargs = { 1984 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1985 } 1986 return self._create_component_from_model( 1987 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1988 ) 1989 except TypeError as error: 1990 missing_parameters = self._extract_missing_parameters(error) 1991 if missing_parameters: 1992 raise ValueError( 1993 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1994 + ", ".join( 1995 ( 1996 f"{type_name}.$parameters.{parameter}" 1997 for parameter in missing_parameters 1998 ) 1999 ) 2000 ) 2001 raise TypeError( 2002 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 2003 ) 2004 else: 2005 raise ValueError( 2006 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 2007 ) 2008 2009 @staticmethod 2010 def _is_component(model_value: Any) -> bool: 2011 return isinstance(model_value, dict) and model_value.get("type") is not None 2012 2013 def create_default_stream( 2014 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2015 ) -> AbstractStream: 2016 primary_key = model.primary_key.__root__ if model.primary_key else None 2017 self._migrate_state(model, config) 2018 2019 partition_router = self._build_stream_slicer_from_partition_router( 2020 model.retriever, 2021 config, 2022 stream_name=model.name, 2023 **kwargs, 2024 ) 2025 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2026 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2027 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2028 2029 end_time_option = ( 2030 self._create_component_from_model( 2031 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2032 ) 2033 if cursor_model.end_time_option 2034 else None 2035 ) 2036 start_time_option = ( 2037 self._create_component_from_model( 2038 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2039 ) 2040 if cursor_model.start_time_option 2041 else None 2042 ) 2043 2044 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2045 start_time_option=start_time_option, 2046 end_time_option=end_time_option, 2047 partition_field_start=cursor_model.partition_field_start, 2048 partition_field_end=cursor_model.partition_field_end, 2049 config=config, 2050 parameters=model.parameters or {}, 2051 ) 2052 request_options_provider = ( 2053 datetime_request_options_provider 2054 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2055 else PerPartitionRequestOptionsProvider( 2056 partition_router, datetime_request_options_provider 2057 ) 2058 ) 2059 elif model.incremental_sync and isinstance( 2060 model.incremental_sync, IncrementingCountCursorModel 2061 ): 2062 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2063 raise ValueError( 2064 "PerPartition does not support per partition states because switching to global state is time based" 2065 ) 2066 2067 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2068 2069 start_time_option = ( 2070 self._create_component_from_model( 2071 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2072 config, 2073 parameters=cursor_model.parameters or {}, 2074 ) 2075 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2076 else None 2077 ) 2078 2079 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2080 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2081 partition_field_start = "start" 2082 2083 request_options_provider = DatetimeBasedRequestOptionsProvider( 2084 start_time_option=start_time_option, 2085 partition_field_start=partition_field_start, 2086 config=config, 2087 parameters=model.parameters or {}, 2088 ) 2089 else: 2090 request_options_provider = None 2091 2092 transformations = [] 2093 if model.transformations: 2094 for transformation_model in model.transformations: 2095 transformations.append( 2096 self._create_component_from_model(model=transformation_model, config=config) 2097 ) 2098 file_uploader = None 2099 if model.file_uploader: 2100 file_uploader = self._create_component_from_model( 2101 model=model.file_uploader, config=config 2102 ) 2103 2104 stream_slicer: ConcurrentStreamSlicer = ( 2105 partition_router 2106 if isinstance(concurrent_cursor, FinalStateCursor) 2107 else concurrent_cursor 2108 ) 2109 2110 retriever = self._create_component_from_model( 2111 model=model.retriever, 2112 config=config, 2113 name=model.name, 2114 primary_key=primary_key, 2115 request_options_provider=request_options_provider, 2116 stream_slicer=stream_slicer, 2117 partition_router=partition_router, 2118 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2119 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2120 cursor=concurrent_cursor, 2121 transformations=transformations, 2122 file_uploader=file_uploader, 2123 incremental_sync=model.incremental_sync, 2124 ) 2125 if isinstance(retriever, AsyncRetriever): 2126 stream_slicer = retriever.stream_slicer 2127 2128 schema_loader: SchemaLoader 2129 if model.schema_loader and isinstance(model.schema_loader, list): 2130 nested_schema_loaders = [ 2131 self._create_component_from_model(model=nested_schema_loader, config=config) 2132 for nested_schema_loader in model.schema_loader 2133 ] 2134 schema_loader = CompositeSchemaLoader( 2135 schema_loaders=nested_schema_loaders, parameters={} 2136 ) 2137 elif model.schema_loader: 2138 schema_loader = self._create_component_from_model( 2139 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2140 config=config, 2141 ) 2142 else: 2143 options = model.parameters or {} 2144 if "name" not in options: 2145 options["name"] = model.name 2146 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2147 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2148 2149 stream_name = model.name or "" 2150 return DefaultStream( 2151 partition_generator=StreamSlicerPartitionGenerator( 2152 DeclarativePartitionFactory( 2153 stream_name, 2154 schema_loader, 2155 retriever, 2156 self._message_repository, 2157 ), 2158 stream_slicer, 2159 slice_limit=self._limit_slices_fetched, 2160 ), 2161 name=stream_name, 2162 json_schema=schema_loader.get_json_schema, 2163 primary_key=get_primary_key_from_stream(primary_key), 2164 cursor_field=( 2165 concurrent_cursor.cursor_field 2166 if hasattr(concurrent_cursor, "cursor_field") 2167 else None 2168 ), 2169 logger=logging.getLogger(f"airbyte.{stream_name}"), 2170 cursor=concurrent_cursor, 2171 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2172 ) 2173 2174 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2175 stream_name = model.name or "" 2176 stream_state = self._connector_state_manager.get_stream_state( 2177 stream_name=stream_name, namespace=None 2178 ) 2179 if model.state_migrations: 2180 state_transformations = [ 2181 self._create_component_from_model(state_migration, config, declarative_stream=model) 2182 for state_migration in model.state_migrations 2183 ] 2184 else: 2185 state_transformations = [] 2186 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2187 self._connector_state_manager.update_state_for_stream( 2188 stream_name=stream_name, namespace=None, value=stream_state 2189 ) 2190 2191 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2192 return bool( 2193 model.incremental_sync 2194 and hasattr(model.incremental_sync, "is_data_feed") 2195 and model.incremental_sync.is_data_feed 2196 ) 2197 2198 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2199 return bool( 2200 model.incremental_sync 2201 and hasattr(model.incremental_sync, "is_client_side_incremental") 2202 and model.incremental_sync.is_client_side_incremental 2203 ) 2204 2205 def _build_stream_slicer_from_partition_router( 2206 self, 2207 model: Union[ 2208 AsyncRetrieverModel, 2209 CustomRetrieverModel, 2210 SimpleRetrieverModel, 2211 ], 2212 config: Config, 2213 stream_name: Optional[str] = None, 2214 **kwargs: Any, 2215 ) -> PartitionRouter: 2216 if ( 2217 hasattr(model, "partition_router") 2218 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2219 and model.partition_router 2220 ): 2221 stream_slicer_model = model.partition_router 2222 if isinstance(stream_slicer_model, list): 2223 return CartesianProductStreamSlicer( 2224 [ 2225 self._create_component_from_model( 2226 model=slicer, config=config, stream_name=stream_name or "" 2227 ) 2228 for slicer in stream_slicer_model 2229 ], 2230 parameters={}, 2231 ) 2232 elif isinstance(stream_slicer_model, dict): 2233 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2234 params = stream_slicer_model.get("$parameters") 2235 if not isinstance(params, dict): 2236 params = {} 2237 stream_slicer_model["$parameters"] = params 2238 2239 if stream_name is not None: 2240 params["stream_name"] = stream_name 2241 2242 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2243 model, 2244 "partition_router", 2245 stream_slicer_model, 2246 config, 2247 **kwargs, 2248 ) 2249 else: 2250 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2251 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2252 ) 2253 return SinglePartitionRouter(parameters={}) 2254 2255 def _build_concurrent_cursor( 2256 self, 2257 model: DeclarativeStreamModel, 2258 stream_slicer: Optional[PartitionRouter], 2259 config: Config, 2260 ) -> Cursor: 2261 stream_name = model.name or "" 2262 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2263 2264 if ( 2265 model.incremental_sync 2266 and stream_slicer 2267 and not isinstance(stream_slicer, SinglePartitionRouter) 2268 ): 2269 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2270 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2271 # same time because we didn't solve for design questions like what the lookback window would 2272 # be as well as global cursor fall backs. We have not seen customers that have needed both 2273 # at the same time yet and are currently punting on this until we need to solve it. 2274 raise ValueError( 2275 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2276 ) 2277 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2278 state_manager=self._connector_state_manager, 2279 model_type=DatetimeBasedCursorModel, 2280 component_definition=model.incremental_sync.__dict__, 2281 stream_name=stream_name, 2282 stream_state=stream_state, 2283 stream_namespace=None, 2284 config=config or {}, 2285 partition_router=stream_slicer, 2286 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2287 ) 2288 elif model.incremental_sync: 2289 if type(model.incremental_sync) == IncrementingCountCursorModel: 2290 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2291 model_type=IncrementingCountCursorModel, 2292 component_definition=model.incremental_sync.__dict__, 2293 stream_name=stream_name, 2294 stream_namespace=None, 2295 stream_state=stream_state, 2296 config=config or {}, 2297 ) 2298 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2299 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2300 model_type=type(model.incremental_sync), 2301 component_definition=model.incremental_sync.__dict__, 2302 stream_name=stream_name, 2303 stream_namespace=None, 2304 stream_state=stream_state, 2305 config=config or {}, 2306 attempt_to_create_cursor_if_not_provided=True, 2307 ) 2308 else: 2309 raise ValueError( 2310 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2311 ) 2312 return FinalStateCursor(stream_name, None, self._message_repository) 2313 2314 def create_default_error_handler( 2315 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2316 ) -> DefaultErrorHandler: 2317 backoff_strategies = [] 2318 if model.backoff_strategies: 2319 for backoff_strategy_model in model.backoff_strategies: 2320 backoff_strategies.append( 2321 self._create_component_from_model(model=backoff_strategy_model, config=config) 2322 ) 2323 2324 response_filters = [] 2325 if model.response_filters: 2326 for response_filter_model in model.response_filters: 2327 response_filters.append( 2328 self._create_component_from_model(model=response_filter_model, config=config) 2329 ) 2330 response_filters.append( 2331 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2332 ) 2333 2334 return DefaultErrorHandler( 2335 backoff_strategies=backoff_strategies, 2336 max_retries=model.max_retries, 2337 response_filters=response_filters, 2338 config=config, 2339 parameters=model.parameters or {}, 2340 ) 2341 2342 def create_default_paginator( 2343 self, 2344 model: DefaultPaginatorModel, 2345 config: Config, 2346 *, 2347 url_base: str, 2348 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2349 decoder: Optional[Decoder] = None, 2350 cursor_used_for_stop_condition: Optional[Cursor] = None, 2351 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2352 if decoder: 2353 if self._is_supported_decoder_for_pagination(decoder): 2354 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2355 else: 2356 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2357 else: 2358 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2359 page_size_option = ( 2360 self._create_component_from_model(model=model.page_size_option, config=config) 2361 if model.page_size_option 2362 else None 2363 ) 2364 page_token_option = ( 2365 self._create_component_from_model(model=model.page_token_option, config=config) 2366 if model.page_token_option 2367 else None 2368 ) 2369 pagination_strategy = self._create_component_from_model( 2370 model=model.pagination_strategy, 2371 config=config, 2372 decoder=decoder_to_use, 2373 extractor_model=extractor_model, 2374 ) 2375 if cursor_used_for_stop_condition: 2376 pagination_strategy = StopConditionPaginationStrategyDecorator( 2377 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2378 ) 2379 paginator = DefaultPaginator( 2380 decoder=decoder_to_use, 2381 page_size_option=page_size_option, 2382 page_token_option=page_token_option, 2383 pagination_strategy=pagination_strategy, 2384 url_base=url_base, 2385 config=config, 2386 parameters=model.parameters or {}, 2387 ) 2388 if self._limit_pages_fetched_per_slice: 2389 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2390 return paginator 2391 2392 def create_dpath_extractor( 2393 self, 2394 model: DpathExtractorModel, 2395 config: Config, 2396 decoder: Optional[Decoder] = None, 2397 **kwargs: Any, 2398 ) -> DpathExtractor: 2399 if decoder: 2400 decoder_to_use = decoder 2401 else: 2402 decoder_to_use = JsonDecoder(parameters={}) 2403 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2404 2405 record_expander = None 2406 if model.record_expander: 2407 record_expander = self._create_component_from_model( 2408 model=model.record_expander, 2409 config=config, 2410 ) 2411 2412 return DpathExtractor( 2413 decoder=decoder_to_use, 2414 field_path=model_field_path, 2415 config=config, 2416 parameters=model.parameters or {}, 2417 record_expander=record_expander, 2418 ) 2419 2420 def create_record_expander( 2421 self, 2422 model: RecordExpanderModel, 2423 config: Config, 2424 **kwargs: Any, 2425 ) -> RecordExpander: 2426 return RecordExpander( 2427 expand_records_from_field=model.expand_records_from_field, 2428 config=config, 2429 parameters=model.parameters or {}, 2430 remain_original_record=model.remain_original_record or False, 2431 on_no_records=OnNoRecords(model.on_no_records.value) 2432 if model.on_no_records 2433 else OnNoRecords.skip, 2434 ) 2435 2436 @staticmethod 2437 def create_response_to_file_extractor( 2438 model: ResponseToFileExtractorModel, 2439 **kwargs: Any, 2440 ) -> ResponseToFileExtractor: 2441 return ResponseToFileExtractor(parameters=model.parameters or {}) 2442 2443 @staticmethod 2444 def create_exponential_backoff_strategy( 2445 model: ExponentialBackoffStrategyModel, config: Config 2446 ) -> ExponentialBackoffStrategy: 2447 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 2448 return ExponentialBackoffStrategy( 2449 factor=model.factor or 5, 2450 jitter_range_in_seconds=model.jitter_range_in_seconds, 2451 parameters=model.parameters or {}, 2452 config=config, 2453 ) 2454 2455 @staticmethod 2456 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2457 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2458 2459 def create_http_requester( 2460 self, 2461 model: HttpRequesterModel, 2462 config: Config, 2463 decoder: Decoder = JsonDecoder(parameters={}), 2464 query_properties_key: Optional[str] = None, 2465 use_cache: Optional[bool] = None, 2466 *, 2467 name: str, 2468 ) -> HttpRequester: 2469 authenticator = ( 2470 self._create_component_from_model( 2471 model=model.authenticator, 2472 config=config, 2473 url_base=model.url or model.url_base, 2474 name=name, 2475 decoder=decoder, 2476 ) 2477 if model.authenticator 2478 else None 2479 ) 2480 error_handler = ( 2481 self._create_component_from_model(model=model.error_handler, config=config) 2482 if model.error_handler 2483 else DefaultErrorHandler( 2484 backoff_strategies=[], 2485 response_filters=[], 2486 config=config, 2487 parameters=model.parameters or {}, 2488 ) 2489 ) 2490 2491 api_budget = self._api_budget 2492 2493 request_options_provider = InterpolatedRequestOptionsProvider( 2494 request_body=model.request_body, 2495 request_body_data=model.request_body_data, 2496 request_body_json=model.request_body_json, 2497 request_headers=model.request_headers, 2498 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2499 query_properties_key=query_properties_key, 2500 config=config, 2501 parameters=model.parameters or {}, 2502 ) 2503 2504 assert model.use_cache is not None # for mypy 2505 assert model.http_method is not None # for mypy 2506 2507 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2508 2509 return HttpRequester( 2510 name=name, 2511 url=model.url, 2512 url_base=model.url_base, 2513 path=model.path, 2514 authenticator=authenticator, 2515 error_handler=error_handler, 2516 api_budget=api_budget, 2517 http_method=HttpMethod[model.http_method.value], 2518 request_options_provider=request_options_provider, 2519 config=config, 2520 disable_retries=self._disable_retries, 2521 parameters=model.parameters or {}, 2522 message_repository=self._message_repository, 2523 use_cache=should_use_cache, 2524 decoder=decoder, 2525 stream_response=decoder.is_stream_response() if decoder else False, 2526 ) 2527 2528 @staticmethod 2529 def create_http_response_filter( 2530 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2531 ) -> HttpResponseFilter: 2532 if model.action: 2533 action = ResponseAction(model.action.value) 2534 else: 2535 action = None 2536 2537 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2538 2539 http_codes = ( 2540 set(model.http_codes) if model.http_codes else set() 2541 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2542 2543 return HttpResponseFilter( 2544 action=action, 2545 failure_type=failure_type, 2546 error_message=model.error_message or "", 2547 error_message_contains=model.error_message_contains or "", 2548 http_codes=http_codes, 2549 predicate=model.predicate or "", 2550 config=config, 2551 parameters=model.parameters or {}, 2552 ) 2553 2554 @staticmethod 2555 def create_inline_schema_loader( 2556 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2557 ) -> InlineSchemaLoader: 2558 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2559 2560 def create_complex_field_type( 2561 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2562 ) -> ComplexFieldType: 2563 items = ( 2564 self._create_component_from_model(model=model.items, config=config) 2565 if isinstance(model.items, ComplexFieldTypeModel) 2566 else model.items 2567 ) 2568 2569 return ComplexFieldType(field_type=model.field_type, items=items) 2570 2571 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2572 target_type = ( 2573 self._create_component_from_model(model=model.target_type, config=config) 2574 if isinstance(model.target_type, ComplexFieldTypeModel) 2575 else model.target_type 2576 ) 2577 2578 return TypesMap( 2579 target_type=target_type, 2580 current_type=model.current_type, 2581 condition=model.condition if model.condition is not None else "True", 2582 ) 2583 2584 def create_schema_type_identifier( 2585 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2586 ) -> SchemaTypeIdentifier: 2587 types_mapping = [] 2588 if model.types_mapping: 2589 types_mapping.extend( 2590 [ 2591 self._create_component_from_model(types_map, config=config) 2592 for types_map in model.types_mapping 2593 ] 2594 ) 2595 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2596 [x for x in model.schema_pointer] if model.schema_pointer else [] 2597 ) 2598 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2599 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2600 [x for x in model.type_pointer] if model.type_pointer else None 2601 ) 2602 2603 return SchemaTypeIdentifier( 2604 schema_pointer=model_schema_pointer, 2605 key_pointer=model_key_pointer, 2606 type_pointer=model_type_pointer, 2607 types_mapping=types_mapping, 2608 parameters=model.parameters or {}, 2609 ) 2610 2611 def create_dynamic_schema_loader( 2612 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2613 ) -> DynamicSchemaLoader: 2614 schema_transformations = [] 2615 if model.schema_transformations: 2616 for transformation_model in model.schema_transformations: 2617 schema_transformations.append( 2618 self._create_component_from_model(model=transformation_model, config=config) 2619 ) 2620 name = "dynamic_properties" 2621 retriever = self._create_component_from_model( 2622 model=model.retriever, 2623 config=config, 2624 name=name, 2625 primary_key=None, 2626 partition_router=self._build_stream_slicer_from_partition_router( 2627 model.retriever, config 2628 ), 2629 transformations=[], 2630 use_cache=True, 2631 log_formatter=( 2632 lambda response: format_http_message( 2633 response, 2634 f"Schema loader '{name}' request", 2635 f"Request performed in order to extract schema.", 2636 name, 2637 is_auxiliary=True, 2638 ) 2639 ), 2640 ) 2641 schema_type_identifier = self._create_component_from_model( 2642 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2643 ) 2644 schema_filter = ( 2645 self._create_component_from_model( 2646 model.schema_filter, config=config, parameters=model.parameters or {} 2647 ) 2648 if model.schema_filter is not None 2649 else None 2650 ) 2651 2652 return DynamicSchemaLoader( 2653 retriever=retriever, 2654 config=config, 2655 schema_transformations=schema_transformations, 2656 schema_filter=schema_filter, 2657 schema_type_identifier=schema_type_identifier, 2658 parameters=model.parameters or {}, 2659 ) 2660 2661 @staticmethod 2662 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2663 return JsonDecoder(parameters={}) 2664 2665 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2666 return CompositeRawDecoder( 2667 parser=ModelToComponentFactory._get_parser(model, config), 2668 stream_response=False if self._emit_connector_builder_messages else True, 2669 ) 2670 2671 def create_jsonl_decoder( 2672 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2673 ) -> Decoder: 2674 return CompositeRawDecoder( 2675 parser=ModelToComponentFactory._get_parser(model, config), 2676 stream_response=False if self._emit_connector_builder_messages else True, 2677 ) 2678 2679 def create_json_items_decoder( 2680 self, model: JsonItemsDecoderModel, config: Config, **kwargs: Any 2681 ) -> Decoder: 2682 return CompositeRawDecoder( 2683 parser=ModelToComponentFactory._get_parser(model, config), 2684 stream_response=False if self._emit_connector_builder_messages else True, 2685 ) 2686 2687 def create_gzip_decoder( 2688 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2689 ) -> Decoder: 2690 _compressed_response_types = { 2691 "gzip", 2692 "x-gzip", 2693 "gzip, deflate", 2694 "x-gzip, deflate", 2695 "application/zip", 2696 "application/gzip", 2697 "application/x-gzip", 2698 "application/x-zip-compressed", 2699 } 2700 2701 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2702 2703 if self._emit_connector_builder_messages: 2704 # This is very surprising but if the response is not streamed, 2705 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2706 # which uses urllib3 directly and does not uncompress the data. 2707 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2708 2709 return CompositeRawDecoder.by_headers( 2710 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2711 stream_response=True, 2712 fallback_parser=gzip_parser.inner_parser, 2713 ) 2714 2715 @staticmethod 2716 def create_iterable_decoder( 2717 model: IterableDecoderModel, config: Config, **kwargs: Any 2718 ) -> IterableDecoder: 2719 return IterableDecoder(parameters={}) 2720 2721 @staticmethod 2722 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2723 return XmlDecoder(parameters={}) 2724 2725 def create_zipfile_decoder( 2726 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2727 ) -> ZipfileDecoder: 2728 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2729 2730 @staticmethod 2731 def _get_parser(model: BaseModel, config: Config) -> Parser: 2732 if isinstance(model, JsonDecoderModel): 2733 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2734 return JsonParser() 2735 elif isinstance(model, JsonItemsDecoderModel): 2736 return JsonItemsParser( 2737 items_path=model.items_path, 2738 encoding=model.encoding, 2739 ) 2740 elif isinstance(model, JsonlDecoderModel): 2741 return JsonLineParser() 2742 elif isinstance(model, CsvDecoderModel): 2743 return CsvParser( 2744 encoding=model.encoding, 2745 delimiter=model.delimiter, 2746 set_values_to_none=model.set_values_to_none, 2747 ) 2748 elif isinstance(model, GzipDecoderModel): 2749 return GzipParser( 2750 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2751 ) 2752 elif isinstance( 2753 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2754 ): 2755 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2756 2757 raise ValueError(f"Unknown decoder type {model}") 2758 2759 @staticmethod 2760 def create_json_file_schema_loader( 2761 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2762 ) -> JsonFileSchemaLoader: 2763 return JsonFileSchemaLoader( 2764 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2765 ) 2766 2767 def create_jwt_authenticator( 2768 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2769 ) -> JwtAuthenticator: 2770 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2771 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2772 request_option = ( 2773 self._create_component_from_model(model.request_option, config) 2774 if model.request_option 2775 else None 2776 ) 2777 return JwtAuthenticator( 2778 config=config, 2779 parameters=model.parameters or {}, 2780 algorithm=JwtAlgorithm(model.algorithm.value), 2781 secret_key=model.secret_key, 2782 base64_encode_secret_key=model.base64_encode_secret_key, 2783 token_duration=model.token_duration, 2784 header_prefix=model.header_prefix, 2785 kid=jwt_headers.kid, 2786 typ=jwt_headers.typ, 2787 cty=jwt_headers.cty, 2788 iss=jwt_payload.iss, 2789 sub=jwt_payload.sub, 2790 aud=jwt_payload.aud, 2791 additional_jwt_headers=model.additional_jwt_headers, 2792 additional_jwt_payload=model.additional_jwt_payload, 2793 passphrase=model.passphrase, 2794 request_option=request_option, 2795 ) 2796 2797 def create_list_partition_router( 2798 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2799 ) -> ListPartitionRouter: 2800 request_option = ( 2801 self._create_component_from_model(model.request_option, config) 2802 if model.request_option 2803 else None 2804 ) 2805 return ListPartitionRouter( 2806 cursor_field=model.cursor_field, 2807 request_option=request_option, 2808 values=model.values, 2809 config=config, 2810 parameters=model.parameters or {}, 2811 ) 2812 2813 @staticmethod 2814 def create_min_max_datetime( 2815 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2816 ) -> MinMaxDatetime: 2817 return MinMaxDatetime( 2818 datetime=model.datetime, 2819 datetime_format=model.datetime_format or "", 2820 max_datetime=model.max_datetime or "", 2821 min_datetime=model.min_datetime or "", 2822 parameters=model.parameters or {}, 2823 ) 2824 2825 @staticmethod 2826 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2827 return NoAuth(parameters=model.parameters or {}) 2828 2829 @staticmethod 2830 def create_no_pagination( 2831 model: NoPaginationModel, config: Config, **kwargs: Any 2832 ) -> NoPagination: 2833 return NoPagination(parameters={}) 2834 2835 def create_oauth_authenticator( 2836 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2837 ) -> DeclarativeOauth2Authenticator: 2838 profile_assertion = ( 2839 self._create_component_from_model(model.profile_assertion, config=config) 2840 if model.profile_assertion 2841 else None 2842 ) 2843 2844 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2845 self._get_refresh_token_error_information(model) 2846 ) 2847 if model.refresh_token_updater: 2848 # ignore type error because fixing it would have a lot of dependencies, revisit later 2849 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2850 config, 2851 InterpolatedString.create( 2852 model.token_refresh_endpoint, # type: ignore 2853 parameters=model.parameters or {}, 2854 ).eval(config), 2855 access_token_name=InterpolatedString.create( 2856 model.access_token_name or "access_token", parameters=model.parameters or {} 2857 ).eval(config), 2858 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2859 expires_in_name=InterpolatedString.create( 2860 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2861 ).eval(config), 2862 client_id_name=InterpolatedString.create( 2863 model.client_id_name or "client_id", parameters=model.parameters or {} 2864 ).eval(config), 2865 client_id=InterpolatedString.create( 2866 model.client_id, parameters=model.parameters or {} 2867 ).eval(config) 2868 if model.client_id 2869 else model.client_id, 2870 client_secret_name=InterpolatedString.create( 2871 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2872 ).eval(config), 2873 client_secret=InterpolatedString.create( 2874 model.client_secret, parameters=model.parameters or {} 2875 ).eval(config) 2876 if model.client_secret 2877 else model.client_secret, 2878 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2879 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2880 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2881 grant_type_name=InterpolatedString.create( 2882 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2883 ).eval(config), 2884 grant_type=InterpolatedString.create( 2885 model.grant_type or "refresh_token", parameters=model.parameters or {} 2886 ).eval(config), 2887 refresh_request_body=InterpolatedMapping( 2888 model.refresh_request_body or {}, parameters=model.parameters or {} 2889 ).eval(config), 2890 refresh_request_headers=InterpolatedMapping( 2891 model.refresh_request_headers or {}, parameters=model.parameters or {} 2892 ).eval(config), 2893 send_refresh_request_as_query_params=bool( 2894 model.send_refresh_request_as_query_params 2895 ), 2896 scopes=model.scopes, 2897 token_expiry_date_format=model.token_expiry_date_format, 2898 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2899 message_repository=self._message_repository, 2900 refresh_token_error_status_codes=refresh_token_error_status_codes, 2901 refresh_token_error_key=refresh_token_error_key, 2902 refresh_token_error_values=refresh_token_error_values, 2903 ) 2904 # ignore type error because fixing it would have a lot of dependencies, revisit later 2905 return DeclarativeOauth2Authenticator( # type: ignore 2906 access_token_name=model.access_token_name or "access_token", 2907 access_token_value=model.access_token_value, 2908 client_id_name=model.client_id_name or "client_id", 2909 client_id=model.client_id, 2910 client_secret_name=model.client_secret_name or "client_secret", 2911 client_secret=model.client_secret, 2912 expires_in_name=model.expires_in_name or "expires_in", 2913 grant_type_name=model.grant_type_name or "grant_type", 2914 grant_type=model.grant_type or "refresh_token", 2915 refresh_request_body=model.refresh_request_body, 2916 refresh_request_headers=model.refresh_request_headers, 2917 send_refresh_request_as_query_params=bool(model.send_refresh_request_as_query_params), 2918 refresh_token_name=model.refresh_token_name or "refresh_token", 2919 refresh_token=model.refresh_token, 2920 scopes=model.scopes, 2921 token_expiry_date=model.token_expiry_date, 2922 token_expiry_date_format=model.token_expiry_date_format, 2923 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2924 token_refresh_endpoint=model.token_refresh_endpoint, 2925 config=config, 2926 parameters=model.parameters or {}, 2927 message_repository=self._message_repository, 2928 profile_assertion=profile_assertion, 2929 use_profile_assertion=model.use_profile_assertion, 2930 refresh_token_error_status_codes=refresh_token_error_status_codes, 2931 refresh_token_error_key=refresh_token_error_key, 2932 refresh_token_error_values=refresh_token_error_values, 2933 ) 2934 2935 @staticmethod 2936 def _get_refresh_token_error_information( 2937 model: OAuthAuthenticatorModel, 2938 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2939 """ 2940 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2941 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2942 information is defined only once and return the right fields. 2943 """ 2944 refresh_token_updater = model.refresh_token_updater 2945 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2946 refresh_token_updater.refresh_token_error_status_codes 2947 or refresh_token_updater.refresh_token_error_key 2948 or refresh_token_updater.refresh_token_error_values 2949 ) 2950 is_defined_on_oauth_authenticator = ( 2951 model.refresh_token_error_status_codes 2952 or model.refresh_token_error_key 2953 or model.refresh_token_error_values 2954 ) 2955 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2956 raise ValueError( 2957 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2958 ) 2959 2960 if is_defined_on_refresh_token_updated: 2961 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2962 return ( 2963 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2964 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2965 else (), 2966 not_optional_refresh_token_updater.refresh_token_error_key or "", 2967 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2968 if not_optional_refresh_token_updater.refresh_token_error_values 2969 else (), 2970 ) 2971 elif is_defined_on_oauth_authenticator: 2972 return ( 2973 tuple(model.refresh_token_error_status_codes) 2974 if model.refresh_token_error_status_codes 2975 else (), 2976 model.refresh_token_error_key or "", 2977 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2978 ) 2979 2980 # returning default values we think cover most cases 2981 return (400,), "error", ("invalid_grant", "invalid_permissions") 2982 2983 def create_offset_increment( 2984 self, 2985 model: OffsetIncrementModel, 2986 config: Config, 2987 decoder: Decoder, 2988 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2989 **kwargs: Any, 2990 ) -> OffsetIncrement: 2991 if isinstance(decoder, PaginationDecoderDecorator): 2992 inner_decoder = decoder.decoder 2993 else: 2994 inner_decoder = decoder 2995 decoder = PaginationDecoderDecorator(decoder=decoder) 2996 2997 if self._is_supported_decoder_for_pagination(inner_decoder): 2998 decoder_to_use = decoder 2999 else: 3000 raise ValueError( 3001 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 3002 ) 3003 3004 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 3005 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 3006 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 3007 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 3008 # When we have more time to investigate we can look into reusing the same component. 3009 extractor = ( 3010 self._create_component_from_model( 3011 model=extractor_model, config=config, decoder=decoder_to_use 3012 ) 3013 if extractor_model 3014 else None 3015 ) 3016 3017 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3018 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3019 page_size = model.page_size 3020 if isinstance(page_size, str) and page_size.isdigit(): 3021 page_size = int(page_size) 3022 3023 return OffsetIncrement( 3024 page_size=page_size, 3025 config=config, 3026 decoder=decoder_to_use, 3027 extractor=extractor, 3028 inject_on_first_request=model.inject_on_first_request or False, 3029 parameters=model.parameters or {}, 3030 ) 3031 3032 @staticmethod 3033 def create_page_increment( 3034 model: PageIncrementModel, config: Config, **kwargs: Any 3035 ) -> PageIncrement: 3036 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3037 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3038 page_size = model.page_size 3039 if isinstance(page_size, str) and page_size.isdigit(): 3040 page_size = int(page_size) 3041 3042 return PageIncrement( 3043 page_size=page_size, 3044 config=config, 3045 start_from_page=model.start_from_page or 0, 3046 inject_on_first_request=model.inject_on_first_request or False, 3047 parameters=model.parameters or {}, 3048 ) 3049 3050 def create_parent_stream_config( 3051 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3052 ) -> ParentStreamConfig: 3053 declarative_stream = self._create_component_from_model( 3054 model.stream, 3055 config=config, 3056 is_parent=True, 3057 **kwargs, 3058 ) 3059 request_option = ( 3060 self._create_component_from_model(model.request_option, config=config) 3061 if model.request_option 3062 else None 3063 ) 3064 3065 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3066 raise ValueError( 3067 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3068 ) 3069 3070 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3071 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3072 ) 3073 3074 return ParentStreamConfig( 3075 parent_key=model.parent_key, 3076 request_option=request_option, 3077 stream=declarative_stream, 3078 partition_field=model.partition_field, 3079 config=config, 3080 incremental_dependency=model.incremental_dependency or False, 3081 parameters=model.parameters or {}, 3082 extra_fields=model.extra_fields, 3083 lazy_read_pointer=model_lazy_read_pointer, 3084 ) 3085 3086 def create_properties_from_endpoint( 3087 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3088 ) -> PropertiesFromEndpoint: 3089 retriever = self._create_component_from_model( 3090 model=model.retriever, 3091 config=config, 3092 name="dynamic_properties", 3093 primary_key=None, 3094 stream_slicer=None, 3095 transformations=[], 3096 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3097 ) 3098 return PropertiesFromEndpoint( 3099 property_field_path=model.property_field_path, 3100 retriever=retriever, 3101 config=config, 3102 parameters=model.parameters or {}, 3103 ) 3104 3105 def create_property_chunking( 3106 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3107 ) -> PropertyChunking: 3108 record_merge_strategy = ( 3109 self._create_component_from_model( 3110 model=model.record_merge_strategy, config=config, **kwargs 3111 ) 3112 if model.record_merge_strategy 3113 else None 3114 ) 3115 3116 property_limit_type: PropertyLimitType 3117 match model.property_limit_type: 3118 case PropertyLimitTypeModel.property_count: 3119 property_limit_type = PropertyLimitType.property_count 3120 case PropertyLimitTypeModel.characters: 3121 property_limit_type = PropertyLimitType.characters 3122 case _: 3123 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3124 3125 return PropertyChunking( 3126 property_limit_type=property_limit_type, 3127 property_limit=model.property_limit, 3128 record_merge_strategy=record_merge_strategy, 3129 config=config, 3130 parameters=model.parameters or {}, 3131 ) 3132 3133 def create_query_properties( 3134 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3135 ) -> QueryProperties: 3136 if isinstance(model.property_list, list): 3137 property_list = model.property_list 3138 else: 3139 property_list = self._create_component_from_model( 3140 model=model.property_list, config=config, **kwargs 3141 ) 3142 3143 property_chunking = ( 3144 self._create_component_from_model( 3145 model=model.property_chunking, config=config, **kwargs 3146 ) 3147 if model.property_chunking 3148 else None 3149 ) 3150 3151 property_selector = ( 3152 self._create_component_from_model( 3153 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3154 ) 3155 if model.property_selector 3156 else None 3157 ) 3158 3159 return QueryProperties( 3160 property_list=property_list, 3161 always_include_properties=model.always_include_properties, 3162 property_chunking=property_chunking, 3163 property_selector=property_selector, 3164 config=config, 3165 parameters=model.parameters or {}, 3166 ) 3167 3168 def create_json_schema_property_selector( 3169 self, 3170 model: JsonSchemaPropertySelectorModel, 3171 config: Config, 3172 *, 3173 stream_name: str, 3174 **kwargs: Any, 3175 ) -> JsonSchemaPropertySelector: 3176 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3177 3178 transformations = [] 3179 if model.transformations: 3180 for transformation_model in model.transformations: 3181 transformations.append( 3182 self._create_component_from_model(model=transformation_model, config=config) 3183 ) 3184 3185 return JsonSchemaPropertySelector( 3186 configured_stream=configured_stream, 3187 properties_transformations=transformations, 3188 config=config, 3189 parameters=model.parameters or {}, 3190 ) 3191 3192 @staticmethod 3193 def create_record_filter( 3194 model: RecordFilterModel, config: Config, **kwargs: Any 3195 ) -> RecordFilter: 3196 return RecordFilter( 3197 condition=model.condition or "", config=config, parameters=model.parameters or {} 3198 ) 3199 3200 @staticmethod 3201 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3202 return RequestPath(parameters={}) 3203 3204 @staticmethod 3205 def create_request_option( 3206 model: RequestOptionModel, config: Config, **kwargs: Any 3207 ) -> RequestOption: 3208 inject_into = RequestOptionType(model.inject_into.value) 3209 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3210 [ 3211 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3212 for segment in model.field_path 3213 ] 3214 if model.field_path 3215 else None 3216 ) 3217 field_name = ( 3218 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3219 if model.field_name 3220 else None 3221 ) 3222 return RequestOption( 3223 field_name=field_name, 3224 field_path=field_path, 3225 inject_into=inject_into, 3226 parameters=kwargs.get("parameters", {}), 3227 ) 3228 3229 def create_record_selector( 3230 self, 3231 model: RecordSelectorModel, 3232 config: Config, 3233 *, 3234 name: str, 3235 transformations: List[RecordTransformation] | None = None, 3236 decoder: Decoder | None = None, 3237 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3238 file_uploader: Optional[DefaultFileUploader] = None, 3239 **kwargs: Any, 3240 ) -> RecordSelector: 3241 extractor = self._create_component_from_model( 3242 model=model.extractor, decoder=decoder, config=config 3243 ) 3244 record_filter = ( 3245 self._create_component_from_model(model.record_filter, config=config) 3246 if model.record_filter 3247 else None 3248 ) 3249 3250 transform_before_filtering = ( 3251 False if model.transform_before_filtering is None else model.transform_before_filtering 3252 ) 3253 if client_side_incremental_sync_cursor: 3254 record_filter = ClientSideIncrementalRecordFilterDecorator( 3255 config=config, 3256 parameters=model.parameters, 3257 condition=model.record_filter.condition 3258 if (model.record_filter and hasattr(model.record_filter, "condition")) 3259 else None, 3260 cursor=client_side_incremental_sync_cursor, 3261 ) 3262 transform_before_filtering = ( 3263 True 3264 if model.transform_before_filtering is None 3265 else model.transform_before_filtering 3266 ) 3267 3268 if model.schema_normalization is None: 3269 # default to no schema normalization if not set 3270 model.schema_normalization = SchemaNormalizationModel.None_ 3271 3272 schema_normalization = ( 3273 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3274 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3275 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3276 ) 3277 3278 return RecordSelector( 3279 extractor=extractor, 3280 name=name, 3281 config=config, 3282 record_filter=record_filter, 3283 transformations=transformations or [], 3284 file_uploader=file_uploader, 3285 schema_normalization=schema_normalization, 3286 parameters=model.parameters or {}, 3287 transform_before_filtering=transform_before_filtering, 3288 ) 3289 3290 @staticmethod 3291 def create_remove_fields( 3292 model: RemoveFieldsModel, config: Config, **kwargs: Any 3293 ) -> RemoveFields: 3294 return RemoveFields( 3295 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3296 ) 3297 3298 def create_selective_authenticator( 3299 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3300 ) -> DeclarativeAuthenticator: 3301 authenticators = { 3302 name: self._create_component_from_model(model=auth, config=config) 3303 for name, auth in model.authenticators.items() 3304 } 3305 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3306 return SelectiveAuthenticator( # type: ignore[abstract] 3307 config=config, 3308 authenticators=authenticators, 3309 authenticator_selection_path=model.authenticator_selection_path, 3310 **kwargs, 3311 ) 3312 3313 @staticmethod 3314 def create_legacy_session_token_authenticator( 3315 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3316 ) -> LegacySessionTokenAuthenticator: 3317 return LegacySessionTokenAuthenticator( 3318 api_url=url_base, 3319 header=model.header, 3320 login_url=model.login_url, 3321 password=model.password or "", 3322 session_token=model.session_token or "", 3323 session_token_response_key=model.session_token_response_key or "", 3324 username=model.username or "", 3325 validate_session_url=model.validate_session_url, 3326 config=config, 3327 parameters=model.parameters or {}, 3328 ) 3329 3330 def create_simple_retriever( 3331 self, 3332 model: SimpleRetrieverModel, 3333 config: Config, 3334 *, 3335 name: str, 3336 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3337 request_options_provider: Optional[RequestOptionsProvider] = None, 3338 cursor: Optional[Cursor] = None, 3339 has_stop_condition_cursor: bool = False, 3340 is_client_side_incremental_sync: bool = False, 3341 transformations: List[RecordTransformation], 3342 file_uploader: Optional[DefaultFileUploader] = None, 3343 incremental_sync: Optional[ 3344 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3345 ] = None, 3346 use_cache: Optional[bool] = None, 3347 log_formatter: Optional[Callable[[Response], Any]] = None, 3348 partition_router: Optional[PartitionRouter] = None, 3349 **kwargs: Any, 3350 ) -> SimpleRetriever: 3351 def _get_url(req: Requester) -> str: 3352 """ 3353 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3354 This is needed because the URL is not set until the requester is created. 3355 """ 3356 3357 _url: str = ( 3358 model.requester.url 3359 if hasattr(model.requester, "url") and model.requester.url is not None 3360 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3361 ) 3362 _url_base: str = ( 3363 model.requester.url_base 3364 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3365 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3366 ) 3367 3368 return _url or _url_base 3369 3370 if cursor is None: 3371 cursor = FinalStateCursor(name, None, self._message_repository) 3372 3373 decoder = ( 3374 self._create_component_from_model(model=model.decoder, config=config) 3375 if model.decoder 3376 else JsonDecoder(parameters={}) 3377 ) 3378 record_selector = self._create_component_from_model( 3379 model=model.record_selector, 3380 name=name, 3381 config=config, 3382 decoder=decoder, 3383 transformations=transformations, 3384 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3385 file_uploader=file_uploader, 3386 ) 3387 3388 query_properties: Optional[QueryProperties] = None 3389 query_properties_key: Optional[str] = None 3390 self._ensure_query_properties_to_model(model.requester) 3391 if self._has_query_properties_in_request_parameters(model.requester): 3392 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3393 # places instead of default to request_parameters which isn't clearly documented 3394 if ( 3395 hasattr(model.requester, "fetch_properties_from_endpoint") 3396 and model.requester.fetch_properties_from_endpoint 3397 ): 3398 raise ValueError( 3399 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3400 ) 3401 3402 query_properties_definitions = [] 3403 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3404 if isinstance(request_parameter, QueryPropertiesModel): 3405 query_properties_key = key 3406 query_properties_definitions.append(request_parameter) 3407 3408 if len(query_properties_definitions) > 1: 3409 raise ValueError( 3410 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3411 ) 3412 3413 if len(query_properties_definitions) == 1: 3414 query_properties = self._create_component_from_model( 3415 model=query_properties_definitions[0], stream_name=name, config=config 3416 ) 3417 3418 # Removes QueryProperties components from the interpolated mappings because it has been designed 3419 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3420 # instead of through jinja interpolation 3421 if hasattr(model.requester, "request_parameters") and isinstance( 3422 model.requester.request_parameters, Mapping 3423 ): 3424 model.requester.request_parameters = self._remove_query_properties( 3425 model.requester.request_parameters 3426 ) 3427 elif ( 3428 hasattr(model.requester, "fetch_properties_from_endpoint") 3429 and model.requester.fetch_properties_from_endpoint 3430 ): 3431 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3432 query_properties_definition = QueryPropertiesModel( 3433 type="QueryProperties", 3434 property_list=model.requester.fetch_properties_from_endpoint, 3435 always_include_properties=None, 3436 property_chunking=None, 3437 ) # type: ignore # $parameters has a default value 3438 3439 query_properties = self.create_query_properties( 3440 model=query_properties_definition, 3441 stream_name=name, 3442 config=config, 3443 ) 3444 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3445 query_properties = self.create_query_properties( 3446 model=model.requester.query_properties, 3447 stream_name=name, 3448 config=config, 3449 ) 3450 3451 requester = self._create_component_from_model( 3452 model=model.requester, 3453 decoder=decoder, 3454 name=name, 3455 query_properties_key=query_properties_key, 3456 use_cache=use_cache, 3457 config=config, 3458 ) 3459 3460 if not request_options_provider: 3461 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3462 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3463 partition_router, PartitionRouter 3464 ): 3465 request_options_provider = partition_router 3466 3467 paginator = ( 3468 self._create_component_from_model( 3469 model=model.paginator, 3470 config=config, 3471 url_base=_get_url(requester), 3472 extractor_model=model.record_selector.extractor, 3473 decoder=decoder, 3474 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3475 ) 3476 if model.paginator 3477 else NoPagination(parameters={}) 3478 ) 3479 3480 ignore_stream_slicer_parameters_on_paginated_requests = ( 3481 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3482 ) 3483 3484 if ( 3485 model.partition_router 3486 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3487 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3488 and any( 3489 parent_stream_config.lazy_read_pointer 3490 for parent_stream_config in model.partition_router.parent_stream_configs 3491 ) 3492 ): 3493 if incremental_sync: 3494 if incremental_sync.type != "DatetimeBasedCursor": 3495 raise ValueError( 3496 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3497 ) 3498 3499 elif incremental_sync.step or incremental_sync.cursor_granularity: 3500 raise ValueError( 3501 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3502 ) 3503 3504 if model.decoder and model.decoder.type != "JsonDecoder": 3505 raise ValueError( 3506 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3507 ) 3508 3509 return LazySimpleRetriever( 3510 name=name, 3511 paginator=paginator, 3512 primary_key=primary_key, 3513 requester=requester, 3514 record_selector=record_selector, 3515 stream_slicer=_NO_STREAM_SLICING, 3516 request_option_provider=request_options_provider, 3517 config=config, 3518 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3519 parameters=model.parameters or {}, 3520 ) 3521 3522 if ( 3523 model.record_selector.record_filter 3524 and model.pagination_reset 3525 and model.pagination_reset.limits 3526 ): 3527 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3528 3529 return SimpleRetriever( 3530 name=name, 3531 paginator=paginator, 3532 primary_key=primary_key, 3533 requester=requester, 3534 record_selector=record_selector, 3535 stream_slicer=_NO_STREAM_SLICING, 3536 request_option_provider=request_options_provider, 3537 config=config, 3538 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3539 additional_query_properties=query_properties, 3540 log_formatter=self._get_log_formatter(log_formatter, name), 3541 pagination_tracker_factory=self._create_pagination_tracker_factory( 3542 model.pagination_reset, cursor 3543 ), 3544 parameters=model.parameters or {}, 3545 ) 3546 3547 def _create_pagination_tracker_factory( 3548 self, model: Optional[PaginationResetModel], cursor: Cursor 3549 ) -> Callable[[], PaginationTracker]: 3550 if model is None: 3551 return lambda: PaginationTracker() 3552 3553 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3554 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3555 if model.action == PaginationResetActionModel.RESET: 3556 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3557 pass 3558 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3559 if isinstance(cursor, ConcurrentCursor): 3560 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3561 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3562 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3563 {}, datetime.timedelta(0) 3564 ) 3565 elif not isinstance(cursor, FinalStateCursor): 3566 LOGGER.warning( 3567 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3568 ) 3569 else: 3570 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3571 3572 limit = model.limits.number_of_records if model and model.limits else None 3573 return lambda: PaginationTracker(cursor_factory(), limit) 3574 3575 def _get_log_formatter( 3576 self, log_formatter: Callable[[Response], Any] | None, name: str 3577 ) -> Callable[[Response], Any] | None: 3578 if self._should_limit_slices_fetched(): 3579 return ( 3580 ( 3581 lambda response: format_http_message( 3582 response, 3583 f"Stream '{name}' request", 3584 f"Request performed in order to extract records for stream '{name}'", 3585 name, 3586 ) 3587 ) 3588 if not log_formatter 3589 else log_formatter 3590 ) 3591 return None 3592 3593 def _should_limit_slices_fetched(self) -> bool: 3594 """ 3595 Returns True if the number of slices fetched should be limited, False otherwise. 3596 This is used to limit the number of slices fetched during tests. 3597 """ 3598 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3599 3600 @staticmethod 3601 def _has_query_properties_in_request_parameters( 3602 requester: Union[HttpRequesterModel, CustomRequesterModel], 3603 ) -> bool: 3604 if not hasattr(requester, "request_parameters"): 3605 return False 3606 request_parameters = requester.request_parameters 3607 if request_parameters and isinstance(request_parameters, Mapping): 3608 for request_parameter in request_parameters.values(): 3609 if isinstance(request_parameter, QueryPropertiesModel): 3610 return True 3611 return False 3612 3613 @staticmethod 3614 def _remove_query_properties( 3615 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3616 ) -> Mapping[str, str]: 3617 return { 3618 parameter_field: request_parameter 3619 for parameter_field, request_parameter in request_parameters.items() 3620 if not isinstance(request_parameter, QueryPropertiesModel) 3621 } 3622 3623 def create_state_delegating_stream( 3624 self, 3625 model: StateDelegatingStreamModel, 3626 config: Config, 3627 **kwargs: Any, 3628 ) -> DefaultStream: 3629 if ( 3630 model.full_refresh_stream.name != model.name 3631 or model.name != model.incremental_stream.name 3632 ): 3633 raise ValueError( 3634 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3635 ) 3636 3637 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3638 resolved_retention_period: Optional[str] = None 3639 if model.api_retention_period: 3640 interpolated_retention = InterpolatedString.create( 3641 model.api_retention_period, parameters=model.parameters or {} 3642 ) 3643 resolved_value = interpolated_retention.eval(config=config) 3644 if resolved_value: 3645 resolved_retention_period = str(resolved_value) 3646 3647 if resolved_retention_period: 3648 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3649 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3650 raise ValueError( 3651 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3652 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3653 f"cursors, so cursor age validation cannot be performed." 3654 ) 3655 3656 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3657 3658 if not stream_state: 3659 return self._create_component_from_model( # type: ignore[no-any-return] 3660 model.full_refresh_stream, config=config, **kwargs 3661 ) 3662 3663 incremental_stream: DefaultStream = self._create_component_from_model( 3664 model.incremental_stream, config=config, **kwargs 3665 ) # type: ignore[assignment] 3666 3667 # Only run cursor age validation for streams that are in the configured 3668 # catalog (or when no catalog was provided, e.g. during discover / connector 3669 # builder). Streams not selected by the user but instantiated as parent-stream 3670 # dependencies must not go through this path because it emits state messages 3671 # that the destination does not know about, causing "Stream not found" crashes. 3672 stream_is_in_catalog = ( 3673 not self._stream_name_to_configured_stream # no catalog → validate by default 3674 or model.name in self._stream_name_to_configured_stream 3675 ) 3676 if resolved_retention_period and stream_is_in_catalog: 3677 full_refresh_stream: DefaultStream = self._create_component_from_model( 3678 model.full_refresh_stream, config=config, **kwargs 3679 ) # type: ignore[assignment] 3680 if self._is_cursor_older_than_retention_period( 3681 stream_state, 3682 full_refresh_stream.cursor, 3683 incremental_stream.cursor, 3684 resolved_retention_period, 3685 model.name, 3686 ): 3687 # Clear state BEFORE constructing the full_refresh_stream so that 3688 # its cursor starts from start_date instead of the stale cursor. 3689 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3690 state_message = self._connector_state_manager.create_state_message(model.name, None) 3691 self._message_repository.emit_message(state_message) 3692 return self._create_component_from_model( # type: ignore[no-any-return] 3693 model.full_refresh_stream, config=config, **kwargs 3694 ) 3695 3696 return incremental_stream 3697 3698 @staticmethod 3699 def _is_cursor_older_than_retention_period( 3700 stream_state: Mapping[str, Any], 3701 full_refresh_cursor: Cursor, 3702 incremental_cursor: Cursor, 3703 api_retention_period: str, 3704 stream_name: str, 3705 ) -> bool: 3706 """Check if the cursor value in the state is older than the API's retention period. 3707 3708 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3709 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3710 which is always within retention, so we use incremental. For other states, it returns 3711 None and we fall back to checking the incremental cursor. 3712 3713 Returns True if the cursor is older than the retention period (should use full refresh). 3714 Returns False if the cursor is within the retention period (safe to use incremental). 3715 """ 3716 retention_duration = parse_duration(api_retention_period) 3717 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3718 3719 # Check full refresh cursor first 3720 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3721 3722 # If full refresh cursor returns None, check incremental cursor 3723 if cursor_datetime is None: 3724 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3725 3726 if cursor_datetime is None: 3727 # Neither cursor could parse the state - fall back to full refresh to be safe 3728 return True 3729 3730 if cursor_datetime < retention_cutoff: 3731 logging.warning( 3732 f"Stream '{stream_name}' has a cursor value older than " 3733 f"the API's retention period of {api_retention_period} " 3734 f"(cutoff: {retention_cutoff.isoformat()}). " 3735 f"Falling back to full refresh to avoid data loss." 3736 ) 3737 return True 3738 3739 return False 3740 3741 def _get_state_delegating_stream_model( 3742 self, 3743 model: StateDelegatingStreamModel, 3744 parent_state: Optional[Mapping[str, Any]] = None, 3745 ) -> DeclarativeStreamModel: 3746 """Return the appropriate underlying stream model based on state.""" 3747 return ( 3748 model.incremental_stream 3749 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3750 else model.full_refresh_stream 3751 ) 3752 3753 _OPTIONAL_ASYNC_STATUS_FIELDS = {"skipped"} 3754 3755 def _create_async_job_status_mapping( 3756 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3757 ) -> Mapping[str, AsyncJobStatus]: 3758 api_status_to_cdk_status = {} 3759 for cdk_status, api_statuses in model.dict().items(): 3760 if cdk_status == "type": 3761 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3762 continue 3763 3764 if api_statuses is None: 3765 if cdk_status in self._OPTIONAL_ASYNC_STATUS_FIELDS: 3766 continue 3767 raise ValueError( 3768 f"Required CDK status '{cdk_status}' has no API statuses mapped. " 3769 f"Please provide at least an empty list for required status fields." 3770 ) 3771 3772 for status in api_statuses: 3773 if status in api_status_to_cdk_status: 3774 raise ValueError( 3775 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3776 ) 3777 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3778 return api_status_to_cdk_status 3779 3780 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3781 match status: 3782 case "running": 3783 return AsyncJobStatus.RUNNING 3784 case "completed": 3785 return AsyncJobStatus.COMPLETED 3786 case "failed": 3787 return AsyncJobStatus.FAILED 3788 case "timeout": 3789 return AsyncJobStatus.TIMED_OUT 3790 case "skipped": 3791 return AsyncJobStatus.SKIPPED 3792 case _: 3793 raise ValueError(f"Unsupported CDK status {status}") 3794 3795 def create_async_retriever( 3796 self, 3797 model: AsyncRetrieverModel, 3798 config: Config, 3799 *, 3800 name: str, 3801 primary_key: Optional[ 3802 Union[str, List[str], List[List[str]]] 3803 ], # this seems to be needed to match create_simple_retriever 3804 stream_slicer: Optional[StreamSlicer], 3805 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3806 transformations: List[RecordTransformation], 3807 **kwargs: Any, 3808 ) -> AsyncRetriever: 3809 if model.download_target_requester and not model.download_target_extractor: 3810 raise ValueError( 3811 f"`download_target_extractor` required if using a `download_target_requester`" 3812 ) 3813 3814 def _get_download_retriever( 3815 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3816 ) -> SimpleRetriever: 3817 # We create a record selector for the download retriever 3818 # with no schema normalization and no transformations, neither record filter 3819 # as all this occurs in the record_selector of the AsyncRetriever 3820 record_selector = RecordSelector( 3821 extractor=extractor, 3822 name=name, 3823 record_filter=None, 3824 transformations=[], 3825 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3826 config=config, 3827 parameters={}, 3828 ) 3829 paginator = ( 3830 self._create_component_from_model( 3831 model=model.download_paginator, 3832 decoder=_decoder, 3833 config=config, 3834 url_base="", 3835 ) 3836 if model.download_paginator 3837 else NoPagination(parameters={}) 3838 ) 3839 3840 return SimpleRetriever( 3841 requester=requester, 3842 record_selector=record_selector, 3843 primary_key=None, 3844 name=name, 3845 paginator=paginator, 3846 config=config, 3847 parameters={}, 3848 log_formatter=self._get_log_formatter(None, name), 3849 ) 3850 3851 def _get_job_timeout() -> datetime.timedelta: 3852 user_defined_timeout: Optional[int] = ( 3853 int( 3854 InterpolatedString.create( 3855 str(model.polling_job_timeout), 3856 parameters={}, 3857 ).eval(config) 3858 ) 3859 if model.polling_job_timeout 3860 else None 3861 ) 3862 3863 # check for user defined timeout during the test read or 15 minutes 3864 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3865 # default value for non-connector builder is 60 minutes. 3866 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3867 3868 return ( 3869 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3870 ) 3871 3872 decoder = ( 3873 self._create_component_from_model(model=model.decoder, config=config) 3874 if model.decoder 3875 else JsonDecoder(parameters={}) 3876 ) 3877 record_selector = self._create_component_from_model( 3878 model=model.record_selector, 3879 config=config, 3880 decoder=decoder, 3881 name=name, 3882 transformations=transformations, 3883 client_side_incremental_sync=client_side_incremental_sync, 3884 ) 3885 3886 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3887 if self._should_limit_slices_fetched(): 3888 stream_slicer = cast( 3889 StreamSlicer, 3890 StreamSlicerTestReadDecorator( 3891 wrapped_slicer=stream_slicer, 3892 maximum_number_of_slices=self._limit_slices_fetched or 5, 3893 ), 3894 ) 3895 3896 creation_requester = self._create_component_from_model( 3897 model=model.creation_requester, 3898 decoder=decoder, 3899 config=config, 3900 name=f"job creation - {name}", 3901 ) 3902 polling_requester = self._create_component_from_model( 3903 model=model.polling_requester, 3904 decoder=decoder, 3905 config=config, 3906 name=f"job polling - {name}", 3907 ) 3908 job_download_components_name = f"job download - {name}" 3909 download_decoder = ( 3910 self._create_component_from_model(model=model.download_decoder, config=config) 3911 if model.download_decoder 3912 else JsonDecoder(parameters={}) 3913 ) 3914 download_extractor = ( 3915 self._create_component_from_model( 3916 model=model.download_extractor, 3917 config=config, 3918 decoder=download_decoder, 3919 parameters=model.parameters, 3920 ) 3921 if model.download_extractor 3922 else DpathExtractor( 3923 [], 3924 config=config, 3925 decoder=download_decoder, 3926 parameters=model.parameters or {}, 3927 ) 3928 ) 3929 download_requester = self._create_component_from_model( 3930 model=model.download_requester, 3931 decoder=download_decoder, 3932 config=config, 3933 name=job_download_components_name, 3934 ) 3935 download_retriever = _get_download_retriever( 3936 download_requester, download_extractor, download_decoder 3937 ) 3938 abort_requester = ( 3939 self._create_component_from_model( 3940 model=model.abort_requester, 3941 decoder=decoder, 3942 config=config, 3943 name=f"job abort - {name}", 3944 ) 3945 if model.abort_requester 3946 else None 3947 ) 3948 delete_requester = ( 3949 self._create_component_from_model( 3950 model=model.delete_requester, 3951 decoder=decoder, 3952 config=config, 3953 name=f"job delete - {name}", 3954 ) 3955 if model.delete_requester 3956 else None 3957 ) 3958 download_target_requester = ( 3959 self._create_component_from_model( 3960 model=model.download_target_requester, 3961 decoder=decoder, 3962 config=config, 3963 name=f"job extract_url - {name}", 3964 ) 3965 if model.download_target_requester 3966 else None 3967 ) 3968 status_extractor = self._create_component_from_model( 3969 model=model.status_extractor, decoder=decoder, config=config, name=name 3970 ) 3971 download_target_extractor = ( 3972 self._create_component_from_model( 3973 model=model.download_target_extractor, 3974 decoder=decoder, 3975 config=config, 3976 name=name, 3977 ) 3978 if model.download_target_extractor 3979 else None 3980 ) 3981 3982 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3983 creation_requester=creation_requester, 3984 polling_requester=polling_requester, 3985 download_retriever=download_retriever, 3986 download_target_requester=download_target_requester, 3987 abort_requester=abort_requester, 3988 delete_requester=delete_requester, 3989 status_extractor=status_extractor, 3990 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3991 download_target_extractor=download_target_extractor, 3992 job_timeout=_get_job_timeout(), 3993 ) 3994 3995 failed_retry_wait_time_in_seconds: Optional[int] = ( 3996 int( 3997 InterpolatedString.create( 3998 str(model.failed_retry_wait_time_in_seconds), 3999 parameters={}, 4000 ).eval(config) 4001 ) 4002 if model.failed_retry_wait_time_in_seconds 4003 else None 4004 ) 4005 4006 async_job_partition_router = AsyncJobPartitionRouter( 4007 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 4008 job_repository, 4009 stream_slices, 4010 self._job_tracker, 4011 self._message_repository, 4012 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 4013 has_bulk_parent=False, 4014 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 4015 # `None` == default retry is set to 3 attempts, under the hood. 4016 job_max_retry=1 if self._emit_connector_builder_messages else None, 4017 failed_retry_wait_time_in_seconds=failed_retry_wait_time_in_seconds, 4018 ), 4019 stream_slicer=stream_slicer, 4020 config=config, 4021 parameters=model.parameters or {}, 4022 ) 4023 4024 return AsyncRetriever( 4025 record_selector=record_selector, 4026 stream_slicer=async_job_partition_router, 4027 config=config, 4028 parameters=model.parameters or {}, 4029 ) 4030 4031 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 4032 config_migrations = [ 4033 self._create_component_from_model(migration, config) 4034 for migration in ( 4035 model.config_normalization_rules.config_migrations 4036 if ( 4037 model.config_normalization_rules 4038 and model.config_normalization_rules.config_migrations 4039 ) 4040 else [] 4041 ) 4042 ] 4043 config_transformations = [ 4044 self._create_component_from_model(transformation, config) 4045 for transformation in ( 4046 model.config_normalization_rules.transformations 4047 if ( 4048 model.config_normalization_rules 4049 and model.config_normalization_rules.transformations 4050 ) 4051 else [] 4052 ) 4053 ] 4054 config_validations = [ 4055 self._create_component_from_model(validation, config) 4056 for validation in ( 4057 model.config_normalization_rules.validations 4058 if ( 4059 model.config_normalization_rules 4060 and model.config_normalization_rules.validations 4061 ) 4062 else [] 4063 ) 4064 ] 4065 4066 return Spec( 4067 connection_specification=model.connection_specification, 4068 documentation_url=model.documentation_url, 4069 advanced_auth=model.advanced_auth, 4070 parameters={}, 4071 config_migrations=config_migrations, 4072 config_transformations=config_transformations, 4073 config_validations=config_validations, 4074 ) 4075 4076 def create_substream_partition_router( 4077 self, 4078 model: SubstreamPartitionRouterModel, 4079 config: Config, 4080 *, 4081 stream_name: str, 4082 **kwargs: Any, 4083 ) -> SubstreamPartitionRouter: 4084 parent_stream_configs = [] 4085 if model.parent_stream_configs: 4086 parent_stream_configs.extend( 4087 [ 4088 self.create_parent_stream_config_with_substream_wrapper( 4089 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4090 ) 4091 for parent_stream_config in model.parent_stream_configs 4092 ] 4093 ) 4094 4095 return SubstreamPartitionRouter( 4096 parent_stream_configs=parent_stream_configs, 4097 parameters=model.parameters or {}, 4098 config=config, 4099 ) 4100 4101 def create_parent_stream_config_with_substream_wrapper( 4102 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4103 ) -> Any: 4104 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4105 4106 parent_state: Optional[Mapping[str, Any]] = ( 4107 child_state if model.incremental_dependency and child_state else None 4108 ) 4109 connector_state_manager = self._instantiate_parent_stream_state_manager( 4110 child_state, config, model, parent_state 4111 ) 4112 4113 substream_factory = ModelToComponentFactory( 4114 connector_state_manager=connector_state_manager, 4115 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4116 limit_slices_fetched=self._limit_slices_fetched, 4117 emit_connector_builder_messages=self._emit_connector_builder_messages, 4118 disable_retries=self._disable_retries, 4119 disable_cache=self._disable_cache, 4120 message_repository=StateFilteringMessageRepository( 4121 LogAppenderMessageRepositoryDecorator( 4122 { 4123 "airbyte_cdk": {"stream": {"is_substream": True}}, 4124 "http": {"is_auxiliary": True}, 4125 }, 4126 self._message_repository, 4127 self._evaluate_log_level(self._emit_connector_builder_messages), 4128 ), 4129 ), 4130 api_budget=self._api_budget, 4131 ) 4132 4133 return substream_factory.create_parent_stream_config( 4134 model=model, config=config, stream_name=stream_name, **kwargs 4135 ) 4136 4137 def _instantiate_parent_stream_state_manager( 4138 self, 4139 child_state: MutableMapping[str, Any], 4140 config: Config, 4141 model: ParentStreamConfigModel, 4142 parent_state: Optional[Mapping[str, Any]] = None, 4143 ) -> ConnectorStateManager: 4144 """ 4145 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4146 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4147 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4148 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4149 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4150 incremental_dependency is set. 4151 """ 4152 if model.incremental_dependency and child_state: 4153 parent_stream_name = model.stream.name or "" 4154 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4155 child_state, parent_stream_name 4156 ) 4157 4158 if not extracted_parent_state: 4159 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4160 child_state, parent_stream_name 4161 ) 4162 4163 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4164 cursor_values = child_state.values() 4165 if cursor_values and len(cursor_values) == 1: 4166 incremental_sync_model: Union[ 4167 DatetimeBasedCursorModel, 4168 IncrementingCountCursorModel, 4169 ] = ( 4170 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4171 if isinstance(model.stream, DeclarativeStreamModel) 4172 else self._get_state_delegating_stream_model( 4173 model.stream, parent_state=parent_state 4174 ).incremental_sync 4175 ) 4176 cursor_field = InterpolatedString.create( 4177 incremental_sync_model.cursor_field, 4178 parameters=incremental_sync_model.parameters or {}, 4179 ).eval(config) 4180 extracted_parent_state = AirbyteStateMessage( 4181 type=AirbyteStateType.STREAM, 4182 stream=AirbyteStreamState( 4183 stream_descriptor=StreamDescriptor( 4184 name=parent_stream_name, namespace=None 4185 ), 4186 stream_state=AirbyteStateBlob( 4187 {cursor_field: list(cursor_values)[0]} 4188 ), 4189 ), 4190 ) 4191 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4192 4193 return ConnectorStateManager([]) 4194 4195 @staticmethod 4196 def create_wait_time_from_header( 4197 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4198 ) -> WaitTimeFromHeaderBackoffStrategy: 4199 return WaitTimeFromHeaderBackoffStrategy( 4200 header=model.header, 4201 parameters=model.parameters or {}, 4202 config=config, 4203 regex=model.regex, 4204 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4205 if model.max_waiting_time_in_seconds is not None 4206 else None, 4207 ) 4208 4209 @staticmethod 4210 def create_wait_until_time_from_header( 4211 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4212 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4213 return WaitUntilTimeFromHeaderBackoffStrategy( 4214 header=model.header, 4215 parameters=model.parameters or {}, 4216 config=config, 4217 min_wait=model.min_wait, 4218 regex=model.regex, 4219 ) 4220 4221 def get_message_repository(self) -> MessageRepository: 4222 return self._message_repository 4223 4224 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4225 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4226 4227 @staticmethod 4228 def create_components_mapping_definition( 4229 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4230 ) -> ComponentMappingDefinition: 4231 interpolated_value = InterpolatedString.create( 4232 model.value, parameters=model.parameters or {} 4233 ) 4234 field_path = [ 4235 InterpolatedString.create(path, parameters=model.parameters or {}) 4236 for path in model.field_path 4237 ] 4238 return ComponentMappingDefinition( 4239 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4240 value=interpolated_value, 4241 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4242 create_or_update=model.create_or_update, 4243 condition=model.condition, 4244 parameters=model.parameters or {}, 4245 ) 4246 4247 def create_http_components_resolver( 4248 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4249 ) -> Any: 4250 retriever = self._create_component_from_model( 4251 model=model.retriever, 4252 config=config, 4253 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4254 primary_key=None, 4255 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4256 transformations=[], 4257 ) 4258 4259 components_mapping = [] 4260 for component_mapping_definition_model in model.components_mapping: 4261 if component_mapping_definition_model.condition: 4262 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4263 components_mapping.append( 4264 self._create_component_from_model( 4265 model=component_mapping_definition_model, 4266 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4267 component_mapping_definition_model.value_type 4268 ), 4269 config=config, 4270 ) 4271 ) 4272 4273 return HttpComponentsResolver( 4274 retriever=retriever, 4275 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4276 config=config, 4277 components_mapping=components_mapping, 4278 parameters=model.parameters or {}, 4279 ) 4280 4281 @staticmethod 4282 def create_stream_config( 4283 model: StreamConfigModel, config: Config, **kwargs: Any 4284 ) -> StreamConfig: 4285 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4286 [x for x in model.configs_pointer] if model.configs_pointer else [] 4287 ) 4288 4289 return StreamConfig( 4290 configs_pointer=model_configs_pointer, 4291 default_values=model.default_values, 4292 parameters=model.parameters or {}, 4293 ) 4294 4295 def create_config_components_resolver( 4296 self, 4297 model: ConfigComponentsResolverModel, 4298 config: Config, 4299 ) -> Any: 4300 model_stream_configs = ( 4301 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4302 ) 4303 4304 stream_configs = [ 4305 self._create_component_from_model( 4306 stream_config, config=config, parameters=model.parameters or {} 4307 ) 4308 for stream_config in model_stream_configs 4309 ] 4310 4311 components_mapping = [ 4312 self._create_component_from_model( 4313 model=components_mapping_definition_model, 4314 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4315 components_mapping_definition_model.value_type 4316 ), 4317 config=config, 4318 parameters=model.parameters, 4319 ) 4320 for components_mapping_definition_model in model.components_mapping 4321 ] 4322 4323 return ConfigComponentsResolver( 4324 stream_configs=stream_configs, 4325 config=config, 4326 components_mapping=components_mapping, 4327 parameters=model.parameters or {}, 4328 ) 4329 4330 def create_parametrized_components_resolver( 4331 self, 4332 model: ParametrizedComponentsResolverModel, 4333 config: Config, 4334 ) -> ParametrizedComponentsResolver: 4335 stream_parameters = StreamParametersDefinition( 4336 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4337 ) 4338 4339 components_mapping = [] 4340 for components_mapping_definition_model in model.components_mapping: 4341 if components_mapping_definition_model.condition: 4342 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4343 components_mapping.append( 4344 self._create_component_from_model( 4345 model=components_mapping_definition_model, 4346 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4347 components_mapping_definition_model.value_type 4348 ), 4349 config=config, 4350 ) 4351 ) 4352 return ParametrizedComponentsResolver( 4353 stream_parameters=stream_parameters, 4354 config=config, 4355 components_mapping=components_mapping, 4356 parameters=model.parameters or {}, 4357 ) 4358 4359 _UNSUPPORTED_DECODER_ERROR = ( 4360 "Specified decoder of {decoder_type} is not supported for pagination." 4361 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4362 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4363 ) 4364 4365 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4366 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4367 return True 4368 elif isinstance(decoder, CompositeRawDecoder): 4369 return self._is_supported_parser_for_pagination(decoder.parser) 4370 else: 4371 return False 4372 4373 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4374 if isinstance(parser, JsonParser): 4375 return True 4376 elif isinstance(parser, GzipParser): 4377 return isinstance(parser.inner_parser, JsonParser) 4378 else: 4379 return False 4380 4381 def create_http_api_budget( 4382 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4383 ) -> HttpAPIBudget: 4384 policies = [ 4385 self._create_component_from_model(model=policy, config=config) 4386 for policy in model.policies 4387 ] 4388 4389 return HttpAPIBudget( 4390 policies=policies, 4391 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4392 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4393 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4394 ) 4395 4396 def create_fixed_window_call_rate_policy( 4397 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4398 ) -> FixedWindowCallRatePolicy: 4399 matchers = [ 4400 self._create_component_from_model(model=matcher, config=config) 4401 for matcher in model.matchers 4402 ] 4403 4404 # Set the initial reset timestamp to 10 days from now. 4405 # This value will be updated by the first request. 4406 return FixedWindowCallRatePolicy( 4407 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4408 period=parse_duration(model.period), 4409 call_limit=model.call_limit, 4410 matchers=matchers, 4411 ) 4412 4413 def create_file_uploader( 4414 self, model: FileUploaderModel, config: Config, **kwargs: Any 4415 ) -> FileUploader: 4416 name = "File Uploader" 4417 requester = self._create_component_from_model( 4418 model=model.requester, 4419 config=config, 4420 name=name, 4421 **kwargs, 4422 ) 4423 download_target_extractor = self._create_component_from_model( 4424 model=model.download_target_extractor, 4425 config=config, 4426 name=name, 4427 **kwargs, 4428 ) 4429 emit_connector_builder_messages = self._emit_connector_builder_messages 4430 file_uploader = DefaultFileUploader( 4431 requester=requester, 4432 download_target_extractor=download_target_extractor, 4433 config=config, 4434 file_writer=NoopFileWriter() 4435 if emit_connector_builder_messages 4436 else LocalFileSystemFileWriter(), 4437 parameters=model.parameters or {}, 4438 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4439 ) 4440 4441 return ( 4442 ConnectorBuilderFileUploader(file_uploader) 4443 if emit_connector_builder_messages 4444 else file_uploader 4445 ) 4446 4447 def create_moving_window_call_rate_policy( 4448 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4449 ) -> MovingWindowCallRatePolicy: 4450 rates = [ 4451 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4452 ] 4453 matchers = [ 4454 self._create_component_from_model(model=matcher, config=config) 4455 for matcher in model.matchers 4456 ] 4457 return MovingWindowCallRatePolicy( 4458 rates=rates, 4459 matchers=matchers, 4460 ) 4461 4462 def create_unlimited_call_rate_policy( 4463 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4464 ) -> UnlimitedCallRatePolicy: 4465 matchers = [ 4466 self._create_component_from_model(model=matcher, config=config) 4467 for matcher in model.matchers 4468 ] 4469 4470 return UnlimitedCallRatePolicy( 4471 matchers=matchers, 4472 ) 4473 4474 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4475 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4476 return Rate( 4477 limit=int(interpolated_limit.eval(config=config)), 4478 interval=parse_duration(model.interval), 4479 ) 4480 4481 def create_http_request_matcher( 4482 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4483 ) -> HttpRequestRegexMatcher: 4484 weight = model.weight 4485 if weight is not None: 4486 if isinstance(weight, str): 4487 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4488 else: 4489 weight = int(weight) 4490 if weight < 1: 4491 raise ValueError(f"weight must be >= 1, got {weight}") 4492 return HttpRequestRegexMatcher( 4493 method=model.method, 4494 url_base=model.url_base, 4495 url_path_pattern=model.url_path_pattern, 4496 params=model.params, 4497 headers=model.headers, 4498 weight=weight, 4499 ) 4500 4501 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4502 self._api_budget = self.create_component( 4503 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4504 ) 4505 4506 def create_grouping_partition_router( 4507 self, 4508 model: GroupingPartitionRouterModel, 4509 config: Config, 4510 *, 4511 stream_name: str, 4512 **kwargs: Any, 4513 ) -> GroupingPartitionRouter: 4514 underlying_router = self._create_component_from_model( 4515 model=model.underlying_partition_router, 4516 config=config, 4517 stream_name=stream_name, 4518 **kwargs, 4519 ) 4520 if model.group_size < 1: 4521 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4522 4523 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4524 # because they are specific to individual partitions and cannot be aggregated or handled 4525 # when grouping, potentially leading to incorrect API calls. Any request customization 4526 # should be managed at the stream level through the requester's configuration. 4527 if isinstance(underlying_router, SubstreamPartitionRouter): 4528 if any( 4529 parent_config.request_option 4530 for parent_config in underlying_router.parent_stream_configs 4531 ): 4532 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4533 4534 if isinstance(underlying_router, ListPartitionRouter): 4535 if underlying_router.request_option: 4536 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4537 4538 return GroupingPartitionRouter( 4539 group_size=model.group_size, 4540 underlying_partition_router=underlying_router, 4541 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4542 config=config, 4543 ) 4544 4545 def _ensure_query_properties_to_model( 4546 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4547 ) -> None: 4548 """ 4549 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4550 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4551 proper model. 4552 """ 4553 if not hasattr(requester, "request_parameters"): 4554 return 4555 4556 request_parameters = requester.request_parameters 4557 if request_parameters and isinstance(request_parameters, Dict): 4558 for request_parameter_key in request_parameters.keys(): 4559 request_parameter = request_parameters[request_parameter_key] 4560 if ( 4561 isinstance(request_parameter, Dict) 4562 and request_parameter.get("type") == "QueryProperties" 4563 ): 4564 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4565 request_parameter 4566 ) 4567 4568 def _get_catalog_defined_cursor_field( 4569 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4570 ) -> Optional[CursorField]: 4571 if not allow_catalog_defined_cursor_field: 4572 return None 4573 4574 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4575 4576 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4577 # case we return None which will then use the default cursor field defined on the cursor model. 4578 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4579 # occur when the platform serializes "no cursor configured" streams incorrectly. 4580 if ( 4581 not configured_stream 4582 or not configured_stream.cursor_field 4583 or not configured_stream.cursor_field[0] 4584 ): 4585 return None 4586 elif len(configured_stream.cursor_field) > 1: 4587 raise ValueError( 4588 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4589 ) 4590 else: 4591 return CursorField( 4592 cursor_field_key=configured_stream.cursor_field[0], 4593 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4594 )
689class ModelToComponentFactory: 690 EPOCH_DATETIME_FORMAT = "%s" 691 692 def __init__( 693 self, 694 limit_pages_fetched_per_slice: Optional[int] = None, 695 limit_slices_fetched: Optional[int] = None, 696 emit_connector_builder_messages: bool = False, 697 disable_retries: bool = False, 698 disable_cache: bool = False, 699 message_repository: Optional[MessageRepository] = None, 700 connector_state_manager: Optional[ConnectorStateManager] = None, 701 max_concurrent_async_job_count: Optional[int] = None, 702 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 703 api_budget: Optional[APIBudget] = None, 704 ): 705 self._init_mappings() 706 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 707 self._limit_slices_fetched = limit_slices_fetched 708 self._emit_connector_builder_messages = emit_connector_builder_messages 709 self._disable_retries = disable_retries 710 self._disable_cache = disable_cache 711 self._message_repository = message_repository or InMemoryMessageRepository( 712 self._evaluate_log_level(emit_connector_builder_messages) 713 ) 714 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 715 configured_catalog 716 ) 717 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 718 self._api_budget: Optional[Union[APIBudget]] = api_budget 719 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 720 # placeholder for deprecation warnings 721 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 722 723 def _init_mappings(self) -> None: 724 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 725 AddedFieldDefinitionModel: self.create_added_field_definition, 726 AddFieldsModel: self.create_add_fields, 727 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 728 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 729 BearerAuthenticatorModel: self.create_bearer_authenticator, 730 CheckStreamModel: self.create_check_stream, 731 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 732 CheckDynamicStreamModel: self.create_check_dynamic_stream, 733 CompositeErrorHandlerModel: self.create_composite_error_handler, 734 ConcurrencyLevelModel: self.create_concurrency_level, 735 ConfigMigrationModel: self.create_config_migration, 736 ConfigAddFieldsModel: self.create_config_add_fields, 737 ConfigRemapFieldModel: self.create_config_remap_field, 738 ConfigRemoveFieldsModel: self.create_config_remove_fields, 739 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 740 CsvDecoderModel: self.create_csv_decoder, 741 CursorPaginationModel: self.create_cursor_pagination, 742 CustomAuthenticatorModel: self.create_custom_component, 743 CustomBackoffStrategyModel: self.create_custom_component, 744 CustomDecoderModel: self.create_custom_component, 745 CustomErrorHandlerModel: self.create_custom_component, 746 CustomRecordExtractorModel: self.create_custom_component, 747 CustomRecordFilterModel: self.create_custom_component, 748 CustomRequesterModel: self.create_custom_component, 749 CustomRetrieverModel: self.create_custom_component, 750 CustomSchemaLoader: self.create_custom_component, 751 CustomSchemaNormalizationModel: self.create_custom_component, 752 CustomStateMigration: self.create_custom_component, 753 CustomPaginationStrategyModel: self.create_custom_component, 754 CustomPartitionRouterModel: self.create_custom_component, 755 CustomTransformationModel: self.create_custom_component, 756 CustomValidationStrategyModel: self.create_custom_component, 757 CustomConfigTransformationModel: self.create_custom_component, 758 DeclarativeStreamModel: self.create_default_stream, 759 DefaultErrorHandlerModel: self.create_default_error_handler, 760 DefaultPaginatorModel: self.create_default_paginator, 761 DpathExtractorModel: self.create_dpath_extractor, 762 DpathValidatorModel: self.create_dpath_validator, 763 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 764 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 765 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 766 GroupByKeyMergeStrategyModel: self.create_group_by_key, 767 HttpRequesterModel: self.create_http_requester, 768 HttpResponseFilterModel: self.create_http_response_filter, 769 InlineSchemaLoaderModel: self.create_inline_schema_loader, 770 JsonDecoderModel: self.create_json_decoder, 771 JsonItemsDecoderModel: self.create_json_items_decoder, 772 JsonlDecoderModel: self.create_jsonl_decoder, 773 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 774 GzipDecoderModel: self.create_gzip_decoder, 775 KeysToLowerModel: self.create_keys_to_lower_transformation, 776 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 777 KeysReplaceModel: self.create_keys_replace_transformation, 778 FlattenFieldsModel: self.create_flatten_fields, 779 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 780 IterableDecoderModel: self.create_iterable_decoder, 781 XmlDecoderModel: self.create_xml_decoder, 782 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 783 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 784 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 785 TypesMapModel: self.create_types_map, 786 ComplexFieldTypeModel: self.create_complex_field_type, 787 JwtAuthenticatorModel: self.create_jwt_authenticator, 788 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 789 ListPartitionRouterModel: self.create_list_partition_router, 790 MinMaxDatetimeModel: self.create_min_max_datetime, 791 NoAuthModel: self.create_no_auth, 792 NoPaginationModel: self.create_no_pagination, 793 OAuthAuthenticatorModel: self.create_oauth_authenticator, 794 OffsetIncrementModel: self.create_offset_increment, 795 PageIncrementModel: self.create_page_increment, 796 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 797 PredicateValidatorModel: self.create_predicate_validator, 798 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 799 PropertyChunkingModel: self.create_property_chunking, 800 QueryPropertiesModel: self.create_query_properties, 801 RecordExpanderModel: self.create_record_expander, 802 RecordFilterModel: self.create_record_filter, 803 RecordSelectorModel: self.create_record_selector, 804 RemoveFieldsModel: self.create_remove_fields, 805 RequestPathModel: self.create_request_path, 806 RequestOptionModel: self.create_request_option, 807 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 808 SelectiveAuthenticatorModel: self.create_selective_authenticator, 809 SimpleRetrieverModel: self.create_simple_retriever, 810 StateDelegatingStreamModel: self.create_state_delegating_stream, 811 SpecModel: self.create_spec, 812 SubstreamPartitionRouterModel: self.create_substream_partition_router, 813 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 814 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 815 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 816 AsyncRetrieverModel: self.create_async_retriever, 817 HttpComponentsResolverModel: self.create_http_components_resolver, 818 ConfigComponentsResolverModel: self.create_config_components_resolver, 819 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 820 StreamConfigModel: self.create_stream_config, 821 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 822 ZipfileDecoderModel: self.create_zipfile_decoder, 823 HTTPAPIBudgetModel: self.create_http_api_budget, 824 FileUploaderModel: self.create_file_uploader, 825 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 826 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 827 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 828 RateModel: self.create_rate, 829 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 830 GroupingPartitionRouterModel: self.create_grouping_partition_router, 831 } 832 833 # Needed for the case where we need to perform a second parse on the fields of a custom component 834 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 835 836 @staticmethod 837 def _create_stream_name_to_configured_stream( 838 configured_catalog: Optional[ConfiguredAirbyteCatalog], 839 ) -> Mapping[str, ConfiguredAirbyteStream]: 840 return ( 841 {stream.stream.name: stream for stream in configured_catalog.streams} 842 if configured_catalog 843 else {} 844 ) 845 846 def create_component( 847 self, 848 model_type: Type[BaseModel], 849 component_definition: ComponentDefinition, 850 config: Config, 851 **kwargs: Any, 852 ) -> Any: 853 """ 854 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 855 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 856 creating declarative components from that model. 857 858 :param model_type: The type of declarative component that is being initialized 859 :param component_definition: The mapping that represents a declarative component 860 :param config: The connector config that is provided by the customer 861 :return: The declarative component to be used at runtime 862 """ 863 864 component_type = component_definition.get("type") 865 if component_definition.get("type") != model_type.__name__: 866 raise ValueError( 867 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 868 ) 869 870 declarative_component_model = model_type.parse_obj(component_definition) 871 872 if not isinstance(declarative_component_model, model_type): 873 raise ValueError( 874 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 875 ) 876 877 return self._create_component_from_model( 878 model=declarative_component_model, config=config, **kwargs 879 ) 880 881 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 882 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 883 raise ValueError( 884 f"{model.__class__} with attributes {model} is not a valid component type" 885 ) 886 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 887 if not component_constructor: 888 raise ValueError(f"Could not find constructor for {model.__class__}") 889 890 # collect deprecation warnings for supported models. 891 if isinstance(model, BaseModelWithDeprecations): 892 self._collect_model_deprecations(model) 893 894 return component_constructor(model=model, config=config, **kwargs) 895 896 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 897 """ 898 Returns the deprecation warnings that were collected during the creation of components. 899 """ 900 return self._collected_deprecation_logs 901 902 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 903 """ 904 Collects deprecation logs from the given model and appends any new logs to the internal collection. 905 906 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 907 908 Args: 909 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 910 """ 911 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 912 for log in model._deprecation_logs: 913 # avoid duplicates for deprecation logs observed. 914 if log not in self._collected_deprecation_logs: 915 self._collected_deprecation_logs.append(log) 916 917 def create_config_migration( 918 self, model: ConfigMigrationModel, config: Config 919 ) -> ConfigMigration: 920 transformations: List[ConfigTransformation] = [ 921 self._create_component_from_model(transformation, config) 922 for transformation in model.transformations 923 ] 924 925 return ConfigMigration( 926 description=model.description, 927 transformations=transformations, 928 ) 929 930 def create_config_add_fields( 931 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 932 ) -> ConfigAddFields: 933 fields = [self._create_component_from_model(field, config) for field in model.fields] 934 return ConfigAddFields( 935 fields=fields, 936 condition=model.condition or "", 937 ) 938 939 @staticmethod 940 def create_config_remove_fields( 941 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 942 ) -> ConfigRemoveFields: 943 return ConfigRemoveFields( 944 field_pointers=model.field_pointers, 945 condition=model.condition or "", 946 ) 947 948 @staticmethod 949 def create_config_remap_field( 950 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 951 ) -> ConfigRemapField: 952 mapping = cast(Mapping[str, Any], model.map) 953 return ConfigRemapField( 954 map=mapping, 955 field_path=model.field_path, 956 config=config, 957 ) 958 959 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 960 strategy = self._create_component_from_model(model.validation_strategy, config) 961 962 return DpathValidator( 963 field_path=model.field_path, 964 strategy=strategy, 965 ) 966 967 def create_predicate_validator( 968 self, model: PredicateValidatorModel, config: Config 969 ) -> PredicateValidator: 970 strategy = self._create_component_from_model(model.validation_strategy, config) 971 972 return PredicateValidator( 973 value=model.value, 974 strategy=strategy, 975 ) 976 977 @staticmethod 978 def create_validate_adheres_to_schema( 979 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 980 ) -> ValidateAdheresToSchema: 981 base_schema = cast(Mapping[str, Any], model.base_schema) 982 return ValidateAdheresToSchema( 983 schema=base_schema, 984 ) 985 986 @staticmethod 987 def create_added_field_definition( 988 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 989 ) -> AddedFieldDefinition: 990 interpolated_value = InterpolatedString.create( 991 model.value, parameters=model.parameters or {} 992 ) 993 return AddedFieldDefinition( 994 path=model.path, 995 value=interpolated_value, 996 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 997 parameters=model.parameters or {}, 998 ) 999 1000 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 1001 added_field_definitions = [ 1002 self._create_component_from_model( 1003 model=added_field_definition_model, 1004 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1005 added_field_definition_model.value_type 1006 ), 1007 config=config, 1008 ) 1009 for added_field_definition_model in model.fields 1010 ] 1011 return AddFields( 1012 fields=added_field_definitions, 1013 condition=model.condition or "", 1014 parameters=model.parameters or {}, 1015 ) 1016 1017 def create_keys_to_lower_transformation( 1018 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1019 ) -> KeysToLowerTransformation: 1020 return KeysToLowerTransformation() 1021 1022 def create_keys_to_snake_transformation( 1023 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1024 ) -> KeysToSnakeCaseTransformation: 1025 return KeysToSnakeCaseTransformation() 1026 1027 def create_keys_replace_transformation( 1028 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1029 ) -> KeysReplaceTransformation: 1030 return KeysReplaceTransformation( 1031 old=model.old, new=model.new, parameters=model.parameters or {} 1032 ) 1033 1034 def create_flatten_fields( 1035 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1036 ) -> FlattenFields: 1037 return FlattenFields( 1038 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1039 ) 1040 1041 def create_dpath_flatten_fields( 1042 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1043 ) -> DpathFlattenFields: 1044 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1045 key_transformation = ( 1046 KeyTransformation( 1047 config=config, 1048 prefix=model.key_transformation.prefix, 1049 suffix=model.key_transformation.suffix, 1050 parameters=model.parameters or {}, 1051 ) 1052 if model.key_transformation is not None 1053 else None 1054 ) 1055 return DpathFlattenFields( 1056 config=config, 1057 field_path=model_field_path, 1058 delete_origin_value=model.delete_origin_value 1059 if model.delete_origin_value is not None 1060 else False, 1061 replace_record=model.replace_record if model.replace_record is not None else False, 1062 key_transformation=key_transformation, 1063 parameters=model.parameters or {}, 1064 ) 1065 1066 @staticmethod 1067 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1068 if not value_type: 1069 return None 1070 names_to_types = { 1071 ValueType.string: str, 1072 ValueType.number: float, 1073 ValueType.integer: int, 1074 ValueType.boolean: bool, 1075 } 1076 return names_to_types[value_type] 1077 1078 def create_api_key_authenticator( 1079 self, 1080 model: ApiKeyAuthenticatorModel, 1081 config: Config, 1082 token_provider: Optional[TokenProvider] = None, 1083 **kwargs: Any, 1084 ) -> ApiKeyAuthenticator: 1085 if model.inject_into is None and model.header is None: 1086 raise ValueError( 1087 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1088 ) 1089 1090 if model.inject_into is not None and model.header is not None: 1091 raise ValueError( 1092 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1093 ) 1094 1095 if token_provider is not None and model.api_token != "": 1096 raise ValueError( 1097 "If token_provider is set, api_token is ignored and has to be set to empty string." 1098 ) 1099 1100 request_option = ( 1101 self._create_component_from_model( 1102 model.inject_into, config, parameters=model.parameters or {} 1103 ) 1104 if model.inject_into 1105 else RequestOption( 1106 inject_into=RequestOptionType.header, 1107 field_name=model.header or "", 1108 parameters=model.parameters or {}, 1109 ) 1110 ) 1111 1112 return ApiKeyAuthenticator( 1113 token_provider=( 1114 token_provider 1115 if token_provider is not None 1116 else InterpolatedStringTokenProvider( 1117 api_token=model.api_token or "", 1118 config=config, 1119 parameters=model.parameters or {}, 1120 ) 1121 ), 1122 request_option=request_option, 1123 config=config, 1124 parameters=model.parameters or {}, 1125 ) 1126 1127 def create_legacy_to_per_partition_state_migration( 1128 self, 1129 model: LegacyToPerPartitionStateMigrationModel, 1130 config: Mapping[str, Any], 1131 declarative_stream: DeclarativeStreamModel, 1132 ) -> LegacyToPerPartitionStateMigration: 1133 retriever = declarative_stream.retriever 1134 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1135 raise ValueError( 1136 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1137 ) 1138 partition_router = retriever.partition_router 1139 if not isinstance( 1140 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1141 ): 1142 raise ValueError( 1143 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1144 ) 1145 if not hasattr(partition_router, "parent_stream_configs"): 1146 raise ValueError( 1147 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1148 ) 1149 1150 if not hasattr(declarative_stream, "incremental_sync"): 1151 raise ValueError( 1152 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1153 ) 1154 1155 return LegacyToPerPartitionStateMigration( 1156 partition_router, # type: ignore # was already checked above 1157 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1158 config, 1159 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1160 ) 1161 1162 def create_session_token_authenticator( 1163 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1164 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1165 decoder = ( 1166 self._create_component_from_model(model=model.decoder, config=config) 1167 if model.decoder 1168 else JsonDecoder(parameters={}) 1169 ) 1170 login_requester = self._create_component_from_model( 1171 model=model.login_requester, 1172 config=config, 1173 name=f"{name}_login_requester", 1174 decoder=decoder, 1175 ) 1176 token_provider = SessionTokenProvider( 1177 login_requester=login_requester, 1178 session_token_path=model.session_token_path, 1179 expiration_duration=parse_duration(model.expiration_duration) 1180 if model.expiration_duration 1181 else None, 1182 parameters=model.parameters or {}, 1183 message_repository=self._message_repository, 1184 decoder=decoder, 1185 ) 1186 if model.request_authentication.type == "Bearer": 1187 return ModelToComponentFactory.create_bearer_authenticator( 1188 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1189 config, 1190 token_provider=token_provider, 1191 ) 1192 else: 1193 # Get the api_token template if specified, default to just the session token 1194 api_token_template = ( 1195 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1196 ) 1197 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1198 config=config, 1199 api_token=api_token_template, 1200 session_token_provider=token_provider, 1201 parameters=model.parameters or {}, 1202 ) 1203 return self.create_api_key_authenticator( 1204 ApiKeyAuthenticatorModel( 1205 type="ApiKeyAuthenticator", 1206 api_token="", 1207 inject_into=model.request_authentication.inject_into, 1208 ), # type: ignore # $parameters and headers default to None 1209 config=config, 1210 token_provider=final_token_provider, 1211 ) 1212 1213 @staticmethod 1214 def create_basic_http_authenticator( 1215 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1216 ) -> BasicHttpAuthenticator: 1217 return BasicHttpAuthenticator( 1218 password=model.password or "", 1219 username=model.username, 1220 config=config, 1221 parameters=model.parameters or {}, 1222 ) 1223 1224 @staticmethod 1225 def create_bearer_authenticator( 1226 model: BearerAuthenticatorModel, 1227 config: Config, 1228 token_provider: Optional[TokenProvider] = None, 1229 **kwargs: Any, 1230 ) -> BearerAuthenticator: 1231 if token_provider is not None and model.api_token != "": 1232 raise ValueError( 1233 "If token_provider is set, api_token is ignored and has to be set to empty string." 1234 ) 1235 return BearerAuthenticator( 1236 token_provider=( 1237 token_provider 1238 if token_provider is not None 1239 else InterpolatedStringTokenProvider( 1240 api_token=model.api_token or "", 1241 config=config, 1242 parameters=model.parameters or {}, 1243 ) 1244 ), 1245 config=config, 1246 parameters=model.parameters or {}, 1247 ) 1248 1249 @staticmethod 1250 def create_dynamic_stream_check_config( 1251 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1252 ) -> DynamicStreamCheckConfig: 1253 return DynamicStreamCheckConfig( 1254 dynamic_stream_name=model.dynamic_stream_name, 1255 stream_count=model.stream_count, 1256 ) 1257 1258 def create_check_stream( 1259 self, model: CheckStreamModel, config: Config, **kwargs: Any 1260 ) -> CheckStream: 1261 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1262 raise ValueError( 1263 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1264 ) 1265 1266 dynamic_streams_check_configs = ( 1267 [ 1268 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1269 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1270 ] 1271 if model.dynamic_streams_check_configs 1272 else [] 1273 ) 1274 1275 return CheckStream( 1276 stream_names=model.stream_names or [], 1277 dynamic_streams_check_configs=dynamic_streams_check_configs, 1278 parameters={}, 1279 ) 1280 1281 @staticmethod 1282 def create_check_dynamic_stream( 1283 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1284 ) -> CheckDynamicStream: 1285 assert model.use_check_availability is not None # for mypy 1286 1287 use_check_availability = model.use_check_availability 1288 1289 return CheckDynamicStream( 1290 stream_count=model.stream_count, 1291 use_check_availability=use_check_availability, 1292 parameters={}, 1293 ) 1294 1295 def create_composite_error_handler( 1296 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1297 ) -> CompositeErrorHandler: 1298 error_handlers = [ 1299 self._create_component_from_model(model=error_handler_model, config=config) 1300 for error_handler_model in model.error_handlers 1301 ] 1302 return CompositeErrorHandler( 1303 error_handlers=error_handlers, parameters=model.parameters or {} 1304 ) 1305 1306 @staticmethod 1307 def create_concurrency_level( 1308 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1309 ) -> ConcurrencyLevel: 1310 return ConcurrencyLevel( 1311 default_concurrency=model.default_concurrency, 1312 max_concurrency=model.max_concurrency, 1313 config=config, 1314 parameters={}, 1315 ) 1316 1317 @staticmethod 1318 def apply_stream_state_migrations( 1319 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1320 ) -> MutableMapping[str, Any]: 1321 if stream_state_migrations: 1322 for state_migration in stream_state_migrations: 1323 if state_migration.should_migrate(stream_state): 1324 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1325 stream_state = dict(state_migration.migrate(stream_state)) 1326 return stream_state 1327 1328 def create_concurrent_cursor_from_datetime_based_cursor( 1329 self, 1330 model_type: Type[BaseModel], 1331 component_definition: ComponentDefinition, 1332 stream_name: str, 1333 stream_namespace: Optional[str], 1334 stream_state: MutableMapping[str, Any], 1335 config: Config, 1336 message_repository: Optional[MessageRepository] = None, 1337 runtime_lookback_window: Optional[datetime.timedelta] = None, 1338 **kwargs: Any, 1339 ) -> ConcurrentCursor: 1340 component_type = component_definition.get("type") 1341 if component_definition.get("type") != model_type.__name__: 1342 raise ValueError( 1343 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1344 ) 1345 1346 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1347 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1348 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1349 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1350 if "$parameters" not in component_definition and "parameters" in component_definition: 1351 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1352 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1353 1354 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1355 raise ValueError( 1356 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1357 ) 1358 1359 model_parameters = datetime_based_cursor_model.parameters or {} 1360 1361 cursor_field = self._get_catalog_defined_cursor_field( 1362 stream_name=stream_name, 1363 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1364 or False, 1365 ) 1366 1367 if not cursor_field: 1368 interpolated_cursor_field = InterpolatedString.create( 1369 datetime_based_cursor_model.cursor_field, 1370 parameters=model_parameters, 1371 ) 1372 cursor_field = CursorField( 1373 cursor_field_key=interpolated_cursor_field.eval(config=config), 1374 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1375 or False, 1376 ) 1377 1378 interpolated_partition_field_start = InterpolatedString.create( 1379 datetime_based_cursor_model.partition_field_start or "start_time", 1380 parameters=model_parameters, 1381 ) 1382 interpolated_partition_field_end = InterpolatedString.create( 1383 datetime_based_cursor_model.partition_field_end or "end_time", 1384 parameters=model_parameters, 1385 ) 1386 1387 slice_boundary_fields = ( 1388 interpolated_partition_field_start.eval(config=config), 1389 interpolated_partition_field_end.eval(config=config), 1390 ) 1391 1392 datetime_format = datetime_based_cursor_model.datetime_format 1393 1394 cursor_granularity = ( 1395 parse_duration(datetime_based_cursor_model.cursor_granularity) 1396 if datetime_based_cursor_model.cursor_granularity 1397 else None 1398 ) 1399 1400 lookback_window = None 1401 interpolated_lookback_window = ( 1402 InterpolatedString.create( 1403 datetime_based_cursor_model.lookback_window, 1404 parameters=model_parameters, 1405 ) 1406 if datetime_based_cursor_model.lookback_window 1407 else None 1408 ) 1409 if interpolated_lookback_window: 1410 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1411 if evaluated_lookback_window: 1412 lookback_window = parse_duration(evaluated_lookback_window) 1413 1414 connector_state_converter: DateTimeStreamStateConverter 1415 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1416 datetime_format=datetime_format, 1417 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1418 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1419 cursor_granularity=cursor_granularity, 1420 ) 1421 1422 # Adjusts the stream state by applying the runtime lookback window. 1423 # This is used to ensure correct state handling in case of failed partitions. 1424 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1425 if runtime_lookback_window and stream_state_value: 1426 new_stream_state = ( 1427 connector_state_converter.parse_timestamp(stream_state_value) 1428 - runtime_lookback_window 1429 ) 1430 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1431 new_stream_state 1432 ) 1433 1434 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1435 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1436 start_date_runtime_value = self.create_min_max_datetime( 1437 model=datetime_based_cursor_model.start_datetime, config=config 1438 ) 1439 else: 1440 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1441 1442 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1443 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1444 end_date_runtime_value = self.create_min_max_datetime( 1445 model=datetime_based_cursor_model.end_datetime, config=config 1446 ) 1447 else: 1448 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1449 1450 interpolated_start_date = MinMaxDatetime.create( 1451 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1452 parameters=datetime_based_cursor_model.parameters, 1453 ) 1454 interpolated_end_date = ( 1455 None 1456 if not end_date_runtime_value 1457 else MinMaxDatetime.create( 1458 end_date_runtime_value, datetime_based_cursor_model.parameters 1459 ) 1460 ) 1461 1462 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1463 if not interpolated_start_date.datetime_format: 1464 interpolated_start_date.datetime_format = datetime_format 1465 if interpolated_end_date and not interpolated_end_date.datetime_format: 1466 interpolated_end_date.datetime_format = datetime_format 1467 1468 start_date = interpolated_start_date.get_datetime(config=config) 1469 end_date_provider = ( 1470 partial(interpolated_end_date.get_datetime, config) 1471 if interpolated_end_date 1472 else connector_state_converter.get_end_provider() 1473 ) 1474 1475 if ( 1476 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1477 ) or ( 1478 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1479 ): 1480 raise ValueError( 1481 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1482 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1483 ) 1484 1485 # When step is not defined, default to a step size from the starting date to the present moment 1486 step_length = datetime.timedelta.max 1487 interpolated_step = ( 1488 InterpolatedString.create( 1489 datetime_based_cursor_model.step, 1490 parameters=model_parameters, 1491 ) 1492 if datetime_based_cursor_model.step 1493 else None 1494 ) 1495 if interpolated_step: 1496 evaluated_step = interpolated_step.eval(config) 1497 if evaluated_step: 1498 step_length = parse_duration(evaluated_step) 1499 1500 clamping_strategy: ClampingStrategy = NoClamping() 1501 if datetime_based_cursor_model.clamping: 1502 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1503 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1504 # object which we want to keep agnostic of being low-code 1505 target = InterpolatedString( 1506 string=datetime_based_cursor_model.clamping.target, 1507 parameters=model_parameters, 1508 ) 1509 evaluated_target = target.eval(config=config) 1510 match evaluated_target: 1511 case "DAY": 1512 clamping_strategy = DayClampingStrategy() 1513 end_date_provider = ClampingEndProvider( 1514 DayClampingStrategy(is_ceiling=False), 1515 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1517 ) 1518 case "WEEK": 1519 if ( 1520 not datetime_based_cursor_model.clamping.target_details 1521 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1522 ): 1523 raise ValueError( 1524 "Given WEEK clamping, weekday needs to be provided as target_details" 1525 ) 1526 weekday = self._assemble_weekday( 1527 datetime_based_cursor_model.clamping.target_details["weekday"] 1528 ) 1529 clamping_strategy = WeekClampingStrategy(weekday) 1530 end_date_provider = ClampingEndProvider( 1531 WeekClampingStrategy(weekday, is_ceiling=False), 1532 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 granularity=cursor_granularity or datetime.timedelta(days=1), 1534 ) 1535 case "MONTH": 1536 clamping_strategy = MonthClampingStrategy() 1537 end_date_provider = ClampingEndProvider( 1538 MonthClampingStrategy(is_ceiling=False), 1539 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1540 granularity=cursor_granularity or datetime.timedelta(days=1), 1541 ) 1542 case _: 1543 raise ValueError( 1544 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1545 ) 1546 1547 return ConcurrentCursor( 1548 stream_name=stream_name, 1549 stream_namespace=stream_namespace, 1550 stream_state=stream_state, 1551 message_repository=message_repository or self._message_repository, 1552 connector_state_manager=self._connector_state_manager, 1553 connector_state_converter=connector_state_converter, 1554 cursor_field=cursor_field, 1555 slice_boundary_fields=slice_boundary_fields, 1556 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1557 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1558 lookback_window=lookback_window, 1559 slice_range=step_length, 1560 cursor_granularity=cursor_granularity, 1561 clamping_strategy=clamping_strategy, 1562 ) 1563 1564 def create_concurrent_cursor_from_incrementing_count_cursor( 1565 self, 1566 model_type: Type[BaseModel], 1567 component_definition: ComponentDefinition, 1568 stream_name: str, 1569 stream_namespace: Optional[str], 1570 stream_state: MutableMapping[str, Any], 1571 config: Config, 1572 message_repository: Optional[MessageRepository] = None, 1573 **kwargs: Any, 1574 ) -> ConcurrentCursor: 1575 component_type = component_definition.get("type") 1576 if component_definition.get("type") != model_type.__name__: 1577 raise ValueError( 1578 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1579 ) 1580 1581 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1582 1583 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1584 raise ValueError( 1585 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1586 ) 1587 1588 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1589 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1590 # We need to handle both int and str representations of numeric values. 1591 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1592 if start_value is not None: 1593 interpolated_start_value = InterpolatedString.create( 1594 str(start_value), # Ensure we pass a string to InterpolatedString.create 1595 parameters=incrementing_count_cursor_model.parameters or {}, 1596 ) 1597 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1598 else: 1599 evaluated_start_value = 0 1600 1601 cursor_field = self._get_catalog_defined_cursor_field( 1602 stream_name=stream_name, 1603 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1604 or False, 1605 ) 1606 1607 if not cursor_field: 1608 interpolated_cursor_field = InterpolatedString.create( 1609 incrementing_count_cursor_model.cursor_field, 1610 parameters=incrementing_count_cursor_model.parameters or {}, 1611 ) 1612 cursor_field = CursorField( 1613 cursor_field_key=interpolated_cursor_field.eval(config=config), 1614 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1615 or False, 1616 ) 1617 1618 connector_state_converter = IncrementingCountStreamStateConverter( 1619 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1620 ) 1621 1622 return ConcurrentCursor( 1623 stream_name=stream_name, 1624 stream_namespace=stream_namespace, 1625 stream_state=stream_state, 1626 message_repository=message_repository or self._message_repository, 1627 connector_state_manager=self._connector_state_manager, 1628 connector_state_converter=connector_state_converter, 1629 cursor_field=cursor_field, 1630 slice_boundary_fields=None, 1631 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1632 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1633 ) 1634 1635 def _assemble_weekday(self, weekday: str) -> Weekday: 1636 match weekday: 1637 case "MONDAY": 1638 return Weekday.MONDAY 1639 case "TUESDAY": 1640 return Weekday.TUESDAY 1641 case "WEDNESDAY": 1642 return Weekday.WEDNESDAY 1643 case "THURSDAY": 1644 return Weekday.THURSDAY 1645 case "FRIDAY": 1646 return Weekday.FRIDAY 1647 case "SATURDAY": 1648 return Weekday.SATURDAY 1649 case "SUNDAY": 1650 return Weekday.SUNDAY 1651 case _: 1652 raise ValueError(f"Unknown weekday {weekday}") 1653 1654 def create_concurrent_cursor_from_perpartition_cursor( 1655 self, 1656 state_manager: ConnectorStateManager, 1657 model_type: Type[BaseModel], 1658 component_definition: ComponentDefinition, 1659 stream_name: str, 1660 stream_namespace: Optional[str], 1661 config: Config, 1662 stream_state: MutableMapping[str, Any], 1663 partition_router: PartitionRouter, 1664 attempt_to_create_cursor_if_not_provided: bool = False, 1665 **kwargs: Any, 1666 ) -> ConcurrentPerPartitionCursor: 1667 component_type = component_definition.get("type") 1668 if component_definition.get("type") != model_type.__name__: 1669 raise ValueError( 1670 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1671 ) 1672 1673 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1674 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1675 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1676 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1677 if "$parameters" not in component_definition and "parameters" in component_definition: 1678 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1679 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1680 1681 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1682 raise ValueError( 1683 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1684 ) 1685 1686 cursor_field = self._get_catalog_defined_cursor_field( 1687 stream_name=stream_name, 1688 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1689 or False, 1690 ) 1691 1692 if not cursor_field: 1693 interpolated_cursor_field = InterpolatedString.create( 1694 datetime_based_cursor_model.cursor_field, 1695 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1696 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1697 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1698 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1699 parameters=datetime_based_cursor_model.parameters or {}, 1700 ) 1701 cursor_field = CursorField( 1702 cursor_field_key=interpolated_cursor_field.eval(config=config), 1703 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1704 or False, 1705 ) 1706 1707 datetime_format = datetime_based_cursor_model.datetime_format 1708 1709 cursor_granularity = ( 1710 parse_duration(datetime_based_cursor_model.cursor_granularity) 1711 if datetime_based_cursor_model.cursor_granularity 1712 else None 1713 ) 1714 1715 connector_state_converter: DateTimeStreamStateConverter 1716 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1717 datetime_format=datetime_format, 1718 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1719 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1720 cursor_granularity=cursor_granularity, 1721 ) 1722 1723 # Create the cursor factory 1724 cursor_factory = ConcurrentCursorFactory( 1725 partial( 1726 self.create_concurrent_cursor_from_datetime_based_cursor, 1727 state_manager=state_manager, 1728 model_type=model_type, 1729 component_definition=component_definition, 1730 stream_name=stream_name, 1731 stream_namespace=stream_namespace, 1732 config=config, 1733 message_repository=NoopMessageRepository(), 1734 ) 1735 ) 1736 1737 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1738 use_global_cursor = isinstance( 1739 partition_router, GroupingPartitionRouter 1740 ) or component_definition.get("global_substream_cursor", False) 1741 1742 # Return the concurrent cursor and state converter 1743 return ConcurrentPerPartitionCursor( 1744 cursor_factory=cursor_factory, 1745 partition_router=partition_router, 1746 stream_name=stream_name, 1747 stream_namespace=stream_namespace, 1748 stream_state=stream_state, 1749 message_repository=self._message_repository, # type: ignore 1750 connector_state_manager=state_manager, 1751 connector_state_converter=connector_state_converter, 1752 cursor_field=cursor_field, 1753 use_global_cursor=use_global_cursor, 1754 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1755 ) 1756 1757 @staticmethod 1758 def create_constant_backoff_strategy( 1759 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1760 ) -> ConstantBackoffStrategy: 1761 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 1762 return ConstantBackoffStrategy( 1763 backoff_time_in_seconds=model.backoff_time_in_seconds, 1764 jitter_range_in_seconds=model.jitter_range_in_seconds, 1765 config=config, 1766 parameters=model.parameters or {}, 1767 ) 1768 1769 @staticmethod 1770 def _validate_jitter_range(jitter_range_in_seconds: Optional[float]) -> None: 1771 if jitter_range_in_seconds is not None and jitter_range_in_seconds < 0: 1772 raise ValueError("jitter_range_in_seconds must be greater than or equal to 0") 1773 1774 def create_cursor_pagination( 1775 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1776 ) -> CursorPaginationStrategy: 1777 if isinstance(decoder, PaginationDecoderDecorator): 1778 inner_decoder = decoder.decoder 1779 else: 1780 inner_decoder = decoder 1781 decoder = PaginationDecoderDecorator(decoder=decoder) 1782 1783 if self._is_supported_decoder_for_pagination(inner_decoder): 1784 decoder_to_use = decoder 1785 else: 1786 raise ValueError( 1787 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1788 ) 1789 1790 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1791 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1792 page_size = model.page_size 1793 if isinstance(page_size, str) and page_size.isdigit(): 1794 page_size = int(page_size) 1795 1796 return CursorPaginationStrategy( 1797 cursor_value=model.cursor_value, 1798 decoder=decoder_to_use, 1799 page_size=page_size, 1800 stop_condition=model.stop_condition, 1801 config=config, 1802 parameters=model.parameters or {}, 1803 ) 1804 1805 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1806 """ 1807 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1808 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1809 :param model: The Pydantic model of the custom component being created 1810 :param config: The custom defined connector config 1811 :return: The declarative component built from the Pydantic model to be used at runtime 1812 """ 1813 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1814 component_fields = get_type_hints(custom_component_class) 1815 model_args = model.dict() 1816 model_args["config"] = config 1817 1818 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1819 # we defer to these arguments over the component's definition 1820 for key, arg in kwargs.items(): 1821 model_args[key] = arg 1822 1823 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1824 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1825 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1826 for model_field, model_value in model_args.items(): 1827 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1828 if ( 1829 isinstance(model_value, dict) 1830 and "type" not in model_value 1831 and model_field in component_fields 1832 ): 1833 derived_type = self._derive_component_type_from_type_hints( 1834 component_fields.get(model_field) 1835 ) 1836 if derived_type: 1837 model_value["type"] = derived_type 1838 1839 if self._is_component(model_value): 1840 model_args[model_field] = self._create_nested_component( 1841 model, 1842 model_field, 1843 model_value, 1844 config, 1845 **kwargs, 1846 ) 1847 elif isinstance(model_value, list): 1848 vals = [] 1849 for v in model_value: 1850 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1851 derived_type = self._derive_component_type_from_type_hints( 1852 component_fields.get(model_field) 1853 ) 1854 if derived_type: 1855 v["type"] = derived_type 1856 if self._is_component(v): 1857 vals.append( 1858 self._create_nested_component( 1859 model, 1860 model_field, 1861 v, 1862 config, 1863 **kwargs, 1864 ) 1865 ) 1866 else: 1867 vals.append(v) 1868 model_args[model_field] = vals 1869 1870 kwargs = { 1871 class_field: model_args[class_field] 1872 for class_field in component_fields.keys() 1873 if class_field in model_args 1874 } 1875 1876 if "api_budget" in component_fields and kwargs.get("api_budget") is None: 1877 kwargs["api_budget"] = self._api_budget 1878 1879 return custom_component_class(**kwargs) 1880 1881 @staticmethod 1882 def _get_class_from_fully_qualified_class_name( 1883 full_qualified_class_name: str, 1884 ) -> Any: 1885 """Get a class from its fully qualified name. 1886 1887 If a custom components module is needed, we assume it is already registered - probably 1888 as `source_declarative_manifest.components` or `components`. 1889 1890 Args: 1891 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1892 1893 Returns: 1894 Any: The class object. 1895 1896 Raises: 1897 ValueError: If the class cannot be loaded. 1898 """ 1899 split = full_qualified_class_name.split(".") 1900 module_name_full = ".".join(split[:-1]) 1901 class_name = split[-1] 1902 1903 try: 1904 module_ref = importlib.import_module(module_name_full) 1905 except ModuleNotFoundError as e: 1906 if split[0] == "source_declarative_manifest": 1907 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1908 try: 1909 import os 1910 1911 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1912 module_ref = importlib.import_module( 1913 module_name_with_source_declarative_manifest 1914 ) 1915 except ModuleNotFoundError: 1916 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1917 else: 1918 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1919 1920 try: 1921 return getattr(module_ref, class_name) 1922 except AttributeError as e: 1923 raise ValueError( 1924 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1925 ) from e 1926 1927 @staticmethod 1928 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1929 interface = field_type 1930 while True: 1931 origin = get_origin(interface) 1932 if origin: 1933 # Unnest types until we reach the raw type 1934 # List[T] -> T 1935 # Optional[List[T]] -> T 1936 args = get_args(interface) 1937 interface = args[0] 1938 else: 1939 break 1940 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1941 return interface.__name__ 1942 return None 1943 1944 @staticmethod 1945 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1946 if not cls: 1947 return False 1948 return cls.__module__ == "builtins" 1949 1950 @staticmethod 1951 def _extract_missing_parameters(error: TypeError) -> List[str]: 1952 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1953 if parameter_search: 1954 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1955 else: 1956 return [] 1957 1958 def _create_nested_component( 1959 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1960 ) -> Any: 1961 type_name = model_value.get("type", None) 1962 if not type_name: 1963 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1964 return model_value 1965 1966 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1967 if model_type: 1968 parsed_model = model_type.parse_obj(model_value) 1969 try: 1970 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1971 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1972 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1973 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1974 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1975 # are needed by a component and could not be shared. 1976 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1977 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1978 model_parameters = model_value.get("$parameters", {}) 1979 matching_parameters = { 1980 kwarg: model_parameters[kwarg] 1981 for kwarg in constructor_kwargs 1982 if kwarg in model_parameters 1983 } 1984 matching_kwargs = { 1985 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1986 } 1987 return self._create_component_from_model( 1988 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1989 ) 1990 except TypeError as error: 1991 missing_parameters = self._extract_missing_parameters(error) 1992 if missing_parameters: 1993 raise ValueError( 1994 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1995 + ", ".join( 1996 ( 1997 f"{type_name}.$parameters.{parameter}" 1998 for parameter in missing_parameters 1999 ) 2000 ) 2001 ) 2002 raise TypeError( 2003 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 2004 ) 2005 else: 2006 raise ValueError( 2007 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 2008 ) 2009 2010 @staticmethod 2011 def _is_component(model_value: Any) -> bool: 2012 return isinstance(model_value, dict) and model_value.get("type") is not None 2013 2014 def create_default_stream( 2015 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2016 ) -> AbstractStream: 2017 primary_key = model.primary_key.__root__ if model.primary_key else None 2018 self._migrate_state(model, config) 2019 2020 partition_router = self._build_stream_slicer_from_partition_router( 2021 model.retriever, 2022 config, 2023 stream_name=model.name, 2024 **kwargs, 2025 ) 2026 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2027 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2028 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2029 2030 end_time_option = ( 2031 self._create_component_from_model( 2032 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2033 ) 2034 if cursor_model.end_time_option 2035 else None 2036 ) 2037 start_time_option = ( 2038 self._create_component_from_model( 2039 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2040 ) 2041 if cursor_model.start_time_option 2042 else None 2043 ) 2044 2045 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2046 start_time_option=start_time_option, 2047 end_time_option=end_time_option, 2048 partition_field_start=cursor_model.partition_field_start, 2049 partition_field_end=cursor_model.partition_field_end, 2050 config=config, 2051 parameters=model.parameters or {}, 2052 ) 2053 request_options_provider = ( 2054 datetime_request_options_provider 2055 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2056 else PerPartitionRequestOptionsProvider( 2057 partition_router, datetime_request_options_provider 2058 ) 2059 ) 2060 elif model.incremental_sync and isinstance( 2061 model.incremental_sync, IncrementingCountCursorModel 2062 ): 2063 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2064 raise ValueError( 2065 "PerPartition does not support per partition states because switching to global state is time based" 2066 ) 2067 2068 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2069 2070 start_time_option = ( 2071 self._create_component_from_model( 2072 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2073 config, 2074 parameters=cursor_model.parameters or {}, 2075 ) 2076 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2077 else None 2078 ) 2079 2080 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2081 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2082 partition_field_start = "start" 2083 2084 request_options_provider = DatetimeBasedRequestOptionsProvider( 2085 start_time_option=start_time_option, 2086 partition_field_start=partition_field_start, 2087 config=config, 2088 parameters=model.parameters or {}, 2089 ) 2090 else: 2091 request_options_provider = None 2092 2093 transformations = [] 2094 if model.transformations: 2095 for transformation_model in model.transformations: 2096 transformations.append( 2097 self._create_component_from_model(model=transformation_model, config=config) 2098 ) 2099 file_uploader = None 2100 if model.file_uploader: 2101 file_uploader = self._create_component_from_model( 2102 model=model.file_uploader, config=config 2103 ) 2104 2105 stream_slicer: ConcurrentStreamSlicer = ( 2106 partition_router 2107 if isinstance(concurrent_cursor, FinalStateCursor) 2108 else concurrent_cursor 2109 ) 2110 2111 retriever = self._create_component_from_model( 2112 model=model.retriever, 2113 config=config, 2114 name=model.name, 2115 primary_key=primary_key, 2116 request_options_provider=request_options_provider, 2117 stream_slicer=stream_slicer, 2118 partition_router=partition_router, 2119 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2120 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2121 cursor=concurrent_cursor, 2122 transformations=transformations, 2123 file_uploader=file_uploader, 2124 incremental_sync=model.incremental_sync, 2125 ) 2126 if isinstance(retriever, AsyncRetriever): 2127 stream_slicer = retriever.stream_slicer 2128 2129 schema_loader: SchemaLoader 2130 if model.schema_loader and isinstance(model.schema_loader, list): 2131 nested_schema_loaders = [ 2132 self._create_component_from_model(model=nested_schema_loader, config=config) 2133 for nested_schema_loader in model.schema_loader 2134 ] 2135 schema_loader = CompositeSchemaLoader( 2136 schema_loaders=nested_schema_loaders, parameters={} 2137 ) 2138 elif model.schema_loader: 2139 schema_loader = self._create_component_from_model( 2140 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2141 config=config, 2142 ) 2143 else: 2144 options = model.parameters or {} 2145 if "name" not in options: 2146 options["name"] = model.name 2147 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2148 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2149 2150 stream_name = model.name or "" 2151 return DefaultStream( 2152 partition_generator=StreamSlicerPartitionGenerator( 2153 DeclarativePartitionFactory( 2154 stream_name, 2155 schema_loader, 2156 retriever, 2157 self._message_repository, 2158 ), 2159 stream_slicer, 2160 slice_limit=self._limit_slices_fetched, 2161 ), 2162 name=stream_name, 2163 json_schema=schema_loader.get_json_schema, 2164 primary_key=get_primary_key_from_stream(primary_key), 2165 cursor_field=( 2166 concurrent_cursor.cursor_field 2167 if hasattr(concurrent_cursor, "cursor_field") 2168 else None 2169 ), 2170 logger=logging.getLogger(f"airbyte.{stream_name}"), 2171 cursor=concurrent_cursor, 2172 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2173 ) 2174 2175 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2176 stream_name = model.name or "" 2177 stream_state = self._connector_state_manager.get_stream_state( 2178 stream_name=stream_name, namespace=None 2179 ) 2180 if model.state_migrations: 2181 state_transformations = [ 2182 self._create_component_from_model(state_migration, config, declarative_stream=model) 2183 for state_migration in model.state_migrations 2184 ] 2185 else: 2186 state_transformations = [] 2187 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2188 self._connector_state_manager.update_state_for_stream( 2189 stream_name=stream_name, namespace=None, value=stream_state 2190 ) 2191 2192 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2193 return bool( 2194 model.incremental_sync 2195 and hasattr(model.incremental_sync, "is_data_feed") 2196 and model.incremental_sync.is_data_feed 2197 ) 2198 2199 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2200 return bool( 2201 model.incremental_sync 2202 and hasattr(model.incremental_sync, "is_client_side_incremental") 2203 and model.incremental_sync.is_client_side_incremental 2204 ) 2205 2206 def _build_stream_slicer_from_partition_router( 2207 self, 2208 model: Union[ 2209 AsyncRetrieverModel, 2210 CustomRetrieverModel, 2211 SimpleRetrieverModel, 2212 ], 2213 config: Config, 2214 stream_name: Optional[str] = None, 2215 **kwargs: Any, 2216 ) -> PartitionRouter: 2217 if ( 2218 hasattr(model, "partition_router") 2219 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2220 and model.partition_router 2221 ): 2222 stream_slicer_model = model.partition_router 2223 if isinstance(stream_slicer_model, list): 2224 return CartesianProductStreamSlicer( 2225 [ 2226 self._create_component_from_model( 2227 model=slicer, config=config, stream_name=stream_name or "" 2228 ) 2229 for slicer in stream_slicer_model 2230 ], 2231 parameters={}, 2232 ) 2233 elif isinstance(stream_slicer_model, dict): 2234 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2235 params = stream_slicer_model.get("$parameters") 2236 if not isinstance(params, dict): 2237 params = {} 2238 stream_slicer_model["$parameters"] = params 2239 2240 if stream_name is not None: 2241 params["stream_name"] = stream_name 2242 2243 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2244 model, 2245 "partition_router", 2246 stream_slicer_model, 2247 config, 2248 **kwargs, 2249 ) 2250 else: 2251 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2252 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2253 ) 2254 return SinglePartitionRouter(parameters={}) 2255 2256 def _build_concurrent_cursor( 2257 self, 2258 model: DeclarativeStreamModel, 2259 stream_slicer: Optional[PartitionRouter], 2260 config: Config, 2261 ) -> Cursor: 2262 stream_name = model.name or "" 2263 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2264 2265 if ( 2266 model.incremental_sync 2267 and stream_slicer 2268 and not isinstance(stream_slicer, SinglePartitionRouter) 2269 ): 2270 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2271 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2272 # same time because we didn't solve for design questions like what the lookback window would 2273 # be as well as global cursor fall backs. We have not seen customers that have needed both 2274 # at the same time yet and are currently punting on this until we need to solve it. 2275 raise ValueError( 2276 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2277 ) 2278 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2279 state_manager=self._connector_state_manager, 2280 model_type=DatetimeBasedCursorModel, 2281 component_definition=model.incremental_sync.__dict__, 2282 stream_name=stream_name, 2283 stream_state=stream_state, 2284 stream_namespace=None, 2285 config=config or {}, 2286 partition_router=stream_slicer, 2287 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2288 ) 2289 elif model.incremental_sync: 2290 if type(model.incremental_sync) == IncrementingCountCursorModel: 2291 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2292 model_type=IncrementingCountCursorModel, 2293 component_definition=model.incremental_sync.__dict__, 2294 stream_name=stream_name, 2295 stream_namespace=None, 2296 stream_state=stream_state, 2297 config=config or {}, 2298 ) 2299 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2300 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2301 model_type=type(model.incremental_sync), 2302 component_definition=model.incremental_sync.__dict__, 2303 stream_name=stream_name, 2304 stream_namespace=None, 2305 stream_state=stream_state, 2306 config=config or {}, 2307 attempt_to_create_cursor_if_not_provided=True, 2308 ) 2309 else: 2310 raise ValueError( 2311 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2312 ) 2313 return FinalStateCursor(stream_name, None, self._message_repository) 2314 2315 def create_default_error_handler( 2316 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2317 ) -> DefaultErrorHandler: 2318 backoff_strategies = [] 2319 if model.backoff_strategies: 2320 for backoff_strategy_model in model.backoff_strategies: 2321 backoff_strategies.append( 2322 self._create_component_from_model(model=backoff_strategy_model, config=config) 2323 ) 2324 2325 response_filters = [] 2326 if model.response_filters: 2327 for response_filter_model in model.response_filters: 2328 response_filters.append( 2329 self._create_component_from_model(model=response_filter_model, config=config) 2330 ) 2331 response_filters.append( 2332 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2333 ) 2334 2335 return DefaultErrorHandler( 2336 backoff_strategies=backoff_strategies, 2337 max_retries=model.max_retries, 2338 response_filters=response_filters, 2339 config=config, 2340 parameters=model.parameters or {}, 2341 ) 2342 2343 def create_default_paginator( 2344 self, 2345 model: DefaultPaginatorModel, 2346 config: Config, 2347 *, 2348 url_base: str, 2349 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2350 decoder: Optional[Decoder] = None, 2351 cursor_used_for_stop_condition: Optional[Cursor] = None, 2352 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2353 if decoder: 2354 if self._is_supported_decoder_for_pagination(decoder): 2355 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2356 else: 2357 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2358 else: 2359 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2360 page_size_option = ( 2361 self._create_component_from_model(model=model.page_size_option, config=config) 2362 if model.page_size_option 2363 else None 2364 ) 2365 page_token_option = ( 2366 self._create_component_from_model(model=model.page_token_option, config=config) 2367 if model.page_token_option 2368 else None 2369 ) 2370 pagination_strategy = self._create_component_from_model( 2371 model=model.pagination_strategy, 2372 config=config, 2373 decoder=decoder_to_use, 2374 extractor_model=extractor_model, 2375 ) 2376 if cursor_used_for_stop_condition: 2377 pagination_strategy = StopConditionPaginationStrategyDecorator( 2378 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2379 ) 2380 paginator = DefaultPaginator( 2381 decoder=decoder_to_use, 2382 page_size_option=page_size_option, 2383 page_token_option=page_token_option, 2384 pagination_strategy=pagination_strategy, 2385 url_base=url_base, 2386 config=config, 2387 parameters=model.parameters or {}, 2388 ) 2389 if self._limit_pages_fetched_per_slice: 2390 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2391 return paginator 2392 2393 def create_dpath_extractor( 2394 self, 2395 model: DpathExtractorModel, 2396 config: Config, 2397 decoder: Optional[Decoder] = None, 2398 **kwargs: Any, 2399 ) -> DpathExtractor: 2400 if decoder: 2401 decoder_to_use = decoder 2402 else: 2403 decoder_to_use = JsonDecoder(parameters={}) 2404 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2405 2406 record_expander = None 2407 if model.record_expander: 2408 record_expander = self._create_component_from_model( 2409 model=model.record_expander, 2410 config=config, 2411 ) 2412 2413 return DpathExtractor( 2414 decoder=decoder_to_use, 2415 field_path=model_field_path, 2416 config=config, 2417 parameters=model.parameters or {}, 2418 record_expander=record_expander, 2419 ) 2420 2421 def create_record_expander( 2422 self, 2423 model: RecordExpanderModel, 2424 config: Config, 2425 **kwargs: Any, 2426 ) -> RecordExpander: 2427 return RecordExpander( 2428 expand_records_from_field=model.expand_records_from_field, 2429 config=config, 2430 parameters=model.parameters or {}, 2431 remain_original_record=model.remain_original_record or False, 2432 on_no_records=OnNoRecords(model.on_no_records.value) 2433 if model.on_no_records 2434 else OnNoRecords.skip, 2435 ) 2436 2437 @staticmethod 2438 def create_response_to_file_extractor( 2439 model: ResponseToFileExtractorModel, 2440 **kwargs: Any, 2441 ) -> ResponseToFileExtractor: 2442 return ResponseToFileExtractor(parameters=model.parameters or {}) 2443 2444 @staticmethod 2445 def create_exponential_backoff_strategy( 2446 model: ExponentialBackoffStrategyModel, config: Config 2447 ) -> ExponentialBackoffStrategy: 2448 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 2449 return ExponentialBackoffStrategy( 2450 factor=model.factor or 5, 2451 jitter_range_in_seconds=model.jitter_range_in_seconds, 2452 parameters=model.parameters or {}, 2453 config=config, 2454 ) 2455 2456 @staticmethod 2457 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2458 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2459 2460 def create_http_requester( 2461 self, 2462 model: HttpRequesterModel, 2463 config: Config, 2464 decoder: Decoder = JsonDecoder(parameters={}), 2465 query_properties_key: Optional[str] = None, 2466 use_cache: Optional[bool] = None, 2467 *, 2468 name: str, 2469 ) -> HttpRequester: 2470 authenticator = ( 2471 self._create_component_from_model( 2472 model=model.authenticator, 2473 config=config, 2474 url_base=model.url or model.url_base, 2475 name=name, 2476 decoder=decoder, 2477 ) 2478 if model.authenticator 2479 else None 2480 ) 2481 error_handler = ( 2482 self._create_component_from_model(model=model.error_handler, config=config) 2483 if model.error_handler 2484 else DefaultErrorHandler( 2485 backoff_strategies=[], 2486 response_filters=[], 2487 config=config, 2488 parameters=model.parameters or {}, 2489 ) 2490 ) 2491 2492 api_budget = self._api_budget 2493 2494 request_options_provider = InterpolatedRequestOptionsProvider( 2495 request_body=model.request_body, 2496 request_body_data=model.request_body_data, 2497 request_body_json=model.request_body_json, 2498 request_headers=model.request_headers, 2499 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2500 query_properties_key=query_properties_key, 2501 config=config, 2502 parameters=model.parameters or {}, 2503 ) 2504 2505 assert model.use_cache is not None # for mypy 2506 assert model.http_method is not None # for mypy 2507 2508 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2509 2510 return HttpRequester( 2511 name=name, 2512 url=model.url, 2513 url_base=model.url_base, 2514 path=model.path, 2515 authenticator=authenticator, 2516 error_handler=error_handler, 2517 api_budget=api_budget, 2518 http_method=HttpMethod[model.http_method.value], 2519 request_options_provider=request_options_provider, 2520 config=config, 2521 disable_retries=self._disable_retries, 2522 parameters=model.parameters or {}, 2523 message_repository=self._message_repository, 2524 use_cache=should_use_cache, 2525 decoder=decoder, 2526 stream_response=decoder.is_stream_response() if decoder else False, 2527 ) 2528 2529 @staticmethod 2530 def create_http_response_filter( 2531 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2532 ) -> HttpResponseFilter: 2533 if model.action: 2534 action = ResponseAction(model.action.value) 2535 else: 2536 action = None 2537 2538 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2539 2540 http_codes = ( 2541 set(model.http_codes) if model.http_codes else set() 2542 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2543 2544 return HttpResponseFilter( 2545 action=action, 2546 failure_type=failure_type, 2547 error_message=model.error_message or "", 2548 error_message_contains=model.error_message_contains or "", 2549 http_codes=http_codes, 2550 predicate=model.predicate or "", 2551 config=config, 2552 parameters=model.parameters or {}, 2553 ) 2554 2555 @staticmethod 2556 def create_inline_schema_loader( 2557 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2558 ) -> InlineSchemaLoader: 2559 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2560 2561 def create_complex_field_type( 2562 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2563 ) -> ComplexFieldType: 2564 items = ( 2565 self._create_component_from_model(model=model.items, config=config) 2566 if isinstance(model.items, ComplexFieldTypeModel) 2567 else model.items 2568 ) 2569 2570 return ComplexFieldType(field_type=model.field_type, items=items) 2571 2572 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2573 target_type = ( 2574 self._create_component_from_model(model=model.target_type, config=config) 2575 if isinstance(model.target_type, ComplexFieldTypeModel) 2576 else model.target_type 2577 ) 2578 2579 return TypesMap( 2580 target_type=target_type, 2581 current_type=model.current_type, 2582 condition=model.condition if model.condition is not None else "True", 2583 ) 2584 2585 def create_schema_type_identifier( 2586 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2587 ) -> SchemaTypeIdentifier: 2588 types_mapping = [] 2589 if model.types_mapping: 2590 types_mapping.extend( 2591 [ 2592 self._create_component_from_model(types_map, config=config) 2593 for types_map in model.types_mapping 2594 ] 2595 ) 2596 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2597 [x for x in model.schema_pointer] if model.schema_pointer else [] 2598 ) 2599 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2600 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2601 [x for x in model.type_pointer] if model.type_pointer else None 2602 ) 2603 2604 return SchemaTypeIdentifier( 2605 schema_pointer=model_schema_pointer, 2606 key_pointer=model_key_pointer, 2607 type_pointer=model_type_pointer, 2608 types_mapping=types_mapping, 2609 parameters=model.parameters or {}, 2610 ) 2611 2612 def create_dynamic_schema_loader( 2613 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2614 ) -> DynamicSchemaLoader: 2615 schema_transformations = [] 2616 if model.schema_transformations: 2617 for transformation_model in model.schema_transformations: 2618 schema_transformations.append( 2619 self._create_component_from_model(model=transformation_model, config=config) 2620 ) 2621 name = "dynamic_properties" 2622 retriever = self._create_component_from_model( 2623 model=model.retriever, 2624 config=config, 2625 name=name, 2626 primary_key=None, 2627 partition_router=self._build_stream_slicer_from_partition_router( 2628 model.retriever, config 2629 ), 2630 transformations=[], 2631 use_cache=True, 2632 log_formatter=( 2633 lambda response: format_http_message( 2634 response, 2635 f"Schema loader '{name}' request", 2636 f"Request performed in order to extract schema.", 2637 name, 2638 is_auxiliary=True, 2639 ) 2640 ), 2641 ) 2642 schema_type_identifier = self._create_component_from_model( 2643 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2644 ) 2645 schema_filter = ( 2646 self._create_component_from_model( 2647 model.schema_filter, config=config, parameters=model.parameters or {} 2648 ) 2649 if model.schema_filter is not None 2650 else None 2651 ) 2652 2653 return DynamicSchemaLoader( 2654 retriever=retriever, 2655 config=config, 2656 schema_transformations=schema_transformations, 2657 schema_filter=schema_filter, 2658 schema_type_identifier=schema_type_identifier, 2659 parameters=model.parameters or {}, 2660 ) 2661 2662 @staticmethod 2663 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2664 return JsonDecoder(parameters={}) 2665 2666 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2667 return CompositeRawDecoder( 2668 parser=ModelToComponentFactory._get_parser(model, config), 2669 stream_response=False if self._emit_connector_builder_messages else True, 2670 ) 2671 2672 def create_jsonl_decoder( 2673 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2674 ) -> Decoder: 2675 return CompositeRawDecoder( 2676 parser=ModelToComponentFactory._get_parser(model, config), 2677 stream_response=False if self._emit_connector_builder_messages else True, 2678 ) 2679 2680 def create_json_items_decoder( 2681 self, model: JsonItemsDecoderModel, config: Config, **kwargs: Any 2682 ) -> Decoder: 2683 return CompositeRawDecoder( 2684 parser=ModelToComponentFactory._get_parser(model, config), 2685 stream_response=False if self._emit_connector_builder_messages else True, 2686 ) 2687 2688 def create_gzip_decoder( 2689 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2690 ) -> Decoder: 2691 _compressed_response_types = { 2692 "gzip", 2693 "x-gzip", 2694 "gzip, deflate", 2695 "x-gzip, deflate", 2696 "application/zip", 2697 "application/gzip", 2698 "application/x-gzip", 2699 "application/x-zip-compressed", 2700 } 2701 2702 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2703 2704 if self._emit_connector_builder_messages: 2705 # This is very surprising but if the response is not streamed, 2706 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2707 # which uses urllib3 directly and does not uncompress the data. 2708 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2709 2710 return CompositeRawDecoder.by_headers( 2711 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2712 stream_response=True, 2713 fallback_parser=gzip_parser.inner_parser, 2714 ) 2715 2716 @staticmethod 2717 def create_iterable_decoder( 2718 model: IterableDecoderModel, config: Config, **kwargs: Any 2719 ) -> IterableDecoder: 2720 return IterableDecoder(parameters={}) 2721 2722 @staticmethod 2723 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2724 return XmlDecoder(parameters={}) 2725 2726 def create_zipfile_decoder( 2727 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2728 ) -> ZipfileDecoder: 2729 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2730 2731 @staticmethod 2732 def _get_parser(model: BaseModel, config: Config) -> Parser: 2733 if isinstance(model, JsonDecoderModel): 2734 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2735 return JsonParser() 2736 elif isinstance(model, JsonItemsDecoderModel): 2737 return JsonItemsParser( 2738 items_path=model.items_path, 2739 encoding=model.encoding, 2740 ) 2741 elif isinstance(model, JsonlDecoderModel): 2742 return JsonLineParser() 2743 elif isinstance(model, CsvDecoderModel): 2744 return CsvParser( 2745 encoding=model.encoding, 2746 delimiter=model.delimiter, 2747 set_values_to_none=model.set_values_to_none, 2748 ) 2749 elif isinstance(model, GzipDecoderModel): 2750 return GzipParser( 2751 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2752 ) 2753 elif isinstance( 2754 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2755 ): 2756 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2757 2758 raise ValueError(f"Unknown decoder type {model}") 2759 2760 @staticmethod 2761 def create_json_file_schema_loader( 2762 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2763 ) -> JsonFileSchemaLoader: 2764 return JsonFileSchemaLoader( 2765 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2766 ) 2767 2768 def create_jwt_authenticator( 2769 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2770 ) -> JwtAuthenticator: 2771 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2772 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2773 request_option = ( 2774 self._create_component_from_model(model.request_option, config) 2775 if model.request_option 2776 else None 2777 ) 2778 return JwtAuthenticator( 2779 config=config, 2780 parameters=model.parameters or {}, 2781 algorithm=JwtAlgorithm(model.algorithm.value), 2782 secret_key=model.secret_key, 2783 base64_encode_secret_key=model.base64_encode_secret_key, 2784 token_duration=model.token_duration, 2785 header_prefix=model.header_prefix, 2786 kid=jwt_headers.kid, 2787 typ=jwt_headers.typ, 2788 cty=jwt_headers.cty, 2789 iss=jwt_payload.iss, 2790 sub=jwt_payload.sub, 2791 aud=jwt_payload.aud, 2792 additional_jwt_headers=model.additional_jwt_headers, 2793 additional_jwt_payload=model.additional_jwt_payload, 2794 passphrase=model.passphrase, 2795 request_option=request_option, 2796 ) 2797 2798 def create_list_partition_router( 2799 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2800 ) -> ListPartitionRouter: 2801 request_option = ( 2802 self._create_component_from_model(model.request_option, config) 2803 if model.request_option 2804 else None 2805 ) 2806 return ListPartitionRouter( 2807 cursor_field=model.cursor_field, 2808 request_option=request_option, 2809 values=model.values, 2810 config=config, 2811 parameters=model.parameters or {}, 2812 ) 2813 2814 @staticmethod 2815 def create_min_max_datetime( 2816 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2817 ) -> MinMaxDatetime: 2818 return MinMaxDatetime( 2819 datetime=model.datetime, 2820 datetime_format=model.datetime_format or "", 2821 max_datetime=model.max_datetime or "", 2822 min_datetime=model.min_datetime or "", 2823 parameters=model.parameters or {}, 2824 ) 2825 2826 @staticmethod 2827 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2828 return NoAuth(parameters=model.parameters or {}) 2829 2830 @staticmethod 2831 def create_no_pagination( 2832 model: NoPaginationModel, config: Config, **kwargs: Any 2833 ) -> NoPagination: 2834 return NoPagination(parameters={}) 2835 2836 def create_oauth_authenticator( 2837 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2838 ) -> DeclarativeOauth2Authenticator: 2839 profile_assertion = ( 2840 self._create_component_from_model(model.profile_assertion, config=config) 2841 if model.profile_assertion 2842 else None 2843 ) 2844 2845 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2846 self._get_refresh_token_error_information(model) 2847 ) 2848 if model.refresh_token_updater: 2849 # ignore type error because fixing it would have a lot of dependencies, revisit later 2850 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2851 config, 2852 InterpolatedString.create( 2853 model.token_refresh_endpoint, # type: ignore 2854 parameters=model.parameters or {}, 2855 ).eval(config), 2856 access_token_name=InterpolatedString.create( 2857 model.access_token_name or "access_token", parameters=model.parameters or {} 2858 ).eval(config), 2859 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2860 expires_in_name=InterpolatedString.create( 2861 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2862 ).eval(config), 2863 client_id_name=InterpolatedString.create( 2864 model.client_id_name or "client_id", parameters=model.parameters or {} 2865 ).eval(config), 2866 client_id=InterpolatedString.create( 2867 model.client_id, parameters=model.parameters or {} 2868 ).eval(config) 2869 if model.client_id 2870 else model.client_id, 2871 client_secret_name=InterpolatedString.create( 2872 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2873 ).eval(config), 2874 client_secret=InterpolatedString.create( 2875 model.client_secret, parameters=model.parameters or {} 2876 ).eval(config) 2877 if model.client_secret 2878 else model.client_secret, 2879 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2880 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2881 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2882 grant_type_name=InterpolatedString.create( 2883 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2884 ).eval(config), 2885 grant_type=InterpolatedString.create( 2886 model.grant_type or "refresh_token", parameters=model.parameters or {} 2887 ).eval(config), 2888 refresh_request_body=InterpolatedMapping( 2889 model.refresh_request_body or {}, parameters=model.parameters or {} 2890 ).eval(config), 2891 refresh_request_headers=InterpolatedMapping( 2892 model.refresh_request_headers or {}, parameters=model.parameters or {} 2893 ).eval(config), 2894 send_refresh_request_as_query_params=bool( 2895 model.send_refresh_request_as_query_params 2896 ), 2897 scopes=model.scopes, 2898 token_expiry_date_format=model.token_expiry_date_format, 2899 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2900 message_repository=self._message_repository, 2901 refresh_token_error_status_codes=refresh_token_error_status_codes, 2902 refresh_token_error_key=refresh_token_error_key, 2903 refresh_token_error_values=refresh_token_error_values, 2904 ) 2905 # ignore type error because fixing it would have a lot of dependencies, revisit later 2906 return DeclarativeOauth2Authenticator( # type: ignore 2907 access_token_name=model.access_token_name or "access_token", 2908 access_token_value=model.access_token_value, 2909 client_id_name=model.client_id_name or "client_id", 2910 client_id=model.client_id, 2911 client_secret_name=model.client_secret_name or "client_secret", 2912 client_secret=model.client_secret, 2913 expires_in_name=model.expires_in_name or "expires_in", 2914 grant_type_name=model.grant_type_name or "grant_type", 2915 grant_type=model.grant_type or "refresh_token", 2916 refresh_request_body=model.refresh_request_body, 2917 refresh_request_headers=model.refresh_request_headers, 2918 send_refresh_request_as_query_params=bool(model.send_refresh_request_as_query_params), 2919 refresh_token_name=model.refresh_token_name or "refresh_token", 2920 refresh_token=model.refresh_token, 2921 scopes=model.scopes, 2922 token_expiry_date=model.token_expiry_date, 2923 token_expiry_date_format=model.token_expiry_date_format, 2924 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2925 token_refresh_endpoint=model.token_refresh_endpoint, 2926 config=config, 2927 parameters=model.parameters or {}, 2928 message_repository=self._message_repository, 2929 profile_assertion=profile_assertion, 2930 use_profile_assertion=model.use_profile_assertion, 2931 refresh_token_error_status_codes=refresh_token_error_status_codes, 2932 refresh_token_error_key=refresh_token_error_key, 2933 refresh_token_error_values=refresh_token_error_values, 2934 ) 2935 2936 @staticmethod 2937 def _get_refresh_token_error_information( 2938 model: OAuthAuthenticatorModel, 2939 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2940 """ 2941 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2942 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2943 information is defined only once and return the right fields. 2944 """ 2945 refresh_token_updater = model.refresh_token_updater 2946 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2947 refresh_token_updater.refresh_token_error_status_codes 2948 or refresh_token_updater.refresh_token_error_key 2949 or refresh_token_updater.refresh_token_error_values 2950 ) 2951 is_defined_on_oauth_authenticator = ( 2952 model.refresh_token_error_status_codes 2953 or model.refresh_token_error_key 2954 or model.refresh_token_error_values 2955 ) 2956 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2957 raise ValueError( 2958 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2959 ) 2960 2961 if is_defined_on_refresh_token_updated: 2962 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2963 return ( 2964 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2965 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2966 else (), 2967 not_optional_refresh_token_updater.refresh_token_error_key or "", 2968 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2969 if not_optional_refresh_token_updater.refresh_token_error_values 2970 else (), 2971 ) 2972 elif is_defined_on_oauth_authenticator: 2973 return ( 2974 tuple(model.refresh_token_error_status_codes) 2975 if model.refresh_token_error_status_codes 2976 else (), 2977 model.refresh_token_error_key or "", 2978 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2979 ) 2980 2981 # returning default values we think cover most cases 2982 return (400,), "error", ("invalid_grant", "invalid_permissions") 2983 2984 def create_offset_increment( 2985 self, 2986 model: OffsetIncrementModel, 2987 config: Config, 2988 decoder: Decoder, 2989 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2990 **kwargs: Any, 2991 ) -> OffsetIncrement: 2992 if isinstance(decoder, PaginationDecoderDecorator): 2993 inner_decoder = decoder.decoder 2994 else: 2995 inner_decoder = decoder 2996 decoder = PaginationDecoderDecorator(decoder=decoder) 2997 2998 if self._is_supported_decoder_for_pagination(inner_decoder): 2999 decoder_to_use = decoder 3000 else: 3001 raise ValueError( 3002 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 3003 ) 3004 3005 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 3006 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 3007 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 3008 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 3009 # When we have more time to investigate we can look into reusing the same component. 3010 extractor = ( 3011 self._create_component_from_model( 3012 model=extractor_model, config=config, decoder=decoder_to_use 3013 ) 3014 if extractor_model 3015 else None 3016 ) 3017 3018 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3019 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3020 page_size = model.page_size 3021 if isinstance(page_size, str) and page_size.isdigit(): 3022 page_size = int(page_size) 3023 3024 return OffsetIncrement( 3025 page_size=page_size, 3026 config=config, 3027 decoder=decoder_to_use, 3028 extractor=extractor, 3029 inject_on_first_request=model.inject_on_first_request or False, 3030 parameters=model.parameters or {}, 3031 ) 3032 3033 @staticmethod 3034 def create_page_increment( 3035 model: PageIncrementModel, config: Config, **kwargs: Any 3036 ) -> PageIncrement: 3037 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3038 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3039 page_size = model.page_size 3040 if isinstance(page_size, str) and page_size.isdigit(): 3041 page_size = int(page_size) 3042 3043 return PageIncrement( 3044 page_size=page_size, 3045 config=config, 3046 start_from_page=model.start_from_page or 0, 3047 inject_on_first_request=model.inject_on_first_request or False, 3048 parameters=model.parameters or {}, 3049 ) 3050 3051 def create_parent_stream_config( 3052 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3053 ) -> ParentStreamConfig: 3054 declarative_stream = self._create_component_from_model( 3055 model.stream, 3056 config=config, 3057 is_parent=True, 3058 **kwargs, 3059 ) 3060 request_option = ( 3061 self._create_component_from_model(model.request_option, config=config) 3062 if model.request_option 3063 else None 3064 ) 3065 3066 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3067 raise ValueError( 3068 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3069 ) 3070 3071 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3072 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3073 ) 3074 3075 return ParentStreamConfig( 3076 parent_key=model.parent_key, 3077 request_option=request_option, 3078 stream=declarative_stream, 3079 partition_field=model.partition_field, 3080 config=config, 3081 incremental_dependency=model.incremental_dependency or False, 3082 parameters=model.parameters or {}, 3083 extra_fields=model.extra_fields, 3084 lazy_read_pointer=model_lazy_read_pointer, 3085 ) 3086 3087 def create_properties_from_endpoint( 3088 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3089 ) -> PropertiesFromEndpoint: 3090 retriever = self._create_component_from_model( 3091 model=model.retriever, 3092 config=config, 3093 name="dynamic_properties", 3094 primary_key=None, 3095 stream_slicer=None, 3096 transformations=[], 3097 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3098 ) 3099 return PropertiesFromEndpoint( 3100 property_field_path=model.property_field_path, 3101 retriever=retriever, 3102 config=config, 3103 parameters=model.parameters or {}, 3104 ) 3105 3106 def create_property_chunking( 3107 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3108 ) -> PropertyChunking: 3109 record_merge_strategy = ( 3110 self._create_component_from_model( 3111 model=model.record_merge_strategy, config=config, **kwargs 3112 ) 3113 if model.record_merge_strategy 3114 else None 3115 ) 3116 3117 property_limit_type: PropertyLimitType 3118 match model.property_limit_type: 3119 case PropertyLimitTypeModel.property_count: 3120 property_limit_type = PropertyLimitType.property_count 3121 case PropertyLimitTypeModel.characters: 3122 property_limit_type = PropertyLimitType.characters 3123 case _: 3124 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3125 3126 return PropertyChunking( 3127 property_limit_type=property_limit_type, 3128 property_limit=model.property_limit, 3129 record_merge_strategy=record_merge_strategy, 3130 config=config, 3131 parameters=model.parameters or {}, 3132 ) 3133 3134 def create_query_properties( 3135 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3136 ) -> QueryProperties: 3137 if isinstance(model.property_list, list): 3138 property_list = model.property_list 3139 else: 3140 property_list = self._create_component_from_model( 3141 model=model.property_list, config=config, **kwargs 3142 ) 3143 3144 property_chunking = ( 3145 self._create_component_from_model( 3146 model=model.property_chunking, config=config, **kwargs 3147 ) 3148 if model.property_chunking 3149 else None 3150 ) 3151 3152 property_selector = ( 3153 self._create_component_from_model( 3154 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3155 ) 3156 if model.property_selector 3157 else None 3158 ) 3159 3160 return QueryProperties( 3161 property_list=property_list, 3162 always_include_properties=model.always_include_properties, 3163 property_chunking=property_chunking, 3164 property_selector=property_selector, 3165 config=config, 3166 parameters=model.parameters or {}, 3167 ) 3168 3169 def create_json_schema_property_selector( 3170 self, 3171 model: JsonSchemaPropertySelectorModel, 3172 config: Config, 3173 *, 3174 stream_name: str, 3175 **kwargs: Any, 3176 ) -> JsonSchemaPropertySelector: 3177 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3178 3179 transformations = [] 3180 if model.transformations: 3181 for transformation_model in model.transformations: 3182 transformations.append( 3183 self._create_component_from_model(model=transformation_model, config=config) 3184 ) 3185 3186 return JsonSchemaPropertySelector( 3187 configured_stream=configured_stream, 3188 properties_transformations=transformations, 3189 config=config, 3190 parameters=model.parameters or {}, 3191 ) 3192 3193 @staticmethod 3194 def create_record_filter( 3195 model: RecordFilterModel, config: Config, **kwargs: Any 3196 ) -> RecordFilter: 3197 return RecordFilter( 3198 condition=model.condition or "", config=config, parameters=model.parameters or {} 3199 ) 3200 3201 @staticmethod 3202 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3203 return RequestPath(parameters={}) 3204 3205 @staticmethod 3206 def create_request_option( 3207 model: RequestOptionModel, config: Config, **kwargs: Any 3208 ) -> RequestOption: 3209 inject_into = RequestOptionType(model.inject_into.value) 3210 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3211 [ 3212 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3213 for segment in model.field_path 3214 ] 3215 if model.field_path 3216 else None 3217 ) 3218 field_name = ( 3219 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3220 if model.field_name 3221 else None 3222 ) 3223 return RequestOption( 3224 field_name=field_name, 3225 field_path=field_path, 3226 inject_into=inject_into, 3227 parameters=kwargs.get("parameters", {}), 3228 ) 3229 3230 def create_record_selector( 3231 self, 3232 model: RecordSelectorModel, 3233 config: Config, 3234 *, 3235 name: str, 3236 transformations: List[RecordTransformation] | None = None, 3237 decoder: Decoder | None = None, 3238 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3239 file_uploader: Optional[DefaultFileUploader] = None, 3240 **kwargs: Any, 3241 ) -> RecordSelector: 3242 extractor = self._create_component_from_model( 3243 model=model.extractor, decoder=decoder, config=config 3244 ) 3245 record_filter = ( 3246 self._create_component_from_model(model.record_filter, config=config) 3247 if model.record_filter 3248 else None 3249 ) 3250 3251 transform_before_filtering = ( 3252 False if model.transform_before_filtering is None else model.transform_before_filtering 3253 ) 3254 if client_side_incremental_sync_cursor: 3255 record_filter = ClientSideIncrementalRecordFilterDecorator( 3256 config=config, 3257 parameters=model.parameters, 3258 condition=model.record_filter.condition 3259 if (model.record_filter and hasattr(model.record_filter, "condition")) 3260 else None, 3261 cursor=client_side_incremental_sync_cursor, 3262 ) 3263 transform_before_filtering = ( 3264 True 3265 if model.transform_before_filtering is None 3266 else model.transform_before_filtering 3267 ) 3268 3269 if model.schema_normalization is None: 3270 # default to no schema normalization if not set 3271 model.schema_normalization = SchemaNormalizationModel.None_ 3272 3273 schema_normalization = ( 3274 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3275 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3276 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3277 ) 3278 3279 return RecordSelector( 3280 extractor=extractor, 3281 name=name, 3282 config=config, 3283 record_filter=record_filter, 3284 transformations=transformations or [], 3285 file_uploader=file_uploader, 3286 schema_normalization=schema_normalization, 3287 parameters=model.parameters or {}, 3288 transform_before_filtering=transform_before_filtering, 3289 ) 3290 3291 @staticmethod 3292 def create_remove_fields( 3293 model: RemoveFieldsModel, config: Config, **kwargs: Any 3294 ) -> RemoveFields: 3295 return RemoveFields( 3296 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3297 ) 3298 3299 def create_selective_authenticator( 3300 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3301 ) -> DeclarativeAuthenticator: 3302 authenticators = { 3303 name: self._create_component_from_model(model=auth, config=config) 3304 for name, auth in model.authenticators.items() 3305 } 3306 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3307 return SelectiveAuthenticator( # type: ignore[abstract] 3308 config=config, 3309 authenticators=authenticators, 3310 authenticator_selection_path=model.authenticator_selection_path, 3311 **kwargs, 3312 ) 3313 3314 @staticmethod 3315 def create_legacy_session_token_authenticator( 3316 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3317 ) -> LegacySessionTokenAuthenticator: 3318 return LegacySessionTokenAuthenticator( 3319 api_url=url_base, 3320 header=model.header, 3321 login_url=model.login_url, 3322 password=model.password or "", 3323 session_token=model.session_token or "", 3324 session_token_response_key=model.session_token_response_key or "", 3325 username=model.username or "", 3326 validate_session_url=model.validate_session_url, 3327 config=config, 3328 parameters=model.parameters or {}, 3329 ) 3330 3331 def create_simple_retriever( 3332 self, 3333 model: SimpleRetrieverModel, 3334 config: Config, 3335 *, 3336 name: str, 3337 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3338 request_options_provider: Optional[RequestOptionsProvider] = None, 3339 cursor: Optional[Cursor] = None, 3340 has_stop_condition_cursor: bool = False, 3341 is_client_side_incremental_sync: bool = False, 3342 transformations: List[RecordTransformation], 3343 file_uploader: Optional[DefaultFileUploader] = None, 3344 incremental_sync: Optional[ 3345 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3346 ] = None, 3347 use_cache: Optional[bool] = None, 3348 log_formatter: Optional[Callable[[Response], Any]] = None, 3349 partition_router: Optional[PartitionRouter] = None, 3350 **kwargs: Any, 3351 ) -> SimpleRetriever: 3352 def _get_url(req: Requester) -> str: 3353 """ 3354 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3355 This is needed because the URL is not set until the requester is created. 3356 """ 3357 3358 _url: str = ( 3359 model.requester.url 3360 if hasattr(model.requester, "url") and model.requester.url is not None 3361 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3362 ) 3363 _url_base: str = ( 3364 model.requester.url_base 3365 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3366 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3367 ) 3368 3369 return _url or _url_base 3370 3371 if cursor is None: 3372 cursor = FinalStateCursor(name, None, self._message_repository) 3373 3374 decoder = ( 3375 self._create_component_from_model(model=model.decoder, config=config) 3376 if model.decoder 3377 else JsonDecoder(parameters={}) 3378 ) 3379 record_selector = self._create_component_from_model( 3380 model=model.record_selector, 3381 name=name, 3382 config=config, 3383 decoder=decoder, 3384 transformations=transformations, 3385 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3386 file_uploader=file_uploader, 3387 ) 3388 3389 query_properties: Optional[QueryProperties] = None 3390 query_properties_key: Optional[str] = None 3391 self._ensure_query_properties_to_model(model.requester) 3392 if self._has_query_properties_in_request_parameters(model.requester): 3393 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3394 # places instead of default to request_parameters which isn't clearly documented 3395 if ( 3396 hasattr(model.requester, "fetch_properties_from_endpoint") 3397 and model.requester.fetch_properties_from_endpoint 3398 ): 3399 raise ValueError( 3400 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3401 ) 3402 3403 query_properties_definitions = [] 3404 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3405 if isinstance(request_parameter, QueryPropertiesModel): 3406 query_properties_key = key 3407 query_properties_definitions.append(request_parameter) 3408 3409 if len(query_properties_definitions) > 1: 3410 raise ValueError( 3411 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3412 ) 3413 3414 if len(query_properties_definitions) == 1: 3415 query_properties = self._create_component_from_model( 3416 model=query_properties_definitions[0], stream_name=name, config=config 3417 ) 3418 3419 # Removes QueryProperties components from the interpolated mappings because it has been designed 3420 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3421 # instead of through jinja interpolation 3422 if hasattr(model.requester, "request_parameters") and isinstance( 3423 model.requester.request_parameters, Mapping 3424 ): 3425 model.requester.request_parameters = self._remove_query_properties( 3426 model.requester.request_parameters 3427 ) 3428 elif ( 3429 hasattr(model.requester, "fetch_properties_from_endpoint") 3430 and model.requester.fetch_properties_from_endpoint 3431 ): 3432 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3433 query_properties_definition = QueryPropertiesModel( 3434 type="QueryProperties", 3435 property_list=model.requester.fetch_properties_from_endpoint, 3436 always_include_properties=None, 3437 property_chunking=None, 3438 ) # type: ignore # $parameters has a default value 3439 3440 query_properties = self.create_query_properties( 3441 model=query_properties_definition, 3442 stream_name=name, 3443 config=config, 3444 ) 3445 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3446 query_properties = self.create_query_properties( 3447 model=model.requester.query_properties, 3448 stream_name=name, 3449 config=config, 3450 ) 3451 3452 requester = self._create_component_from_model( 3453 model=model.requester, 3454 decoder=decoder, 3455 name=name, 3456 query_properties_key=query_properties_key, 3457 use_cache=use_cache, 3458 config=config, 3459 ) 3460 3461 if not request_options_provider: 3462 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3463 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3464 partition_router, PartitionRouter 3465 ): 3466 request_options_provider = partition_router 3467 3468 paginator = ( 3469 self._create_component_from_model( 3470 model=model.paginator, 3471 config=config, 3472 url_base=_get_url(requester), 3473 extractor_model=model.record_selector.extractor, 3474 decoder=decoder, 3475 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3476 ) 3477 if model.paginator 3478 else NoPagination(parameters={}) 3479 ) 3480 3481 ignore_stream_slicer_parameters_on_paginated_requests = ( 3482 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3483 ) 3484 3485 if ( 3486 model.partition_router 3487 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3488 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3489 and any( 3490 parent_stream_config.lazy_read_pointer 3491 for parent_stream_config in model.partition_router.parent_stream_configs 3492 ) 3493 ): 3494 if incremental_sync: 3495 if incremental_sync.type != "DatetimeBasedCursor": 3496 raise ValueError( 3497 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3498 ) 3499 3500 elif incremental_sync.step or incremental_sync.cursor_granularity: 3501 raise ValueError( 3502 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3503 ) 3504 3505 if model.decoder and model.decoder.type != "JsonDecoder": 3506 raise ValueError( 3507 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3508 ) 3509 3510 return LazySimpleRetriever( 3511 name=name, 3512 paginator=paginator, 3513 primary_key=primary_key, 3514 requester=requester, 3515 record_selector=record_selector, 3516 stream_slicer=_NO_STREAM_SLICING, 3517 request_option_provider=request_options_provider, 3518 config=config, 3519 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3520 parameters=model.parameters or {}, 3521 ) 3522 3523 if ( 3524 model.record_selector.record_filter 3525 and model.pagination_reset 3526 and model.pagination_reset.limits 3527 ): 3528 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3529 3530 return SimpleRetriever( 3531 name=name, 3532 paginator=paginator, 3533 primary_key=primary_key, 3534 requester=requester, 3535 record_selector=record_selector, 3536 stream_slicer=_NO_STREAM_SLICING, 3537 request_option_provider=request_options_provider, 3538 config=config, 3539 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3540 additional_query_properties=query_properties, 3541 log_formatter=self._get_log_formatter(log_formatter, name), 3542 pagination_tracker_factory=self._create_pagination_tracker_factory( 3543 model.pagination_reset, cursor 3544 ), 3545 parameters=model.parameters or {}, 3546 ) 3547 3548 def _create_pagination_tracker_factory( 3549 self, model: Optional[PaginationResetModel], cursor: Cursor 3550 ) -> Callable[[], PaginationTracker]: 3551 if model is None: 3552 return lambda: PaginationTracker() 3553 3554 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3555 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3556 if model.action == PaginationResetActionModel.RESET: 3557 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3558 pass 3559 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3560 if isinstance(cursor, ConcurrentCursor): 3561 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3562 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3563 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3564 {}, datetime.timedelta(0) 3565 ) 3566 elif not isinstance(cursor, FinalStateCursor): 3567 LOGGER.warning( 3568 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3569 ) 3570 else: 3571 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3572 3573 limit = model.limits.number_of_records if model and model.limits else None 3574 return lambda: PaginationTracker(cursor_factory(), limit) 3575 3576 def _get_log_formatter( 3577 self, log_formatter: Callable[[Response], Any] | None, name: str 3578 ) -> Callable[[Response], Any] | None: 3579 if self._should_limit_slices_fetched(): 3580 return ( 3581 ( 3582 lambda response: format_http_message( 3583 response, 3584 f"Stream '{name}' request", 3585 f"Request performed in order to extract records for stream '{name}'", 3586 name, 3587 ) 3588 ) 3589 if not log_formatter 3590 else log_formatter 3591 ) 3592 return None 3593 3594 def _should_limit_slices_fetched(self) -> bool: 3595 """ 3596 Returns True if the number of slices fetched should be limited, False otherwise. 3597 This is used to limit the number of slices fetched during tests. 3598 """ 3599 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3600 3601 @staticmethod 3602 def _has_query_properties_in_request_parameters( 3603 requester: Union[HttpRequesterModel, CustomRequesterModel], 3604 ) -> bool: 3605 if not hasattr(requester, "request_parameters"): 3606 return False 3607 request_parameters = requester.request_parameters 3608 if request_parameters and isinstance(request_parameters, Mapping): 3609 for request_parameter in request_parameters.values(): 3610 if isinstance(request_parameter, QueryPropertiesModel): 3611 return True 3612 return False 3613 3614 @staticmethod 3615 def _remove_query_properties( 3616 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3617 ) -> Mapping[str, str]: 3618 return { 3619 parameter_field: request_parameter 3620 for parameter_field, request_parameter in request_parameters.items() 3621 if not isinstance(request_parameter, QueryPropertiesModel) 3622 } 3623 3624 def create_state_delegating_stream( 3625 self, 3626 model: StateDelegatingStreamModel, 3627 config: Config, 3628 **kwargs: Any, 3629 ) -> DefaultStream: 3630 if ( 3631 model.full_refresh_stream.name != model.name 3632 or model.name != model.incremental_stream.name 3633 ): 3634 raise ValueError( 3635 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3636 ) 3637 3638 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3639 resolved_retention_period: Optional[str] = None 3640 if model.api_retention_period: 3641 interpolated_retention = InterpolatedString.create( 3642 model.api_retention_period, parameters=model.parameters or {} 3643 ) 3644 resolved_value = interpolated_retention.eval(config=config) 3645 if resolved_value: 3646 resolved_retention_period = str(resolved_value) 3647 3648 if resolved_retention_period: 3649 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3650 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3651 raise ValueError( 3652 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3653 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3654 f"cursors, so cursor age validation cannot be performed." 3655 ) 3656 3657 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3658 3659 if not stream_state: 3660 return self._create_component_from_model( # type: ignore[no-any-return] 3661 model.full_refresh_stream, config=config, **kwargs 3662 ) 3663 3664 incremental_stream: DefaultStream = self._create_component_from_model( 3665 model.incremental_stream, config=config, **kwargs 3666 ) # type: ignore[assignment] 3667 3668 # Only run cursor age validation for streams that are in the configured 3669 # catalog (or when no catalog was provided, e.g. during discover / connector 3670 # builder). Streams not selected by the user but instantiated as parent-stream 3671 # dependencies must not go through this path because it emits state messages 3672 # that the destination does not know about, causing "Stream not found" crashes. 3673 stream_is_in_catalog = ( 3674 not self._stream_name_to_configured_stream # no catalog → validate by default 3675 or model.name in self._stream_name_to_configured_stream 3676 ) 3677 if resolved_retention_period and stream_is_in_catalog: 3678 full_refresh_stream: DefaultStream = self._create_component_from_model( 3679 model.full_refresh_stream, config=config, **kwargs 3680 ) # type: ignore[assignment] 3681 if self._is_cursor_older_than_retention_period( 3682 stream_state, 3683 full_refresh_stream.cursor, 3684 incremental_stream.cursor, 3685 resolved_retention_period, 3686 model.name, 3687 ): 3688 # Clear state BEFORE constructing the full_refresh_stream so that 3689 # its cursor starts from start_date instead of the stale cursor. 3690 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3691 state_message = self._connector_state_manager.create_state_message(model.name, None) 3692 self._message_repository.emit_message(state_message) 3693 return self._create_component_from_model( # type: ignore[no-any-return] 3694 model.full_refresh_stream, config=config, **kwargs 3695 ) 3696 3697 return incremental_stream 3698 3699 @staticmethod 3700 def _is_cursor_older_than_retention_period( 3701 stream_state: Mapping[str, Any], 3702 full_refresh_cursor: Cursor, 3703 incremental_cursor: Cursor, 3704 api_retention_period: str, 3705 stream_name: str, 3706 ) -> bool: 3707 """Check if the cursor value in the state is older than the API's retention period. 3708 3709 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3710 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3711 which is always within retention, so we use incremental. For other states, it returns 3712 None and we fall back to checking the incremental cursor. 3713 3714 Returns True if the cursor is older than the retention period (should use full refresh). 3715 Returns False if the cursor is within the retention period (safe to use incremental). 3716 """ 3717 retention_duration = parse_duration(api_retention_period) 3718 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3719 3720 # Check full refresh cursor first 3721 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3722 3723 # If full refresh cursor returns None, check incremental cursor 3724 if cursor_datetime is None: 3725 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3726 3727 if cursor_datetime is None: 3728 # Neither cursor could parse the state - fall back to full refresh to be safe 3729 return True 3730 3731 if cursor_datetime < retention_cutoff: 3732 logging.warning( 3733 f"Stream '{stream_name}' has a cursor value older than " 3734 f"the API's retention period of {api_retention_period} " 3735 f"(cutoff: {retention_cutoff.isoformat()}). " 3736 f"Falling back to full refresh to avoid data loss." 3737 ) 3738 return True 3739 3740 return False 3741 3742 def _get_state_delegating_stream_model( 3743 self, 3744 model: StateDelegatingStreamModel, 3745 parent_state: Optional[Mapping[str, Any]] = None, 3746 ) -> DeclarativeStreamModel: 3747 """Return the appropriate underlying stream model based on state.""" 3748 return ( 3749 model.incremental_stream 3750 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3751 else model.full_refresh_stream 3752 ) 3753 3754 _OPTIONAL_ASYNC_STATUS_FIELDS = {"skipped"} 3755 3756 def _create_async_job_status_mapping( 3757 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3758 ) -> Mapping[str, AsyncJobStatus]: 3759 api_status_to_cdk_status = {} 3760 for cdk_status, api_statuses in model.dict().items(): 3761 if cdk_status == "type": 3762 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3763 continue 3764 3765 if api_statuses is None: 3766 if cdk_status in self._OPTIONAL_ASYNC_STATUS_FIELDS: 3767 continue 3768 raise ValueError( 3769 f"Required CDK status '{cdk_status}' has no API statuses mapped. " 3770 f"Please provide at least an empty list for required status fields." 3771 ) 3772 3773 for status in api_statuses: 3774 if status in api_status_to_cdk_status: 3775 raise ValueError( 3776 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3777 ) 3778 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3779 return api_status_to_cdk_status 3780 3781 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3782 match status: 3783 case "running": 3784 return AsyncJobStatus.RUNNING 3785 case "completed": 3786 return AsyncJobStatus.COMPLETED 3787 case "failed": 3788 return AsyncJobStatus.FAILED 3789 case "timeout": 3790 return AsyncJobStatus.TIMED_OUT 3791 case "skipped": 3792 return AsyncJobStatus.SKIPPED 3793 case _: 3794 raise ValueError(f"Unsupported CDK status {status}") 3795 3796 def create_async_retriever( 3797 self, 3798 model: AsyncRetrieverModel, 3799 config: Config, 3800 *, 3801 name: str, 3802 primary_key: Optional[ 3803 Union[str, List[str], List[List[str]]] 3804 ], # this seems to be needed to match create_simple_retriever 3805 stream_slicer: Optional[StreamSlicer], 3806 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3807 transformations: List[RecordTransformation], 3808 **kwargs: Any, 3809 ) -> AsyncRetriever: 3810 if model.download_target_requester and not model.download_target_extractor: 3811 raise ValueError( 3812 f"`download_target_extractor` required if using a `download_target_requester`" 3813 ) 3814 3815 def _get_download_retriever( 3816 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3817 ) -> SimpleRetriever: 3818 # We create a record selector for the download retriever 3819 # with no schema normalization and no transformations, neither record filter 3820 # as all this occurs in the record_selector of the AsyncRetriever 3821 record_selector = RecordSelector( 3822 extractor=extractor, 3823 name=name, 3824 record_filter=None, 3825 transformations=[], 3826 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3827 config=config, 3828 parameters={}, 3829 ) 3830 paginator = ( 3831 self._create_component_from_model( 3832 model=model.download_paginator, 3833 decoder=_decoder, 3834 config=config, 3835 url_base="", 3836 ) 3837 if model.download_paginator 3838 else NoPagination(parameters={}) 3839 ) 3840 3841 return SimpleRetriever( 3842 requester=requester, 3843 record_selector=record_selector, 3844 primary_key=None, 3845 name=name, 3846 paginator=paginator, 3847 config=config, 3848 parameters={}, 3849 log_formatter=self._get_log_formatter(None, name), 3850 ) 3851 3852 def _get_job_timeout() -> datetime.timedelta: 3853 user_defined_timeout: Optional[int] = ( 3854 int( 3855 InterpolatedString.create( 3856 str(model.polling_job_timeout), 3857 parameters={}, 3858 ).eval(config) 3859 ) 3860 if model.polling_job_timeout 3861 else None 3862 ) 3863 3864 # check for user defined timeout during the test read or 15 minutes 3865 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3866 # default value for non-connector builder is 60 minutes. 3867 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3868 3869 return ( 3870 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3871 ) 3872 3873 decoder = ( 3874 self._create_component_from_model(model=model.decoder, config=config) 3875 if model.decoder 3876 else JsonDecoder(parameters={}) 3877 ) 3878 record_selector = self._create_component_from_model( 3879 model=model.record_selector, 3880 config=config, 3881 decoder=decoder, 3882 name=name, 3883 transformations=transformations, 3884 client_side_incremental_sync=client_side_incremental_sync, 3885 ) 3886 3887 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3888 if self._should_limit_slices_fetched(): 3889 stream_slicer = cast( 3890 StreamSlicer, 3891 StreamSlicerTestReadDecorator( 3892 wrapped_slicer=stream_slicer, 3893 maximum_number_of_slices=self._limit_slices_fetched or 5, 3894 ), 3895 ) 3896 3897 creation_requester = self._create_component_from_model( 3898 model=model.creation_requester, 3899 decoder=decoder, 3900 config=config, 3901 name=f"job creation - {name}", 3902 ) 3903 polling_requester = self._create_component_from_model( 3904 model=model.polling_requester, 3905 decoder=decoder, 3906 config=config, 3907 name=f"job polling - {name}", 3908 ) 3909 job_download_components_name = f"job download - {name}" 3910 download_decoder = ( 3911 self._create_component_from_model(model=model.download_decoder, config=config) 3912 if model.download_decoder 3913 else JsonDecoder(parameters={}) 3914 ) 3915 download_extractor = ( 3916 self._create_component_from_model( 3917 model=model.download_extractor, 3918 config=config, 3919 decoder=download_decoder, 3920 parameters=model.parameters, 3921 ) 3922 if model.download_extractor 3923 else DpathExtractor( 3924 [], 3925 config=config, 3926 decoder=download_decoder, 3927 parameters=model.parameters or {}, 3928 ) 3929 ) 3930 download_requester = self._create_component_from_model( 3931 model=model.download_requester, 3932 decoder=download_decoder, 3933 config=config, 3934 name=job_download_components_name, 3935 ) 3936 download_retriever = _get_download_retriever( 3937 download_requester, download_extractor, download_decoder 3938 ) 3939 abort_requester = ( 3940 self._create_component_from_model( 3941 model=model.abort_requester, 3942 decoder=decoder, 3943 config=config, 3944 name=f"job abort - {name}", 3945 ) 3946 if model.abort_requester 3947 else None 3948 ) 3949 delete_requester = ( 3950 self._create_component_from_model( 3951 model=model.delete_requester, 3952 decoder=decoder, 3953 config=config, 3954 name=f"job delete - {name}", 3955 ) 3956 if model.delete_requester 3957 else None 3958 ) 3959 download_target_requester = ( 3960 self._create_component_from_model( 3961 model=model.download_target_requester, 3962 decoder=decoder, 3963 config=config, 3964 name=f"job extract_url - {name}", 3965 ) 3966 if model.download_target_requester 3967 else None 3968 ) 3969 status_extractor = self._create_component_from_model( 3970 model=model.status_extractor, decoder=decoder, config=config, name=name 3971 ) 3972 download_target_extractor = ( 3973 self._create_component_from_model( 3974 model=model.download_target_extractor, 3975 decoder=decoder, 3976 config=config, 3977 name=name, 3978 ) 3979 if model.download_target_extractor 3980 else None 3981 ) 3982 3983 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3984 creation_requester=creation_requester, 3985 polling_requester=polling_requester, 3986 download_retriever=download_retriever, 3987 download_target_requester=download_target_requester, 3988 abort_requester=abort_requester, 3989 delete_requester=delete_requester, 3990 status_extractor=status_extractor, 3991 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3992 download_target_extractor=download_target_extractor, 3993 job_timeout=_get_job_timeout(), 3994 ) 3995 3996 failed_retry_wait_time_in_seconds: Optional[int] = ( 3997 int( 3998 InterpolatedString.create( 3999 str(model.failed_retry_wait_time_in_seconds), 4000 parameters={}, 4001 ).eval(config) 4002 ) 4003 if model.failed_retry_wait_time_in_seconds 4004 else None 4005 ) 4006 4007 async_job_partition_router = AsyncJobPartitionRouter( 4008 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 4009 job_repository, 4010 stream_slices, 4011 self._job_tracker, 4012 self._message_repository, 4013 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 4014 has_bulk_parent=False, 4015 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 4016 # `None` == default retry is set to 3 attempts, under the hood. 4017 job_max_retry=1 if self._emit_connector_builder_messages else None, 4018 failed_retry_wait_time_in_seconds=failed_retry_wait_time_in_seconds, 4019 ), 4020 stream_slicer=stream_slicer, 4021 config=config, 4022 parameters=model.parameters or {}, 4023 ) 4024 4025 return AsyncRetriever( 4026 record_selector=record_selector, 4027 stream_slicer=async_job_partition_router, 4028 config=config, 4029 parameters=model.parameters or {}, 4030 ) 4031 4032 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 4033 config_migrations = [ 4034 self._create_component_from_model(migration, config) 4035 for migration in ( 4036 model.config_normalization_rules.config_migrations 4037 if ( 4038 model.config_normalization_rules 4039 and model.config_normalization_rules.config_migrations 4040 ) 4041 else [] 4042 ) 4043 ] 4044 config_transformations = [ 4045 self._create_component_from_model(transformation, config) 4046 for transformation in ( 4047 model.config_normalization_rules.transformations 4048 if ( 4049 model.config_normalization_rules 4050 and model.config_normalization_rules.transformations 4051 ) 4052 else [] 4053 ) 4054 ] 4055 config_validations = [ 4056 self._create_component_from_model(validation, config) 4057 for validation in ( 4058 model.config_normalization_rules.validations 4059 if ( 4060 model.config_normalization_rules 4061 and model.config_normalization_rules.validations 4062 ) 4063 else [] 4064 ) 4065 ] 4066 4067 return Spec( 4068 connection_specification=model.connection_specification, 4069 documentation_url=model.documentation_url, 4070 advanced_auth=model.advanced_auth, 4071 parameters={}, 4072 config_migrations=config_migrations, 4073 config_transformations=config_transformations, 4074 config_validations=config_validations, 4075 ) 4076 4077 def create_substream_partition_router( 4078 self, 4079 model: SubstreamPartitionRouterModel, 4080 config: Config, 4081 *, 4082 stream_name: str, 4083 **kwargs: Any, 4084 ) -> SubstreamPartitionRouter: 4085 parent_stream_configs = [] 4086 if model.parent_stream_configs: 4087 parent_stream_configs.extend( 4088 [ 4089 self.create_parent_stream_config_with_substream_wrapper( 4090 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4091 ) 4092 for parent_stream_config in model.parent_stream_configs 4093 ] 4094 ) 4095 4096 return SubstreamPartitionRouter( 4097 parent_stream_configs=parent_stream_configs, 4098 parameters=model.parameters or {}, 4099 config=config, 4100 ) 4101 4102 def create_parent_stream_config_with_substream_wrapper( 4103 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4104 ) -> Any: 4105 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4106 4107 parent_state: Optional[Mapping[str, Any]] = ( 4108 child_state if model.incremental_dependency and child_state else None 4109 ) 4110 connector_state_manager = self._instantiate_parent_stream_state_manager( 4111 child_state, config, model, parent_state 4112 ) 4113 4114 substream_factory = ModelToComponentFactory( 4115 connector_state_manager=connector_state_manager, 4116 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4117 limit_slices_fetched=self._limit_slices_fetched, 4118 emit_connector_builder_messages=self._emit_connector_builder_messages, 4119 disable_retries=self._disable_retries, 4120 disable_cache=self._disable_cache, 4121 message_repository=StateFilteringMessageRepository( 4122 LogAppenderMessageRepositoryDecorator( 4123 { 4124 "airbyte_cdk": {"stream": {"is_substream": True}}, 4125 "http": {"is_auxiliary": True}, 4126 }, 4127 self._message_repository, 4128 self._evaluate_log_level(self._emit_connector_builder_messages), 4129 ), 4130 ), 4131 api_budget=self._api_budget, 4132 ) 4133 4134 return substream_factory.create_parent_stream_config( 4135 model=model, config=config, stream_name=stream_name, **kwargs 4136 ) 4137 4138 def _instantiate_parent_stream_state_manager( 4139 self, 4140 child_state: MutableMapping[str, Any], 4141 config: Config, 4142 model: ParentStreamConfigModel, 4143 parent_state: Optional[Mapping[str, Any]] = None, 4144 ) -> ConnectorStateManager: 4145 """ 4146 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4147 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4148 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4149 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4150 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4151 incremental_dependency is set. 4152 """ 4153 if model.incremental_dependency and child_state: 4154 parent_stream_name = model.stream.name or "" 4155 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4156 child_state, parent_stream_name 4157 ) 4158 4159 if not extracted_parent_state: 4160 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4161 child_state, parent_stream_name 4162 ) 4163 4164 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4165 cursor_values = child_state.values() 4166 if cursor_values and len(cursor_values) == 1: 4167 incremental_sync_model: Union[ 4168 DatetimeBasedCursorModel, 4169 IncrementingCountCursorModel, 4170 ] = ( 4171 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4172 if isinstance(model.stream, DeclarativeStreamModel) 4173 else self._get_state_delegating_stream_model( 4174 model.stream, parent_state=parent_state 4175 ).incremental_sync 4176 ) 4177 cursor_field = InterpolatedString.create( 4178 incremental_sync_model.cursor_field, 4179 parameters=incremental_sync_model.parameters or {}, 4180 ).eval(config) 4181 extracted_parent_state = AirbyteStateMessage( 4182 type=AirbyteStateType.STREAM, 4183 stream=AirbyteStreamState( 4184 stream_descriptor=StreamDescriptor( 4185 name=parent_stream_name, namespace=None 4186 ), 4187 stream_state=AirbyteStateBlob( 4188 {cursor_field: list(cursor_values)[0]} 4189 ), 4190 ), 4191 ) 4192 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4193 4194 return ConnectorStateManager([]) 4195 4196 @staticmethod 4197 def create_wait_time_from_header( 4198 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4199 ) -> WaitTimeFromHeaderBackoffStrategy: 4200 return WaitTimeFromHeaderBackoffStrategy( 4201 header=model.header, 4202 parameters=model.parameters or {}, 4203 config=config, 4204 regex=model.regex, 4205 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4206 if model.max_waiting_time_in_seconds is not None 4207 else None, 4208 ) 4209 4210 @staticmethod 4211 def create_wait_until_time_from_header( 4212 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4213 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4214 return WaitUntilTimeFromHeaderBackoffStrategy( 4215 header=model.header, 4216 parameters=model.parameters or {}, 4217 config=config, 4218 min_wait=model.min_wait, 4219 regex=model.regex, 4220 ) 4221 4222 def get_message_repository(self) -> MessageRepository: 4223 return self._message_repository 4224 4225 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4226 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4227 4228 @staticmethod 4229 def create_components_mapping_definition( 4230 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4231 ) -> ComponentMappingDefinition: 4232 interpolated_value = InterpolatedString.create( 4233 model.value, parameters=model.parameters or {} 4234 ) 4235 field_path = [ 4236 InterpolatedString.create(path, parameters=model.parameters or {}) 4237 for path in model.field_path 4238 ] 4239 return ComponentMappingDefinition( 4240 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4241 value=interpolated_value, 4242 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4243 create_or_update=model.create_or_update, 4244 condition=model.condition, 4245 parameters=model.parameters or {}, 4246 ) 4247 4248 def create_http_components_resolver( 4249 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4250 ) -> Any: 4251 retriever = self._create_component_from_model( 4252 model=model.retriever, 4253 config=config, 4254 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4255 primary_key=None, 4256 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4257 transformations=[], 4258 ) 4259 4260 components_mapping = [] 4261 for component_mapping_definition_model in model.components_mapping: 4262 if component_mapping_definition_model.condition: 4263 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4264 components_mapping.append( 4265 self._create_component_from_model( 4266 model=component_mapping_definition_model, 4267 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4268 component_mapping_definition_model.value_type 4269 ), 4270 config=config, 4271 ) 4272 ) 4273 4274 return HttpComponentsResolver( 4275 retriever=retriever, 4276 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4277 config=config, 4278 components_mapping=components_mapping, 4279 parameters=model.parameters or {}, 4280 ) 4281 4282 @staticmethod 4283 def create_stream_config( 4284 model: StreamConfigModel, config: Config, **kwargs: Any 4285 ) -> StreamConfig: 4286 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4287 [x for x in model.configs_pointer] if model.configs_pointer else [] 4288 ) 4289 4290 return StreamConfig( 4291 configs_pointer=model_configs_pointer, 4292 default_values=model.default_values, 4293 parameters=model.parameters or {}, 4294 ) 4295 4296 def create_config_components_resolver( 4297 self, 4298 model: ConfigComponentsResolverModel, 4299 config: Config, 4300 ) -> Any: 4301 model_stream_configs = ( 4302 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4303 ) 4304 4305 stream_configs = [ 4306 self._create_component_from_model( 4307 stream_config, config=config, parameters=model.parameters or {} 4308 ) 4309 for stream_config in model_stream_configs 4310 ] 4311 4312 components_mapping = [ 4313 self._create_component_from_model( 4314 model=components_mapping_definition_model, 4315 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4316 components_mapping_definition_model.value_type 4317 ), 4318 config=config, 4319 parameters=model.parameters, 4320 ) 4321 for components_mapping_definition_model in model.components_mapping 4322 ] 4323 4324 return ConfigComponentsResolver( 4325 stream_configs=stream_configs, 4326 config=config, 4327 components_mapping=components_mapping, 4328 parameters=model.parameters or {}, 4329 ) 4330 4331 def create_parametrized_components_resolver( 4332 self, 4333 model: ParametrizedComponentsResolverModel, 4334 config: Config, 4335 ) -> ParametrizedComponentsResolver: 4336 stream_parameters = StreamParametersDefinition( 4337 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4338 ) 4339 4340 components_mapping = [] 4341 for components_mapping_definition_model in model.components_mapping: 4342 if components_mapping_definition_model.condition: 4343 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4344 components_mapping.append( 4345 self._create_component_from_model( 4346 model=components_mapping_definition_model, 4347 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4348 components_mapping_definition_model.value_type 4349 ), 4350 config=config, 4351 ) 4352 ) 4353 return ParametrizedComponentsResolver( 4354 stream_parameters=stream_parameters, 4355 config=config, 4356 components_mapping=components_mapping, 4357 parameters=model.parameters or {}, 4358 ) 4359 4360 _UNSUPPORTED_DECODER_ERROR = ( 4361 "Specified decoder of {decoder_type} is not supported for pagination." 4362 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4363 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4364 ) 4365 4366 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4367 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4368 return True 4369 elif isinstance(decoder, CompositeRawDecoder): 4370 return self._is_supported_parser_for_pagination(decoder.parser) 4371 else: 4372 return False 4373 4374 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4375 if isinstance(parser, JsonParser): 4376 return True 4377 elif isinstance(parser, GzipParser): 4378 return isinstance(parser.inner_parser, JsonParser) 4379 else: 4380 return False 4381 4382 def create_http_api_budget( 4383 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4384 ) -> HttpAPIBudget: 4385 policies = [ 4386 self._create_component_from_model(model=policy, config=config) 4387 for policy in model.policies 4388 ] 4389 4390 return HttpAPIBudget( 4391 policies=policies, 4392 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4393 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4394 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4395 ) 4396 4397 def create_fixed_window_call_rate_policy( 4398 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4399 ) -> FixedWindowCallRatePolicy: 4400 matchers = [ 4401 self._create_component_from_model(model=matcher, config=config) 4402 for matcher in model.matchers 4403 ] 4404 4405 # Set the initial reset timestamp to 10 days from now. 4406 # This value will be updated by the first request. 4407 return FixedWindowCallRatePolicy( 4408 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4409 period=parse_duration(model.period), 4410 call_limit=model.call_limit, 4411 matchers=matchers, 4412 ) 4413 4414 def create_file_uploader( 4415 self, model: FileUploaderModel, config: Config, **kwargs: Any 4416 ) -> FileUploader: 4417 name = "File Uploader" 4418 requester = self._create_component_from_model( 4419 model=model.requester, 4420 config=config, 4421 name=name, 4422 **kwargs, 4423 ) 4424 download_target_extractor = self._create_component_from_model( 4425 model=model.download_target_extractor, 4426 config=config, 4427 name=name, 4428 **kwargs, 4429 ) 4430 emit_connector_builder_messages = self._emit_connector_builder_messages 4431 file_uploader = DefaultFileUploader( 4432 requester=requester, 4433 download_target_extractor=download_target_extractor, 4434 config=config, 4435 file_writer=NoopFileWriter() 4436 if emit_connector_builder_messages 4437 else LocalFileSystemFileWriter(), 4438 parameters=model.parameters or {}, 4439 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4440 ) 4441 4442 return ( 4443 ConnectorBuilderFileUploader(file_uploader) 4444 if emit_connector_builder_messages 4445 else file_uploader 4446 ) 4447 4448 def create_moving_window_call_rate_policy( 4449 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4450 ) -> MovingWindowCallRatePolicy: 4451 rates = [ 4452 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4453 ] 4454 matchers = [ 4455 self._create_component_from_model(model=matcher, config=config) 4456 for matcher in model.matchers 4457 ] 4458 return MovingWindowCallRatePolicy( 4459 rates=rates, 4460 matchers=matchers, 4461 ) 4462 4463 def create_unlimited_call_rate_policy( 4464 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4465 ) -> UnlimitedCallRatePolicy: 4466 matchers = [ 4467 self._create_component_from_model(model=matcher, config=config) 4468 for matcher in model.matchers 4469 ] 4470 4471 return UnlimitedCallRatePolicy( 4472 matchers=matchers, 4473 ) 4474 4475 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4476 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4477 return Rate( 4478 limit=int(interpolated_limit.eval(config=config)), 4479 interval=parse_duration(model.interval), 4480 ) 4481 4482 def create_http_request_matcher( 4483 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4484 ) -> HttpRequestRegexMatcher: 4485 weight = model.weight 4486 if weight is not None: 4487 if isinstance(weight, str): 4488 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4489 else: 4490 weight = int(weight) 4491 if weight < 1: 4492 raise ValueError(f"weight must be >= 1, got {weight}") 4493 return HttpRequestRegexMatcher( 4494 method=model.method, 4495 url_base=model.url_base, 4496 url_path_pattern=model.url_path_pattern, 4497 params=model.params, 4498 headers=model.headers, 4499 weight=weight, 4500 ) 4501 4502 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4503 self._api_budget = self.create_component( 4504 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4505 ) 4506 4507 def create_grouping_partition_router( 4508 self, 4509 model: GroupingPartitionRouterModel, 4510 config: Config, 4511 *, 4512 stream_name: str, 4513 **kwargs: Any, 4514 ) -> GroupingPartitionRouter: 4515 underlying_router = self._create_component_from_model( 4516 model=model.underlying_partition_router, 4517 config=config, 4518 stream_name=stream_name, 4519 **kwargs, 4520 ) 4521 if model.group_size < 1: 4522 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4523 4524 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4525 # because they are specific to individual partitions and cannot be aggregated or handled 4526 # when grouping, potentially leading to incorrect API calls. Any request customization 4527 # should be managed at the stream level through the requester's configuration. 4528 if isinstance(underlying_router, SubstreamPartitionRouter): 4529 if any( 4530 parent_config.request_option 4531 for parent_config in underlying_router.parent_stream_configs 4532 ): 4533 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4534 4535 if isinstance(underlying_router, ListPartitionRouter): 4536 if underlying_router.request_option: 4537 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4538 4539 return GroupingPartitionRouter( 4540 group_size=model.group_size, 4541 underlying_partition_router=underlying_router, 4542 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4543 config=config, 4544 ) 4545 4546 def _ensure_query_properties_to_model( 4547 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4548 ) -> None: 4549 """ 4550 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4551 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4552 proper model. 4553 """ 4554 if not hasattr(requester, "request_parameters"): 4555 return 4556 4557 request_parameters = requester.request_parameters 4558 if request_parameters and isinstance(request_parameters, Dict): 4559 for request_parameter_key in request_parameters.keys(): 4560 request_parameter = request_parameters[request_parameter_key] 4561 if ( 4562 isinstance(request_parameter, Dict) 4563 and request_parameter.get("type") == "QueryProperties" 4564 ): 4565 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4566 request_parameter 4567 ) 4568 4569 def _get_catalog_defined_cursor_field( 4570 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4571 ) -> Optional[CursorField]: 4572 if not allow_catalog_defined_cursor_field: 4573 return None 4574 4575 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4576 4577 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4578 # case we return None which will then use the default cursor field defined on the cursor model. 4579 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4580 # occur when the platform serializes "no cursor configured" streams incorrectly. 4581 if ( 4582 not configured_stream 4583 or not configured_stream.cursor_field 4584 or not configured_stream.cursor_field[0] 4585 ): 4586 return None 4587 elif len(configured_stream.cursor_field) > 1: 4588 raise ValueError( 4589 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4590 ) 4591 else: 4592 return CursorField( 4593 cursor_field_key=configured_stream.cursor_field[0], 4594 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4595 )
692 def __init__( 693 self, 694 limit_pages_fetched_per_slice: Optional[int] = None, 695 limit_slices_fetched: Optional[int] = None, 696 emit_connector_builder_messages: bool = False, 697 disable_retries: bool = False, 698 disable_cache: bool = False, 699 message_repository: Optional[MessageRepository] = None, 700 connector_state_manager: Optional[ConnectorStateManager] = None, 701 max_concurrent_async_job_count: Optional[int] = None, 702 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 703 api_budget: Optional[APIBudget] = None, 704 ): 705 self._init_mappings() 706 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 707 self._limit_slices_fetched = limit_slices_fetched 708 self._emit_connector_builder_messages = emit_connector_builder_messages 709 self._disable_retries = disable_retries 710 self._disable_cache = disable_cache 711 self._message_repository = message_repository or InMemoryMessageRepository( 712 self._evaluate_log_level(emit_connector_builder_messages) 713 ) 714 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 715 configured_catalog 716 ) 717 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 718 self._api_budget: Optional[Union[APIBudget]] = api_budget 719 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 720 # placeholder for deprecation warnings 721 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
846 def create_component( 847 self, 848 model_type: Type[BaseModel], 849 component_definition: ComponentDefinition, 850 config: Config, 851 **kwargs: Any, 852 ) -> Any: 853 """ 854 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 855 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 856 creating declarative components from that model. 857 858 :param model_type: The type of declarative component that is being initialized 859 :param component_definition: The mapping that represents a declarative component 860 :param config: The connector config that is provided by the customer 861 :return: The declarative component to be used at runtime 862 """ 863 864 component_type = component_definition.get("type") 865 if component_definition.get("type") != model_type.__name__: 866 raise ValueError( 867 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 868 ) 869 870 declarative_component_model = model_type.parse_obj(component_definition) 871 872 if not isinstance(declarative_component_model, model_type): 873 raise ValueError( 874 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 875 ) 876 877 return self._create_component_from_model( 878 model=declarative_component_model, config=config, **kwargs 879 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
896 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 897 """ 898 Returns the deprecation warnings that were collected during the creation of components. 899 """ 900 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
917 def create_config_migration( 918 self, model: ConfigMigrationModel, config: Config 919 ) -> ConfigMigration: 920 transformations: List[ConfigTransformation] = [ 921 self._create_component_from_model(transformation, config) 922 for transformation in model.transformations 923 ] 924 925 return ConfigMigration( 926 description=model.description, 927 transformations=transformations, 928 )
930 def create_config_add_fields( 931 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 932 ) -> ConfigAddFields: 933 fields = [self._create_component_from_model(field, config) for field in model.fields] 934 return ConfigAddFields( 935 fields=fields, 936 condition=model.condition or "", 937 )
986 @staticmethod 987 def create_added_field_definition( 988 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 989 ) -> AddedFieldDefinition: 990 interpolated_value = InterpolatedString.create( 991 model.value, parameters=model.parameters or {} 992 ) 993 return AddedFieldDefinition( 994 path=model.path, 995 value=interpolated_value, 996 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 997 parameters=model.parameters or {}, 998 )
1000 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 1001 added_field_definitions = [ 1002 self._create_component_from_model( 1003 model=added_field_definition_model, 1004 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 1005 added_field_definition_model.value_type 1006 ), 1007 config=config, 1008 ) 1009 for added_field_definition_model in model.fields 1010 ] 1011 return AddFields( 1012 fields=added_field_definitions, 1013 condition=model.condition or "", 1014 parameters=model.parameters or {}, 1015 )
1041 def create_dpath_flatten_fields( 1042 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1043 ) -> DpathFlattenFields: 1044 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1045 key_transformation = ( 1046 KeyTransformation( 1047 config=config, 1048 prefix=model.key_transformation.prefix, 1049 suffix=model.key_transformation.suffix, 1050 parameters=model.parameters or {}, 1051 ) 1052 if model.key_transformation is not None 1053 else None 1054 ) 1055 return DpathFlattenFields( 1056 config=config, 1057 field_path=model_field_path, 1058 delete_origin_value=model.delete_origin_value 1059 if model.delete_origin_value is not None 1060 else False, 1061 replace_record=model.replace_record if model.replace_record is not None else False, 1062 key_transformation=key_transformation, 1063 parameters=model.parameters or {}, 1064 )
1078 def create_api_key_authenticator( 1079 self, 1080 model: ApiKeyAuthenticatorModel, 1081 config: Config, 1082 token_provider: Optional[TokenProvider] = None, 1083 **kwargs: Any, 1084 ) -> ApiKeyAuthenticator: 1085 if model.inject_into is None and model.header is None: 1086 raise ValueError( 1087 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1088 ) 1089 1090 if model.inject_into is not None and model.header is not None: 1091 raise ValueError( 1092 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1093 ) 1094 1095 if token_provider is not None and model.api_token != "": 1096 raise ValueError( 1097 "If token_provider is set, api_token is ignored and has to be set to empty string." 1098 ) 1099 1100 request_option = ( 1101 self._create_component_from_model( 1102 model.inject_into, config, parameters=model.parameters or {} 1103 ) 1104 if model.inject_into 1105 else RequestOption( 1106 inject_into=RequestOptionType.header, 1107 field_name=model.header or "", 1108 parameters=model.parameters or {}, 1109 ) 1110 ) 1111 1112 return ApiKeyAuthenticator( 1113 token_provider=( 1114 token_provider 1115 if token_provider is not None 1116 else InterpolatedStringTokenProvider( 1117 api_token=model.api_token or "", 1118 config=config, 1119 parameters=model.parameters or {}, 1120 ) 1121 ), 1122 request_option=request_option, 1123 config=config, 1124 parameters=model.parameters or {}, 1125 )
1127 def create_legacy_to_per_partition_state_migration( 1128 self, 1129 model: LegacyToPerPartitionStateMigrationModel, 1130 config: Mapping[str, Any], 1131 declarative_stream: DeclarativeStreamModel, 1132 ) -> LegacyToPerPartitionStateMigration: 1133 retriever = declarative_stream.retriever 1134 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1135 raise ValueError( 1136 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1137 ) 1138 partition_router = retriever.partition_router 1139 if not isinstance( 1140 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1141 ): 1142 raise ValueError( 1143 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1144 ) 1145 if not hasattr(partition_router, "parent_stream_configs"): 1146 raise ValueError( 1147 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1148 ) 1149 1150 if not hasattr(declarative_stream, "incremental_sync"): 1151 raise ValueError( 1152 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1153 ) 1154 1155 return LegacyToPerPartitionStateMigration( 1156 partition_router, # type: ignore # was already checked above 1157 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1158 config, 1159 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1160 )
1162 def create_session_token_authenticator( 1163 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1164 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1165 decoder = ( 1166 self._create_component_from_model(model=model.decoder, config=config) 1167 if model.decoder 1168 else JsonDecoder(parameters={}) 1169 ) 1170 login_requester = self._create_component_from_model( 1171 model=model.login_requester, 1172 config=config, 1173 name=f"{name}_login_requester", 1174 decoder=decoder, 1175 ) 1176 token_provider = SessionTokenProvider( 1177 login_requester=login_requester, 1178 session_token_path=model.session_token_path, 1179 expiration_duration=parse_duration(model.expiration_duration) 1180 if model.expiration_duration 1181 else None, 1182 parameters=model.parameters or {}, 1183 message_repository=self._message_repository, 1184 decoder=decoder, 1185 ) 1186 if model.request_authentication.type == "Bearer": 1187 return ModelToComponentFactory.create_bearer_authenticator( 1188 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1189 config, 1190 token_provider=token_provider, 1191 ) 1192 else: 1193 # Get the api_token template if specified, default to just the session token 1194 api_token_template = ( 1195 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1196 ) 1197 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1198 config=config, 1199 api_token=api_token_template, 1200 session_token_provider=token_provider, 1201 parameters=model.parameters or {}, 1202 ) 1203 return self.create_api_key_authenticator( 1204 ApiKeyAuthenticatorModel( 1205 type="ApiKeyAuthenticator", 1206 api_token="", 1207 inject_into=model.request_authentication.inject_into, 1208 ), # type: ignore # $parameters and headers default to None 1209 config=config, 1210 token_provider=final_token_provider, 1211 )
1213 @staticmethod 1214 def create_basic_http_authenticator( 1215 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1216 ) -> BasicHttpAuthenticator: 1217 return BasicHttpAuthenticator( 1218 password=model.password or "", 1219 username=model.username, 1220 config=config, 1221 parameters=model.parameters or {}, 1222 )
1224 @staticmethod 1225 def create_bearer_authenticator( 1226 model: BearerAuthenticatorModel, 1227 config: Config, 1228 token_provider: Optional[TokenProvider] = None, 1229 **kwargs: Any, 1230 ) -> BearerAuthenticator: 1231 if token_provider is not None and model.api_token != "": 1232 raise ValueError( 1233 "If token_provider is set, api_token is ignored and has to be set to empty string." 1234 ) 1235 return BearerAuthenticator( 1236 token_provider=( 1237 token_provider 1238 if token_provider is not None 1239 else InterpolatedStringTokenProvider( 1240 api_token=model.api_token or "", 1241 config=config, 1242 parameters=model.parameters or {}, 1243 ) 1244 ), 1245 config=config, 1246 parameters=model.parameters or {}, 1247 )
1249 @staticmethod 1250 def create_dynamic_stream_check_config( 1251 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1252 ) -> DynamicStreamCheckConfig: 1253 return DynamicStreamCheckConfig( 1254 dynamic_stream_name=model.dynamic_stream_name, 1255 stream_count=model.stream_count, 1256 )
1258 def create_check_stream( 1259 self, model: CheckStreamModel, config: Config, **kwargs: Any 1260 ) -> CheckStream: 1261 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1262 raise ValueError( 1263 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1264 ) 1265 1266 dynamic_streams_check_configs = ( 1267 [ 1268 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1269 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1270 ] 1271 if model.dynamic_streams_check_configs 1272 else [] 1273 ) 1274 1275 return CheckStream( 1276 stream_names=model.stream_names or [], 1277 dynamic_streams_check_configs=dynamic_streams_check_configs, 1278 parameters={}, 1279 )
1281 @staticmethod 1282 def create_check_dynamic_stream( 1283 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1284 ) -> CheckDynamicStream: 1285 assert model.use_check_availability is not None # for mypy 1286 1287 use_check_availability = model.use_check_availability 1288 1289 return CheckDynamicStream( 1290 stream_count=model.stream_count, 1291 use_check_availability=use_check_availability, 1292 parameters={}, 1293 )
1295 def create_composite_error_handler( 1296 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1297 ) -> CompositeErrorHandler: 1298 error_handlers = [ 1299 self._create_component_from_model(model=error_handler_model, config=config) 1300 for error_handler_model in model.error_handlers 1301 ] 1302 return CompositeErrorHandler( 1303 error_handlers=error_handlers, parameters=model.parameters or {} 1304 )
1306 @staticmethod 1307 def create_concurrency_level( 1308 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1309 ) -> ConcurrencyLevel: 1310 return ConcurrencyLevel( 1311 default_concurrency=model.default_concurrency, 1312 max_concurrency=model.max_concurrency, 1313 config=config, 1314 parameters={}, 1315 )
1317 @staticmethod 1318 def apply_stream_state_migrations( 1319 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1320 ) -> MutableMapping[str, Any]: 1321 if stream_state_migrations: 1322 for state_migration in stream_state_migrations: 1323 if state_migration.should_migrate(stream_state): 1324 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1325 stream_state = dict(state_migration.migrate(stream_state)) 1326 return stream_state
1328 def create_concurrent_cursor_from_datetime_based_cursor( 1329 self, 1330 model_type: Type[BaseModel], 1331 component_definition: ComponentDefinition, 1332 stream_name: str, 1333 stream_namespace: Optional[str], 1334 stream_state: MutableMapping[str, Any], 1335 config: Config, 1336 message_repository: Optional[MessageRepository] = None, 1337 runtime_lookback_window: Optional[datetime.timedelta] = None, 1338 **kwargs: Any, 1339 ) -> ConcurrentCursor: 1340 component_type = component_definition.get("type") 1341 if component_definition.get("type") != model_type.__name__: 1342 raise ValueError( 1343 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1344 ) 1345 1346 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1347 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1348 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1349 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1350 if "$parameters" not in component_definition and "parameters" in component_definition: 1351 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1352 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1353 1354 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1355 raise ValueError( 1356 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1357 ) 1358 1359 model_parameters = datetime_based_cursor_model.parameters or {} 1360 1361 cursor_field = self._get_catalog_defined_cursor_field( 1362 stream_name=stream_name, 1363 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1364 or False, 1365 ) 1366 1367 if not cursor_field: 1368 interpolated_cursor_field = InterpolatedString.create( 1369 datetime_based_cursor_model.cursor_field, 1370 parameters=model_parameters, 1371 ) 1372 cursor_field = CursorField( 1373 cursor_field_key=interpolated_cursor_field.eval(config=config), 1374 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1375 or False, 1376 ) 1377 1378 interpolated_partition_field_start = InterpolatedString.create( 1379 datetime_based_cursor_model.partition_field_start or "start_time", 1380 parameters=model_parameters, 1381 ) 1382 interpolated_partition_field_end = InterpolatedString.create( 1383 datetime_based_cursor_model.partition_field_end or "end_time", 1384 parameters=model_parameters, 1385 ) 1386 1387 slice_boundary_fields = ( 1388 interpolated_partition_field_start.eval(config=config), 1389 interpolated_partition_field_end.eval(config=config), 1390 ) 1391 1392 datetime_format = datetime_based_cursor_model.datetime_format 1393 1394 cursor_granularity = ( 1395 parse_duration(datetime_based_cursor_model.cursor_granularity) 1396 if datetime_based_cursor_model.cursor_granularity 1397 else None 1398 ) 1399 1400 lookback_window = None 1401 interpolated_lookback_window = ( 1402 InterpolatedString.create( 1403 datetime_based_cursor_model.lookback_window, 1404 parameters=model_parameters, 1405 ) 1406 if datetime_based_cursor_model.lookback_window 1407 else None 1408 ) 1409 if interpolated_lookback_window: 1410 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1411 if evaluated_lookback_window: 1412 lookback_window = parse_duration(evaluated_lookback_window) 1413 1414 connector_state_converter: DateTimeStreamStateConverter 1415 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1416 datetime_format=datetime_format, 1417 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1418 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1419 cursor_granularity=cursor_granularity, 1420 ) 1421 1422 # Adjusts the stream state by applying the runtime lookback window. 1423 # This is used to ensure correct state handling in case of failed partitions. 1424 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1425 if runtime_lookback_window and stream_state_value: 1426 new_stream_state = ( 1427 connector_state_converter.parse_timestamp(stream_state_value) 1428 - runtime_lookback_window 1429 ) 1430 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1431 new_stream_state 1432 ) 1433 1434 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1435 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1436 start_date_runtime_value = self.create_min_max_datetime( 1437 model=datetime_based_cursor_model.start_datetime, config=config 1438 ) 1439 else: 1440 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1441 1442 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1443 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1444 end_date_runtime_value = self.create_min_max_datetime( 1445 model=datetime_based_cursor_model.end_datetime, config=config 1446 ) 1447 else: 1448 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1449 1450 interpolated_start_date = MinMaxDatetime.create( 1451 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1452 parameters=datetime_based_cursor_model.parameters, 1453 ) 1454 interpolated_end_date = ( 1455 None 1456 if not end_date_runtime_value 1457 else MinMaxDatetime.create( 1458 end_date_runtime_value, datetime_based_cursor_model.parameters 1459 ) 1460 ) 1461 1462 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1463 if not interpolated_start_date.datetime_format: 1464 interpolated_start_date.datetime_format = datetime_format 1465 if interpolated_end_date and not interpolated_end_date.datetime_format: 1466 interpolated_end_date.datetime_format = datetime_format 1467 1468 start_date = interpolated_start_date.get_datetime(config=config) 1469 end_date_provider = ( 1470 partial(interpolated_end_date.get_datetime, config) 1471 if interpolated_end_date 1472 else connector_state_converter.get_end_provider() 1473 ) 1474 1475 if ( 1476 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1477 ) or ( 1478 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1479 ): 1480 raise ValueError( 1481 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1482 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1483 ) 1484 1485 # When step is not defined, default to a step size from the starting date to the present moment 1486 step_length = datetime.timedelta.max 1487 interpolated_step = ( 1488 InterpolatedString.create( 1489 datetime_based_cursor_model.step, 1490 parameters=model_parameters, 1491 ) 1492 if datetime_based_cursor_model.step 1493 else None 1494 ) 1495 if interpolated_step: 1496 evaluated_step = interpolated_step.eval(config) 1497 if evaluated_step: 1498 step_length = parse_duration(evaluated_step) 1499 1500 clamping_strategy: ClampingStrategy = NoClamping() 1501 if datetime_based_cursor_model.clamping: 1502 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1503 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1504 # object which we want to keep agnostic of being low-code 1505 target = InterpolatedString( 1506 string=datetime_based_cursor_model.clamping.target, 1507 parameters=model_parameters, 1508 ) 1509 evaluated_target = target.eval(config=config) 1510 match evaluated_target: 1511 case "DAY": 1512 clamping_strategy = DayClampingStrategy() 1513 end_date_provider = ClampingEndProvider( 1514 DayClampingStrategy(is_ceiling=False), 1515 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1517 ) 1518 case "WEEK": 1519 if ( 1520 not datetime_based_cursor_model.clamping.target_details 1521 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1522 ): 1523 raise ValueError( 1524 "Given WEEK clamping, weekday needs to be provided as target_details" 1525 ) 1526 weekday = self._assemble_weekday( 1527 datetime_based_cursor_model.clamping.target_details["weekday"] 1528 ) 1529 clamping_strategy = WeekClampingStrategy(weekday) 1530 end_date_provider = ClampingEndProvider( 1531 WeekClampingStrategy(weekday, is_ceiling=False), 1532 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1533 granularity=cursor_granularity or datetime.timedelta(days=1), 1534 ) 1535 case "MONTH": 1536 clamping_strategy = MonthClampingStrategy() 1537 end_date_provider = ClampingEndProvider( 1538 MonthClampingStrategy(is_ceiling=False), 1539 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1540 granularity=cursor_granularity or datetime.timedelta(days=1), 1541 ) 1542 case _: 1543 raise ValueError( 1544 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1545 ) 1546 1547 return ConcurrentCursor( 1548 stream_name=stream_name, 1549 stream_namespace=stream_namespace, 1550 stream_state=stream_state, 1551 message_repository=message_repository or self._message_repository, 1552 connector_state_manager=self._connector_state_manager, 1553 connector_state_converter=connector_state_converter, 1554 cursor_field=cursor_field, 1555 slice_boundary_fields=slice_boundary_fields, 1556 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1557 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1558 lookback_window=lookback_window, 1559 slice_range=step_length, 1560 cursor_granularity=cursor_granularity, 1561 clamping_strategy=clamping_strategy, 1562 )
1564 def create_concurrent_cursor_from_incrementing_count_cursor( 1565 self, 1566 model_type: Type[BaseModel], 1567 component_definition: ComponentDefinition, 1568 stream_name: str, 1569 stream_namespace: Optional[str], 1570 stream_state: MutableMapping[str, Any], 1571 config: Config, 1572 message_repository: Optional[MessageRepository] = None, 1573 **kwargs: Any, 1574 ) -> ConcurrentCursor: 1575 component_type = component_definition.get("type") 1576 if component_definition.get("type") != model_type.__name__: 1577 raise ValueError( 1578 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1579 ) 1580 1581 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1582 1583 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1584 raise ValueError( 1585 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1586 ) 1587 1588 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1589 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1590 # We need to handle both int and str representations of numeric values. 1591 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1592 if start_value is not None: 1593 interpolated_start_value = InterpolatedString.create( 1594 str(start_value), # Ensure we pass a string to InterpolatedString.create 1595 parameters=incrementing_count_cursor_model.parameters or {}, 1596 ) 1597 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1598 else: 1599 evaluated_start_value = 0 1600 1601 cursor_field = self._get_catalog_defined_cursor_field( 1602 stream_name=stream_name, 1603 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1604 or False, 1605 ) 1606 1607 if not cursor_field: 1608 interpolated_cursor_field = InterpolatedString.create( 1609 incrementing_count_cursor_model.cursor_field, 1610 parameters=incrementing_count_cursor_model.parameters or {}, 1611 ) 1612 cursor_field = CursorField( 1613 cursor_field_key=interpolated_cursor_field.eval(config=config), 1614 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1615 or False, 1616 ) 1617 1618 connector_state_converter = IncrementingCountStreamStateConverter( 1619 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1620 ) 1621 1622 return ConcurrentCursor( 1623 stream_name=stream_name, 1624 stream_namespace=stream_namespace, 1625 stream_state=stream_state, 1626 message_repository=message_repository or self._message_repository, 1627 connector_state_manager=self._connector_state_manager, 1628 connector_state_converter=connector_state_converter, 1629 cursor_field=cursor_field, 1630 slice_boundary_fields=None, 1631 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1632 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1633 )
1654 def create_concurrent_cursor_from_perpartition_cursor( 1655 self, 1656 state_manager: ConnectorStateManager, 1657 model_type: Type[BaseModel], 1658 component_definition: ComponentDefinition, 1659 stream_name: str, 1660 stream_namespace: Optional[str], 1661 config: Config, 1662 stream_state: MutableMapping[str, Any], 1663 partition_router: PartitionRouter, 1664 attempt_to_create_cursor_if_not_provided: bool = False, 1665 **kwargs: Any, 1666 ) -> ConcurrentPerPartitionCursor: 1667 component_type = component_definition.get("type") 1668 if component_definition.get("type") != model_type.__name__: 1669 raise ValueError( 1670 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1671 ) 1672 1673 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1674 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1675 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1676 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1677 if "$parameters" not in component_definition and "parameters" in component_definition: 1678 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1679 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1680 1681 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1682 raise ValueError( 1683 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1684 ) 1685 1686 cursor_field = self._get_catalog_defined_cursor_field( 1687 stream_name=stream_name, 1688 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1689 or False, 1690 ) 1691 1692 if not cursor_field: 1693 interpolated_cursor_field = InterpolatedString.create( 1694 datetime_based_cursor_model.cursor_field, 1695 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1696 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1697 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1698 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1699 parameters=datetime_based_cursor_model.parameters or {}, 1700 ) 1701 cursor_field = CursorField( 1702 cursor_field_key=interpolated_cursor_field.eval(config=config), 1703 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1704 or False, 1705 ) 1706 1707 datetime_format = datetime_based_cursor_model.datetime_format 1708 1709 cursor_granularity = ( 1710 parse_duration(datetime_based_cursor_model.cursor_granularity) 1711 if datetime_based_cursor_model.cursor_granularity 1712 else None 1713 ) 1714 1715 connector_state_converter: DateTimeStreamStateConverter 1716 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1717 datetime_format=datetime_format, 1718 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1719 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1720 cursor_granularity=cursor_granularity, 1721 ) 1722 1723 # Create the cursor factory 1724 cursor_factory = ConcurrentCursorFactory( 1725 partial( 1726 self.create_concurrent_cursor_from_datetime_based_cursor, 1727 state_manager=state_manager, 1728 model_type=model_type, 1729 component_definition=component_definition, 1730 stream_name=stream_name, 1731 stream_namespace=stream_namespace, 1732 config=config, 1733 message_repository=NoopMessageRepository(), 1734 ) 1735 ) 1736 1737 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1738 use_global_cursor = isinstance( 1739 partition_router, GroupingPartitionRouter 1740 ) or component_definition.get("global_substream_cursor", False) 1741 1742 # Return the concurrent cursor and state converter 1743 return ConcurrentPerPartitionCursor( 1744 cursor_factory=cursor_factory, 1745 partition_router=partition_router, 1746 stream_name=stream_name, 1747 stream_namespace=stream_namespace, 1748 stream_state=stream_state, 1749 message_repository=self._message_repository, # type: ignore 1750 connector_state_manager=state_manager, 1751 connector_state_converter=connector_state_converter, 1752 cursor_field=cursor_field, 1753 use_global_cursor=use_global_cursor, 1754 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1755 )
1757 @staticmethod 1758 def create_constant_backoff_strategy( 1759 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1760 ) -> ConstantBackoffStrategy: 1761 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 1762 return ConstantBackoffStrategy( 1763 backoff_time_in_seconds=model.backoff_time_in_seconds, 1764 jitter_range_in_seconds=model.jitter_range_in_seconds, 1765 config=config, 1766 parameters=model.parameters or {}, 1767 )
1774 def create_cursor_pagination( 1775 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1776 ) -> CursorPaginationStrategy: 1777 if isinstance(decoder, PaginationDecoderDecorator): 1778 inner_decoder = decoder.decoder 1779 else: 1780 inner_decoder = decoder 1781 decoder = PaginationDecoderDecorator(decoder=decoder) 1782 1783 if self._is_supported_decoder_for_pagination(inner_decoder): 1784 decoder_to_use = decoder 1785 else: 1786 raise ValueError( 1787 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1788 ) 1789 1790 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1791 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1792 page_size = model.page_size 1793 if isinstance(page_size, str) and page_size.isdigit(): 1794 page_size = int(page_size) 1795 1796 return CursorPaginationStrategy( 1797 cursor_value=model.cursor_value, 1798 decoder=decoder_to_use, 1799 page_size=page_size, 1800 stop_condition=model.stop_condition, 1801 config=config, 1802 parameters=model.parameters or {}, 1803 )
1805 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1806 """ 1807 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1808 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1809 :param model: The Pydantic model of the custom component being created 1810 :param config: The custom defined connector config 1811 :return: The declarative component built from the Pydantic model to be used at runtime 1812 """ 1813 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1814 component_fields = get_type_hints(custom_component_class) 1815 model_args = model.dict() 1816 model_args["config"] = config 1817 1818 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1819 # we defer to these arguments over the component's definition 1820 for key, arg in kwargs.items(): 1821 model_args[key] = arg 1822 1823 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1824 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1825 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1826 for model_field, model_value in model_args.items(): 1827 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1828 if ( 1829 isinstance(model_value, dict) 1830 and "type" not in model_value 1831 and model_field in component_fields 1832 ): 1833 derived_type = self._derive_component_type_from_type_hints( 1834 component_fields.get(model_field) 1835 ) 1836 if derived_type: 1837 model_value["type"] = derived_type 1838 1839 if self._is_component(model_value): 1840 model_args[model_field] = self._create_nested_component( 1841 model, 1842 model_field, 1843 model_value, 1844 config, 1845 **kwargs, 1846 ) 1847 elif isinstance(model_value, list): 1848 vals = [] 1849 for v in model_value: 1850 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1851 derived_type = self._derive_component_type_from_type_hints( 1852 component_fields.get(model_field) 1853 ) 1854 if derived_type: 1855 v["type"] = derived_type 1856 if self._is_component(v): 1857 vals.append( 1858 self._create_nested_component( 1859 model, 1860 model_field, 1861 v, 1862 config, 1863 **kwargs, 1864 ) 1865 ) 1866 else: 1867 vals.append(v) 1868 model_args[model_field] = vals 1869 1870 kwargs = { 1871 class_field: model_args[class_field] 1872 for class_field in component_fields.keys() 1873 if class_field in model_args 1874 } 1875 1876 if "api_budget" in component_fields and kwargs.get("api_budget") is None: 1877 kwargs["api_budget"] = self._api_budget 1878 1879 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
2014 def create_default_stream( 2015 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 2016 ) -> AbstractStream: 2017 primary_key = model.primary_key.__root__ if model.primary_key else None 2018 self._migrate_state(model, config) 2019 2020 partition_router = self._build_stream_slicer_from_partition_router( 2021 model.retriever, 2022 config, 2023 stream_name=model.name, 2024 **kwargs, 2025 ) 2026 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2027 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2028 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2029 2030 end_time_option = ( 2031 self._create_component_from_model( 2032 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2033 ) 2034 if cursor_model.end_time_option 2035 else None 2036 ) 2037 start_time_option = ( 2038 self._create_component_from_model( 2039 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2040 ) 2041 if cursor_model.start_time_option 2042 else None 2043 ) 2044 2045 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2046 start_time_option=start_time_option, 2047 end_time_option=end_time_option, 2048 partition_field_start=cursor_model.partition_field_start, 2049 partition_field_end=cursor_model.partition_field_end, 2050 config=config, 2051 parameters=model.parameters or {}, 2052 ) 2053 request_options_provider = ( 2054 datetime_request_options_provider 2055 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2056 else PerPartitionRequestOptionsProvider( 2057 partition_router, datetime_request_options_provider 2058 ) 2059 ) 2060 elif model.incremental_sync and isinstance( 2061 model.incremental_sync, IncrementingCountCursorModel 2062 ): 2063 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2064 raise ValueError( 2065 "PerPartition does not support per partition states because switching to global state is time based" 2066 ) 2067 2068 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2069 2070 start_time_option = ( 2071 self._create_component_from_model( 2072 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2073 config, 2074 parameters=cursor_model.parameters or {}, 2075 ) 2076 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2077 else None 2078 ) 2079 2080 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2081 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2082 partition_field_start = "start" 2083 2084 request_options_provider = DatetimeBasedRequestOptionsProvider( 2085 start_time_option=start_time_option, 2086 partition_field_start=partition_field_start, 2087 config=config, 2088 parameters=model.parameters or {}, 2089 ) 2090 else: 2091 request_options_provider = None 2092 2093 transformations = [] 2094 if model.transformations: 2095 for transformation_model in model.transformations: 2096 transformations.append( 2097 self._create_component_from_model(model=transformation_model, config=config) 2098 ) 2099 file_uploader = None 2100 if model.file_uploader: 2101 file_uploader = self._create_component_from_model( 2102 model=model.file_uploader, config=config 2103 ) 2104 2105 stream_slicer: ConcurrentStreamSlicer = ( 2106 partition_router 2107 if isinstance(concurrent_cursor, FinalStateCursor) 2108 else concurrent_cursor 2109 ) 2110 2111 retriever = self._create_component_from_model( 2112 model=model.retriever, 2113 config=config, 2114 name=model.name, 2115 primary_key=primary_key, 2116 request_options_provider=request_options_provider, 2117 stream_slicer=stream_slicer, 2118 partition_router=partition_router, 2119 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2120 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2121 cursor=concurrent_cursor, 2122 transformations=transformations, 2123 file_uploader=file_uploader, 2124 incremental_sync=model.incremental_sync, 2125 ) 2126 if isinstance(retriever, AsyncRetriever): 2127 stream_slicer = retriever.stream_slicer 2128 2129 schema_loader: SchemaLoader 2130 if model.schema_loader and isinstance(model.schema_loader, list): 2131 nested_schema_loaders = [ 2132 self._create_component_from_model(model=nested_schema_loader, config=config) 2133 for nested_schema_loader in model.schema_loader 2134 ] 2135 schema_loader = CompositeSchemaLoader( 2136 schema_loaders=nested_schema_loaders, parameters={} 2137 ) 2138 elif model.schema_loader: 2139 schema_loader = self._create_component_from_model( 2140 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2141 config=config, 2142 ) 2143 else: 2144 options = model.parameters or {} 2145 if "name" not in options: 2146 options["name"] = model.name 2147 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2148 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2149 2150 stream_name = model.name or "" 2151 return DefaultStream( 2152 partition_generator=StreamSlicerPartitionGenerator( 2153 DeclarativePartitionFactory( 2154 stream_name, 2155 schema_loader, 2156 retriever, 2157 self._message_repository, 2158 ), 2159 stream_slicer, 2160 slice_limit=self._limit_slices_fetched, 2161 ), 2162 name=stream_name, 2163 json_schema=schema_loader.get_json_schema, 2164 primary_key=get_primary_key_from_stream(primary_key), 2165 cursor_field=( 2166 concurrent_cursor.cursor_field 2167 if hasattr(concurrent_cursor, "cursor_field") 2168 else None 2169 ), 2170 logger=logging.getLogger(f"airbyte.{stream_name}"), 2171 cursor=concurrent_cursor, 2172 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2173 )
2315 def create_default_error_handler( 2316 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2317 ) -> DefaultErrorHandler: 2318 backoff_strategies = [] 2319 if model.backoff_strategies: 2320 for backoff_strategy_model in model.backoff_strategies: 2321 backoff_strategies.append( 2322 self._create_component_from_model(model=backoff_strategy_model, config=config) 2323 ) 2324 2325 response_filters = [] 2326 if model.response_filters: 2327 for response_filter_model in model.response_filters: 2328 response_filters.append( 2329 self._create_component_from_model(model=response_filter_model, config=config) 2330 ) 2331 response_filters.append( 2332 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2333 ) 2334 2335 return DefaultErrorHandler( 2336 backoff_strategies=backoff_strategies, 2337 max_retries=model.max_retries, 2338 response_filters=response_filters, 2339 config=config, 2340 parameters=model.parameters or {}, 2341 )
2343 def create_default_paginator( 2344 self, 2345 model: DefaultPaginatorModel, 2346 config: Config, 2347 *, 2348 url_base: str, 2349 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2350 decoder: Optional[Decoder] = None, 2351 cursor_used_for_stop_condition: Optional[Cursor] = None, 2352 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2353 if decoder: 2354 if self._is_supported_decoder_for_pagination(decoder): 2355 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2356 else: 2357 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2358 else: 2359 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2360 page_size_option = ( 2361 self._create_component_from_model(model=model.page_size_option, config=config) 2362 if model.page_size_option 2363 else None 2364 ) 2365 page_token_option = ( 2366 self._create_component_from_model(model=model.page_token_option, config=config) 2367 if model.page_token_option 2368 else None 2369 ) 2370 pagination_strategy = self._create_component_from_model( 2371 model=model.pagination_strategy, 2372 config=config, 2373 decoder=decoder_to_use, 2374 extractor_model=extractor_model, 2375 ) 2376 if cursor_used_for_stop_condition: 2377 pagination_strategy = StopConditionPaginationStrategyDecorator( 2378 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2379 ) 2380 paginator = DefaultPaginator( 2381 decoder=decoder_to_use, 2382 page_size_option=page_size_option, 2383 page_token_option=page_token_option, 2384 pagination_strategy=pagination_strategy, 2385 url_base=url_base, 2386 config=config, 2387 parameters=model.parameters or {}, 2388 ) 2389 if self._limit_pages_fetched_per_slice: 2390 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2391 return paginator
2393 def create_dpath_extractor( 2394 self, 2395 model: DpathExtractorModel, 2396 config: Config, 2397 decoder: Optional[Decoder] = None, 2398 **kwargs: Any, 2399 ) -> DpathExtractor: 2400 if decoder: 2401 decoder_to_use = decoder 2402 else: 2403 decoder_to_use = JsonDecoder(parameters={}) 2404 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2405 2406 record_expander = None 2407 if model.record_expander: 2408 record_expander = self._create_component_from_model( 2409 model=model.record_expander, 2410 config=config, 2411 ) 2412 2413 return DpathExtractor( 2414 decoder=decoder_to_use, 2415 field_path=model_field_path, 2416 config=config, 2417 parameters=model.parameters or {}, 2418 record_expander=record_expander, 2419 )
2421 def create_record_expander( 2422 self, 2423 model: RecordExpanderModel, 2424 config: Config, 2425 **kwargs: Any, 2426 ) -> RecordExpander: 2427 return RecordExpander( 2428 expand_records_from_field=model.expand_records_from_field, 2429 config=config, 2430 parameters=model.parameters or {}, 2431 remain_original_record=model.remain_original_record or False, 2432 on_no_records=OnNoRecords(model.on_no_records.value) 2433 if model.on_no_records 2434 else OnNoRecords.skip, 2435 )
2444 @staticmethod 2445 def create_exponential_backoff_strategy( 2446 model: ExponentialBackoffStrategyModel, config: Config 2447 ) -> ExponentialBackoffStrategy: 2448 ModelToComponentFactory._validate_jitter_range(model.jitter_range_in_seconds) 2449 return ExponentialBackoffStrategy( 2450 factor=model.factor or 5, 2451 jitter_range_in_seconds=model.jitter_range_in_seconds, 2452 parameters=model.parameters or {}, 2453 config=config, 2454 )
2460 def create_http_requester( 2461 self, 2462 model: HttpRequesterModel, 2463 config: Config, 2464 decoder: Decoder = JsonDecoder(parameters={}), 2465 query_properties_key: Optional[str] = None, 2466 use_cache: Optional[bool] = None, 2467 *, 2468 name: str, 2469 ) -> HttpRequester: 2470 authenticator = ( 2471 self._create_component_from_model( 2472 model=model.authenticator, 2473 config=config, 2474 url_base=model.url or model.url_base, 2475 name=name, 2476 decoder=decoder, 2477 ) 2478 if model.authenticator 2479 else None 2480 ) 2481 error_handler = ( 2482 self._create_component_from_model(model=model.error_handler, config=config) 2483 if model.error_handler 2484 else DefaultErrorHandler( 2485 backoff_strategies=[], 2486 response_filters=[], 2487 config=config, 2488 parameters=model.parameters or {}, 2489 ) 2490 ) 2491 2492 api_budget = self._api_budget 2493 2494 request_options_provider = InterpolatedRequestOptionsProvider( 2495 request_body=model.request_body, 2496 request_body_data=model.request_body_data, 2497 request_body_json=model.request_body_json, 2498 request_headers=model.request_headers, 2499 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2500 query_properties_key=query_properties_key, 2501 config=config, 2502 parameters=model.parameters or {}, 2503 ) 2504 2505 assert model.use_cache is not None # for mypy 2506 assert model.http_method is not None # for mypy 2507 2508 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2509 2510 return HttpRequester( 2511 name=name, 2512 url=model.url, 2513 url_base=model.url_base, 2514 path=model.path, 2515 authenticator=authenticator, 2516 error_handler=error_handler, 2517 api_budget=api_budget, 2518 http_method=HttpMethod[model.http_method.value], 2519 request_options_provider=request_options_provider, 2520 config=config, 2521 disable_retries=self._disable_retries, 2522 parameters=model.parameters or {}, 2523 message_repository=self._message_repository, 2524 use_cache=should_use_cache, 2525 decoder=decoder, 2526 stream_response=decoder.is_stream_response() if decoder else False, 2527 )
2529 @staticmethod 2530 def create_http_response_filter( 2531 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2532 ) -> HttpResponseFilter: 2533 if model.action: 2534 action = ResponseAction(model.action.value) 2535 else: 2536 action = None 2537 2538 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2539 2540 http_codes = ( 2541 set(model.http_codes) if model.http_codes else set() 2542 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2543 2544 return HttpResponseFilter( 2545 action=action, 2546 failure_type=failure_type, 2547 error_message=model.error_message or "", 2548 error_message_contains=model.error_message_contains or "", 2549 http_codes=http_codes, 2550 predicate=model.predicate or "", 2551 config=config, 2552 parameters=model.parameters or {}, 2553 )
2561 def create_complex_field_type( 2562 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2563 ) -> ComplexFieldType: 2564 items = ( 2565 self._create_component_from_model(model=model.items, config=config) 2566 if isinstance(model.items, ComplexFieldTypeModel) 2567 else model.items 2568 ) 2569 2570 return ComplexFieldType(field_type=model.field_type, items=items)
2572 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2573 target_type = ( 2574 self._create_component_from_model(model=model.target_type, config=config) 2575 if isinstance(model.target_type, ComplexFieldTypeModel) 2576 else model.target_type 2577 ) 2578 2579 return TypesMap( 2580 target_type=target_type, 2581 current_type=model.current_type, 2582 condition=model.condition if model.condition is not None else "True", 2583 )
2585 def create_schema_type_identifier( 2586 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2587 ) -> SchemaTypeIdentifier: 2588 types_mapping = [] 2589 if model.types_mapping: 2590 types_mapping.extend( 2591 [ 2592 self._create_component_from_model(types_map, config=config) 2593 for types_map in model.types_mapping 2594 ] 2595 ) 2596 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2597 [x for x in model.schema_pointer] if model.schema_pointer else [] 2598 ) 2599 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2600 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2601 [x for x in model.type_pointer] if model.type_pointer else None 2602 ) 2603 2604 return SchemaTypeIdentifier( 2605 schema_pointer=model_schema_pointer, 2606 key_pointer=model_key_pointer, 2607 type_pointer=model_type_pointer, 2608 types_mapping=types_mapping, 2609 parameters=model.parameters or {}, 2610 )
2612 def create_dynamic_schema_loader( 2613 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2614 ) -> DynamicSchemaLoader: 2615 schema_transformations = [] 2616 if model.schema_transformations: 2617 for transformation_model in model.schema_transformations: 2618 schema_transformations.append( 2619 self._create_component_from_model(model=transformation_model, config=config) 2620 ) 2621 name = "dynamic_properties" 2622 retriever = self._create_component_from_model( 2623 model=model.retriever, 2624 config=config, 2625 name=name, 2626 primary_key=None, 2627 partition_router=self._build_stream_slicer_from_partition_router( 2628 model.retriever, config 2629 ), 2630 transformations=[], 2631 use_cache=True, 2632 log_formatter=( 2633 lambda response: format_http_message( 2634 response, 2635 f"Schema loader '{name}' request", 2636 f"Request performed in order to extract schema.", 2637 name, 2638 is_auxiliary=True, 2639 ) 2640 ), 2641 ) 2642 schema_type_identifier = self._create_component_from_model( 2643 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2644 ) 2645 schema_filter = ( 2646 self._create_component_from_model( 2647 model.schema_filter, config=config, parameters=model.parameters or {} 2648 ) 2649 if model.schema_filter is not None 2650 else None 2651 ) 2652 2653 return DynamicSchemaLoader( 2654 retriever=retriever, 2655 config=config, 2656 schema_transformations=schema_transformations, 2657 schema_filter=schema_filter, 2658 schema_type_identifier=schema_type_identifier, 2659 parameters=model.parameters or {}, 2660 )
2680 def create_json_items_decoder( 2681 self, model: JsonItemsDecoderModel, config: Config, **kwargs: Any 2682 ) -> Decoder: 2683 return CompositeRawDecoder( 2684 parser=ModelToComponentFactory._get_parser(model, config), 2685 stream_response=False if self._emit_connector_builder_messages else True, 2686 )
2688 def create_gzip_decoder( 2689 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2690 ) -> Decoder: 2691 _compressed_response_types = { 2692 "gzip", 2693 "x-gzip", 2694 "gzip, deflate", 2695 "x-gzip, deflate", 2696 "application/zip", 2697 "application/gzip", 2698 "application/x-gzip", 2699 "application/x-zip-compressed", 2700 } 2701 2702 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2703 2704 if self._emit_connector_builder_messages: 2705 # This is very surprising but if the response is not streamed, 2706 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2707 # which uses urllib3 directly and does not uncompress the data. 2708 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2709 2710 return CompositeRawDecoder.by_headers( 2711 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2712 stream_response=True, 2713 fallback_parser=gzip_parser.inner_parser, 2714 )
2768 def create_jwt_authenticator( 2769 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2770 ) -> JwtAuthenticator: 2771 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2772 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2773 request_option = ( 2774 self._create_component_from_model(model.request_option, config) 2775 if model.request_option 2776 else None 2777 ) 2778 return JwtAuthenticator( 2779 config=config, 2780 parameters=model.parameters or {}, 2781 algorithm=JwtAlgorithm(model.algorithm.value), 2782 secret_key=model.secret_key, 2783 base64_encode_secret_key=model.base64_encode_secret_key, 2784 token_duration=model.token_duration, 2785 header_prefix=model.header_prefix, 2786 kid=jwt_headers.kid, 2787 typ=jwt_headers.typ, 2788 cty=jwt_headers.cty, 2789 iss=jwt_payload.iss, 2790 sub=jwt_payload.sub, 2791 aud=jwt_payload.aud, 2792 additional_jwt_headers=model.additional_jwt_headers, 2793 additional_jwt_payload=model.additional_jwt_payload, 2794 passphrase=model.passphrase, 2795 request_option=request_option, 2796 )
2798 def create_list_partition_router( 2799 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2800 ) -> ListPartitionRouter: 2801 request_option = ( 2802 self._create_component_from_model(model.request_option, config) 2803 if model.request_option 2804 else None 2805 ) 2806 return ListPartitionRouter( 2807 cursor_field=model.cursor_field, 2808 request_option=request_option, 2809 values=model.values, 2810 config=config, 2811 parameters=model.parameters or {}, 2812 )
2814 @staticmethod 2815 def create_min_max_datetime( 2816 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2817 ) -> MinMaxDatetime: 2818 return MinMaxDatetime( 2819 datetime=model.datetime, 2820 datetime_format=model.datetime_format or "", 2821 max_datetime=model.max_datetime or "", 2822 min_datetime=model.min_datetime or "", 2823 parameters=model.parameters or {}, 2824 )
2836 def create_oauth_authenticator( 2837 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2838 ) -> DeclarativeOauth2Authenticator: 2839 profile_assertion = ( 2840 self._create_component_from_model(model.profile_assertion, config=config) 2841 if model.profile_assertion 2842 else None 2843 ) 2844 2845 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2846 self._get_refresh_token_error_information(model) 2847 ) 2848 if model.refresh_token_updater: 2849 # ignore type error because fixing it would have a lot of dependencies, revisit later 2850 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2851 config, 2852 InterpolatedString.create( 2853 model.token_refresh_endpoint, # type: ignore 2854 parameters=model.parameters or {}, 2855 ).eval(config), 2856 access_token_name=InterpolatedString.create( 2857 model.access_token_name or "access_token", parameters=model.parameters or {} 2858 ).eval(config), 2859 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2860 expires_in_name=InterpolatedString.create( 2861 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2862 ).eval(config), 2863 client_id_name=InterpolatedString.create( 2864 model.client_id_name or "client_id", parameters=model.parameters or {} 2865 ).eval(config), 2866 client_id=InterpolatedString.create( 2867 model.client_id, parameters=model.parameters or {} 2868 ).eval(config) 2869 if model.client_id 2870 else model.client_id, 2871 client_secret_name=InterpolatedString.create( 2872 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2873 ).eval(config), 2874 client_secret=InterpolatedString.create( 2875 model.client_secret, parameters=model.parameters or {} 2876 ).eval(config) 2877 if model.client_secret 2878 else model.client_secret, 2879 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2880 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2881 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2882 grant_type_name=InterpolatedString.create( 2883 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2884 ).eval(config), 2885 grant_type=InterpolatedString.create( 2886 model.grant_type or "refresh_token", parameters=model.parameters or {} 2887 ).eval(config), 2888 refresh_request_body=InterpolatedMapping( 2889 model.refresh_request_body or {}, parameters=model.parameters or {} 2890 ).eval(config), 2891 refresh_request_headers=InterpolatedMapping( 2892 model.refresh_request_headers or {}, parameters=model.parameters or {} 2893 ).eval(config), 2894 send_refresh_request_as_query_params=bool( 2895 model.send_refresh_request_as_query_params 2896 ), 2897 scopes=model.scopes, 2898 token_expiry_date_format=model.token_expiry_date_format, 2899 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2900 message_repository=self._message_repository, 2901 refresh_token_error_status_codes=refresh_token_error_status_codes, 2902 refresh_token_error_key=refresh_token_error_key, 2903 refresh_token_error_values=refresh_token_error_values, 2904 ) 2905 # ignore type error because fixing it would have a lot of dependencies, revisit later 2906 return DeclarativeOauth2Authenticator( # type: ignore 2907 access_token_name=model.access_token_name or "access_token", 2908 access_token_value=model.access_token_value, 2909 client_id_name=model.client_id_name or "client_id", 2910 client_id=model.client_id, 2911 client_secret_name=model.client_secret_name or "client_secret", 2912 client_secret=model.client_secret, 2913 expires_in_name=model.expires_in_name or "expires_in", 2914 grant_type_name=model.grant_type_name or "grant_type", 2915 grant_type=model.grant_type or "refresh_token", 2916 refresh_request_body=model.refresh_request_body, 2917 refresh_request_headers=model.refresh_request_headers, 2918 send_refresh_request_as_query_params=bool(model.send_refresh_request_as_query_params), 2919 refresh_token_name=model.refresh_token_name or "refresh_token", 2920 refresh_token=model.refresh_token, 2921 scopes=model.scopes, 2922 token_expiry_date=model.token_expiry_date, 2923 token_expiry_date_format=model.token_expiry_date_format, 2924 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2925 token_refresh_endpoint=model.token_refresh_endpoint, 2926 config=config, 2927 parameters=model.parameters or {}, 2928 message_repository=self._message_repository, 2929 profile_assertion=profile_assertion, 2930 use_profile_assertion=model.use_profile_assertion, 2931 refresh_token_error_status_codes=refresh_token_error_status_codes, 2932 refresh_token_error_key=refresh_token_error_key, 2933 refresh_token_error_values=refresh_token_error_values, 2934 )
2984 def create_offset_increment( 2985 self, 2986 model: OffsetIncrementModel, 2987 config: Config, 2988 decoder: Decoder, 2989 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2990 **kwargs: Any, 2991 ) -> OffsetIncrement: 2992 if isinstance(decoder, PaginationDecoderDecorator): 2993 inner_decoder = decoder.decoder 2994 else: 2995 inner_decoder = decoder 2996 decoder = PaginationDecoderDecorator(decoder=decoder) 2997 2998 if self._is_supported_decoder_for_pagination(inner_decoder): 2999 decoder_to_use = decoder 3000 else: 3001 raise ValueError( 3002 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 3003 ) 3004 3005 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 3006 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 3007 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 3008 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 3009 # When we have more time to investigate we can look into reusing the same component. 3010 extractor = ( 3011 self._create_component_from_model( 3012 model=extractor_model, config=config, decoder=decoder_to_use 3013 ) 3014 if extractor_model 3015 else None 3016 ) 3017 3018 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3019 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3020 page_size = model.page_size 3021 if isinstance(page_size, str) and page_size.isdigit(): 3022 page_size = int(page_size) 3023 3024 return OffsetIncrement( 3025 page_size=page_size, 3026 config=config, 3027 decoder=decoder_to_use, 3028 extractor=extractor, 3029 inject_on_first_request=model.inject_on_first_request or False, 3030 parameters=model.parameters or {}, 3031 )
3033 @staticmethod 3034 def create_page_increment( 3035 model: PageIncrementModel, config: Config, **kwargs: Any 3036 ) -> PageIncrement: 3037 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 3038 # If page_size is a string that represents an integer (not an interpolation), convert it back. 3039 page_size = model.page_size 3040 if isinstance(page_size, str) and page_size.isdigit(): 3041 page_size = int(page_size) 3042 3043 return PageIncrement( 3044 page_size=page_size, 3045 config=config, 3046 start_from_page=model.start_from_page or 0, 3047 inject_on_first_request=model.inject_on_first_request or False, 3048 parameters=model.parameters or {}, 3049 )
3051 def create_parent_stream_config( 3052 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3053 ) -> ParentStreamConfig: 3054 declarative_stream = self._create_component_from_model( 3055 model.stream, 3056 config=config, 3057 is_parent=True, 3058 **kwargs, 3059 ) 3060 request_option = ( 3061 self._create_component_from_model(model.request_option, config=config) 3062 if model.request_option 3063 else None 3064 ) 3065 3066 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 3067 raise ValueError( 3068 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 3069 ) 3070 3071 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3072 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3073 ) 3074 3075 return ParentStreamConfig( 3076 parent_key=model.parent_key, 3077 request_option=request_option, 3078 stream=declarative_stream, 3079 partition_field=model.partition_field, 3080 config=config, 3081 incremental_dependency=model.incremental_dependency or False, 3082 parameters=model.parameters or {}, 3083 extra_fields=model.extra_fields, 3084 lazy_read_pointer=model_lazy_read_pointer, 3085 )
3087 def create_properties_from_endpoint( 3088 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3089 ) -> PropertiesFromEndpoint: 3090 retriever = self._create_component_from_model( 3091 model=model.retriever, 3092 config=config, 3093 name="dynamic_properties", 3094 primary_key=None, 3095 stream_slicer=None, 3096 transformations=[], 3097 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3098 ) 3099 return PropertiesFromEndpoint( 3100 property_field_path=model.property_field_path, 3101 retriever=retriever, 3102 config=config, 3103 parameters=model.parameters or {}, 3104 )
3106 def create_property_chunking( 3107 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3108 ) -> PropertyChunking: 3109 record_merge_strategy = ( 3110 self._create_component_from_model( 3111 model=model.record_merge_strategy, config=config, **kwargs 3112 ) 3113 if model.record_merge_strategy 3114 else None 3115 ) 3116 3117 property_limit_type: PropertyLimitType 3118 match model.property_limit_type: 3119 case PropertyLimitTypeModel.property_count: 3120 property_limit_type = PropertyLimitType.property_count 3121 case PropertyLimitTypeModel.characters: 3122 property_limit_type = PropertyLimitType.characters 3123 case _: 3124 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3125 3126 return PropertyChunking( 3127 property_limit_type=property_limit_type, 3128 property_limit=model.property_limit, 3129 record_merge_strategy=record_merge_strategy, 3130 config=config, 3131 parameters=model.parameters or {}, 3132 )
3134 def create_query_properties( 3135 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3136 ) -> QueryProperties: 3137 if isinstance(model.property_list, list): 3138 property_list = model.property_list 3139 else: 3140 property_list = self._create_component_from_model( 3141 model=model.property_list, config=config, **kwargs 3142 ) 3143 3144 property_chunking = ( 3145 self._create_component_from_model( 3146 model=model.property_chunking, config=config, **kwargs 3147 ) 3148 if model.property_chunking 3149 else None 3150 ) 3151 3152 property_selector = ( 3153 self._create_component_from_model( 3154 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3155 ) 3156 if model.property_selector 3157 else None 3158 ) 3159 3160 return QueryProperties( 3161 property_list=property_list, 3162 always_include_properties=model.always_include_properties, 3163 property_chunking=property_chunking, 3164 property_selector=property_selector, 3165 config=config, 3166 parameters=model.parameters or {}, 3167 )
3169 def create_json_schema_property_selector( 3170 self, 3171 model: JsonSchemaPropertySelectorModel, 3172 config: Config, 3173 *, 3174 stream_name: str, 3175 **kwargs: Any, 3176 ) -> JsonSchemaPropertySelector: 3177 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3178 3179 transformations = [] 3180 if model.transformations: 3181 for transformation_model in model.transformations: 3182 transformations.append( 3183 self._create_component_from_model(model=transformation_model, config=config) 3184 ) 3185 3186 return JsonSchemaPropertySelector( 3187 configured_stream=configured_stream, 3188 properties_transformations=transformations, 3189 config=config, 3190 parameters=model.parameters or {}, 3191 )
3205 @staticmethod 3206 def create_request_option( 3207 model: RequestOptionModel, config: Config, **kwargs: Any 3208 ) -> RequestOption: 3209 inject_into = RequestOptionType(model.inject_into.value) 3210 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3211 [ 3212 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3213 for segment in model.field_path 3214 ] 3215 if model.field_path 3216 else None 3217 ) 3218 field_name = ( 3219 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3220 if model.field_name 3221 else None 3222 ) 3223 return RequestOption( 3224 field_name=field_name, 3225 field_path=field_path, 3226 inject_into=inject_into, 3227 parameters=kwargs.get("parameters", {}), 3228 )
3230 def create_record_selector( 3231 self, 3232 model: RecordSelectorModel, 3233 config: Config, 3234 *, 3235 name: str, 3236 transformations: List[RecordTransformation] | None = None, 3237 decoder: Decoder | None = None, 3238 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3239 file_uploader: Optional[DefaultFileUploader] = None, 3240 **kwargs: Any, 3241 ) -> RecordSelector: 3242 extractor = self._create_component_from_model( 3243 model=model.extractor, decoder=decoder, config=config 3244 ) 3245 record_filter = ( 3246 self._create_component_from_model(model.record_filter, config=config) 3247 if model.record_filter 3248 else None 3249 ) 3250 3251 transform_before_filtering = ( 3252 False if model.transform_before_filtering is None else model.transform_before_filtering 3253 ) 3254 if client_side_incremental_sync_cursor: 3255 record_filter = ClientSideIncrementalRecordFilterDecorator( 3256 config=config, 3257 parameters=model.parameters, 3258 condition=model.record_filter.condition 3259 if (model.record_filter and hasattr(model.record_filter, "condition")) 3260 else None, 3261 cursor=client_side_incremental_sync_cursor, 3262 ) 3263 transform_before_filtering = ( 3264 True 3265 if model.transform_before_filtering is None 3266 else model.transform_before_filtering 3267 ) 3268 3269 if model.schema_normalization is None: 3270 # default to no schema normalization if not set 3271 model.schema_normalization = SchemaNormalizationModel.None_ 3272 3273 schema_normalization = ( 3274 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3275 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3276 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3277 ) 3278 3279 return RecordSelector( 3280 extractor=extractor, 3281 name=name, 3282 config=config, 3283 record_filter=record_filter, 3284 transformations=transformations or [], 3285 file_uploader=file_uploader, 3286 schema_normalization=schema_normalization, 3287 parameters=model.parameters or {}, 3288 transform_before_filtering=transform_before_filtering, 3289 )
3299 def create_selective_authenticator( 3300 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3301 ) -> DeclarativeAuthenticator: 3302 authenticators = { 3303 name: self._create_component_from_model(model=auth, config=config) 3304 for name, auth in model.authenticators.items() 3305 } 3306 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3307 return SelectiveAuthenticator( # type: ignore[abstract] 3308 config=config, 3309 authenticators=authenticators, 3310 authenticator_selection_path=model.authenticator_selection_path, 3311 **kwargs, 3312 )
3314 @staticmethod 3315 def create_legacy_session_token_authenticator( 3316 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3317 ) -> LegacySessionTokenAuthenticator: 3318 return LegacySessionTokenAuthenticator( 3319 api_url=url_base, 3320 header=model.header, 3321 login_url=model.login_url, 3322 password=model.password or "", 3323 session_token=model.session_token or "", 3324 session_token_response_key=model.session_token_response_key or "", 3325 username=model.username or "", 3326 validate_session_url=model.validate_session_url, 3327 config=config, 3328 parameters=model.parameters or {}, 3329 )
3331 def create_simple_retriever( 3332 self, 3333 model: SimpleRetrieverModel, 3334 config: Config, 3335 *, 3336 name: str, 3337 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3338 request_options_provider: Optional[RequestOptionsProvider] = None, 3339 cursor: Optional[Cursor] = None, 3340 has_stop_condition_cursor: bool = False, 3341 is_client_side_incremental_sync: bool = False, 3342 transformations: List[RecordTransformation], 3343 file_uploader: Optional[DefaultFileUploader] = None, 3344 incremental_sync: Optional[ 3345 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3346 ] = None, 3347 use_cache: Optional[bool] = None, 3348 log_formatter: Optional[Callable[[Response], Any]] = None, 3349 partition_router: Optional[PartitionRouter] = None, 3350 **kwargs: Any, 3351 ) -> SimpleRetriever: 3352 def _get_url(req: Requester) -> str: 3353 """ 3354 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3355 This is needed because the URL is not set until the requester is created. 3356 """ 3357 3358 _url: str = ( 3359 model.requester.url 3360 if hasattr(model.requester, "url") and model.requester.url is not None 3361 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3362 ) 3363 _url_base: str = ( 3364 model.requester.url_base 3365 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3366 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3367 ) 3368 3369 return _url or _url_base 3370 3371 if cursor is None: 3372 cursor = FinalStateCursor(name, None, self._message_repository) 3373 3374 decoder = ( 3375 self._create_component_from_model(model=model.decoder, config=config) 3376 if model.decoder 3377 else JsonDecoder(parameters={}) 3378 ) 3379 record_selector = self._create_component_from_model( 3380 model=model.record_selector, 3381 name=name, 3382 config=config, 3383 decoder=decoder, 3384 transformations=transformations, 3385 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3386 file_uploader=file_uploader, 3387 ) 3388 3389 query_properties: Optional[QueryProperties] = None 3390 query_properties_key: Optional[str] = None 3391 self._ensure_query_properties_to_model(model.requester) 3392 if self._has_query_properties_in_request_parameters(model.requester): 3393 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3394 # places instead of default to request_parameters which isn't clearly documented 3395 if ( 3396 hasattr(model.requester, "fetch_properties_from_endpoint") 3397 and model.requester.fetch_properties_from_endpoint 3398 ): 3399 raise ValueError( 3400 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3401 ) 3402 3403 query_properties_definitions = [] 3404 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3405 if isinstance(request_parameter, QueryPropertiesModel): 3406 query_properties_key = key 3407 query_properties_definitions.append(request_parameter) 3408 3409 if len(query_properties_definitions) > 1: 3410 raise ValueError( 3411 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3412 ) 3413 3414 if len(query_properties_definitions) == 1: 3415 query_properties = self._create_component_from_model( 3416 model=query_properties_definitions[0], stream_name=name, config=config 3417 ) 3418 3419 # Removes QueryProperties components from the interpolated mappings because it has been designed 3420 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3421 # instead of through jinja interpolation 3422 if hasattr(model.requester, "request_parameters") and isinstance( 3423 model.requester.request_parameters, Mapping 3424 ): 3425 model.requester.request_parameters = self._remove_query_properties( 3426 model.requester.request_parameters 3427 ) 3428 elif ( 3429 hasattr(model.requester, "fetch_properties_from_endpoint") 3430 and model.requester.fetch_properties_from_endpoint 3431 ): 3432 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3433 query_properties_definition = QueryPropertiesModel( 3434 type="QueryProperties", 3435 property_list=model.requester.fetch_properties_from_endpoint, 3436 always_include_properties=None, 3437 property_chunking=None, 3438 ) # type: ignore # $parameters has a default value 3439 3440 query_properties = self.create_query_properties( 3441 model=query_properties_definition, 3442 stream_name=name, 3443 config=config, 3444 ) 3445 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3446 query_properties = self.create_query_properties( 3447 model=model.requester.query_properties, 3448 stream_name=name, 3449 config=config, 3450 ) 3451 3452 requester = self._create_component_from_model( 3453 model=model.requester, 3454 decoder=decoder, 3455 name=name, 3456 query_properties_key=query_properties_key, 3457 use_cache=use_cache, 3458 config=config, 3459 ) 3460 3461 if not request_options_provider: 3462 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3463 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3464 partition_router, PartitionRouter 3465 ): 3466 request_options_provider = partition_router 3467 3468 paginator = ( 3469 self._create_component_from_model( 3470 model=model.paginator, 3471 config=config, 3472 url_base=_get_url(requester), 3473 extractor_model=model.record_selector.extractor, 3474 decoder=decoder, 3475 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3476 ) 3477 if model.paginator 3478 else NoPagination(parameters={}) 3479 ) 3480 3481 ignore_stream_slicer_parameters_on_paginated_requests = ( 3482 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3483 ) 3484 3485 if ( 3486 model.partition_router 3487 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3488 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3489 and any( 3490 parent_stream_config.lazy_read_pointer 3491 for parent_stream_config in model.partition_router.parent_stream_configs 3492 ) 3493 ): 3494 if incremental_sync: 3495 if incremental_sync.type != "DatetimeBasedCursor": 3496 raise ValueError( 3497 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3498 ) 3499 3500 elif incremental_sync.step or incremental_sync.cursor_granularity: 3501 raise ValueError( 3502 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3503 ) 3504 3505 if model.decoder and model.decoder.type != "JsonDecoder": 3506 raise ValueError( 3507 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3508 ) 3509 3510 return LazySimpleRetriever( 3511 name=name, 3512 paginator=paginator, 3513 primary_key=primary_key, 3514 requester=requester, 3515 record_selector=record_selector, 3516 stream_slicer=_NO_STREAM_SLICING, 3517 request_option_provider=request_options_provider, 3518 config=config, 3519 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3520 parameters=model.parameters or {}, 3521 ) 3522 3523 if ( 3524 model.record_selector.record_filter 3525 and model.pagination_reset 3526 and model.pagination_reset.limits 3527 ): 3528 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3529 3530 return SimpleRetriever( 3531 name=name, 3532 paginator=paginator, 3533 primary_key=primary_key, 3534 requester=requester, 3535 record_selector=record_selector, 3536 stream_slicer=_NO_STREAM_SLICING, 3537 request_option_provider=request_options_provider, 3538 config=config, 3539 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3540 additional_query_properties=query_properties, 3541 log_formatter=self._get_log_formatter(log_formatter, name), 3542 pagination_tracker_factory=self._create_pagination_tracker_factory( 3543 model.pagination_reset, cursor 3544 ), 3545 parameters=model.parameters or {}, 3546 )
3624 def create_state_delegating_stream( 3625 self, 3626 model: StateDelegatingStreamModel, 3627 config: Config, 3628 **kwargs: Any, 3629 ) -> DefaultStream: 3630 if ( 3631 model.full_refresh_stream.name != model.name 3632 or model.name != model.incremental_stream.name 3633 ): 3634 raise ValueError( 3635 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3636 ) 3637 3638 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3639 resolved_retention_period: Optional[str] = None 3640 if model.api_retention_period: 3641 interpolated_retention = InterpolatedString.create( 3642 model.api_retention_period, parameters=model.parameters or {} 3643 ) 3644 resolved_value = interpolated_retention.eval(config=config) 3645 if resolved_value: 3646 resolved_retention_period = str(resolved_value) 3647 3648 if resolved_retention_period: 3649 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3650 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3651 raise ValueError( 3652 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3653 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3654 f"cursors, so cursor age validation cannot be performed." 3655 ) 3656 3657 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3658 3659 if not stream_state: 3660 return self._create_component_from_model( # type: ignore[no-any-return] 3661 model.full_refresh_stream, config=config, **kwargs 3662 ) 3663 3664 incremental_stream: DefaultStream = self._create_component_from_model( 3665 model.incremental_stream, config=config, **kwargs 3666 ) # type: ignore[assignment] 3667 3668 # Only run cursor age validation for streams that are in the configured 3669 # catalog (or when no catalog was provided, e.g. during discover / connector 3670 # builder). Streams not selected by the user but instantiated as parent-stream 3671 # dependencies must not go through this path because it emits state messages 3672 # that the destination does not know about, causing "Stream not found" crashes. 3673 stream_is_in_catalog = ( 3674 not self._stream_name_to_configured_stream # no catalog → validate by default 3675 or model.name in self._stream_name_to_configured_stream 3676 ) 3677 if resolved_retention_period and stream_is_in_catalog: 3678 full_refresh_stream: DefaultStream = self._create_component_from_model( 3679 model.full_refresh_stream, config=config, **kwargs 3680 ) # type: ignore[assignment] 3681 if self._is_cursor_older_than_retention_period( 3682 stream_state, 3683 full_refresh_stream.cursor, 3684 incremental_stream.cursor, 3685 resolved_retention_period, 3686 model.name, 3687 ): 3688 # Clear state BEFORE constructing the full_refresh_stream so that 3689 # its cursor starts from start_date instead of the stale cursor. 3690 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3691 state_message = self._connector_state_manager.create_state_message(model.name, None) 3692 self._message_repository.emit_message(state_message) 3693 return self._create_component_from_model( # type: ignore[no-any-return] 3694 model.full_refresh_stream, config=config, **kwargs 3695 ) 3696 3697 return incremental_stream
3796 def create_async_retriever( 3797 self, 3798 model: AsyncRetrieverModel, 3799 config: Config, 3800 *, 3801 name: str, 3802 primary_key: Optional[ 3803 Union[str, List[str], List[List[str]]] 3804 ], # this seems to be needed to match create_simple_retriever 3805 stream_slicer: Optional[StreamSlicer], 3806 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3807 transformations: List[RecordTransformation], 3808 **kwargs: Any, 3809 ) -> AsyncRetriever: 3810 if model.download_target_requester and not model.download_target_extractor: 3811 raise ValueError( 3812 f"`download_target_extractor` required if using a `download_target_requester`" 3813 ) 3814 3815 def _get_download_retriever( 3816 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3817 ) -> SimpleRetriever: 3818 # We create a record selector for the download retriever 3819 # with no schema normalization and no transformations, neither record filter 3820 # as all this occurs in the record_selector of the AsyncRetriever 3821 record_selector = RecordSelector( 3822 extractor=extractor, 3823 name=name, 3824 record_filter=None, 3825 transformations=[], 3826 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3827 config=config, 3828 parameters={}, 3829 ) 3830 paginator = ( 3831 self._create_component_from_model( 3832 model=model.download_paginator, 3833 decoder=_decoder, 3834 config=config, 3835 url_base="", 3836 ) 3837 if model.download_paginator 3838 else NoPagination(parameters={}) 3839 ) 3840 3841 return SimpleRetriever( 3842 requester=requester, 3843 record_selector=record_selector, 3844 primary_key=None, 3845 name=name, 3846 paginator=paginator, 3847 config=config, 3848 parameters={}, 3849 log_formatter=self._get_log_formatter(None, name), 3850 ) 3851 3852 def _get_job_timeout() -> datetime.timedelta: 3853 user_defined_timeout: Optional[int] = ( 3854 int( 3855 InterpolatedString.create( 3856 str(model.polling_job_timeout), 3857 parameters={}, 3858 ).eval(config) 3859 ) 3860 if model.polling_job_timeout 3861 else None 3862 ) 3863 3864 # check for user defined timeout during the test read or 15 minutes 3865 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3866 # default value for non-connector builder is 60 minutes. 3867 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3868 3869 return ( 3870 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3871 ) 3872 3873 decoder = ( 3874 self._create_component_from_model(model=model.decoder, config=config) 3875 if model.decoder 3876 else JsonDecoder(parameters={}) 3877 ) 3878 record_selector = self._create_component_from_model( 3879 model=model.record_selector, 3880 config=config, 3881 decoder=decoder, 3882 name=name, 3883 transformations=transformations, 3884 client_side_incremental_sync=client_side_incremental_sync, 3885 ) 3886 3887 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3888 if self._should_limit_slices_fetched(): 3889 stream_slicer = cast( 3890 StreamSlicer, 3891 StreamSlicerTestReadDecorator( 3892 wrapped_slicer=stream_slicer, 3893 maximum_number_of_slices=self._limit_slices_fetched or 5, 3894 ), 3895 ) 3896 3897 creation_requester = self._create_component_from_model( 3898 model=model.creation_requester, 3899 decoder=decoder, 3900 config=config, 3901 name=f"job creation - {name}", 3902 ) 3903 polling_requester = self._create_component_from_model( 3904 model=model.polling_requester, 3905 decoder=decoder, 3906 config=config, 3907 name=f"job polling - {name}", 3908 ) 3909 job_download_components_name = f"job download - {name}" 3910 download_decoder = ( 3911 self._create_component_from_model(model=model.download_decoder, config=config) 3912 if model.download_decoder 3913 else JsonDecoder(parameters={}) 3914 ) 3915 download_extractor = ( 3916 self._create_component_from_model( 3917 model=model.download_extractor, 3918 config=config, 3919 decoder=download_decoder, 3920 parameters=model.parameters, 3921 ) 3922 if model.download_extractor 3923 else DpathExtractor( 3924 [], 3925 config=config, 3926 decoder=download_decoder, 3927 parameters=model.parameters or {}, 3928 ) 3929 ) 3930 download_requester = self._create_component_from_model( 3931 model=model.download_requester, 3932 decoder=download_decoder, 3933 config=config, 3934 name=job_download_components_name, 3935 ) 3936 download_retriever = _get_download_retriever( 3937 download_requester, download_extractor, download_decoder 3938 ) 3939 abort_requester = ( 3940 self._create_component_from_model( 3941 model=model.abort_requester, 3942 decoder=decoder, 3943 config=config, 3944 name=f"job abort - {name}", 3945 ) 3946 if model.abort_requester 3947 else None 3948 ) 3949 delete_requester = ( 3950 self._create_component_from_model( 3951 model=model.delete_requester, 3952 decoder=decoder, 3953 config=config, 3954 name=f"job delete - {name}", 3955 ) 3956 if model.delete_requester 3957 else None 3958 ) 3959 download_target_requester = ( 3960 self._create_component_from_model( 3961 model=model.download_target_requester, 3962 decoder=decoder, 3963 config=config, 3964 name=f"job extract_url - {name}", 3965 ) 3966 if model.download_target_requester 3967 else None 3968 ) 3969 status_extractor = self._create_component_from_model( 3970 model=model.status_extractor, decoder=decoder, config=config, name=name 3971 ) 3972 download_target_extractor = ( 3973 self._create_component_from_model( 3974 model=model.download_target_extractor, 3975 decoder=decoder, 3976 config=config, 3977 name=name, 3978 ) 3979 if model.download_target_extractor 3980 else None 3981 ) 3982 3983 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3984 creation_requester=creation_requester, 3985 polling_requester=polling_requester, 3986 download_retriever=download_retriever, 3987 download_target_requester=download_target_requester, 3988 abort_requester=abort_requester, 3989 delete_requester=delete_requester, 3990 status_extractor=status_extractor, 3991 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3992 download_target_extractor=download_target_extractor, 3993 job_timeout=_get_job_timeout(), 3994 ) 3995 3996 failed_retry_wait_time_in_seconds: Optional[int] = ( 3997 int( 3998 InterpolatedString.create( 3999 str(model.failed_retry_wait_time_in_seconds), 4000 parameters={}, 4001 ).eval(config) 4002 ) 4003 if model.failed_retry_wait_time_in_seconds 4004 else None 4005 ) 4006 4007 async_job_partition_router = AsyncJobPartitionRouter( 4008 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 4009 job_repository, 4010 stream_slices, 4011 self._job_tracker, 4012 self._message_repository, 4013 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 4014 has_bulk_parent=False, 4015 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 4016 # `None` == default retry is set to 3 attempts, under the hood. 4017 job_max_retry=1 if self._emit_connector_builder_messages else None, 4018 failed_retry_wait_time_in_seconds=failed_retry_wait_time_in_seconds, 4019 ), 4020 stream_slicer=stream_slicer, 4021 config=config, 4022 parameters=model.parameters or {}, 4023 ) 4024 4025 return AsyncRetriever( 4026 record_selector=record_selector, 4027 stream_slicer=async_job_partition_router, 4028 config=config, 4029 parameters=model.parameters or {}, 4030 )
4032 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 4033 config_migrations = [ 4034 self._create_component_from_model(migration, config) 4035 for migration in ( 4036 model.config_normalization_rules.config_migrations 4037 if ( 4038 model.config_normalization_rules 4039 and model.config_normalization_rules.config_migrations 4040 ) 4041 else [] 4042 ) 4043 ] 4044 config_transformations = [ 4045 self._create_component_from_model(transformation, config) 4046 for transformation in ( 4047 model.config_normalization_rules.transformations 4048 if ( 4049 model.config_normalization_rules 4050 and model.config_normalization_rules.transformations 4051 ) 4052 else [] 4053 ) 4054 ] 4055 config_validations = [ 4056 self._create_component_from_model(validation, config) 4057 for validation in ( 4058 model.config_normalization_rules.validations 4059 if ( 4060 model.config_normalization_rules 4061 and model.config_normalization_rules.validations 4062 ) 4063 else [] 4064 ) 4065 ] 4066 4067 return Spec( 4068 connection_specification=model.connection_specification, 4069 documentation_url=model.documentation_url, 4070 advanced_auth=model.advanced_auth, 4071 parameters={}, 4072 config_migrations=config_migrations, 4073 config_transformations=config_transformations, 4074 config_validations=config_validations, 4075 )
4077 def create_substream_partition_router( 4078 self, 4079 model: SubstreamPartitionRouterModel, 4080 config: Config, 4081 *, 4082 stream_name: str, 4083 **kwargs: Any, 4084 ) -> SubstreamPartitionRouter: 4085 parent_stream_configs = [] 4086 if model.parent_stream_configs: 4087 parent_stream_configs.extend( 4088 [ 4089 self.create_parent_stream_config_with_substream_wrapper( 4090 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 4091 ) 4092 for parent_stream_config in model.parent_stream_configs 4093 ] 4094 ) 4095 4096 return SubstreamPartitionRouter( 4097 parent_stream_configs=parent_stream_configs, 4098 parameters=model.parameters or {}, 4099 config=config, 4100 )
4102 def create_parent_stream_config_with_substream_wrapper( 4103 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4104 ) -> Any: 4105 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4106 4107 parent_state: Optional[Mapping[str, Any]] = ( 4108 child_state if model.incremental_dependency and child_state else None 4109 ) 4110 connector_state_manager = self._instantiate_parent_stream_state_manager( 4111 child_state, config, model, parent_state 4112 ) 4113 4114 substream_factory = ModelToComponentFactory( 4115 connector_state_manager=connector_state_manager, 4116 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4117 limit_slices_fetched=self._limit_slices_fetched, 4118 emit_connector_builder_messages=self._emit_connector_builder_messages, 4119 disable_retries=self._disable_retries, 4120 disable_cache=self._disable_cache, 4121 message_repository=StateFilteringMessageRepository( 4122 LogAppenderMessageRepositoryDecorator( 4123 { 4124 "airbyte_cdk": {"stream": {"is_substream": True}}, 4125 "http": {"is_auxiliary": True}, 4126 }, 4127 self._message_repository, 4128 self._evaluate_log_level(self._emit_connector_builder_messages), 4129 ), 4130 ), 4131 api_budget=self._api_budget, 4132 ) 4133 4134 return substream_factory.create_parent_stream_config( 4135 model=model, config=config, stream_name=stream_name, **kwargs 4136 )
4196 @staticmethod 4197 def create_wait_time_from_header( 4198 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4199 ) -> WaitTimeFromHeaderBackoffStrategy: 4200 return WaitTimeFromHeaderBackoffStrategy( 4201 header=model.header, 4202 parameters=model.parameters or {}, 4203 config=config, 4204 regex=model.regex, 4205 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4206 if model.max_waiting_time_in_seconds is not None 4207 else None, 4208 )
4210 @staticmethod 4211 def create_wait_until_time_from_header( 4212 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4213 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4214 return WaitUntilTimeFromHeaderBackoffStrategy( 4215 header=model.header, 4216 parameters=model.parameters or {}, 4217 config=config, 4218 min_wait=model.min_wait, 4219 regex=model.regex, 4220 )
4228 @staticmethod 4229 def create_components_mapping_definition( 4230 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4231 ) -> ComponentMappingDefinition: 4232 interpolated_value = InterpolatedString.create( 4233 model.value, parameters=model.parameters or {} 4234 ) 4235 field_path = [ 4236 InterpolatedString.create(path, parameters=model.parameters or {}) 4237 for path in model.field_path 4238 ] 4239 return ComponentMappingDefinition( 4240 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4241 value=interpolated_value, 4242 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4243 create_or_update=model.create_or_update, 4244 condition=model.condition, 4245 parameters=model.parameters or {}, 4246 )
4248 def create_http_components_resolver( 4249 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4250 ) -> Any: 4251 retriever = self._create_component_from_model( 4252 model=model.retriever, 4253 config=config, 4254 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4255 primary_key=None, 4256 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4257 transformations=[], 4258 ) 4259 4260 components_mapping = [] 4261 for component_mapping_definition_model in model.components_mapping: 4262 if component_mapping_definition_model.condition: 4263 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4264 components_mapping.append( 4265 self._create_component_from_model( 4266 model=component_mapping_definition_model, 4267 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4268 component_mapping_definition_model.value_type 4269 ), 4270 config=config, 4271 ) 4272 ) 4273 4274 return HttpComponentsResolver( 4275 retriever=retriever, 4276 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4277 config=config, 4278 components_mapping=components_mapping, 4279 parameters=model.parameters or {}, 4280 )
4282 @staticmethod 4283 def create_stream_config( 4284 model: StreamConfigModel, config: Config, **kwargs: Any 4285 ) -> StreamConfig: 4286 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4287 [x for x in model.configs_pointer] if model.configs_pointer else [] 4288 ) 4289 4290 return StreamConfig( 4291 configs_pointer=model_configs_pointer, 4292 default_values=model.default_values, 4293 parameters=model.parameters or {}, 4294 )
4296 def create_config_components_resolver( 4297 self, 4298 model: ConfigComponentsResolverModel, 4299 config: Config, 4300 ) -> Any: 4301 model_stream_configs = ( 4302 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4303 ) 4304 4305 stream_configs = [ 4306 self._create_component_from_model( 4307 stream_config, config=config, parameters=model.parameters or {} 4308 ) 4309 for stream_config in model_stream_configs 4310 ] 4311 4312 components_mapping = [ 4313 self._create_component_from_model( 4314 model=components_mapping_definition_model, 4315 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4316 components_mapping_definition_model.value_type 4317 ), 4318 config=config, 4319 parameters=model.parameters, 4320 ) 4321 for components_mapping_definition_model in model.components_mapping 4322 ] 4323 4324 return ConfigComponentsResolver( 4325 stream_configs=stream_configs, 4326 config=config, 4327 components_mapping=components_mapping, 4328 parameters=model.parameters or {}, 4329 )
4331 def create_parametrized_components_resolver( 4332 self, 4333 model: ParametrizedComponentsResolverModel, 4334 config: Config, 4335 ) -> ParametrizedComponentsResolver: 4336 stream_parameters = StreamParametersDefinition( 4337 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4338 ) 4339 4340 components_mapping = [] 4341 for components_mapping_definition_model in model.components_mapping: 4342 if components_mapping_definition_model.condition: 4343 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4344 components_mapping.append( 4345 self._create_component_from_model( 4346 model=components_mapping_definition_model, 4347 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4348 components_mapping_definition_model.value_type 4349 ), 4350 config=config, 4351 ) 4352 ) 4353 return ParametrizedComponentsResolver( 4354 stream_parameters=stream_parameters, 4355 config=config, 4356 components_mapping=components_mapping, 4357 parameters=model.parameters or {}, 4358 )
4382 def create_http_api_budget( 4383 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4384 ) -> HttpAPIBudget: 4385 policies = [ 4386 self._create_component_from_model(model=policy, config=config) 4387 for policy in model.policies 4388 ] 4389 4390 return HttpAPIBudget( 4391 policies=policies, 4392 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4393 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4394 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4395 )
4397 def create_fixed_window_call_rate_policy( 4398 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4399 ) -> FixedWindowCallRatePolicy: 4400 matchers = [ 4401 self._create_component_from_model(model=matcher, config=config) 4402 for matcher in model.matchers 4403 ] 4404 4405 # Set the initial reset timestamp to 10 days from now. 4406 # This value will be updated by the first request. 4407 return FixedWindowCallRatePolicy( 4408 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4409 period=parse_duration(model.period), 4410 call_limit=model.call_limit, 4411 matchers=matchers, 4412 )
4414 def create_file_uploader( 4415 self, model: FileUploaderModel, config: Config, **kwargs: Any 4416 ) -> FileUploader: 4417 name = "File Uploader" 4418 requester = self._create_component_from_model( 4419 model=model.requester, 4420 config=config, 4421 name=name, 4422 **kwargs, 4423 ) 4424 download_target_extractor = self._create_component_from_model( 4425 model=model.download_target_extractor, 4426 config=config, 4427 name=name, 4428 **kwargs, 4429 ) 4430 emit_connector_builder_messages = self._emit_connector_builder_messages 4431 file_uploader = DefaultFileUploader( 4432 requester=requester, 4433 download_target_extractor=download_target_extractor, 4434 config=config, 4435 file_writer=NoopFileWriter() 4436 if emit_connector_builder_messages 4437 else LocalFileSystemFileWriter(), 4438 parameters=model.parameters or {}, 4439 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4440 ) 4441 4442 return ( 4443 ConnectorBuilderFileUploader(file_uploader) 4444 if emit_connector_builder_messages 4445 else file_uploader 4446 )
4448 def create_moving_window_call_rate_policy( 4449 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4450 ) -> MovingWindowCallRatePolicy: 4451 rates = [ 4452 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4453 ] 4454 matchers = [ 4455 self._create_component_from_model(model=matcher, config=config) 4456 for matcher in model.matchers 4457 ] 4458 return MovingWindowCallRatePolicy( 4459 rates=rates, 4460 matchers=matchers, 4461 )
4463 def create_unlimited_call_rate_policy( 4464 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4465 ) -> UnlimitedCallRatePolicy: 4466 matchers = [ 4467 self._create_component_from_model(model=matcher, config=config) 4468 for matcher in model.matchers 4469 ] 4470 4471 return UnlimitedCallRatePolicy( 4472 matchers=matchers, 4473 )
4482 def create_http_request_matcher( 4483 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4484 ) -> HttpRequestRegexMatcher: 4485 weight = model.weight 4486 if weight is not None: 4487 if isinstance(weight, str): 4488 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4489 else: 4490 weight = int(weight) 4491 if weight < 1: 4492 raise ValueError(f"weight must be >= 1, got {weight}") 4493 return HttpRequestRegexMatcher( 4494 method=model.method, 4495 url_base=model.url_base, 4496 url_path_pattern=model.url_path_pattern, 4497 params=model.params, 4498 headers=model.headers, 4499 weight=weight, 4500 )
4507 def create_grouping_partition_router( 4508 self, 4509 model: GroupingPartitionRouterModel, 4510 config: Config, 4511 *, 4512 stream_name: str, 4513 **kwargs: Any, 4514 ) -> GroupingPartitionRouter: 4515 underlying_router = self._create_component_from_model( 4516 model=model.underlying_partition_router, 4517 config=config, 4518 stream_name=stream_name, 4519 **kwargs, 4520 ) 4521 if model.group_size < 1: 4522 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4523 4524 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4525 # because they are specific to individual partitions and cannot be aggregated or handled 4526 # when grouping, potentially leading to incorrect API calls. Any request customization 4527 # should be managed at the stream level through the requester's configuration. 4528 if isinstance(underlying_router, SubstreamPartitionRouter): 4529 if any( 4530 parent_config.request_option 4531 for parent_config in underlying_router.parent_stream_configs 4532 ): 4533 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4534 4535 if isinstance(underlying_router, ListPartitionRouter): 4536 if underlying_router.request_option: 4537 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4538 4539 return GroupingPartitionRouter( 4540 group_size=model.group_size, 4541 underlying_partition_router=underlying_router, 4542 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4543 config=config, 4544 )