airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 TYPE_CHECKING, 15 Any, 16 Callable, 17 Dict, 18 List, 19 Mapping, 20 MutableMapping, 21 Optional, 22 Tuple, 23 Type, 24 Union, 25 cast, 26 get_args, 27 get_origin, 28 get_type_hints, 29) 30 31if TYPE_CHECKING: 32 from airbyte_cdk.legacy.sources.declarative.incremental.datetime_based_cursor import ( 33 DatetimeBasedCursor, 34 ) 35 36from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream 37from isodate import parse_duration 38from pydantic.v1 import BaseModel 39from requests import Response 40 41from airbyte_cdk.connector_builder.models import ( 42 LogMessage as ConnectorBuilderLogMessage, 43) 44from airbyte_cdk.models import ( 45 AirbyteStateBlob, 46 AirbyteStateMessage, 47 AirbyteStateType, 48 AirbyteStreamState, 49 ConfiguredAirbyteCatalog, 50 FailureType, 51 Level, 52 StreamDescriptor, 53) 54from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 55from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 56from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 57from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 58from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 59from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 60from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 61 DeclarativeAuthenticator, 62 NoAuth, 63) 64from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 65from airbyte_cdk.sources.declarative.auth.oauth import ( 66 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 67) 68from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 69from airbyte_cdk.sources.declarative.auth.token import ( 70 ApiKeyAuthenticator, 71 BasicHttpAuthenticator, 72 BearerAuthenticator, 73 LegacySessionTokenAuthenticator, 74) 75from airbyte_cdk.sources.declarative.auth.token_provider import ( 76 InterpolatedSessionTokenProvider, 77 InterpolatedStringTokenProvider, 78 SessionTokenProvider, 79 TokenProvider, 80) 81from airbyte_cdk.sources.declarative.checks import ( 82 CheckDynamicStream, 83 CheckStream, 84 DynamicStreamCheckConfig, 85) 86from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 87from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 88from airbyte_cdk.sources.declarative.decoders import ( 89 Decoder, 90 IterableDecoder, 91 JsonDecoder, 92 PaginationDecoderDecorator, 93 XmlDecoder, 94 ZipfileDecoder, 95) 96from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 97 CompositeRawDecoder, 98 CsvParser, 99 GzipParser, 100 JsonLineParser, 101 JsonParser, 102 Parser, 103) 104from airbyte_cdk.sources.declarative.extractors import ( 105 DpathExtractor, 106 RecordFilter, 107 RecordSelector, 108 ResponseToFileExtractor, 109) 110from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 111from airbyte_cdk.sources.declarative.extractors.record_filter import ( 112 ClientSideIncrementalRecordFilterDecorator, 113) 114from airbyte_cdk.sources.declarative.incremental import ( 115 ConcurrentCursorFactory, 116 ConcurrentPerPartitionCursor, 117) 118from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 119from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 120from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 121 LegacyToPerPartitionStateMigration, 122) 123from airbyte_cdk.sources.declarative.models import ( 124 CustomStateMigration, 125 PaginationResetLimits, 126) 127from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 128 DEPRECATION_LOGS_TAG, 129 BaseModelWithDeprecations, 130) 131from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 132 Action1 as PaginationResetActionModel, 133) 134from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 135 AddedFieldDefinition as AddedFieldDefinitionModel, 136) 137from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 138 AddFields as AddFieldsModel, 139) 140from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 141 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 142) 143from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 144 AsyncJobStatusMap as AsyncJobStatusMapModel, 145) 146from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 147 AsyncRetriever as AsyncRetrieverModel, 148) 149from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 150 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 151) 152from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 153 BearerAuthenticator as BearerAuthenticatorModel, 154) 155from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 156 CheckDynamicStream as CheckDynamicStreamModel, 157) 158from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 159 CheckStream as CheckStreamModel, 160) 161from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 162 ComplexFieldType as ComplexFieldTypeModel, 163) 164from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 165 ComponentMappingDefinition as ComponentMappingDefinitionModel, 166) 167from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 168 CompositeErrorHandler as CompositeErrorHandlerModel, 169) 170from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 171 ConcurrencyLevel as ConcurrencyLevelModel, 172) 173from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 174 ConfigAddFields as ConfigAddFieldsModel, 175) 176from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 177 ConfigComponentsResolver as ConfigComponentsResolverModel, 178) 179from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 180 ConfigMigration as ConfigMigrationModel, 181) 182from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 183 ConfigRemapField as ConfigRemapFieldModel, 184) 185from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 186 ConfigRemoveFields as ConfigRemoveFieldsModel, 187) 188from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 189 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 190) 191from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 192 CsvDecoder as CsvDecoderModel, 193) 194from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 195 CursorPagination as CursorPaginationModel, 196) 197from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 198 CustomAuthenticator as CustomAuthenticatorModel, 199) 200from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 201 CustomBackoffStrategy as CustomBackoffStrategyModel, 202) 203from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 204 CustomConfigTransformation as CustomConfigTransformationModel, 205) 206from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 207 CustomDecoder as CustomDecoderModel, 208) 209from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 210 CustomErrorHandler as CustomErrorHandlerModel, 211) 212from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 213 CustomPaginationStrategy as CustomPaginationStrategyModel, 214) 215from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 216 CustomPartitionRouter as CustomPartitionRouterModel, 217) 218from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 219 CustomRecordExtractor as CustomRecordExtractorModel, 220) 221from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 222 CustomRecordFilter as CustomRecordFilterModel, 223) 224from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 225 CustomRequester as CustomRequesterModel, 226) 227from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 228 CustomRetriever as CustomRetrieverModel, 229) 230from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 231 CustomSchemaLoader as CustomSchemaLoader, 232) 233from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 234 CustomSchemaNormalization as CustomSchemaNormalizationModel, 235) 236from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 237 CustomTransformation as CustomTransformationModel, 238) 239from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 240 CustomValidationStrategy as CustomValidationStrategyModel, 241) 242from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 243 DatetimeBasedCursor as DatetimeBasedCursorModel, 244) 245from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 246 DeclarativeStream as DeclarativeStreamModel, 247) 248from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 249 DefaultErrorHandler as DefaultErrorHandlerModel, 250) 251from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 252 DefaultPaginator as DefaultPaginatorModel, 253) 254from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 255 DpathExtractor as DpathExtractorModel, 256) 257from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 258 DpathFlattenFields as DpathFlattenFieldsModel, 259) 260from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 261 DpathValidator as DpathValidatorModel, 262) 263from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 264 DynamicSchemaLoader as DynamicSchemaLoaderModel, 265) 266from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 267 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 268) 269from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 270 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 271) 272from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 273 FileUploader as FileUploaderModel, 274) 275from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 276 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 277) 278from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 279 FlattenFields as FlattenFieldsModel, 280) 281from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 282 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 283) 284from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 285 GroupingPartitionRouter as GroupingPartitionRouterModel, 286) 287from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 288 GzipDecoder as GzipDecoderModel, 289) 290from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 291 HTTPAPIBudget as HTTPAPIBudgetModel, 292) 293from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 294 HttpComponentsResolver as HttpComponentsResolverModel, 295) 296from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 297 HttpRequester as HttpRequesterModel, 298) 299from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 300 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 301) 302from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 303 HttpResponseFilter as HttpResponseFilterModel, 304) 305from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 306 IncrementingCountCursor as IncrementingCountCursorModel, 307) 308from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 309 InlineSchemaLoader as InlineSchemaLoaderModel, 310) 311from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 312 IterableDecoder as IterableDecoderModel, 313) 314from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 315 JsonDecoder as JsonDecoderModel, 316) 317from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 318 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 319) 320from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 321 JsonlDecoder as JsonlDecoderModel, 322) 323from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 324 JsonSchemaPropertySelector as JsonSchemaPropertySelectorModel, 325) 326from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 327 JwtAuthenticator as JwtAuthenticatorModel, 328) 329from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 330 JwtHeaders as JwtHeadersModel, 331) 332from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 333 JwtPayload as JwtPayloadModel, 334) 335from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 336 KeysReplace as KeysReplaceModel, 337) 338from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 339 KeysToLower as KeysToLowerModel, 340) 341from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 342 KeysToSnakeCase as KeysToSnakeCaseModel, 343) 344from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 345 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 346) 347from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 348 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 349) 350from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 351 ListPartitionRouter as ListPartitionRouterModel, 352) 353from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 354 MinMaxDatetime as MinMaxDatetimeModel, 355) 356from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 357 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 358) 359from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 360 NoAuth as NoAuthModel, 361) 362from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 363 NoPagination as NoPaginationModel, 364) 365from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 366 OAuthAuthenticator as OAuthAuthenticatorModel, 367) 368from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 369 OffsetIncrement as OffsetIncrementModel, 370) 371from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 372 PageIncrement as PageIncrementModel, 373) 374from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 375 PaginationReset as PaginationResetModel, 376) 377from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 378 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 379) 380from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 381 ParentStreamConfig as ParentStreamConfigModel, 382) 383from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 384 PredicateValidator as PredicateValidatorModel, 385) 386from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 387 PropertiesFromEndpoint as PropertiesFromEndpointModel, 388) 389from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 390 PropertyChunking as PropertyChunkingModel, 391) 392from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 393 PropertyLimitType as PropertyLimitTypeModel, 394) 395from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 396 QueryProperties as QueryPropertiesModel, 397) 398from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 399 Rate as RateModel, 400) 401from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 402 RecordFilter as RecordFilterModel, 403) 404from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 405 RecordSelector as RecordSelectorModel, 406) 407from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 408 RefreshTokenUpdater as RefreshTokenUpdaterModel, 409) 410from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 411 RemoveFields as RemoveFieldsModel, 412) 413from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 414 RequestOption as RequestOptionModel, 415) 416from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 417 RequestPath as RequestPathModel, 418) 419from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 420 ResponseToFileExtractor as ResponseToFileExtractorModel, 421) 422from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 423 SchemaNormalization as SchemaNormalizationModel, 424) 425from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 426 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 427) 428from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 429 SelectiveAuthenticator as SelectiveAuthenticatorModel, 430) 431from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 432 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 433) 434from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 435 SimpleRetriever as SimpleRetrieverModel, 436) 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 438from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 439 StateDelegatingStream as StateDelegatingStreamModel, 440) 441from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 442 StreamConfig as StreamConfigModel, 443) 444from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 445 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 446) 447from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 448 TypesMap as TypesMapModel, 449) 450from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 451 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 452) 453from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 454 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 455) 456from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 457from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 458 WaitTimeFromHeader as WaitTimeFromHeaderModel, 459) 460from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 461 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 462) 463from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 464 XmlDecoder as XmlDecoderModel, 465) 466from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 467 ZipfileDecoder as ZipfileDecoderModel, 468) 469from airbyte_cdk.sources.declarative.partition_routers import ( 470 CartesianProductStreamSlicer, 471 GroupingPartitionRouter, 472 ListPartitionRouter, 473 PartitionRouter, 474 SinglePartitionRouter, 475 SubstreamPartitionRouter, 476) 477from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 478 AsyncJobPartitionRouter, 479) 480from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 481 ParentStreamConfig, 482) 483from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 484from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 485 CompositeErrorHandler, 486 DefaultErrorHandler, 487 HttpResponseFilter, 488) 489from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 490 ConstantBackoffStrategy, 491 ExponentialBackoffStrategy, 492 WaitTimeFromHeaderBackoffStrategy, 493 WaitUntilTimeFromHeaderBackoffStrategy, 494) 495from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 496from airbyte_cdk.sources.declarative.requesters.paginators import ( 497 DefaultPaginator, 498 NoPagination, 499 PaginatorTestReadDecorator, 500) 501from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 502 CursorPaginationStrategy, 503 CursorStopCondition, 504 OffsetIncrement, 505 PageIncrement, 506 StopConditionPaginationStrategyDecorator, 507) 508from airbyte_cdk.sources.declarative.requesters.query_properties import ( 509 PropertiesFromEndpoint, 510 PropertyChunking, 511 QueryProperties, 512) 513from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 514 PropertyLimitType, 515) 516from airbyte_cdk.sources.declarative.requesters.query_properties.property_selector import ( 517 JsonSchemaPropertySelector, 518) 519from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 520 GroupByKey, 521) 522from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 523from airbyte_cdk.sources.declarative.requesters.request_options import ( 524 DatetimeBasedRequestOptionsProvider, 525 DefaultRequestOptionsProvider, 526 InterpolatedRequestOptionsProvider, 527 RequestOptionsProvider, 528) 529from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 530 PerPartitionRequestOptionsProvider, 531) 532from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 533from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 534from airbyte_cdk.sources.declarative.resolvers import ( 535 ComponentMappingDefinition, 536 ConfigComponentsResolver, 537 HttpComponentsResolver, 538 ParametrizedComponentsResolver, 539 StreamConfig, 540 StreamParametersDefinition, 541) 542from airbyte_cdk.sources.declarative.retrievers import ( 543 AsyncRetriever, 544 LazySimpleRetriever, 545 SimpleRetriever, 546) 547from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 548 ConnectorBuilderFileUploader, 549 DefaultFileUploader, 550 FileUploader, 551 LocalFileSystemFileWriter, 552 NoopFileWriter, 553) 554from airbyte_cdk.sources.declarative.retrievers.pagination_tracker import PaginationTracker 555from airbyte_cdk.sources.declarative.schema import ( 556 ComplexFieldType, 557 DefaultSchemaLoader, 558 DynamicSchemaLoader, 559 InlineSchemaLoader, 560 JsonFileSchemaLoader, 561 SchemaLoader, 562 SchemaTypeIdentifier, 563 TypesMap, 564) 565from airbyte_cdk.sources.declarative.schema.caching_schema_loader_decorator import ( 566 CachingSchemaLoaderDecorator, 567) 568from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 569from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 570from airbyte_cdk.sources.declarative.stream_slicers import ( 571 StreamSlicer, 572 StreamSlicerTestReadDecorator, 573) 574from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 575 DeclarativePartitionFactory, 576 StreamSlicerPartitionGenerator, 577) 578from airbyte_cdk.sources.declarative.transformations import ( 579 AddFields, 580 RecordTransformation, 581 RemoveFields, 582) 583from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 584from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 585 ConfigAddFields, 586 ConfigRemapField, 587 ConfigRemoveFields, 588) 589from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 590 ConfigTransformation, 591) 592from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 593 DpathFlattenFields, 594 KeyTransformation, 595) 596from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 597 FlattenFields, 598) 599from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 600 KeysReplaceTransformation, 601) 602from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 603 KeysToLowerTransformation, 604) 605from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 606 KeysToSnakeCaseTransformation, 607) 608from airbyte_cdk.sources.declarative.validators import ( 609 DpathValidator, 610 PredicateValidator, 611 ValidateAdheresToSchema, 612) 613from airbyte_cdk.sources.http_logger import format_http_message 614from airbyte_cdk.sources.message import ( 615 InMemoryMessageRepository, 616 LogAppenderMessageRepositoryDecorator, 617 MessageRepository, 618 NoopMessageRepository, 619) 620from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 621from airbyte_cdk.sources.streams.call_rate import ( 622 APIBudget, 623 FixedWindowCallRatePolicy, 624 HttpAPIBudget, 625 HttpRequestRegexMatcher, 626 MovingWindowCallRatePolicy, 627 Rate, 628 UnlimitedCallRatePolicy, 629) 630from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 631from airbyte_cdk.sources.streams.concurrent.clamping import ( 632 ClampingEndProvider, 633 ClampingStrategy, 634 DayClampingStrategy, 635 MonthClampingStrategy, 636 NoClamping, 637 WeekClampingStrategy, 638 Weekday, 639) 640from airbyte_cdk.sources.streams.concurrent.cursor import ( 641 ConcurrentCursor, 642 Cursor, 643 CursorField, 644 FinalStateCursor, 645) 646from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 647from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 648from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 649 StreamSlicer as ConcurrentStreamSlicer, 650) 651from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 652 CustomFormatConcurrentStreamStateConverter, 653 DateTimeStreamStateConverter, 654) 655from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 656 IncrementingCountStreamStateConverter, 657) 658from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 659from airbyte_cdk.sources.types import Config 660from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 661 662ComponentDefinition = Mapping[str, Any] 663 664SCHEMA_TRANSFORMER_TYPE_MAPPING = { 665 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 666 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 667} 668_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 669 670# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 671# this would be a circular import 672MAX_SLICES = 5 673 674LOGGER = logging.getLogger(f"airbyte.model_to_component_factory") 675 676 677class ModelToComponentFactory: 678 EPOCH_DATETIME_FORMAT = "%s" 679 680 def __init__( 681 self, 682 limit_pages_fetched_per_slice: Optional[int] = None, 683 limit_slices_fetched: Optional[int] = None, 684 emit_connector_builder_messages: bool = False, 685 disable_retries: bool = False, 686 disable_cache: bool = False, 687 message_repository: Optional[MessageRepository] = None, 688 connector_state_manager: Optional[ConnectorStateManager] = None, 689 max_concurrent_async_job_count: Optional[int] = None, 690 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 691 api_budget: Optional[APIBudget] = None, 692 ): 693 self._init_mappings() 694 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 695 self._limit_slices_fetched = limit_slices_fetched 696 self._emit_connector_builder_messages = emit_connector_builder_messages 697 self._disable_retries = disable_retries 698 self._disable_cache = disable_cache 699 self._message_repository = message_repository or InMemoryMessageRepository( 700 self._evaluate_log_level(emit_connector_builder_messages) 701 ) 702 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 703 configured_catalog 704 ) 705 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 706 self._api_budget: Optional[Union[APIBudget]] = api_budget 707 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 708 # placeholder for deprecation warnings 709 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 710 711 def _init_mappings(self) -> None: 712 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 713 AddedFieldDefinitionModel: self.create_added_field_definition, 714 AddFieldsModel: self.create_add_fields, 715 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 716 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 717 BearerAuthenticatorModel: self.create_bearer_authenticator, 718 CheckStreamModel: self.create_check_stream, 719 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 720 CheckDynamicStreamModel: self.create_check_dynamic_stream, 721 CompositeErrorHandlerModel: self.create_composite_error_handler, 722 ConcurrencyLevelModel: self.create_concurrency_level, 723 ConfigMigrationModel: self.create_config_migration, 724 ConfigAddFieldsModel: self.create_config_add_fields, 725 ConfigRemapFieldModel: self.create_config_remap_field, 726 ConfigRemoveFieldsModel: self.create_config_remove_fields, 727 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 728 CsvDecoderModel: self.create_csv_decoder, 729 CursorPaginationModel: self.create_cursor_pagination, 730 CustomAuthenticatorModel: self.create_custom_component, 731 CustomBackoffStrategyModel: self.create_custom_component, 732 CustomDecoderModel: self.create_custom_component, 733 CustomErrorHandlerModel: self.create_custom_component, 734 CustomRecordExtractorModel: self.create_custom_component, 735 CustomRecordFilterModel: self.create_custom_component, 736 CustomRequesterModel: self.create_custom_component, 737 CustomRetrieverModel: self.create_custom_component, 738 CustomSchemaLoader: self.create_custom_component, 739 CustomSchemaNormalizationModel: self.create_custom_component, 740 CustomStateMigration: self.create_custom_component, 741 CustomPaginationStrategyModel: self.create_custom_component, 742 CustomPartitionRouterModel: self.create_custom_component, 743 CustomTransformationModel: self.create_custom_component, 744 CustomValidationStrategyModel: self.create_custom_component, 745 CustomConfigTransformationModel: self.create_custom_component, 746 DeclarativeStreamModel: self.create_default_stream, 747 DefaultErrorHandlerModel: self.create_default_error_handler, 748 DefaultPaginatorModel: self.create_default_paginator, 749 DpathExtractorModel: self.create_dpath_extractor, 750 DpathValidatorModel: self.create_dpath_validator, 751 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 752 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 753 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 754 GroupByKeyMergeStrategyModel: self.create_group_by_key, 755 HttpRequesterModel: self.create_http_requester, 756 HttpResponseFilterModel: self.create_http_response_filter, 757 InlineSchemaLoaderModel: self.create_inline_schema_loader, 758 JsonDecoderModel: self.create_json_decoder, 759 JsonlDecoderModel: self.create_jsonl_decoder, 760 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 761 GzipDecoderModel: self.create_gzip_decoder, 762 KeysToLowerModel: self.create_keys_to_lower_transformation, 763 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 764 KeysReplaceModel: self.create_keys_replace_transformation, 765 FlattenFieldsModel: self.create_flatten_fields, 766 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 767 IterableDecoderModel: self.create_iterable_decoder, 768 XmlDecoderModel: self.create_xml_decoder, 769 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 770 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 771 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 772 TypesMapModel: self.create_types_map, 773 ComplexFieldTypeModel: self.create_complex_field_type, 774 JwtAuthenticatorModel: self.create_jwt_authenticator, 775 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 776 ListPartitionRouterModel: self.create_list_partition_router, 777 MinMaxDatetimeModel: self.create_min_max_datetime, 778 NoAuthModel: self.create_no_auth, 779 NoPaginationModel: self.create_no_pagination, 780 OAuthAuthenticatorModel: self.create_oauth_authenticator, 781 OffsetIncrementModel: self.create_offset_increment, 782 PageIncrementModel: self.create_page_increment, 783 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 784 PredicateValidatorModel: self.create_predicate_validator, 785 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 786 PropertyChunkingModel: self.create_property_chunking, 787 QueryPropertiesModel: self.create_query_properties, 788 RecordFilterModel: self.create_record_filter, 789 RecordSelectorModel: self.create_record_selector, 790 RemoveFieldsModel: self.create_remove_fields, 791 RequestPathModel: self.create_request_path, 792 RequestOptionModel: self.create_request_option, 793 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 794 SelectiveAuthenticatorModel: self.create_selective_authenticator, 795 SimpleRetrieverModel: self.create_simple_retriever, 796 StateDelegatingStreamModel: self.create_state_delegating_stream, 797 SpecModel: self.create_spec, 798 SubstreamPartitionRouterModel: self.create_substream_partition_router, 799 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 800 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 801 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 802 AsyncRetrieverModel: self.create_async_retriever, 803 HttpComponentsResolverModel: self.create_http_components_resolver, 804 ConfigComponentsResolverModel: self.create_config_components_resolver, 805 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 806 StreamConfigModel: self.create_stream_config, 807 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 808 ZipfileDecoderModel: self.create_zipfile_decoder, 809 HTTPAPIBudgetModel: self.create_http_api_budget, 810 FileUploaderModel: self.create_file_uploader, 811 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 812 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 813 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 814 RateModel: self.create_rate, 815 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 816 GroupingPartitionRouterModel: self.create_grouping_partition_router, 817 } 818 819 # Needed for the case where we need to perform a second parse on the fields of a custom component 820 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 821 822 @staticmethod 823 def _create_stream_name_to_configured_stream( 824 configured_catalog: Optional[ConfiguredAirbyteCatalog], 825 ) -> Mapping[str, ConfiguredAirbyteStream]: 826 return ( 827 {stream.stream.name: stream for stream in configured_catalog.streams} 828 if configured_catalog 829 else {} 830 ) 831 832 def create_component( 833 self, 834 model_type: Type[BaseModel], 835 component_definition: ComponentDefinition, 836 config: Config, 837 **kwargs: Any, 838 ) -> Any: 839 """ 840 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 841 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 842 creating declarative components from that model. 843 844 :param model_type: The type of declarative component that is being initialized 845 :param component_definition: The mapping that represents a declarative component 846 :param config: The connector config that is provided by the customer 847 :return: The declarative component to be used at runtime 848 """ 849 850 component_type = component_definition.get("type") 851 if component_definition.get("type") != model_type.__name__: 852 raise ValueError( 853 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 854 ) 855 856 declarative_component_model = model_type.parse_obj(component_definition) 857 858 if not isinstance(declarative_component_model, model_type): 859 raise ValueError( 860 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 861 ) 862 863 return self._create_component_from_model( 864 model=declarative_component_model, config=config, **kwargs 865 ) 866 867 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 868 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 869 raise ValueError( 870 f"{model.__class__} with attributes {model} is not a valid component type" 871 ) 872 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 873 if not component_constructor: 874 raise ValueError(f"Could not find constructor for {model.__class__}") 875 876 # collect deprecation warnings for supported models. 877 if isinstance(model, BaseModelWithDeprecations): 878 self._collect_model_deprecations(model) 879 880 return component_constructor(model=model, config=config, **kwargs) 881 882 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 883 """ 884 Returns the deprecation warnings that were collected during the creation of components. 885 """ 886 return self._collected_deprecation_logs 887 888 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 889 """ 890 Collects deprecation logs from the given model and appends any new logs to the internal collection. 891 892 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 893 894 Args: 895 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 896 """ 897 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 898 for log in model._deprecation_logs: 899 # avoid duplicates for deprecation logs observed. 900 if log not in self._collected_deprecation_logs: 901 self._collected_deprecation_logs.append(log) 902 903 def create_config_migration( 904 self, model: ConfigMigrationModel, config: Config 905 ) -> ConfigMigration: 906 transformations: List[ConfigTransformation] = [ 907 self._create_component_from_model(transformation, config) 908 for transformation in model.transformations 909 ] 910 911 return ConfigMigration( 912 description=model.description, 913 transformations=transformations, 914 ) 915 916 def create_config_add_fields( 917 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 918 ) -> ConfigAddFields: 919 fields = [self._create_component_from_model(field, config) for field in model.fields] 920 return ConfigAddFields( 921 fields=fields, 922 condition=model.condition or "", 923 ) 924 925 @staticmethod 926 def create_config_remove_fields( 927 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 928 ) -> ConfigRemoveFields: 929 return ConfigRemoveFields( 930 field_pointers=model.field_pointers, 931 condition=model.condition or "", 932 ) 933 934 @staticmethod 935 def create_config_remap_field( 936 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 937 ) -> ConfigRemapField: 938 mapping = cast(Mapping[str, Any], model.map) 939 return ConfigRemapField( 940 map=mapping, 941 field_path=model.field_path, 942 config=config, 943 ) 944 945 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 946 strategy = self._create_component_from_model(model.validation_strategy, config) 947 948 return DpathValidator( 949 field_path=model.field_path, 950 strategy=strategy, 951 ) 952 953 def create_predicate_validator( 954 self, model: PredicateValidatorModel, config: Config 955 ) -> PredicateValidator: 956 strategy = self._create_component_from_model(model.validation_strategy, config) 957 958 return PredicateValidator( 959 value=model.value, 960 strategy=strategy, 961 ) 962 963 @staticmethod 964 def create_validate_adheres_to_schema( 965 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 966 ) -> ValidateAdheresToSchema: 967 base_schema = cast(Mapping[str, Any], model.base_schema) 968 return ValidateAdheresToSchema( 969 schema=base_schema, 970 ) 971 972 @staticmethod 973 def create_added_field_definition( 974 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 975 ) -> AddedFieldDefinition: 976 interpolated_value = InterpolatedString.create( 977 model.value, parameters=model.parameters or {} 978 ) 979 return AddedFieldDefinition( 980 path=model.path, 981 value=interpolated_value, 982 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 983 parameters=model.parameters or {}, 984 ) 985 986 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 987 added_field_definitions = [ 988 self._create_component_from_model( 989 model=added_field_definition_model, 990 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 991 added_field_definition_model.value_type 992 ), 993 config=config, 994 ) 995 for added_field_definition_model in model.fields 996 ] 997 return AddFields( 998 fields=added_field_definitions, 999 condition=model.condition or "", 1000 parameters=model.parameters or {}, 1001 ) 1002 1003 def create_keys_to_lower_transformation( 1004 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1005 ) -> KeysToLowerTransformation: 1006 return KeysToLowerTransformation() 1007 1008 def create_keys_to_snake_transformation( 1009 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1010 ) -> KeysToSnakeCaseTransformation: 1011 return KeysToSnakeCaseTransformation() 1012 1013 def create_keys_replace_transformation( 1014 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1015 ) -> KeysReplaceTransformation: 1016 return KeysReplaceTransformation( 1017 old=model.old, new=model.new, parameters=model.parameters or {} 1018 ) 1019 1020 def create_flatten_fields( 1021 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1022 ) -> FlattenFields: 1023 return FlattenFields( 1024 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1025 ) 1026 1027 def create_dpath_flatten_fields( 1028 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1029 ) -> DpathFlattenFields: 1030 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1031 key_transformation = ( 1032 KeyTransformation( 1033 config=config, 1034 prefix=model.key_transformation.prefix, 1035 suffix=model.key_transformation.suffix, 1036 parameters=model.parameters or {}, 1037 ) 1038 if model.key_transformation is not None 1039 else None 1040 ) 1041 return DpathFlattenFields( 1042 config=config, 1043 field_path=model_field_path, 1044 delete_origin_value=model.delete_origin_value 1045 if model.delete_origin_value is not None 1046 else False, 1047 replace_record=model.replace_record if model.replace_record is not None else False, 1048 key_transformation=key_transformation, 1049 parameters=model.parameters or {}, 1050 ) 1051 1052 @staticmethod 1053 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1054 if not value_type: 1055 return None 1056 names_to_types = { 1057 ValueType.string: str, 1058 ValueType.number: float, 1059 ValueType.integer: int, 1060 ValueType.boolean: bool, 1061 } 1062 return names_to_types[value_type] 1063 1064 def create_api_key_authenticator( 1065 self, 1066 model: ApiKeyAuthenticatorModel, 1067 config: Config, 1068 token_provider: Optional[TokenProvider] = None, 1069 **kwargs: Any, 1070 ) -> ApiKeyAuthenticator: 1071 if model.inject_into is None and model.header is None: 1072 raise ValueError( 1073 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1074 ) 1075 1076 if model.inject_into is not None and model.header is not None: 1077 raise ValueError( 1078 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1079 ) 1080 1081 if token_provider is not None and model.api_token != "": 1082 raise ValueError( 1083 "If token_provider is set, api_token is ignored and has to be set to empty string." 1084 ) 1085 1086 request_option = ( 1087 self._create_component_from_model( 1088 model.inject_into, config, parameters=model.parameters or {} 1089 ) 1090 if model.inject_into 1091 else RequestOption( 1092 inject_into=RequestOptionType.header, 1093 field_name=model.header or "", 1094 parameters=model.parameters or {}, 1095 ) 1096 ) 1097 1098 return ApiKeyAuthenticator( 1099 token_provider=( 1100 token_provider 1101 if token_provider is not None 1102 else InterpolatedStringTokenProvider( 1103 api_token=model.api_token or "", 1104 config=config, 1105 parameters=model.parameters or {}, 1106 ) 1107 ), 1108 request_option=request_option, 1109 config=config, 1110 parameters=model.parameters or {}, 1111 ) 1112 1113 def create_legacy_to_per_partition_state_migration( 1114 self, 1115 model: LegacyToPerPartitionStateMigrationModel, 1116 config: Mapping[str, Any], 1117 declarative_stream: DeclarativeStreamModel, 1118 ) -> LegacyToPerPartitionStateMigration: 1119 retriever = declarative_stream.retriever 1120 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1121 raise ValueError( 1122 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1123 ) 1124 partition_router = retriever.partition_router 1125 if not isinstance( 1126 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1127 ): 1128 raise ValueError( 1129 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1130 ) 1131 if not hasattr(partition_router, "parent_stream_configs"): 1132 raise ValueError( 1133 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1134 ) 1135 1136 if not hasattr(declarative_stream, "incremental_sync"): 1137 raise ValueError( 1138 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1139 ) 1140 1141 return LegacyToPerPartitionStateMigration( 1142 partition_router, # type: ignore # was already checked above 1143 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1144 config, 1145 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1146 ) 1147 1148 def create_session_token_authenticator( 1149 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1150 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1151 decoder = ( 1152 self._create_component_from_model(model=model.decoder, config=config) 1153 if model.decoder 1154 else JsonDecoder(parameters={}) 1155 ) 1156 login_requester = self._create_component_from_model( 1157 model=model.login_requester, 1158 config=config, 1159 name=f"{name}_login_requester", 1160 decoder=decoder, 1161 ) 1162 token_provider = SessionTokenProvider( 1163 login_requester=login_requester, 1164 session_token_path=model.session_token_path, 1165 expiration_duration=parse_duration(model.expiration_duration) 1166 if model.expiration_duration 1167 else None, 1168 parameters=model.parameters or {}, 1169 message_repository=self._message_repository, 1170 decoder=decoder, 1171 ) 1172 if model.request_authentication.type == "Bearer": 1173 return ModelToComponentFactory.create_bearer_authenticator( 1174 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1175 config, 1176 token_provider=token_provider, 1177 ) 1178 else: 1179 # Get the api_token template if specified, default to just the session token 1180 api_token_template = ( 1181 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1182 ) 1183 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1184 config=config, 1185 api_token=api_token_template, 1186 session_token_provider=token_provider, 1187 parameters=model.parameters or {}, 1188 ) 1189 return self.create_api_key_authenticator( 1190 ApiKeyAuthenticatorModel( 1191 type="ApiKeyAuthenticator", 1192 api_token="", 1193 inject_into=model.request_authentication.inject_into, 1194 ), # type: ignore # $parameters and headers default to None 1195 config=config, 1196 token_provider=final_token_provider, 1197 ) 1198 1199 @staticmethod 1200 def create_basic_http_authenticator( 1201 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1202 ) -> BasicHttpAuthenticator: 1203 return BasicHttpAuthenticator( 1204 password=model.password or "", 1205 username=model.username, 1206 config=config, 1207 parameters=model.parameters or {}, 1208 ) 1209 1210 @staticmethod 1211 def create_bearer_authenticator( 1212 model: BearerAuthenticatorModel, 1213 config: Config, 1214 token_provider: Optional[TokenProvider] = None, 1215 **kwargs: Any, 1216 ) -> BearerAuthenticator: 1217 if token_provider is not None and model.api_token != "": 1218 raise ValueError( 1219 "If token_provider is set, api_token is ignored and has to be set to empty string." 1220 ) 1221 return BearerAuthenticator( 1222 token_provider=( 1223 token_provider 1224 if token_provider is not None 1225 else InterpolatedStringTokenProvider( 1226 api_token=model.api_token or "", 1227 config=config, 1228 parameters=model.parameters or {}, 1229 ) 1230 ), 1231 config=config, 1232 parameters=model.parameters or {}, 1233 ) 1234 1235 @staticmethod 1236 def create_dynamic_stream_check_config( 1237 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1238 ) -> DynamicStreamCheckConfig: 1239 return DynamicStreamCheckConfig( 1240 dynamic_stream_name=model.dynamic_stream_name, 1241 stream_count=model.stream_count or 0, 1242 ) 1243 1244 def create_check_stream( 1245 self, model: CheckStreamModel, config: Config, **kwargs: Any 1246 ) -> CheckStream: 1247 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1248 raise ValueError( 1249 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1250 ) 1251 1252 dynamic_streams_check_configs = ( 1253 [ 1254 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1255 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1256 ] 1257 if model.dynamic_streams_check_configs 1258 else [] 1259 ) 1260 1261 return CheckStream( 1262 stream_names=model.stream_names or [], 1263 dynamic_streams_check_configs=dynamic_streams_check_configs, 1264 parameters={}, 1265 ) 1266 1267 @staticmethod 1268 def create_check_dynamic_stream( 1269 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1270 ) -> CheckDynamicStream: 1271 assert model.use_check_availability is not None # for mypy 1272 1273 use_check_availability = model.use_check_availability 1274 1275 return CheckDynamicStream( 1276 stream_count=model.stream_count, 1277 use_check_availability=use_check_availability, 1278 parameters={}, 1279 ) 1280 1281 def create_composite_error_handler( 1282 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1283 ) -> CompositeErrorHandler: 1284 error_handlers = [ 1285 self._create_component_from_model(model=error_handler_model, config=config) 1286 for error_handler_model in model.error_handlers 1287 ] 1288 return CompositeErrorHandler( 1289 error_handlers=error_handlers, parameters=model.parameters or {} 1290 ) 1291 1292 @staticmethod 1293 def create_concurrency_level( 1294 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1295 ) -> ConcurrencyLevel: 1296 return ConcurrencyLevel( 1297 default_concurrency=model.default_concurrency, 1298 max_concurrency=model.max_concurrency, 1299 config=config, 1300 parameters={}, 1301 ) 1302 1303 @staticmethod 1304 def apply_stream_state_migrations( 1305 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1306 ) -> MutableMapping[str, Any]: 1307 if stream_state_migrations: 1308 for state_migration in stream_state_migrations: 1309 if state_migration.should_migrate(stream_state): 1310 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1311 stream_state = dict(state_migration.migrate(stream_state)) 1312 return stream_state 1313 1314 def create_concurrent_cursor_from_datetime_based_cursor( 1315 self, 1316 model_type: Type[BaseModel], 1317 component_definition: ComponentDefinition, 1318 stream_name: str, 1319 stream_namespace: Optional[str], 1320 stream_state: MutableMapping[str, Any], 1321 config: Config, 1322 message_repository: Optional[MessageRepository] = None, 1323 runtime_lookback_window: Optional[datetime.timedelta] = None, 1324 **kwargs: Any, 1325 ) -> ConcurrentCursor: 1326 component_type = component_definition.get("type") 1327 if component_definition.get("type") != model_type.__name__: 1328 raise ValueError( 1329 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1330 ) 1331 1332 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1333 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1334 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1335 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1336 if "$parameters" not in component_definition and "parameters" in component_definition: 1337 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1338 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1339 1340 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1341 raise ValueError( 1342 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1343 ) 1344 1345 model_parameters = datetime_based_cursor_model.parameters or {} 1346 1347 cursor_field = self._get_catalog_defined_cursor_field( 1348 stream_name=stream_name, 1349 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1350 or False, 1351 ) 1352 1353 if not cursor_field: 1354 interpolated_cursor_field = InterpolatedString.create( 1355 datetime_based_cursor_model.cursor_field, 1356 parameters=model_parameters, 1357 ) 1358 cursor_field = CursorField( 1359 cursor_field_key=interpolated_cursor_field.eval(config=config), 1360 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1361 or False, 1362 ) 1363 1364 interpolated_partition_field_start = InterpolatedString.create( 1365 datetime_based_cursor_model.partition_field_start or "start_time", 1366 parameters=model_parameters, 1367 ) 1368 interpolated_partition_field_end = InterpolatedString.create( 1369 datetime_based_cursor_model.partition_field_end or "end_time", 1370 parameters=model_parameters, 1371 ) 1372 1373 slice_boundary_fields = ( 1374 interpolated_partition_field_start.eval(config=config), 1375 interpolated_partition_field_end.eval(config=config), 1376 ) 1377 1378 datetime_format = datetime_based_cursor_model.datetime_format 1379 1380 cursor_granularity = ( 1381 parse_duration(datetime_based_cursor_model.cursor_granularity) 1382 if datetime_based_cursor_model.cursor_granularity 1383 else None 1384 ) 1385 1386 lookback_window = None 1387 interpolated_lookback_window = ( 1388 InterpolatedString.create( 1389 datetime_based_cursor_model.lookback_window, 1390 parameters=model_parameters, 1391 ) 1392 if datetime_based_cursor_model.lookback_window 1393 else None 1394 ) 1395 if interpolated_lookback_window: 1396 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1397 if evaluated_lookback_window: 1398 lookback_window = parse_duration(evaluated_lookback_window) 1399 1400 connector_state_converter: DateTimeStreamStateConverter 1401 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1402 datetime_format=datetime_format, 1403 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1404 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1405 cursor_granularity=cursor_granularity, 1406 ) 1407 1408 # Adjusts the stream state by applying the runtime lookback window. 1409 # This is used to ensure correct state handling in case of failed partitions. 1410 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1411 if runtime_lookback_window and stream_state_value: 1412 new_stream_state = ( 1413 connector_state_converter.parse_timestamp(stream_state_value) 1414 - runtime_lookback_window 1415 ) 1416 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1417 new_stream_state 1418 ) 1419 1420 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1421 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1422 start_date_runtime_value = self.create_min_max_datetime( 1423 model=datetime_based_cursor_model.start_datetime, config=config 1424 ) 1425 else: 1426 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1427 1428 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1429 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1430 end_date_runtime_value = self.create_min_max_datetime( 1431 model=datetime_based_cursor_model.end_datetime, config=config 1432 ) 1433 else: 1434 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1435 1436 interpolated_start_date = MinMaxDatetime.create( 1437 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1438 parameters=datetime_based_cursor_model.parameters, 1439 ) 1440 interpolated_end_date = ( 1441 None 1442 if not end_date_runtime_value 1443 else MinMaxDatetime.create( 1444 end_date_runtime_value, datetime_based_cursor_model.parameters 1445 ) 1446 ) 1447 1448 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1449 if not interpolated_start_date.datetime_format: 1450 interpolated_start_date.datetime_format = datetime_format 1451 if interpolated_end_date and not interpolated_end_date.datetime_format: 1452 interpolated_end_date.datetime_format = datetime_format 1453 1454 start_date = interpolated_start_date.get_datetime(config=config) 1455 end_date_provider = ( 1456 partial(interpolated_end_date.get_datetime, config) 1457 if interpolated_end_date 1458 else connector_state_converter.get_end_provider() 1459 ) 1460 1461 if ( 1462 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1463 ) or ( 1464 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1465 ): 1466 raise ValueError( 1467 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1468 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1469 ) 1470 1471 # When step is not defined, default to a step size from the starting date to the present moment 1472 step_length = datetime.timedelta.max 1473 interpolated_step = ( 1474 InterpolatedString.create( 1475 datetime_based_cursor_model.step, 1476 parameters=model_parameters, 1477 ) 1478 if datetime_based_cursor_model.step 1479 else None 1480 ) 1481 if interpolated_step: 1482 evaluated_step = interpolated_step.eval(config) 1483 if evaluated_step: 1484 step_length = parse_duration(evaluated_step) 1485 1486 clamping_strategy: ClampingStrategy = NoClamping() 1487 if datetime_based_cursor_model.clamping: 1488 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1489 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1490 # object which we want to keep agnostic of being low-code 1491 target = InterpolatedString( 1492 string=datetime_based_cursor_model.clamping.target, 1493 parameters=model_parameters, 1494 ) 1495 evaluated_target = target.eval(config=config) 1496 match evaluated_target: 1497 case "DAY": 1498 clamping_strategy = DayClampingStrategy() 1499 end_date_provider = ClampingEndProvider( 1500 DayClampingStrategy(is_ceiling=False), 1501 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1502 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1503 ) 1504 case "WEEK": 1505 if ( 1506 not datetime_based_cursor_model.clamping.target_details 1507 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1508 ): 1509 raise ValueError( 1510 "Given WEEK clamping, weekday needs to be provided as target_details" 1511 ) 1512 weekday = self._assemble_weekday( 1513 datetime_based_cursor_model.clamping.target_details["weekday"] 1514 ) 1515 clamping_strategy = WeekClampingStrategy(weekday) 1516 end_date_provider = ClampingEndProvider( 1517 WeekClampingStrategy(weekday, is_ceiling=False), 1518 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1519 granularity=cursor_granularity or datetime.timedelta(days=1), 1520 ) 1521 case "MONTH": 1522 clamping_strategy = MonthClampingStrategy() 1523 end_date_provider = ClampingEndProvider( 1524 MonthClampingStrategy(is_ceiling=False), 1525 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1526 granularity=cursor_granularity or datetime.timedelta(days=1), 1527 ) 1528 case _: 1529 raise ValueError( 1530 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1531 ) 1532 1533 return ConcurrentCursor( 1534 stream_name=stream_name, 1535 stream_namespace=stream_namespace, 1536 stream_state=stream_state, 1537 message_repository=message_repository or self._message_repository, 1538 connector_state_manager=self._connector_state_manager, 1539 connector_state_converter=connector_state_converter, 1540 cursor_field=cursor_field, 1541 slice_boundary_fields=slice_boundary_fields, 1542 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1543 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1544 lookback_window=lookback_window, 1545 slice_range=step_length, 1546 cursor_granularity=cursor_granularity, 1547 clamping_strategy=clamping_strategy, 1548 ) 1549 1550 def create_concurrent_cursor_from_incrementing_count_cursor( 1551 self, 1552 model_type: Type[BaseModel], 1553 component_definition: ComponentDefinition, 1554 stream_name: str, 1555 stream_namespace: Optional[str], 1556 stream_state: MutableMapping[str, Any], 1557 config: Config, 1558 message_repository: Optional[MessageRepository] = None, 1559 **kwargs: Any, 1560 ) -> ConcurrentCursor: 1561 component_type = component_definition.get("type") 1562 if component_definition.get("type") != model_type.__name__: 1563 raise ValueError( 1564 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1565 ) 1566 1567 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1568 1569 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1570 raise ValueError( 1571 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1572 ) 1573 1574 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1575 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1576 # We need to handle both int and str representations of numeric values. 1577 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1578 if start_value is not None: 1579 interpolated_start_value = InterpolatedString.create( 1580 str(start_value), # Ensure we pass a string to InterpolatedString.create 1581 parameters=incrementing_count_cursor_model.parameters or {}, 1582 ) 1583 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1584 else: 1585 evaluated_start_value = 0 1586 1587 cursor_field = self._get_catalog_defined_cursor_field( 1588 stream_name=stream_name, 1589 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1590 or False, 1591 ) 1592 1593 if not cursor_field: 1594 interpolated_cursor_field = InterpolatedString.create( 1595 incrementing_count_cursor_model.cursor_field, 1596 parameters=incrementing_count_cursor_model.parameters or {}, 1597 ) 1598 cursor_field = CursorField( 1599 cursor_field_key=interpolated_cursor_field.eval(config=config), 1600 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1601 or False, 1602 ) 1603 1604 connector_state_converter = IncrementingCountStreamStateConverter( 1605 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1606 ) 1607 1608 return ConcurrentCursor( 1609 stream_name=stream_name, 1610 stream_namespace=stream_namespace, 1611 stream_state=stream_state, 1612 message_repository=message_repository or self._message_repository, 1613 connector_state_manager=self._connector_state_manager, 1614 connector_state_converter=connector_state_converter, 1615 cursor_field=cursor_field, 1616 slice_boundary_fields=None, 1617 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1618 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1619 ) 1620 1621 def _assemble_weekday(self, weekday: str) -> Weekday: 1622 match weekday: 1623 case "MONDAY": 1624 return Weekday.MONDAY 1625 case "TUESDAY": 1626 return Weekday.TUESDAY 1627 case "WEDNESDAY": 1628 return Weekday.WEDNESDAY 1629 case "THURSDAY": 1630 return Weekday.THURSDAY 1631 case "FRIDAY": 1632 return Weekday.FRIDAY 1633 case "SATURDAY": 1634 return Weekday.SATURDAY 1635 case "SUNDAY": 1636 return Weekday.SUNDAY 1637 case _: 1638 raise ValueError(f"Unknown weekday {weekday}") 1639 1640 def create_concurrent_cursor_from_perpartition_cursor( 1641 self, 1642 state_manager: ConnectorStateManager, 1643 model_type: Type[BaseModel], 1644 component_definition: ComponentDefinition, 1645 stream_name: str, 1646 stream_namespace: Optional[str], 1647 config: Config, 1648 stream_state: MutableMapping[str, Any], 1649 partition_router: PartitionRouter, 1650 attempt_to_create_cursor_if_not_provided: bool = False, 1651 **kwargs: Any, 1652 ) -> ConcurrentPerPartitionCursor: 1653 component_type = component_definition.get("type") 1654 if component_definition.get("type") != model_type.__name__: 1655 raise ValueError( 1656 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1657 ) 1658 1659 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1660 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1661 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1662 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1663 if "$parameters" not in component_definition and "parameters" in component_definition: 1664 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1665 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1666 1667 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1668 raise ValueError( 1669 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1670 ) 1671 1672 cursor_field = self._get_catalog_defined_cursor_field( 1673 stream_name=stream_name, 1674 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1675 or False, 1676 ) 1677 1678 if not cursor_field: 1679 interpolated_cursor_field = InterpolatedString.create( 1680 datetime_based_cursor_model.cursor_field, 1681 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1682 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1683 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1684 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1685 parameters=datetime_based_cursor_model.parameters or {}, 1686 ) 1687 cursor_field = CursorField( 1688 cursor_field_key=interpolated_cursor_field.eval(config=config), 1689 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1690 or False, 1691 ) 1692 1693 datetime_format = datetime_based_cursor_model.datetime_format 1694 1695 cursor_granularity = ( 1696 parse_duration(datetime_based_cursor_model.cursor_granularity) 1697 if datetime_based_cursor_model.cursor_granularity 1698 else None 1699 ) 1700 1701 connector_state_converter: DateTimeStreamStateConverter 1702 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1703 datetime_format=datetime_format, 1704 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1705 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1706 cursor_granularity=cursor_granularity, 1707 ) 1708 1709 # Create the cursor factory 1710 cursor_factory = ConcurrentCursorFactory( 1711 partial( 1712 self.create_concurrent_cursor_from_datetime_based_cursor, 1713 state_manager=state_manager, 1714 model_type=model_type, 1715 component_definition=component_definition, 1716 stream_name=stream_name, 1717 stream_namespace=stream_namespace, 1718 config=config, 1719 message_repository=NoopMessageRepository(), 1720 ) 1721 ) 1722 1723 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1724 use_global_cursor = isinstance( 1725 partition_router, GroupingPartitionRouter 1726 ) or component_definition.get("global_substream_cursor", False) 1727 1728 # Return the concurrent cursor and state converter 1729 return ConcurrentPerPartitionCursor( 1730 cursor_factory=cursor_factory, 1731 partition_router=partition_router, 1732 stream_name=stream_name, 1733 stream_namespace=stream_namespace, 1734 stream_state=stream_state, 1735 message_repository=self._message_repository, # type: ignore 1736 connector_state_manager=state_manager, 1737 connector_state_converter=connector_state_converter, 1738 cursor_field=cursor_field, 1739 use_global_cursor=use_global_cursor, 1740 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1741 ) 1742 1743 @staticmethod 1744 def create_constant_backoff_strategy( 1745 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1746 ) -> ConstantBackoffStrategy: 1747 return ConstantBackoffStrategy( 1748 backoff_time_in_seconds=model.backoff_time_in_seconds, 1749 config=config, 1750 parameters=model.parameters or {}, 1751 ) 1752 1753 def create_cursor_pagination( 1754 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1755 ) -> CursorPaginationStrategy: 1756 if isinstance(decoder, PaginationDecoderDecorator): 1757 inner_decoder = decoder.decoder 1758 else: 1759 inner_decoder = decoder 1760 decoder = PaginationDecoderDecorator(decoder=decoder) 1761 1762 if self._is_supported_decoder_for_pagination(inner_decoder): 1763 decoder_to_use = decoder 1764 else: 1765 raise ValueError( 1766 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1767 ) 1768 1769 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1770 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1771 page_size = model.page_size 1772 if isinstance(page_size, str) and page_size.isdigit(): 1773 page_size = int(page_size) 1774 1775 return CursorPaginationStrategy( 1776 cursor_value=model.cursor_value, 1777 decoder=decoder_to_use, 1778 page_size=page_size, 1779 stop_condition=model.stop_condition, 1780 config=config, 1781 parameters=model.parameters or {}, 1782 ) 1783 1784 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1785 """ 1786 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1787 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1788 :param model: The Pydantic model of the custom component being created 1789 :param config: The custom defined connector config 1790 :return: The declarative component built from the Pydantic model to be used at runtime 1791 """ 1792 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1793 component_fields = get_type_hints(custom_component_class) 1794 model_args = model.dict() 1795 model_args["config"] = config 1796 1797 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1798 # we defer to these arguments over the component's definition 1799 for key, arg in kwargs.items(): 1800 model_args[key] = arg 1801 1802 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1803 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1804 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1805 for model_field, model_value in model_args.items(): 1806 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1807 if ( 1808 isinstance(model_value, dict) 1809 and "type" not in model_value 1810 and model_field in component_fields 1811 ): 1812 derived_type = self._derive_component_type_from_type_hints( 1813 component_fields.get(model_field) 1814 ) 1815 if derived_type: 1816 model_value["type"] = derived_type 1817 1818 if self._is_component(model_value): 1819 model_args[model_field] = self._create_nested_component( 1820 model, 1821 model_field, 1822 model_value, 1823 config, 1824 **kwargs, 1825 ) 1826 elif isinstance(model_value, list): 1827 vals = [] 1828 for v in model_value: 1829 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1830 derived_type = self._derive_component_type_from_type_hints( 1831 component_fields.get(model_field) 1832 ) 1833 if derived_type: 1834 v["type"] = derived_type 1835 if self._is_component(v): 1836 vals.append( 1837 self._create_nested_component( 1838 model, 1839 model_field, 1840 v, 1841 config, 1842 **kwargs, 1843 ) 1844 ) 1845 else: 1846 vals.append(v) 1847 model_args[model_field] = vals 1848 1849 kwargs = { 1850 class_field: model_args[class_field] 1851 for class_field in component_fields.keys() 1852 if class_field in model_args 1853 } 1854 return custom_component_class(**kwargs) 1855 1856 @staticmethod 1857 def _get_class_from_fully_qualified_class_name( 1858 full_qualified_class_name: str, 1859 ) -> Any: 1860 """Get a class from its fully qualified name. 1861 1862 If a custom components module is needed, we assume it is already registered - probably 1863 as `source_declarative_manifest.components` or `components`. 1864 1865 Args: 1866 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1867 1868 Returns: 1869 Any: The class object. 1870 1871 Raises: 1872 ValueError: If the class cannot be loaded. 1873 """ 1874 split = full_qualified_class_name.split(".") 1875 module_name_full = ".".join(split[:-1]) 1876 class_name = split[-1] 1877 1878 try: 1879 module_ref = importlib.import_module(module_name_full) 1880 except ModuleNotFoundError as e: 1881 if split[0] == "source_declarative_manifest": 1882 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1883 try: 1884 import os 1885 1886 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1887 module_ref = importlib.import_module( 1888 module_name_with_source_declarative_manifest 1889 ) 1890 except ModuleNotFoundError: 1891 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1892 else: 1893 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1894 1895 try: 1896 return getattr(module_ref, class_name) 1897 except AttributeError as e: 1898 raise ValueError( 1899 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1900 ) from e 1901 1902 @staticmethod 1903 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1904 interface = field_type 1905 while True: 1906 origin = get_origin(interface) 1907 if origin: 1908 # Unnest types until we reach the raw type 1909 # List[T] -> T 1910 # Optional[List[T]] -> T 1911 args = get_args(interface) 1912 interface = args[0] 1913 else: 1914 break 1915 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1916 return interface.__name__ 1917 return None 1918 1919 @staticmethod 1920 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1921 if not cls: 1922 return False 1923 return cls.__module__ == "builtins" 1924 1925 @staticmethod 1926 def _extract_missing_parameters(error: TypeError) -> List[str]: 1927 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1928 if parameter_search: 1929 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1930 else: 1931 return [] 1932 1933 def _create_nested_component( 1934 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1935 ) -> Any: 1936 type_name = model_value.get("type", None) 1937 if not type_name: 1938 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1939 return model_value 1940 1941 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1942 if model_type: 1943 parsed_model = model_type.parse_obj(model_value) 1944 try: 1945 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1946 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1947 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1948 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1949 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1950 # are needed by a component and could not be shared. 1951 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1952 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1953 model_parameters = model_value.get("$parameters", {}) 1954 matching_parameters = { 1955 kwarg: model_parameters[kwarg] 1956 for kwarg in constructor_kwargs 1957 if kwarg in model_parameters 1958 } 1959 matching_kwargs = { 1960 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1961 } 1962 return self._create_component_from_model( 1963 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1964 ) 1965 except TypeError as error: 1966 missing_parameters = self._extract_missing_parameters(error) 1967 if missing_parameters: 1968 raise ValueError( 1969 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1970 + ", ".join( 1971 ( 1972 f"{type_name}.$parameters.{parameter}" 1973 for parameter in missing_parameters 1974 ) 1975 ) 1976 ) 1977 raise TypeError( 1978 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1979 ) 1980 else: 1981 raise ValueError( 1982 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1983 ) 1984 1985 @staticmethod 1986 def _is_component(model_value: Any) -> bool: 1987 return isinstance(model_value, dict) and model_value.get("type") is not None 1988 1989 def create_default_stream( 1990 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1991 ) -> AbstractStream: 1992 primary_key = model.primary_key.__root__ if model.primary_key else None 1993 self._migrate_state(model, config) 1994 1995 partition_router = self._build_stream_slicer_from_partition_router( 1996 model.retriever, 1997 config, 1998 stream_name=model.name, 1999 **kwargs, 2000 ) 2001 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2002 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2003 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2004 2005 end_time_option = ( 2006 self._create_component_from_model( 2007 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2008 ) 2009 if cursor_model.end_time_option 2010 else None 2011 ) 2012 start_time_option = ( 2013 self._create_component_from_model( 2014 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2015 ) 2016 if cursor_model.start_time_option 2017 else None 2018 ) 2019 2020 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2021 start_time_option=start_time_option, 2022 end_time_option=end_time_option, 2023 partition_field_start=cursor_model.partition_field_start, 2024 partition_field_end=cursor_model.partition_field_end, 2025 config=config, 2026 parameters=model.parameters or {}, 2027 ) 2028 request_options_provider = ( 2029 datetime_request_options_provider 2030 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2031 else PerPartitionRequestOptionsProvider( 2032 partition_router, datetime_request_options_provider 2033 ) 2034 ) 2035 elif model.incremental_sync and isinstance( 2036 model.incremental_sync, IncrementingCountCursorModel 2037 ): 2038 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2039 raise ValueError( 2040 "PerPartition does not support per partition states because switching to global state is time based" 2041 ) 2042 2043 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2044 2045 start_time_option = ( 2046 self._create_component_from_model( 2047 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2048 config, 2049 parameters=cursor_model.parameters or {}, 2050 ) 2051 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2052 else None 2053 ) 2054 2055 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2056 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2057 partition_field_start = "start" 2058 2059 request_options_provider = DatetimeBasedRequestOptionsProvider( 2060 start_time_option=start_time_option, 2061 partition_field_start=partition_field_start, 2062 config=config, 2063 parameters=model.parameters or {}, 2064 ) 2065 else: 2066 request_options_provider = None 2067 2068 transformations = [] 2069 if model.transformations: 2070 for transformation_model in model.transformations: 2071 transformations.append( 2072 self._create_component_from_model(model=transformation_model, config=config) 2073 ) 2074 file_uploader = None 2075 if model.file_uploader: 2076 file_uploader = self._create_component_from_model( 2077 model=model.file_uploader, config=config 2078 ) 2079 2080 stream_slicer: ConcurrentStreamSlicer = ( 2081 partition_router 2082 if isinstance(concurrent_cursor, FinalStateCursor) 2083 else concurrent_cursor 2084 ) 2085 2086 retriever = self._create_component_from_model( 2087 model=model.retriever, 2088 config=config, 2089 name=model.name, 2090 primary_key=primary_key, 2091 request_options_provider=request_options_provider, 2092 stream_slicer=stream_slicer, 2093 partition_router=partition_router, 2094 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2095 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2096 cursor=concurrent_cursor, 2097 transformations=transformations, 2098 file_uploader=file_uploader, 2099 incremental_sync=model.incremental_sync, 2100 ) 2101 if isinstance(retriever, AsyncRetriever): 2102 stream_slicer = retriever.stream_slicer 2103 2104 schema_loader: SchemaLoader 2105 if model.schema_loader and isinstance(model.schema_loader, list): 2106 nested_schema_loaders = [ 2107 self._create_component_from_model(model=nested_schema_loader, config=config) 2108 for nested_schema_loader in model.schema_loader 2109 ] 2110 schema_loader = CompositeSchemaLoader( 2111 schema_loaders=nested_schema_loaders, parameters={} 2112 ) 2113 elif model.schema_loader: 2114 schema_loader = self._create_component_from_model( 2115 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2116 config=config, 2117 ) 2118 else: 2119 options = model.parameters or {} 2120 if "name" not in options: 2121 options["name"] = model.name 2122 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2123 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2124 2125 stream_name = model.name or "" 2126 return DefaultStream( 2127 partition_generator=StreamSlicerPartitionGenerator( 2128 DeclarativePartitionFactory( 2129 stream_name, 2130 schema_loader, 2131 retriever, 2132 self._message_repository, 2133 ), 2134 stream_slicer, 2135 slice_limit=self._limit_slices_fetched, 2136 ), 2137 name=stream_name, 2138 json_schema=schema_loader.get_json_schema, 2139 primary_key=get_primary_key_from_stream(primary_key), 2140 cursor_field=( 2141 concurrent_cursor.cursor_field 2142 if hasattr(concurrent_cursor, "cursor_field") 2143 else None 2144 ), 2145 logger=logging.getLogger(f"airbyte.{stream_name}"), 2146 cursor=concurrent_cursor, 2147 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2148 ) 2149 2150 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2151 stream_name = model.name or "" 2152 stream_state = self._connector_state_manager.get_stream_state( 2153 stream_name=stream_name, namespace=None 2154 ) 2155 if model.state_migrations: 2156 state_transformations = [ 2157 self._create_component_from_model(state_migration, config, declarative_stream=model) 2158 for state_migration in model.state_migrations 2159 ] 2160 else: 2161 state_transformations = [] 2162 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2163 self._connector_state_manager.update_state_for_stream( 2164 stream_name=stream_name, namespace=None, value=stream_state 2165 ) 2166 2167 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2168 return bool( 2169 model.incremental_sync 2170 and hasattr(model.incremental_sync, "is_data_feed") 2171 and model.incremental_sync.is_data_feed 2172 ) 2173 2174 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2175 return bool( 2176 model.incremental_sync 2177 and hasattr(model.incremental_sync, "is_client_side_incremental") 2178 and model.incremental_sync.is_client_side_incremental 2179 ) 2180 2181 def _build_stream_slicer_from_partition_router( 2182 self, 2183 model: Union[ 2184 AsyncRetrieverModel, 2185 CustomRetrieverModel, 2186 SimpleRetrieverModel, 2187 ], 2188 config: Config, 2189 stream_name: Optional[str] = None, 2190 **kwargs: Any, 2191 ) -> PartitionRouter: 2192 if ( 2193 hasattr(model, "partition_router") 2194 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2195 and model.partition_router 2196 ): 2197 stream_slicer_model = model.partition_router 2198 if isinstance(stream_slicer_model, list): 2199 return CartesianProductStreamSlicer( 2200 [ 2201 self._create_component_from_model( 2202 model=slicer, config=config, stream_name=stream_name or "" 2203 ) 2204 for slicer in stream_slicer_model 2205 ], 2206 parameters={}, 2207 ) 2208 elif isinstance(stream_slicer_model, dict): 2209 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2210 params = stream_slicer_model.get("$parameters") 2211 if not isinstance(params, dict): 2212 params = {} 2213 stream_slicer_model["$parameters"] = params 2214 2215 if stream_name is not None: 2216 params["stream_name"] = stream_name 2217 2218 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2219 model, 2220 "partition_router", 2221 stream_slicer_model, 2222 config, 2223 **kwargs, 2224 ) 2225 else: 2226 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2227 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2228 ) 2229 return SinglePartitionRouter(parameters={}) 2230 2231 def _build_concurrent_cursor( 2232 self, 2233 model: DeclarativeStreamModel, 2234 stream_slicer: Optional[PartitionRouter], 2235 config: Config, 2236 ) -> Cursor: 2237 stream_name = model.name or "" 2238 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2239 2240 if ( 2241 model.incremental_sync 2242 and stream_slicer 2243 and not isinstance(stream_slicer, SinglePartitionRouter) 2244 ): 2245 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2246 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2247 # same time because we didn't solve for design questions like what the lookback window would 2248 # be as well as global cursor fall backs. We have not seen customers that have needed both 2249 # at the same time yet and are currently punting on this until we need to solve it. 2250 raise ValueError( 2251 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2252 ) 2253 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2254 state_manager=self._connector_state_manager, 2255 model_type=DatetimeBasedCursorModel, 2256 component_definition=model.incremental_sync.__dict__, 2257 stream_name=stream_name, 2258 stream_state=stream_state, 2259 stream_namespace=None, 2260 config=config or {}, 2261 partition_router=stream_slicer, 2262 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2263 ) 2264 elif model.incremental_sync: 2265 if type(model.incremental_sync) == IncrementingCountCursorModel: 2266 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2267 model_type=IncrementingCountCursorModel, 2268 component_definition=model.incremental_sync.__dict__, 2269 stream_name=stream_name, 2270 stream_namespace=None, 2271 stream_state=stream_state, 2272 config=config or {}, 2273 ) 2274 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2275 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2276 model_type=type(model.incremental_sync), 2277 component_definition=model.incremental_sync.__dict__, 2278 stream_name=stream_name, 2279 stream_namespace=None, 2280 stream_state=stream_state, 2281 config=config or {}, 2282 attempt_to_create_cursor_if_not_provided=True, 2283 ) 2284 else: 2285 raise ValueError( 2286 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2287 ) 2288 return FinalStateCursor(stream_name, None, self._message_repository) 2289 2290 def create_default_error_handler( 2291 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2292 ) -> DefaultErrorHandler: 2293 backoff_strategies = [] 2294 if model.backoff_strategies: 2295 for backoff_strategy_model in model.backoff_strategies: 2296 backoff_strategies.append( 2297 self._create_component_from_model(model=backoff_strategy_model, config=config) 2298 ) 2299 2300 response_filters = [] 2301 if model.response_filters: 2302 for response_filter_model in model.response_filters: 2303 response_filters.append( 2304 self._create_component_from_model(model=response_filter_model, config=config) 2305 ) 2306 response_filters.append( 2307 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2308 ) 2309 2310 return DefaultErrorHandler( 2311 backoff_strategies=backoff_strategies, 2312 max_retries=model.max_retries, 2313 response_filters=response_filters, 2314 config=config, 2315 parameters=model.parameters or {}, 2316 ) 2317 2318 def create_default_paginator( 2319 self, 2320 model: DefaultPaginatorModel, 2321 config: Config, 2322 *, 2323 url_base: str, 2324 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2325 decoder: Optional[Decoder] = None, 2326 cursor_used_for_stop_condition: Optional[Cursor] = None, 2327 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2328 if decoder: 2329 if self._is_supported_decoder_for_pagination(decoder): 2330 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2331 else: 2332 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2333 else: 2334 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2335 page_size_option = ( 2336 self._create_component_from_model(model=model.page_size_option, config=config) 2337 if model.page_size_option 2338 else None 2339 ) 2340 page_token_option = ( 2341 self._create_component_from_model(model=model.page_token_option, config=config) 2342 if model.page_token_option 2343 else None 2344 ) 2345 pagination_strategy = self._create_component_from_model( 2346 model=model.pagination_strategy, 2347 config=config, 2348 decoder=decoder_to_use, 2349 extractor_model=extractor_model, 2350 ) 2351 if cursor_used_for_stop_condition: 2352 pagination_strategy = StopConditionPaginationStrategyDecorator( 2353 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2354 ) 2355 paginator = DefaultPaginator( 2356 decoder=decoder_to_use, 2357 page_size_option=page_size_option, 2358 page_token_option=page_token_option, 2359 pagination_strategy=pagination_strategy, 2360 url_base=url_base, 2361 config=config, 2362 parameters=model.parameters or {}, 2363 ) 2364 if self._limit_pages_fetched_per_slice: 2365 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2366 return paginator 2367 2368 def create_dpath_extractor( 2369 self, 2370 model: DpathExtractorModel, 2371 config: Config, 2372 decoder: Optional[Decoder] = None, 2373 **kwargs: Any, 2374 ) -> DpathExtractor: 2375 if decoder: 2376 decoder_to_use = decoder 2377 else: 2378 decoder_to_use = JsonDecoder(parameters={}) 2379 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2380 return DpathExtractor( 2381 decoder=decoder_to_use, 2382 field_path=model_field_path, 2383 config=config, 2384 parameters=model.parameters or {}, 2385 ) 2386 2387 @staticmethod 2388 def create_response_to_file_extractor( 2389 model: ResponseToFileExtractorModel, 2390 **kwargs: Any, 2391 ) -> ResponseToFileExtractor: 2392 return ResponseToFileExtractor(parameters=model.parameters or {}) 2393 2394 @staticmethod 2395 def create_exponential_backoff_strategy( 2396 model: ExponentialBackoffStrategyModel, config: Config 2397 ) -> ExponentialBackoffStrategy: 2398 return ExponentialBackoffStrategy( 2399 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2400 ) 2401 2402 @staticmethod 2403 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2404 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2405 2406 def create_http_requester( 2407 self, 2408 model: HttpRequesterModel, 2409 config: Config, 2410 decoder: Decoder = JsonDecoder(parameters={}), 2411 query_properties_key: Optional[str] = None, 2412 use_cache: Optional[bool] = None, 2413 *, 2414 name: str, 2415 ) -> HttpRequester: 2416 authenticator = ( 2417 self._create_component_from_model( 2418 model=model.authenticator, 2419 config=config, 2420 url_base=model.url or model.url_base, 2421 name=name, 2422 decoder=decoder, 2423 ) 2424 if model.authenticator 2425 else None 2426 ) 2427 error_handler = ( 2428 self._create_component_from_model(model=model.error_handler, config=config) 2429 if model.error_handler 2430 else DefaultErrorHandler( 2431 backoff_strategies=[], 2432 response_filters=[], 2433 config=config, 2434 parameters=model.parameters or {}, 2435 ) 2436 ) 2437 2438 api_budget = self._api_budget 2439 2440 request_options_provider = InterpolatedRequestOptionsProvider( 2441 request_body=model.request_body, 2442 request_body_data=model.request_body_data, 2443 request_body_json=model.request_body_json, 2444 request_headers=model.request_headers, 2445 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2446 query_properties_key=query_properties_key, 2447 config=config, 2448 parameters=model.parameters or {}, 2449 ) 2450 2451 assert model.use_cache is not None # for mypy 2452 assert model.http_method is not None # for mypy 2453 2454 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2455 2456 return HttpRequester( 2457 name=name, 2458 url=model.url, 2459 url_base=model.url_base, 2460 path=model.path, 2461 authenticator=authenticator, 2462 error_handler=error_handler, 2463 api_budget=api_budget, 2464 http_method=HttpMethod[model.http_method.value], 2465 request_options_provider=request_options_provider, 2466 config=config, 2467 disable_retries=self._disable_retries, 2468 parameters=model.parameters or {}, 2469 message_repository=self._message_repository, 2470 use_cache=should_use_cache, 2471 decoder=decoder, 2472 stream_response=decoder.is_stream_response() if decoder else False, 2473 ) 2474 2475 @staticmethod 2476 def create_http_response_filter( 2477 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2478 ) -> HttpResponseFilter: 2479 if model.action: 2480 action = ResponseAction(model.action.value) 2481 else: 2482 action = None 2483 2484 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2485 2486 http_codes = ( 2487 set(model.http_codes) if model.http_codes else set() 2488 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2489 2490 return HttpResponseFilter( 2491 action=action, 2492 failure_type=failure_type, 2493 error_message=model.error_message or "", 2494 error_message_contains=model.error_message_contains or "", 2495 http_codes=http_codes, 2496 predicate=model.predicate or "", 2497 config=config, 2498 parameters=model.parameters or {}, 2499 ) 2500 2501 @staticmethod 2502 def create_inline_schema_loader( 2503 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2504 ) -> InlineSchemaLoader: 2505 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2506 2507 def create_complex_field_type( 2508 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2509 ) -> ComplexFieldType: 2510 items = ( 2511 self._create_component_from_model(model=model.items, config=config) 2512 if isinstance(model.items, ComplexFieldTypeModel) 2513 else model.items 2514 ) 2515 2516 return ComplexFieldType(field_type=model.field_type, items=items) 2517 2518 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2519 target_type = ( 2520 self._create_component_from_model(model=model.target_type, config=config) 2521 if isinstance(model.target_type, ComplexFieldTypeModel) 2522 else model.target_type 2523 ) 2524 2525 return TypesMap( 2526 target_type=target_type, 2527 current_type=model.current_type, 2528 condition=model.condition if model.condition is not None else "True", 2529 ) 2530 2531 def create_schema_type_identifier( 2532 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2533 ) -> SchemaTypeIdentifier: 2534 types_mapping = [] 2535 if model.types_mapping: 2536 types_mapping.extend( 2537 [ 2538 self._create_component_from_model(types_map, config=config) 2539 for types_map in model.types_mapping 2540 ] 2541 ) 2542 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2543 [x for x in model.schema_pointer] if model.schema_pointer else [] 2544 ) 2545 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2546 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2547 [x for x in model.type_pointer] if model.type_pointer else None 2548 ) 2549 2550 return SchemaTypeIdentifier( 2551 schema_pointer=model_schema_pointer, 2552 key_pointer=model_key_pointer, 2553 type_pointer=model_type_pointer, 2554 types_mapping=types_mapping, 2555 parameters=model.parameters or {}, 2556 ) 2557 2558 def create_dynamic_schema_loader( 2559 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2560 ) -> DynamicSchemaLoader: 2561 schema_transformations = [] 2562 if model.schema_transformations: 2563 for transformation_model in model.schema_transformations: 2564 schema_transformations.append( 2565 self._create_component_from_model(model=transformation_model, config=config) 2566 ) 2567 name = "dynamic_properties" 2568 retriever = self._create_component_from_model( 2569 model=model.retriever, 2570 config=config, 2571 name=name, 2572 primary_key=None, 2573 partition_router=self._build_stream_slicer_from_partition_router( 2574 model.retriever, config 2575 ), 2576 transformations=[], 2577 use_cache=True, 2578 log_formatter=( 2579 lambda response: format_http_message( 2580 response, 2581 f"Schema loader '{name}' request", 2582 f"Request performed in order to extract schema.", 2583 name, 2584 is_auxiliary=True, 2585 ) 2586 ), 2587 ) 2588 schema_type_identifier = self._create_component_from_model( 2589 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2590 ) 2591 schema_filter = ( 2592 self._create_component_from_model( 2593 model.schema_filter, config=config, parameters=model.parameters or {} 2594 ) 2595 if model.schema_filter is not None 2596 else None 2597 ) 2598 2599 return DynamicSchemaLoader( 2600 retriever=retriever, 2601 config=config, 2602 schema_transformations=schema_transformations, 2603 schema_filter=schema_filter, 2604 schema_type_identifier=schema_type_identifier, 2605 parameters=model.parameters or {}, 2606 ) 2607 2608 @staticmethod 2609 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2610 return JsonDecoder(parameters={}) 2611 2612 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2613 return CompositeRawDecoder( 2614 parser=ModelToComponentFactory._get_parser(model, config), 2615 stream_response=False if self._emit_connector_builder_messages else True, 2616 ) 2617 2618 def create_jsonl_decoder( 2619 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2620 ) -> Decoder: 2621 return CompositeRawDecoder( 2622 parser=ModelToComponentFactory._get_parser(model, config), 2623 stream_response=False if self._emit_connector_builder_messages else True, 2624 ) 2625 2626 def create_gzip_decoder( 2627 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2628 ) -> Decoder: 2629 _compressed_response_types = { 2630 "gzip", 2631 "x-gzip", 2632 "gzip, deflate", 2633 "x-gzip, deflate", 2634 "application/zip", 2635 "application/gzip", 2636 "application/x-gzip", 2637 "application/x-zip-compressed", 2638 } 2639 2640 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2641 2642 if self._emit_connector_builder_messages: 2643 # This is very surprising but if the response is not streamed, 2644 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2645 # which uses urllib3 directly and does not uncompress the data. 2646 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2647 2648 return CompositeRawDecoder.by_headers( 2649 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2650 stream_response=True, 2651 fallback_parser=gzip_parser.inner_parser, 2652 ) 2653 2654 @staticmethod 2655 def create_iterable_decoder( 2656 model: IterableDecoderModel, config: Config, **kwargs: Any 2657 ) -> IterableDecoder: 2658 return IterableDecoder(parameters={}) 2659 2660 @staticmethod 2661 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2662 return XmlDecoder(parameters={}) 2663 2664 def create_zipfile_decoder( 2665 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2666 ) -> ZipfileDecoder: 2667 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2668 2669 @staticmethod 2670 def _get_parser(model: BaseModel, config: Config) -> Parser: 2671 if isinstance(model, JsonDecoderModel): 2672 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2673 return JsonParser() 2674 elif isinstance(model, JsonlDecoderModel): 2675 return JsonLineParser() 2676 elif isinstance(model, CsvDecoderModel): 2677 return CsvParser( 2678 encoding=model.encoding, 2679 delimiter=model.delimiter, 2680 set_values_to_none=model.set_values_to_none, 2681 ) 2682 elif isinstance(model, GzipDecoderModel): 2683 return GzipParser( 2684 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2685 ) 2686 elif isinstance( 2687 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2688 ): 2689 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2690 2691 raise ValueError(f"Unknown decoder type {model}") 2692 2693 @staticmethod 2694 def create_json_file_schema_loader( 2695 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2696 ) -> JsonFileSchemaLoader: 2697 return JsonFileSchemaLoader( 2698 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2699 ) 2700 2701 def create_jwt_authenticator( 2702 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2703 ) -> JwtAuthenticator: 2704 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2705 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2706 request_option = ( 2707 self._create_component_from_model(model.request_option, config) 2708 if model.request_option 2709 else None 2710 ) 2711 return JwtAuthenticator( 2712 config=config, 2713 parameters=model.parameters or {}, 2714 algorithm=JwtAlgorithm(model.algorithm.value), 2715 secret_key=model.secret_key, 2716 base64_encode_secret_key=model.base64_encode_secret_key, 2717 token_duration=model.token_duration, 2718 header_prefix=model.header_prefix, 2719 kid=jwt_headers.kid, 2720 typ=jwt_headers.typ, 2721 cty=jwt_headers.cty, 2722 iss=jwt_payload.iss, 2723 sub=jwt_payload.sub, 2724 aud=jwt_payload.aud, 2725 additional_jwt_headers=model.additional_jwt_headers, 2726 additional_jwt_payload=model.additional_jwt_payload, 2727 passphrase=model.passphrase, 2728 request_option=request_option, 2729 ) 2730 2731 def create_list_partition_router( 2732 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2733 ) -> ListPartitionRouter: 2734 request_option = ( 2735 self._create_component_from_model(model.request_option, config) 2736 if model.request_option 2737 else None 2738 ) 2739 return ListPartitionRouter( 2740 cursor_field=model.cursor_field, 2741 request_option=request_option, 2742 values=model.values, 2743 config=config, 2744 parameters=model.parameters or {}, 2745 ) 2746 2747 @staticmethod 2748 def create_min_max_datetime( 2749 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2750 ) -> MinMaxDatetime: 2751 return MinMaxDatetime( 2752 datetime=model.datetime, 2753 datetime_format=model.datetime_format or "", 2754 max_datetime=model.max_datetime or "", 2755 min_datetime=model.min_datetime or "", 2756 parameters=model.parameters or {}, 2757 ) 2758 2759 @staticmethod 2760 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2761 return NoAuth(parameters=model.parameters or {}) 2762 2763 @staticmethod 2764 def create_no_pagination( 2765 model: NoPaginationModel, config: Config, **kwargs: Any 2766 ) -> NoPagination: 2767 return NoPagination(parameters={}) 2768 2769 def create_oauth_authenticator( 2770 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2771 ) -> DeclarativeOauth2Authenticator: 2772 profile_assertion = ( 2773 self._create_component_from_model(model.profile_assertion, config=config) 2774 if model.profile_assertion 2775 else None 2776 ) 2777 2778 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2779 self._get_refresh_token_error_information(model) 2780 ) 2781 if model.refresh_token_updater: 2782 # ignore type error because fixing it would have a lot of dependencies, revisit later 2783 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2784 config, 2785 InterpolatedString.create( 2786 model.token_refresh_endpoint, # type: ignore 2787 parameters=model.parameters or {}, 2788 ).eval(config), 2789 access_token_name=InterpolatedString.create( 2790 model.access_token_name or "access_token", parameters=model.parameters or {} 2791 ).eval(config), 2792 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2793 expires_in_name=InterpolatedString.create( 2794 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2795 ).eval(config), 2796 client_id_name=InterpolatedString.create( 2797 model.client_id_name or "client_id", parameters=model.parameters or {} 2798 ).eval(config), 2799 client_id=InterpolatedString.create( 2800 model.client_id, parameters=model.parameters or {} 2801 ).eval(config) 2802 if model.client_id 2803 else model.client_id, 2804 client_secret_name=InterpolatedString.create( 2805 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2806 ).eval(config), 2807 client_secret=InterpolatedString.create( 2808 model.client_secret, parameters=model.parameters or {} 2809 ).eval(config) 2810 if model.client_secret 2811 else model.client_secret, 2812 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2813 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2814 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2815 grant_type_name=InterpolatedString.create( 2816 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2817 ).eval(config), 2818 grant_type=InterpolatedString.create( 2819 model.grant_type or "refresh_token", parameters=model.parameters or {} 2820 ).eval(config), 2821 refresh_request_body=InterpolatedMapping( 2822 model.refresh_request_body or {}, parameters=model.parameters or {} 2823 ).eval(config), 2824 refresh_request_headers=InterpolatedMapping( 2825 model.refresh_request_headers or {}, parameters=model.parameters or {} 2826 ).eval(config), 2827 scopes=model.scopes, 2828 token_expiry_date_format=model.token_expiry_date_format, 2829 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2830 message_repository=self._message_repository, 2831 refresh_token_error_status_codes=refresh_token_error_status_codes, 2832 refresh_token_error_key=refresh_token_error_key, 2833 refresh_token_error_values=refresh_token_error_values, 2834 ) 2835 # ignore type error because fixing it would have a lot of dependencies, revisit later 2836 return DeclarativeOauth2Authenticator( # type: ignore 2837 access_token_name=model.access_token_name or "access_token", 2838 access_token_value=model.access_token_value, 2839 client_id_name=model.client_id_name or "client_id", 2840 client_id=model.client_id, 2841 client_secret_name=model.client_secret_name or "client_secret", 2842 client_secret=model.client_secret, 2843 expires_in_name=model.expires_in_name or "expires_in", 2844 grant_type_name=model.grant_type_name or "grant_type", 2845 grant_type=model.grant_type or "refresh_token", 2846 refresh_request_body=model.refresh_request_body, 2847 refresh_request_headers=model.refresh_request_headers, 2848 refresh_token_name=model.refresh_token_name or "refresh_token", 2849 refresh_token=model.refresh_token, 2850 scopes=model.scopes, 2851 token_expiry_date=model.token_expiry_date, 2852 token_expiry_date_format=model.token_expiry_date_format, 2853 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2854 token_refresh_endpoint=model.token_refresh_endpoint, 2855 config=config, 2856 parameters=model.parameters or {}, 2857 message_repository=self._message_repository, 2858 profile_assertion=profile_assertion, 2859 use_profile_assertion=model.use_profile_assertion, 2860 refresh_token_error_status_codes=refresh_token_error_status_codes, 2861 refresh_token_error_key=refresh_token_error_key, 2862 refresh_token_error_values=refresh_token_error_values, 2863 ) 2864 2865 @staticmethod 2866 def _get_refresh_token_error_information( 2867 model: OAuthAuthenticatorModel, 2868 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2869 """ 2870 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2871 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2872 information is defined only once and return the right fields. 2873 """ 2874 refresh_token_updater = model.refresh_token_updater 2875 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2876 refresh_token_updater.refresh_token_error_status_codes 2877 or refresh_token_updater.refresh_token_error_key 2878 or refresh_token_updater.refresh_token_error_values 2879 ) 2880 is_defined_on_oauth_authenticator = ( 2881 model.refresh_token_error_status_codes 2882 or model.refresh_token_error_key 2883 or model.refresh_token_error_values 2884 ) 2885 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2886 raise ValueError( 2887 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2888 ) 2889 2890 if is_defined_on_refresh_token_updated: 2891 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2892 return ( 2893 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2894 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2895 else (), 2896 not_optional_refresh_token_updater.refresh_token_error_key or "", 2897 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2898 if not_optional_refresh_token_updater.refresh_token_error_values 2899 else (), 2900 ) 2901 elif is_defined_on_oauth_authenticator: 2902 return ( 2903 tuple(model.refresh_token_error_status_codes) 2904 if model.refresh_token_error_status_codes 2905 else (), 2906 model.refresh_token_error_key or "", 2907 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2908 ) 2909 2910 # returning default values we think cover most cases 2911 return (400,), "error", ("invalid_grant", "invalid_permissions") 2912 2913 def create_offset_increment( 2914 self, 2915 model: OffsetIncrementModel, 2916 config: Config, 2917 decoder: Decoder, 2918 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2919 **kwargs: Any, 2920 ) -> OffsetIncrement: 2921 if isinstance(decoder, PaginationDecoderDecorator): 2922 inner_decoder = decoder.decoder 2923 else: 2924 inner_decoder = decoder 2925 decoder = PaginationDecoderDecorator(decoder=decoder) 2926 2927 if self._is_supported_decoder_for_pagination(inner_decoder): 2928 decoder_to_use = decoder 2929 else: 2930 raise ValueError( 2931 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2932 ) 2933 2934 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2935 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2936 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2937 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2938 # When we have more time to investigate we can look into reusing the same component. 2939 extractor = ( 2940 self._create_component_from_model( 2941 model=extractor_model, config=config, decoder=decoder_to_use 2942 ) 2943 if extractor_model 2944 else None 2945 ) 2946 2947 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2948 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2949 page_size = model.page_size 2950 if isinstance(page_size, str) and page_size.isdigit(): 2951 page_size = int(page_size) 2952 2953 return OffsetIncrement( 2954 page_size=page_size, 2955 config=config, 2956 decoder=decoder_to_use, 2957 extractor=extractor, 2958 inject_on_first_request=model.inject_on_first_request or False, 2959 parameters=model.parameters or {}, 2960 ) 2961 2962 @staticmethod 2963 def create_page_increment( 2964 model: PageIncrementModel, config: Config, **kwargs: Any 2965 ) -> PageIncrement: 2966 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2967 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2968 page_size = model.page_size 2969 if isinstance(page_size, str) and page_size.isdigit(): 2970 page_size = int(page_size) 2971 2972 return PageIncrement( 2973 page_size=page_size, 2974 config=config, 2975 start_from_page=model.start_from_page or 0, 2976 inject_on_first_request=model.inject_on_first_request or False, 2977 parameters=model.parameters or {}, 2978 ) 2979 2980 def create_parent_stream_config( 2981 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2982 ) -> ParentStreamConfig: 2983 declarative_stream = self._create_component_from_model( 2984 model.stream, 2985 config=config, 2986 is_parent=True, 2987 **kwargs, 2988 ) 2989 request_option = ( 2990 self._create_component_from_model(model.request_option, config=config) 2991 if model.request_option 2992 else None 2993 ) 2994 2995 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2996 raise ValueError( 2997 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2998 ) 2999 3000 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3001 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3002 ) 3003 3004 return ParentStreamConfig( 3005 parent_key=model.parent_key, 3006 request_option=request_option, 3007 stream=declarative_stream, 3008 partition_field=model.partition_field, 3009 config=config, 3010 incremental_dependency=model.incremental_dependency or False, 3011 parameters=model.parameters or {}, 3012 extra_fields=model.extra_fields, 3013 lazy_read_pointer=model_lazy_read_pointer, 3014 ) 3015 3016 def create_properties_from_endpoint( 3017 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3018 ) -> PropertiesFromEndpoint: 3019 retriever = self._create_component_from_model( 3020 model=model.retriever, 3021 config=config, 3022 name="dynamic_properties", 3023 primary_key=None, 3024 stream_slicer=None, 3025 transformations=[], 3026 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3027 ) 3028 return PropertiesFromEndpoint( 3029 property_field_path=model.property_field_path, 3030 retriever=retriever, 3031 config=config, 3032 parameters=model.parameters or {}, 3033 ) 3034 3035 def create_property_chunking( 3036 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3037 ) -> PropertyChunking: 3038 record_merge_strategy = ( 3039 self._create_component_from_model( 3040 model=model.record_merge_strategy, config=config, **kwargs 3041 ) 3042 if model.record_merge_strategy 3043 else None 3044 ) 3045 3046 property_limit_type: PropertyLimitType 3047 match model.property_limit_type: 3048 case PropertyLimitTypeModel.property_count: 3049 property_limit_type = PropertyLimitType.property_count 3050 case PropertyLimitTypeModel.characters: 3051 property_limit_type = PropertyLimitType.characters 3052 case _: 3053 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3054 3055 return PropertyChunking( 3056 property_limit_type=property_limit_type, 3057 property_limit=model.property_limit, 3058 record_merge_strategy=record_merge_strategy, 3059 config=config, 3060 parameters=model.parameters or {}, 3061 ) 3062 3063 def create_query_properties( 3064 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3065 ) -> QueryProperties: 3066 if isinstance(model.property_list, list): 3067 property_list = model.property_list 3068 else: 3069 property_list = self._create_component_from_model( 3070 model=model.property_list, config=config, **kwargs 3071 ) 3072 3073 property_chunking = ( 3074 self._create_component_from_model( 3075 model=model.property_chunking, config=config, **kwargs 3076 ) 3077 if model.property_chunking 3078 else None 3079 ) 3080 3081 property_selector = ( 3082 self._create_component_from_model( 3083 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3084 ) 3085 if model.property_selector 3086 else None 3087 ) 3088 3089 return QueryProperties( 3090 property_list=property_list, 3091 always_include_properties=model.always_include_properties, 3092 property_chunking=property_chunking, 3093 property_selector=property_selector, 3094 config=config, 3095 parameters=model.parameters or {}, 3096 ) 3097 3098 def create_json_schema_property_selector( 3099 self, 3100 model: JsonSchemaPropertySelectorModel, 3101 config: Config, 3102 *, 3103 stream_name: str, 3104 **kwargs: Any, 3105 ) -> JsonSchemaPropertySelector: 3106 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3107 3108 transformations = [] 3109 if model.transformations: 3110 for transformation_model in model.transformations: 3111 transformations.append( 3112 self._create_component_from_model(model=transformation_model, config=config) 3113 ) 3114 3115 return JsonSchemaPropertySelector( 3116 configured_stream=configured_stream, 3117 properties_transformations=transformations, 3118 config=config, 3119 parameters=model.parameters or {}, 3120 ) 3121 3122 @staticmethod 3123 def create_record_filter( 3124 model: RecordFilterModel, config: Config, **kwargs: Any 3125 ) -> RecordFilter: 3126 return RecordFilter( 3127 condition=model.condition or "", config=config, parameters=model.parameters or {} 3128 ) 3129 3130 @staticmethod 3131 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3132 return RequestPath(parameters={}) 3133 3134 @staticmethod 3135 def create_request_option( 3136 model: RequestOptionModel, config: Config, **kwargs: Any 3137 ) -> RequestOption: 3138 inject_into = RequestOptionType(model.inject_into.value) 3139 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3140 [ 3141 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3142 for segment in model.field_path 3143 ] 3144 if model.field_path 3145 else None 3146 ) 3147 field_name = ( 3148 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3149 if model.field_name 3150 else None 3151 ) 3152 return RequestOption( 3153 field_name=field_name, 3154 field_path=field_path, 3155 inject_into=inject_into, 3156 parameters=kwargs.get("parameters", {}), 3157 ) 3158 3159 def create_record_selector( 3160 self, 3161 model: RecordSelectorModel, 3162 config: Config, 3163 *, 3164 name: str, 3165 transformations: List[RecordTransformation] | None = None, 3166 decoder: Decoder | None = None, 3167 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3168 file_uploader: Optional[DefaultFileUploader] = None, 3169 **kwargs: Any, 3170 ) -> RecordSelector: 3171 extractor = self._create_component_from_model( 3172 model=model.extractor, decoder=decoder, config=config 3173 ) 3174 record_filter = ( 3175 self._create_component_from_model(model.record_filter, config=config) 3176 if model.record_filter 3177 else None 3178 ) 3179 3180 transform_before_filtering = ( 3181 False if model.transform_before_filtering is None else model.transform_before_filtering 3182 ) 3183 if client_side_incremental_sync_cursor: 3184 record_filter = ClientSideIncrementalRecordFilterDecorator( 3185 config=config, 3186 parameters=model.parameters, 3187 condition=model.record_filter.condition 3188 if (model.record_filter and hasattr(model.record_filter, "condition")) 3189 else None, 3190 cursor=client_side_incremental_sync_cursor, 3191 ) 3192 transform_before_filtering = ( 3193 True 3194 if model.transform_before_filtering is None 3195 else model.transform_before_filtering 3196 ) 3197 3198 if model.schema_normalization is None: 3199 # default to no schema normalization if not set 3200 model.schema_normalization = SchemaNormalizationModel.None_ 3201 3202 schema_normalization = ( 3203 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3204 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3205 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3206 ) 3207 3208 return RecordSelector( 3209 extractor=extractor, 3210 name=name, 3211 config=config, 3212 record_filter=record_filter, 3213 transformations=transformations or [], 3214 file_uploader=file_uploader, 3215 schema_normalization=schema_normalization, 3216 parameters=model.parameters or {}, 3217 transform_before_filtering=transform_before_filtering, 3218 ) 3219 3220 @staticmethod 3221 def create_remove_fields( 3222 model: RemoveFieldsModel, config: Config, **kwargs: Any 3223 ) -> RemoveFields: 3224 return RemoveFields( 3225 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3226 ) 3227 3228 def create_selective_authenticator( 3229 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3230 ) -> DeclarativeAuthenticator: 3231 authenticators = { 3232 name: self._create_component_from_model(model=auth, config=config) 3233 for name, auth in model.authenticators.items() 3234 } 3235 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3236 return SelectiveAuthenticator( # type: ignore[abstract] 3237 config=config, 3238 authenticators=authenticators, 3239 authenticator_selection_path=model.authenticator_selection_path, 3240 **kwargs, 3241 ) 3242 3243 @staticmethod 3244 def create_legacy_session_token_authenticator( 3245 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3246 ) -> LegacySessionTokenAuthenticator: 3247 return LegacySessionTokenAuthenticator( 3248 api_url=url_base, 3249 header=model.header, 3250 login_url=model.login_url, 3251 password=model.password or "", 3252 session_token=model.session_token or "", 3253 session_token_response_key=model.session_token_response_key or "", 3254 username=model.username or "", 3255 validate_session_url=model.validate_session_url, 3256 config=config, 3257 parameters=model.parameters or {}, 3258 ) 3259 3260 def create_simple_retriever( 3261 self, 3262 model: SimpleRetrieverModel, 3263 config: Config, 3264 *, 3265 name: str, 3266 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3267 request_options_provider: Optional[RequestOptionsProvider] = None, 3268 cursor: Optional[Cursor] = None, 3269 has_stop_condition_cursor: bool = False, 3270 is_client_side_incremental_sync: bool = False, 3271 transformations: List[RecordTransformation], 3272 file_uploader: Optional[DefaultFileUploader] = None, 3273 incremental_sync: Optional[ 3274 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3275 ] = None, 3276 use_cache: Optional[bool] = None, 3277 log_formatter: Optional[Callable[[Response], Any]] = None, 3278 partition_router: Optional[PartitionRouter] = None, 3279 **kwargs: Any, 3280 ) -> SimpleRetriever: 3281 def _get_url(req: Requester) -> str: 3282 """ 3283 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3284 This is needed because the URL is not set until the requester is created. 3285 """ 3286 3287 _url: str = ( 3288 model.requester.url 3289 if hasattr(model.requester, "url") and model.requester.url is not None 3290 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3291 ) 3292 _url_base: str = ( 3293 model.requester.url_base 3294 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3295 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3296 ) 3297 3298 return _url or _url_base 3299 3300 if cursor is None: 3301 cursor = FinalStateCursor(name, None, self._message_repository) 3302 3303 decoder = ( 3304 self._create_component_from_model(model=model.decoder, config=config) 3305 if model.decoder 3306 else JsonDecoder(parameters={}) 3307 ) 3308 record_selector = self._create_component_from_model( 3309 model=model.record_selector, 3310 name=name, 3311 config=config, 3312 decoder=decoder, 3313 transformations=transformations, 3314 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3315 file_uploader=file_uploader, 3316 ) 3317 3318 query_properties: Optional[QueryProperties] = None 3319 query_properties_key: Optional[str] = None 3320 self._ensure_query_properties_to_model(model.requester) 3321 if self._has_query_properties_in_request_parameters(model.requester): 3322 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3323 # places instead of default to request_parameters which isn't clearly documented 3324 if ( 3325 hasattr(model.requester, "fetch_properties_from_endpoint") 3326 and model.requester.fetch_properties_from_endpoint 3327 ): 3328 raise ValueError( 3329 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3330 ) 3331 3332 query_properties_definitions = [] 3333 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3334 if isinstance(request_parameter, QueryPropertiesModel): 3335 query_properties_key = key 3336 query_properties_definitions.append(request_parameter) 3337 3338 if len(query_properties_definitions) > 1: 3339 raise ValueError( 3340 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3341 ) 3342 3343 if len(query_properties_definitions) == 1: 3344 query_properties = self._create_component_from_model( 3345 model=query_properties_definitions[0], stream_name=name, config=config 3346 ) 3347 3348 # Removes QueryProperties components from the interpolated mappings because it has been designed 3349 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3350 # instead of through jinja interpolation 3351 if hasattr(model.requester, "request_parameters") and isinstance( 3352 model.requester.request_parameters, Mapping 3353 ): 3354 model.requester.request_parameters = self._remove_query_properties( 3355 model.requester.request_parameters 3356 ) 3357 elif ( 3358 hasattr(model.requester, "fetch_properties_from_endpoint") 3359 and model.requester.fetch_properties_from_endpoint 3360 ): 3361 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3362 query_properties_definition = QueryPropertiesModel( 3363 type="QueryProperties", 3364 property_list=model.requester.fetch_properties_from_endpoint, 3365 always_include_properties=None, 3366 property_chunking=None, 3367 ) # type: ignore # $parameters has a default value 3368 3369 query_properties = self.create_query_properties( 3370 model=query_properties_definition, 3371 stream_name=name, 3372 config=config, 3373 ) 3374 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3375 query_properties = self.create_query_properties( 3376 model=model.requester.query_properties, 3377 stream_name=name, 3378 config=config, 3379 ) 3380 3381 requester = self._create_component_from_model( 3382 model=model.requester, 3383 decoder=decoder, 3384 name=name, 3385 query_properties_key=query_properties_key, 3386 use_cache=use_cache, 3387 config=config, 3388 ) 3389 3390 if not request_options_provider: 3391 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3392 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3393 partition_router, PartitionRouter 3394 ): 3395 request_options_provider = partition_router 3396 3397 paginator = ( 3398 self._create_component_from_model( 3399 model=model.paginator, 3400 config=config, 3401 url_base=_get_url(requester), 3402 extractor_model=model.record_selector.extractor, 3403 decoder=decoder, 3404 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3405 ) 3406 if model.paginator 3407 else NoPagination(parameters={}) 3408 ) 3409 3410 ignore_stream_slicer_parameters_on_paginated_requests = ( 3411 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3412 ) 3413 3414 if ( 3415 model.partition_router 3416 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3417 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3418 and any( 3419 parent_stream_config.lazy_read_pointer 3420 for parent_stream_config in model.partition_router.parent_stream_configs 3421 ) 3422 ): 3423 if incremental_sync: 3424 if incremental_sync.type != "DatetimeBasedCursor": 3425 raise ValueError( 3426 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3427 ) 3428 3429 elif incremental_sync.step or incremental_sync.cursor_granularity: 3430 raise ValueError( 3431 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3432 ) 3433 3434 if model.decoder and model.decoder.type != "JsonDecoder": 3435 raise ValueError( 3436 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3437 ) 3438 3439 return LazySimpleRetriever( 3440 name=name, 3441 paginator=paginator, 3442 primary_key=primary_key, 3443 requester=requester, 3444 record_selector=record_selector, 3445 stream_slicer=_NO_STREAM_SLICING, 3446 request_option_provider=request_options_provider, 3447 config=config, 3448 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3449 parameters=model.parameters or {}, 3450 ) 3451 3452 if ( 3453 model.record_selector.record_filter 3454 and model.pagination_reset 3455 and model.pagination_reset.limits 3456 ): 3457 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3458 3459 return SimpleRetriever( 3460 name=name, 3461 paginator=paginator, 3462 primary_key=primary_key, 3463 requester=requester, 3464 record_selector=record_selector, 3465 stream_slicer=_NO_STREAM_SLICING, 3466 request_option_provider=request_options_provider, 3467 config=config, 3468 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3469 additional_query_properties=query_properties, 3470 log_formatter=self._get_log_formatter(log_formatter, name), 3471 pagination_tracker_factory=self._create_pagination_tracker_factory( 3472 model.pagination_reset, cursor 3473 ), 3474 parameters=model.parameters or {}, 3475 ) 3476 3477 def _create_pagination_tracker_factory( 3478 self, model: Optional[PaginationResetModel], cursor: Cursor 3479 ) -> Callable[[], PaginationTracker]: 3480 if model is None: 3481 return lambda: PaginationTracker() 3482 3483 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3484 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3485 if model.action == PaginationResetActionModel.RESET: 3486 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3487 pass 3488 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3489 if isinstance(cursor, ConcurrentCursor): 3490 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3491 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3492 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3493 {}, datetime.timedelta(0) 3494 ) 3495 elif not isinstance(cursor, FinalStateCursor): 3496 LOGGER.warning( 3497 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3498 ) 3499 else: 3500 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3501 3502 limit = model.limits.number_of_records if model and model.limits else None 3503 return lambda: PaginationTracker(cursor_factory(), limit) 3504 3505 def _get_log_formatter( 3506 self, log_formatter: Callable[[Response], Any] | None, name: str 3507 ) -> Callable[[Response], Any] | None: 3508 if self._should_limit_slices_fetched(): 3509 return ( 3510 ( 3511 lambda response: format_http_message( 3512 response, 3513 f"Stream '{name}' request", 3514 f"Request performed in order to extract records for stream '{name}'", 3515 name, 3516 ) 3517 ) 3518 if not log_formatter 3519 else log_formatter 3520 ) 3521 return None 3522 3523 def _should_limit_slices_fetched(self) -> bool: 3524 """ 3525 Returns True if the number of slices fetched should be limited, False otherwise. 3526 This is used to limit the number of slices fetched during tests. 3527 """ 3528 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3529 3530 @staticmethod 3531 def _has_query_properties_in_request_parameters( 3532 requester: Union[HttpRequesterModel, CustomRequesterModel], 3533 ) -> bool: 3534 if not hasattr(requester, "request_parameters"): 3535 return False 3536 request_parameters = requester.request_parameters 3537 if request_parameters and isinstance(request_parameters, Mapping): 3538 for request_parameter in request_parameters.values(): 3539 if isinstance(request_parameter, QueryPropertiesModel): 3540 return True 3541 return False 3542 3543 @staticmethod 3544 def _remove_query_properties( 3545 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3546 ) -> Mapping[str, str]: 3547 return { 3548 parameter_field: request_parameter 3549 for parameter_field, request_parameter in request_parameters.items() 3550 if not isinstance(request_parameter, QueryPropertiesModel) 3551 } 3552 3553 def create_state_delegating_stream( 3554 self, 3555 model: StateDelegatingStreamModel, 3556 config: Config, 3557 **kwargs: Any, 3558 ) -> DefaultStream: 3559 if ( 3560 model.full_refresh_stream.name != model.name 3561 or model.name != model.incremental_stream.name 3562 ): 3563 raise ValueError( 3564 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3565 ) 3566 3567 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3568 resolved_retention_period: Optional[str] = None 3569 if model.api_retention_period: 3570 interpolated_retention = InterpolatedString.create( 3571 model.api_retention_period, parameters=model.parameters or {} 3572 ) 3573 resolved_value = interpolated_retention.eval(config=config) 3574 if resolved_value: 3575 resolved_retention_period = str(resolved_value) 3576 3577 if resolved_retention_period: 3578 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3579 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3580 raise ValueError( 3581 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3582 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3583 f"cursors, so cursor age validation cannot be performed." 3584 ) 3585 3586 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3587 3588 if not stream_state: 3589 return self._create_component_from_model( # type: ignore[no-any-return] 3590 model.full_refresh_stream, config=config, **kwargs 3591 ) 3592 3593 incremental_stream: DefaultStream = self._create_component_from_model( 3594 model.incremental_stream, config=config, **kwargs 3595 ) # type: ignore[assignment] 3596 3597 # Only run cursor age validation for streams that are in the configured 3598 # catalog (or when no catalog was provided, e.g. during discover / connector 3599 # builder). Streams not selected by the user but instantiated as parent-stream 3600 # dependencies must not go through this path because it emits state messages 3601 # that the destination does not know about, causing "Stream not found" crashes. 3602 stream_is_in_catalog = ( 3603 not self._stream_name_to_configured_stream # no catalog → validate by default 3604 or model.name in self._stream_name_to_configured_stream 3605 ) 3606 if resolved_retention_period and stream_is_in_catalog: 3607 full_refresh_stream: DefaultStream = self._create_component_from_model( 3608 model.full_refresh_stream, config=config, **kwargs 3609 ) # type: ignore[assignment] 3610 if self._is_cursor_older_than_retention_period( 3611 stream_state, 3612 full_refresh_stream.cursor, 3613 incremental_stream.cursor, 3614 resolved_retention_period, 3615 model.name, 3616 ): 3617 # Clear state BEFORE constructing the full_refresh_stream so that 3618 # its cursor starts from start_date instead of the stale cursor. 3619 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3620 state_message = self._connector_state_manager.create_state_message(model.name, None) 3621 self._message_repository.emit_message(state_message) 3622 return self._create_component_from_model( # type: ignore[no-any-return] 3623 model.full_refresh_stream, config=config, **kwargs 3624 ) 3625 3626 return incremental_stream 3627 3628 @staticmethod 3629 def _is_cursor_older_than_retention_period( 3630 stream_state: Mapping[str, Any], 3631 full_refresh_cursor: Cursor, 3632 incremental_cursor: Cursor, 3633 api_retention_period: str, 3634 stream_name: str, 3635 ) -> bool: 3636 """Check if the cursor value in the state is older than the API's retention period. 3637 3638 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3639 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3640 which is always within retention, so we use incremental. For other states, it returns 3641 None and we fall back to checking the incremental cursor. 3642 3643 Returns True if the cursor is older than the retention period (should use full refresh). 3644 Returns False if the cursor is within the retention period (safe to use incremental). 3645 """ 3646 retention_duration = parse_duration(api_retention_period) 3647 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3648 3649 # Check full refresh cursor first 3650 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3651 3652 # If full refresh cursor returns None, check incremental cursor 3653 if cursor_datetime is None: 3654 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3655 3656 if cursor_datetime is None: 3657 # Neither cursor could parse the state - fall back to full refresh to be safe 3658 return True 3659 3660 if cursor_datetime < retention_cutoff: 3661 logging.warning( 3662 f"Stream '{stream_name}' has a cursor value older than " 3663 f"the API's retention period of {api_retention_period} " 3664 f"(cutoff: {retention_cutoff.isoformat()}). " 3665 f"Falling back to full refresh to avoid data loss." 3666 ) 3667 return True 3668 3669 return False 3670 3671 def _get_state_delegating_stream_model( 3672 self, 3673 model: StateDelegatingStreamModel, 3674 parent_state: Optional[Mapping[str, Any]] = None, 3675 ) -> DeclarativeStreamModel: 3676 """Return the appropriate underlying stream model based on state.""" 3677 return ( 3678 model.incremental_stream 3679 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3680 else model.full_refresh_stream 3681 ) 3682 3683 def _create_async_job_status_mapping( 3684 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3685 ) -> Mapping[str, AsyncJobStatus]: 3686 api_status_to_cdk_status = {} 3687 for cdk_status, api_statuses in model.dict().items(): 3688 if cdk_status == "type": 3689 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3690 continue 3691 3692 for status in api_statuses: 3693 if status in api_status_to_cdk_status: 3694 raise ValueError( 3695 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3696 ) 3697 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3698 return api_status_to_cdk_status 3699 3700 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3701 match status: 3702 case "running": 3703 return AsyncJobStatus.RUNNING 3704 case "completed": 3705 return AsyncJobStatus.COMPLETED 3706 case "failed": 3707 return AsyncJobStatus.FAILED 3708 case "timeout": 3709 return AsyncJobStatus.TIMED_OUT 3710 case _: 3711 raise ValueError(f"Unsupported CDK status {status}") 3712 3713 def create_async_retriever( 3714 self, 3715 model: AsyncRetrieverModel, 3716 config: Config, 3717 *, 3718 name: str, 3719 primary_key: Optional[ 3720 Union[str, List[str], List[List[str]]] 3721 ], # this seems to be needed to match create_simple_retriever 3722 stream_slicer: Optional[StreamSlicer], 3723 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3724 transformations: List[RecordTransformation], 3725 **kwargs: Any, 3726 ) -> AsyncRetriever: 3727 if model.download_target_requester and not model.download_target_extractor: 3728 raise ValueError( 3729 f"`download_target_extractor` required if using a `download_target_requester`" 3730 ) 3731 3732 def _get_download_retriever( 3733 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3734 ) -> SimpleRetriever: 3735 # We create a record selector for the download retriever 3736 # with no schema normalization and no transformations, neither record filter 3737 # as all this occurs in the record_selector of the AsyncRetriever 3738 record_selector = RecordSelector( 3739 extractor=extractor, 3740 name=name, 3741 record_filter=None, 3742 transformations=[], 3743 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3744 config=config, 3745 parameters={}, 3746 ) 3747 paginator = ( 3748 self._create_component_from_model( 3749 model=model.download_paginator, 3750 decoder=_decoder, 3751 config=config, 3752 url_base="", 3753 ) 3754 if model.download_paginator 3755 else NoPagination(parameters={}) 3756 ) 3757 3758 return SimpleRetriever( 3759 requester=requester, 3760 record_selector=record_selector, 3761 primary_key=None, 3762 name=name, 3763 paginator=paginator, 3764 config=config, 3765 parameters={}, 3766 log_formatter=self._get_log_formatter(None, name), 3767 ) 3768 3769 def _get_job_timeout() -> datetime.timedelta: 3770 user_defined_timeout: Optional[int] = ( 3771 int( 3772 InterpolatedString.create( 3773 str(model.polling_job_timeout), 3774 parameters={}, 3775 ).eval(config) 3776 ) 3777 if model.polling_job_timeout 3778 else None 3779 ) 3780 3781 # check for user defined timeout during the test read or 15 minutes 3782 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3783 # default value for non-connector builder is 60 minutes. 3784 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3785 3786 return ( 3787 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3788 ) 3789 3790 decoder = ( 3791 self._create_component_from_model(model=model.decoder, config=config) 3792 if model.decoder 3793 else JsonDecoder(parameters={}) 3794 ) 3795 record_selector = self._create_component_from_model( 3796 model=model.record_selector, 3797 config=config, 3798 decoder=decoder, 3799 name=name, 3800 transformations=transformations, 3801 client_side_incremental_sync=client_side_incremental_sync, 3802 ) 3803 3804 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3805 if self._should_limit_slices_fetched(): 3806 stream_slicer = cast( 3807 StreamSlicer, 3808 StreamSlicerTestReadDecorator( 3809 wrapped_slicer=stream_slicer, 3810 maximum_number_of_slices=self._limit_slices_fetched or 5, 3811 ), 3812 ) 3813 3814 creation_requester = self._create_component_from_model( 3815 model=model.creation_requester, 3816 decoder=decoder, 3817 config=config, 3818 name=f"job creation - {name}", 3819 ) 3820 polling_requester = self._create_component_from_model( 3821 model=model.polling_requester, 3822 decoder=decoder, 3823 config=config, 3824 name=f"job polling - {name}", 3825 ) 3826 job_download_components_name = f"job download - {name}" 3827 download_decoder = ( 3828 self._create_component_from_model(model=model.download_decoder, config=config) 3829 if model.download_decoder 3830 else JsonDecoder(parameters={}) 3831 ) 3832 download_extractor = ( 3833 self._create_component_from_model( 3834 model=model.download_extractor, 3835 config=config, 3836 decoder=download_decoder, 3837 parameters=model.parameters, 3838 ) 3839 if model.download_extractor 3840 else DpathExtractor( 3841 [], 3842 config=config, 3843 decoder=download_decoder, 3844 parameters=model.parameters or {}, 3845 ) 3846 ) 3847 download_requester = self._create_component_from_model( 3848 model=model.download_requester, 3849 decoder=download_decoder, 3850 config=config, 3851 name=job_download_components_name, 3852 ) 3853 download_retriever = _get_download_retriever( 3854 download_requester, download_extractor, download_decoder 3855 ) 3856 abort_requester = ( 3857 self._create_component_from_model( 3858 model=model.abort_requester, 3859 decoder=decoder, 3860 config=config, 3861 name=f"job abort - {name}", 3862 ) 3863 if model.abort_requester 3864 else None 3865 ) 3866 delete_requester = ( 3867 self._create_component_from_model( 3868 model=model.delete_requester, 3869 decoder=decoder, 3870 config=config, 3871 name=f"job delete - {name}", 3872 ) 3873 if model.delete_requester 3874 else None 3875 ) 3876 download_target_requester = ( 3877 self._create_component_from_model( 3878 model=model.download_target_requester, 3879 decoder=decoder, 3880 config=config, 3881 name=f"job extract_url - {name}", 3882 ) 3883 if model.download_target_requester 3884 else None 3885 ) 3886 status_extractor = self._create_component_from_model( 3887 model=model.status_extractor, decoder=decoder, config=config, name=name 3888 ) 3889 download_target_extractor = ( 3890 self._create_component_from_model( 3891 model=model.download_target_extractor, 3892 decoder=decoder, 3893 config=config, 3894 name=name, 3895 ) 3896 if model.download_target_extractor 3897 else None 3898 ) 3899 3900 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3901 creation_requester=creation_requester, 3902 polling_requester=polling_requester, 3903 download_retriever=download_retriever, 3904 download_target_requester=download_target_requester, 3905 abort_requester=abort_requester, 3906 delete_requester=delete_requester, 3907 status_extractor=status_extractor, 3908 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3909 download_target_extractor=download_target_extractor, 3910 job_timeout=_get_job_timeout(), 3911 ) 3912 3913 async_job_partition_router = AsyncJobPartitionRouter( 3914 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3915 job_repository, 3916 stream_slices, 3917 self._job_tracker, 3918 self._message_repository, 3919 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3920 has_bulk_parent=False, 3921 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3922 # `None` == default retry is set to 3 attempts, under the hood. 3923 job_max_retry=1 if self._emit_connector_builder_messages else None, 3924 ), 3925 stream_slicer=stream_slicer, 3926 config=config, 3927 parameters=model.parameters or {}, 3928 ) 3929 3930 return AsyncRetriever( 3931 record_selector=record_selector, 3932 stream_slicer=async_job_partition_router, 3933 config=config, 3934 parameters=model.parameters or {}, 3935 ) 3936 3937 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3938 config_migrations = [ 3939 self._create_component_from_model(migration, config) 3940 for migration in ( 3941 model.config_normalization_rules.config_migrations 3942 if ( 3943 model.config_normalization_rules 3944 and model.config_normalization_rules.config_migrations 3945 ) 3946 else [] 3947 ) 3948 ] 3949 config_transformations = [ 3950 self._create_component_from_model(transformation, config) 3951 for transformation in ( 3952 model.config_normalization_rules.transformations 3953 if ( 3954 model.config_normalization_rules 3955 and model.config_normalization_rules.transformations 3956 ) 3957 else [] 3958 ) 3959 ] 3960 config_validations = [ 3961 self._create_component_from_model(validation, config) 3962 for validation in ( 3963 model.config_normalization_rules.validations 3964 if ( 3965 model.config_normalization_rules 3966 and model.config_normalization_rules.validations 3967 ) 3968 else [] 3969 ) 3970 ] 3971 3972 return Spec( 3973 connection_specification=model.connection_specification, 3974 documentation_url=model.documentation_url, 3975 advanced_auth=model.advanced_auth, 3976 parameters={}, 3977 config_migrations=config_migrations, 3978 config_transformations=config_transformations, 3979 config_validations=config_validations, 3980 ) 3981 3982 def create_substream_partition_router( 3983 self, 3984 model: SubstreamPartitionRouterModel, 3985 config: Config, 3986 *, 3987 stream_name: str, 3988 **kwargs: Any, 3989 ) -> SubstreamPartitionRouter: 3990 parent_stream_configs = [] 3991 if model.parent_stream_configs: 3992 parent_stream_configs.extend( 3993 [ 3994 self.create_parent_stream_config_with_substream_wrapper( 3995 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3996 ) 3997 for parent_stream_config in model.parent_stream_configs 3998 ] 3999 ) 4000 4001 return SubstreamPartitionRouter( 4002 parent_stream_configs=parent_stream_configs, 4003 parameters=model.parameters or {}, 4004 config=config, 4005 ) 4006 4007 def create_parent_stream_config_with_substream_wrapper( 4008 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4009 ) -> Any: 4010 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4011 4012 parent_state: Optional[Mapping[str, Any]] = ( 4013 child_state if model.incremental_dependency and child_state else None 4014 ) 4015 connector_state_manager = self._instantiate_parent_stream_state_manager( 4016 child_state, config, model, parent_state 4017 ) 4018 4019 substream_factory = ModelToComponentFactory( 4020 connector_state_manager=connector_state_manager, 4021 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4022 limit_slices_fetched=self._limit_slices_fetched, 4023 emit_connector_builder_messages=self._emit_connector_builder_messages, 4024 disable_retries=self._disable_retries, 4025 disable_cache=self._disable_cache, 4026 message_repository=StateFilteringMessageRepository( 4027 LogAppenderMessageRepositoryDecorator( 4028 { 4029 "airbyte_cdk": {"stream": {"is_substream": True}}, 4030 "http": {"is_auxiliary": True}, 4031 }, 4032 self._message_repository, 4033 self._evaluate_log_level(self._emit_connector_builder_messages), 4034 ), 4035 ), 4036 api_budget=self._api_budget, 4037 ) 4038 4039 return substream_factory.create_parent_stream_config( 4040 model=model, config=config, stream_name=stream_name, **kwargs 4041 ) 4042 4043 def _instantiate_parent_stream_state_manager( 4044 self, 4045 child_state: MutableMapping[str, Any], 4046 config: Config, 4047 model: ParentStreamConfigModel, 4048 parent_state: Optional[Mapping[str, Any]] = None, 4049 ) -> ConnectorStateManager: 4050 """ 4051 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4052 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4053 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4054 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4055 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4056 incremental_dependency is set. 4057 """ 4058 if model.incremental_dependency and child_state: 4059 parent_stream_name = model.stream.name or "" 4060 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4061 child_state, parent_stream_name 4062 ) 4063 4064 if not extracted_parent_state: 4065 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4066 child_state, parent_stream_name 4067 ) 4068 4069 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4070 cursor_values = child_state.values() 4071 if cursor_values and len(cursor_values) == 1: 4072 incremental_sync_model: Union[ 4073 DatetimeBasedCursorModel, 4074 IncrementingCountCursorModel, 4075 ] = ( 4076 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4077 if isinstance(model.stream, DeclarativeStreamModel) 4078 else self._get_state_delegating_stream_model( 4079 model.stream, parent_state=parent_state 4080 ).incremental_sync 4081 ) 4082 cursor_field = InterpolatedString.create( 4083 incremental_sync_model.cursor_field, 4084 parameters=incremental_sync_model.parameters or {}, 4085 ).eval(config) 4086 extracted_parent_state = AirbyteStateMessage( 4087 type=AirbyteStateType.STREAM, 4088 stream=AirbyteStreamState( 4089 stream_descriptor=StreamDescriptor( 4090 name=parent_stream_name, namespace=None 4091 ), 4092 stream_state=AirbyteStateBlob( 4093 {cursor_field: list(cursor_values)[0]} 4094 ), 4095 ), 4096 ) 4097 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4098 4099 return ConnectorStateManager([]) 4100 4101 @staticmethod 4102 def create_wait_time_from_header( 4103 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4104 ) -> WaitTimeFromHeaderBackoffStrategy: 4105 return WaitTimeFromHeaderBackoffStrategy( 4106 header=model.header, 4107 parameters=model.parameters or {}, 4108 config=config, 4109 regex=model.regex, 4110 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4111 if model.max_waiting_time_in_seconds is not None 4112 else None, 4113 ) 4114 4115 @staticmethod 4116 def create_wait_until_time_from_header( 4117 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4118 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4119 return WaitUntilTimeFromHeaderBackoffStrategy( 4120 header=model.header, 4121 parameters=model.parameters or {}, 4122 config=config, 4123 min_wait=model.min_wait, 4124 regex=model.regex, 4125 ) 4126 4127 def get_message_repository(self) -> MessageRepository: 4128 return self._message_repository 4129 4130 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4131 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4132 4133 @staticmethod 4134 def create_components_mapping_definition( 4135 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4136 ) -> ComponentMappingDefinition: 4137 interpolated_value = InterpolatedString.create( 4138 model.value, parameters=model.parameters or {} 4139 ) 4140 field_path = [ 4141 InterpolatedString.create(path, parameters=model.parameters or {}) 4142 for path in model.field_path 4143 ] 4144 return ComponentMappingDefinition( 4145 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4146 value=interpolated_value, 4147 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4148 create_or_update=model.create_or_update, 4149 condition=model.condition, 4150 parameters=model.parameters or {}, 4151 ) 4152 4153 def create_http_components_resolver( 4154 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4155 ) -> Any: 4156 retriever = self._create_component_from_model( 4157 model=model.retriever, 4158 config=config, 4159 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4160 primary_key=None, 4161 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4162 transformations=[], 4163 ) 4164 4165 components_mapping = [] 4166 for component_mapping_definition_model in model.components_mapping: 4167 if component_mapping_definition_model.condition: 4168 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4169 components_mapping.append( 4170 self._create_component_from_model( 4171 model=component_mapping_definition_model, 4172 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4173 component_mapping_definition_model.value_type 4174 ), 4175 config=config, 4176 ) 4177 ) 4178 4179 return HttpComponentsResolver( 4180 retriever=retriever, 4181 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4182 config=config, 4183 components_mapping=components_mapping, 4184 parameters=model.parameters or {}, 4185 ) 4186 4187 @staticmethod 4188 def create_stream_config( 4189 model: StreamConfigModel, config: Config, **kwargs: Any 4190 ) -> StreamConfig: 4191 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4192 [x for x in model.configs_pointer] if model.configs_pointer else [] 4193 ) 4194 4195 return StreamConfig( 4196 configs_pointer=model_configs_pointer, 4197 default_values=model.default_values, 4198 parameters=model.parameters or {}, 4199 ) 4200 4201 def create_config_components_resolver( 4202 self, 4203 model: ConfigComponentsResolverModel, 4204 config: Config, 4205 ) -> Any: 4206 model_stream_configs = ( 4207 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4208 ) 4209 4210 stream_configs = [ 4211 self._create_component_from_model( 4212 stream_config, config=config, parameters=model.parameters or {} 4213 ) 4214 for stream_config in model_stream_configs 4215 ] 4216 4217 components_mapping = [ 4218 self._create_component_from_model( 4219 model=components_mapping_definition_model, 4220 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4221 components_mapping_definition_model.value_type 4222 ), 4223 config=config, 4224 parameters=model.parameters, 4225 ) 4226 for components_mapping_definition_model in model.components_mapping 4227 ] 4228 4229 return ConfigComponentsResolver( 4230 stream_configs=stream_configs, 4231 config=config, 4232 components_mapping=components_mapping, 4233 parameters=model.parameters or {}, 4234 ) 4235 4236 def create_parametrized_components_resolver( 4237 self, 4238 model: ParametrizedComponentsResolverModel, 4239 config: Config, 4240 ) -> ParametrizedComponentsResolver: 4241 stream_parameters = StreamParametersDefinition( 4242 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4243 ) 4244 4245 components_mapping = [] 4246 for components_mapping_definition_model in model.components_mapping: 4247 if components_mapping_definition_model.condition: 4248 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4249 components_mapping.append( 4250 self._create_component_from_model( 4251 model=components_mapping_definition_model, 4252 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4253 components_mapping_definition_model.value_type 4254 ), 4255 config=config, 4256 ) 4257 ) 4258 return ParametrizedComponentsResolver( 4259 stream_parameters=stream_parameters, 4260 config=config, 4261 components_mapping=components_mapping, 4262 parameters=model.parameters or {}, 4263 ) 4264 4265 _UNSUPPORTED_DECODER_ERROR = ( 4266 "Specified decoder of {decoder_type} is not supported for pagination." 4267 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4268 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4269 ) 4270 4271 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4272 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4273 return True 4274 elif isinstance(decoder, CompositeRawDecoder): 4275 return self._is_supported_parser_for_pagination(decoder.parser) 4276 else: 4277 return False 4278 4279 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4280 if isinstance(parser, JsonParser): 4281 return True 4282 elif isinstance(parser, GzipParser): 4283 return isinstance(parser.inner_parser, JsonParser) 4284 else: 4285 return False 4286 4287 def create_http_api_budget( 4288 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4289 ) -> HttpAPIBudget: 4290 policies = [ 4291 self._create_component_from_model(model=policy, config=config) 4292 for policy in model.policies 4293 ] 4294 4295 return HttpAPIBudget( 4296 policies=policies, 4297 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4298 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4299 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4300 ) 4301 4302 def create_fixed_window_call_rate_policy( 4303 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4304 ) -> FixedWindowCallRatePolicy: 4305 matchers = [ 4306 self._create_component_from_model(model=matcher, config=config) 4307 for matcher in model.matchers 4308 ] 4309 4310 # Set the initial reset timestamp to 10 days from now. 4311 # This value will be updated by the first request. 4312 return FixedWindowCallRatePolicy( 4313 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4314 period=parse_duration(model.period), 4315 call_limit=model.call_limit, 4316 matchers=matchers, 4317 ) 4318 4319 def create_file_uploader( 4320 self, model: FileUploaderModel, config: Config, **kwargs: Any 4321 ) -> FileUploader: 4322 name = "File Uploader" 4323 requester = self._create_component_from_model( 4324 model=model.requester, 4325 config=config, 4326 name=name, 4327 **kwargs, 4328 ) 4329 download_target_extractor = self._create_component_from_model( 4330 model=model.download_target_extractor, 4331 config=config, 4332 name=name, 4333 **kwargs, 4334 ) 4335 emit_connector_builder_messages = self._emit_connector_builder_messages 4336 file_uploader = DefaultFileUploader( 4337 requester=requester, 4338 download_target_extractor=download_target_extractor, 4339 config=config, 4340 file_writer=NoopFileWriter() 4341 if emit_connector_builder_messages 4342 else LocalFileSystemFileWriter(), 4343 parameters=model.parameters or {}, 4344 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4345 ) 4346 4347 return ( 4348 ConnectorBuilderFileUploader(file_uploader) 4349 if emit_connector_builder_messages 4350 else file_uploader 4351 ) 4352 4353 def create_moving_window_call_rate_policy( 4354 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4355 ) -> MovingWindowCallRatePolicy: 4356 rates = [ 4357 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4358 ] 4359 matchers = [ 4360 self._create_component_from_model(model=matcher, config=config) 4361 for matcher in model.matchers 4362 ] 4363 return MovingWindowCallRatePolicy( 4364 rates=rates, 4365 matchers=matchers, 4366 ) 4367 4368 def create_unlimited_call_rate_policy( 4369 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4370 ) -> UnlimitedCallRatePolicy: 4371 matchers = [ 4372 self._create_component_from_model(model=matcher, config=config) 4373 for matcher in model.matchers 4374 ] 4375 4376 return UnlimitedCallRatePolicy( 4377 matchers=matchers, 4378 ) 4379 4380 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4381 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4382 return Rate( 4383 limit=int(interpolated_limit.eval(config=config)), 4384 interval=parse_duration(model.interval), 4385 ) 4386 4387 def create_http_request_matcher( 4388 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4389 ) -> HttpRequestRegexMatcher: 4390 weight = model.weight 4391 if weight is not None: 4392 if isinstance(weight, str): 4393 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4394 else: 4395 weight = int(weight) 4396 if weight < 1: 4397 raise ValueError(f"weight must be >= 1, got {weight}") 4398 return HttpRequestRegexMatcher( 4399 method=model.method, 4400 url_base=model.url_base, 4401 url_path_pattern=model.url_path_pattern, 4402 params=model.params, 4403 headers=model.headers, 4404 weight=weight, 4405 ) 4406 4407 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4408 self._api_budget = self.create_component( 4409 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4410 ) 4411 4412 def create_grouping_partition_router( 4413 self, 4414 model: GroupingPartitionRouterModel, 4415 config: Config, 4416 *, 4417 stream_name: str, 4418 **kwargs: Any, 4419 ) -> GroupingPartitionRouter: 4420 underlying_router = self._create_component_from_model( 4421 model=model.underlying_partition_router, 4422 config=config, 4423 stream_name=stream_name, 4424 **kwargs, 4425 ) 4426 if model.group_size < 1: 4427 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4428 4429 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4430 # because they are specific to individual partitions and cannot be aggregated or handled 4431 # when grouping, potentially leading to incorrect API calls. Any request customization 4432 # should be managed at the stream level through the requester's configuration. 4433 if isinstance(underlying_router, SubstreamPartitionRouter): 4434 if any( 4435 parent_config.request_option 4436 for parent_config in underlying_router.parent_stream_configs 4437 ): 4438 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4439 4440 if isinstance(underlying_router, ListPartitionRouter): 4441 if underlying_router.request_option: 4442 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4443 4444 return GroupingPartitionRouter( 4445 group_size=model.group_size, 4446 underlying_partition_router=underlying_router, 4447 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4448 config=config, 4449 ) 4450 4451 def _ensure_query_properties_to_model( 4452 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4453 ) -> None: 4454 """ 4455 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4456 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4457 proper model. 4458 """ 4459 if not hasattr(requester, "request_parameters"): 4460 return 4461 4462 request_parameters = requester.request_parameters 4463 if request_parameters and isinstance(request_parameters, Dict): 4464 for request_parameter_key in request_parameters.keys(): 4465 request_parameter = request_parameters[request_parameter_key] 4466 if ( 4467 isinstance(request_parameter, Dict) 4468 and request_parameter.get("type") == "QueryProperties" 4469 ): 4470 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4471 request_parameter 4472 ) 4473 4474 def _get_catalog_defined_cursor_field( 4475 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4476 ) -> Optional[CursorField]: 4477 if not allow_catalog_defined_cursor_field: 4478 return None 4479 4480 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4481 4482 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4483 # case we return None which will then use the default cursor field defined on the cursor model. 4484 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4485 # occur when the platform serializes "no cursor configured" streams incorrectly. 4486 if ( 4487 not configured_stream 4488 or not configured_stream.cursor_field 4489 or not configured_stream.cursor_field[0] 4490 ): 4491 return None 4492 elif len(configured_stream.cursor_field) > 1: 4493 raise ValueError( 4494 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4495 ) 4496 else: 4497 return CursorField( 4498 cursor_field_key=configured_stream.cursor_field[0], 4499 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4500 )
678class ModelToComponentFactory: 679 EPOCH_DATETIME_FORMAT = "%s" 680 681 def __init__( 682 self, 683 limit_pages_fetched_per_slice: Optional[int] = None, 684 limit_slices_fetched: Optional[int] = None, 685 emit_connector_builder_messages: bool = False, 686 disable_retries: bool = False, 687 disable_cache: bool = False, 688 message_repository: Optional[MessageRepository] = None, 689 connector_state_manager: Optional[ConnectorStateManager] = None, 690 max_concurrent_async_job_count: Optional[int] = None, 691 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 692 api_budget: Optional[APIBudget] = None, 693 ): 694 self._init_mappings() 695 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 696 self._limit_slices_fetched = limit_slices_fetched 697 self._emit_connector_builder_messages = emit_connector_builder_messages 698 self._disable_retries = disable_retries 699 self._disable_cache = disable_cache 700 self._message_repository = message_repository or InMemoryMessageRepository( 701 self._evaluate_log_level(emit_connector_builder_messages) 702 ) 703 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 704 configured_catalog 705 ) 706 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 707 self._api_budget: Optional[Union[APIBudget]] = api_budget 708 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 709 # placeholder for deprecation warnings 710 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 711 712 def _init_mappings(self) -> None: 713 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 714 AddedFieldDefinitionModel: self.create_added_field_definition, 715 AddFieldsModel: self.create_add_fields, 716 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 717 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 718 BearerAuthenticatorModel: self.create_bearer_authenticator, 719 CheckStreamModel: self.create_check_stream, 720 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 721 CheckDynamicStreamModel: self.create_check_dynamic_stream, 722 CompositeErrorHandlerModel: self.create_composite_error_handler, 723 ConcurrencyLevelModel: self.create_concurrency_level, 724 ConfigMigrationModel: self.create_config_migration, 725 ConfigAddFieldsModel: self.create_config_add_fields, 726 ConfigRemapFieldModel: self.create_config_remap_field, 727 ConfigRemoveFieldsModel: self.create_config_remove_fields, 728 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 729 CsvDecoderModel: self.create_csv_decoder, 730 CursorPaginationModel: self.create_cursor_pagination, 731 CustomAuthenticatorModel: self.create_custom_component, 732 CustomBackoffStrategyModel: self.create_custom_component, 733 CustomDecoderModel: self.create_custom_component, 734 CustomErrorHandlerModel: self.create_custom_component, 735 CustomRecordExtractorModel: self.create_custom_component, 736 CustomRecordFilterModel: self.create_custom_component, 737 CustomRequesterModel: self.create_custom_component, 738 CustomRetrieverModel: self.create_custom_component, 739 CustomSchemaLoader: self.create_custom_component, 740 CustomSchemaNormalizationModel: self.create_custom_component, 741 CustomStateMigration: self.create_custom_component, 742 CustomPaginationStrategyModel: self.create_custom_component, 743 CustomPartitionRouterModel: self.create_custom_component, 744 CustomTransformationModel: self.create_custom_component, 745 CustomValidationStrategyModel: self.create_custom_component, 746 CustomConfigTransformationModel: self.create_custom_component, 747 DeclarativeStreamModel: self.create_default_stream, 748 DefaultErrorHandlerModel: self.create_default_error_handler, 749 DefaultPaginatorModel: self.create_default_paginator, 750 DpathExtractorModel: self.create_dpath_extractor, 751 DpathValidatorModel: self.create_dpath_validator, 752 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 753 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 754 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 755 GroupByKeyMergeStrategyModel: self.create_group_by_key, 756 HttpRequesterModel: self.create_http_requester, 757 HttpResponseFilterModel: self.create_http_response_filter, 758 InlineSchemaLoaderModel: self.create_inline_schema_loader, 759 JsonDecoderModel: self.create_json_decoder, 760 JsonlDecoderModel: self.create_jsonl_decoder, 761 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 762 GzipDecoderModel: self.create_gzip_decoder, 763 KeysToLowerModel: self.create_keys_to_lower_transformation, 764 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 765 KeysReplaceModel: self.create_keys_replace_transformation, 766 FlattenFieldsModel: self.create_flatten_fields, 767 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 768 IterableDecoderModel: self.create_iterable_decoder, 769 XmlDecoderModel: self.create_xml_decoder, 770 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 771 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 772 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 773 TypesMapModel: self.create_types_map, 774 ComplexFieldTypeModel: self.create_complex_field_type, 775 JwtAuthenticatorModel: self.create_jwt_authenticator, 776 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 777 ListPartitionRouterModel: self.create_list_partition_router, 778 MinMaxDatetimeModel: self.create_min_max_datetime, 779 NoAuthModel: self.create_no_auth, 780 NoPaginationModel: self.create_no_pagination, 781 OAuthAuthenticatorModel: self.create_oauth_authenticator, 782 OffsetIncrementModel: self.create_offset_increment, 783 PageIncrementModel: self.create_page_increment, 784 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 785 PredicateValidatorModel: self.create_predicate_validator, 786 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 787 PropertyChunkingModel: self.create_property_chunking, 788 QueryPropertiesModel: self.create_query_properties, 789 RecordFilterModel: self.create_record_filter, 790 RecordSelectorModel: self.create_record_selector, 791 RemoveFieldsModel: self.create_remove_fields, 792 RequestPathModel: self.create_request_path, 793 RequestOptionModel: self.create_request_option, 794 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 795 SelectiveAuthenticatorModel: self.create_selective_authenticator, 796 SimpleRetrieverModel: self.create_simple_retriever, 797 StateDelegatingStreamModel: self.create_state_delegating_stream, 798 SpecModel: self.create_spec, 799 SubstreamPartitionRouterModel: self.create_substream_partition_router, 800 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 801 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 802 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 803 AsyncRetrieverModel: self.create_async_retriever, 804 HttpComponentsResolverModel: self.create_http_components_resolver, 805 ConfigComponentsResolverModel: self.create_config_components_resolver, 806 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 807 StreamConfigModel: self.create_stream_config, 808 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 809 ZipfileDecoderModel: self.create_zipfile_decoder, 810 HTTPAPIBudgetModel: self.create_http_api_budget, 811 FileUploaderModel: self.create_file_uploader, 812 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 813 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 814 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 815 RateModel: self.create_rate, 816 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 817 GroupingPartitionRouterModel: self.create_grouping_partition_router, 818 } 819 820 # Needed for the case where we need to perform a second parse on the fields of a custom component 821 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 822 823 @staticmethod 824 def _create_stream_name_to_configured_stream( 825 configured_catalog: Optional[ConfiguredAirbyteCatalog], 826 ) -> Mapping[str, ConfiguredAirbyteStream]: 827 return ( 828 {stream.stream.name: stream for stream in configured_catalog.streams} 829 if configured_catalog 830 else {} 831 ) 832 833 def create_component( 834 self, 835 model_type: Type[BaseModel], 836 component_definition: ComponentDefinition, 837 config: Config, 838 **kwargs: Any, 839 ) -> Any: 840 """ 841 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 842 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 843 creating declarative components from that model. 844 845 :param model_type: The type of declarative component that is being initialized 846 :param component_definition: The mapping that represents a declarative component 847 :param config: The connector config that is provided by the customer 848 :return: The declarative component to be used at runtime 849 """ 850 851 component_type = component_definition.get("type") 852 if component_definition.get("type") != model_type.__name__: 853 raise ValueError( 854 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 855 ) 856 857 declarative_component_model = model_type.parse_obj(component_definition) 858 859 if not isinstance(declarative_component_model, model_type): 860 raise ValueError( 861 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 862 ) 863 864 return self._create_component_from_model( 865 model=declarative_component_model, config=config, **kwargs 866 ) 867 868 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 869 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 870 raise ValueError( 871 f"{model.__class__} with attributes {model} is not a valid component type" 872 ) 873 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 874 if not component_constructor: 875 raise ValueError(f"Could not find constructor for {model.__class__}") 876 877 # collect deprecation warnings for supported models. 878 if isinstance(model, BaseModelWithDeprecations): 879 self._collect_model_deprecations(model) 880 881 return component_constructor(model=model, config=config, **kwargs) 882 883 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 884 """ 885 Returns the deprecation warnings that were collected during the creation of components. 886 """ 887 return self._collected_deprecation_logs 888 889 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 890 """ 891 Collects deprecation logs from the given model and appends any new logs to the internal collection. 892 893 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 894 895 Args: 896 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 897 """ 898 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 899 for log in model._deprecation_logs: 900 # avoid duplicates for deprecation logs observed. 901 if log not in self._collected_deprecation_logs: 902 self._collected_deprecation_logs.append(log) 903 904 def create_config_migration( 905 self, model: ConfigMigrationModel, config: Config 906 ) -> ConfigMigration: 907 transformations: List[ConfigTransformation] = [ 908 self._create_component_from_model(transformation, config) 909 for transformation in model.transformations 910 ] 911 912 return ConfigMigration( 913 description=model.description, 914 transformations=transformations, 915 ) 916 917 def create_config_add_fields( 918 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 919 ) -> ConfigAddFields: 920 fields = [self._create_component_from_model(field, config) for field in model.fields] 921 return ConfigAddFields( 922 fields=fields, 923 condition=model.condition or "", 924 ) 925 926 @staticmethod 927 def create_config_remove_fields( 928 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 929 ) -> ConfigRemoveFields: 930 return ConfigRemoveFields( 931 field_pointers=model.field_pointers, 932 condition=model.condition or "", 933 ) 934 935 @staticmethod 936 def create_config_remap_field( 937 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 938 ) -> ConfigRemapField: 939 mapping = cast(Mapping[str, Any], model.map) 940 return ConfigRemapField( 941 map=mapping, 942 field_path=model.field_path, 943 config=config, 944 ) 945 946 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 947 strategy = self._create_component_from_model(model.validation_strategy, config) 948 949 return DpathValidator( 950 field_path=model.field_path, 951 strategy=strategy, 952 ) 953 954 def create_predicate_validator( 955 self, model: PredicateValidatorModel, config: Config 956 ) -> PredicateValidator: 957 strategy = self._create_component_from_model(model.validation_strategy, config) 958 959 return PredicateValidator( 960 value=model.value, 961 strategy=strategy, 962 ) 963 964 @staticmethod 965 def create_validate_adheres_to_schema( 966 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 967 ) -> ValidateAdheresToSchema: 968 base_schema = cast(Mapping[str, Any], model.base_schema) 969 return ValidateAdheresToSchema( 970 schema=base_schema, 971 ) 972 973 @staticmethod 974 def create_added_field_definition( 975 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 976 ) -> AddedFieldDefinition: 977 interpolated_value = InterpolatedString.create( 978 model.value, parameters=model.parameters or {} 979 ) 980 return AddedFieldDefinition( 981 path=model.path, 982 value=interpolated_value, 983 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 984 parameters=model.parameters or {}, 985 ) 986 987 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 988 added_field_definitions = [ 989 self._create_component_from_model( 990 model=added_field_definition_model, 991 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 992 added_field_definition_model.value_type 993 ), 994 config=config, 995 ) 996 for added_field_definition_model in model.fields 997 ] 998 return AddFields( 999 fields=added_field_definitions, 1000 condition=model.condition or "", 1001 parameters=model.parameters or {}, 1002 ) 1003 1004 def create_keys_to_lower_transformation( 1005 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1006 ) -> KeysToLowerTransformation: 1007 return KeysToLowerTransformation() 1008 1009 def create_keys_to_snake_transformation( 1010 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1011 ) -> KeysToSnakeCaseTransformation: 1012 return KeysToSnakeCaseTransformation() 1013 1014 def create_keys_replace_transformation( 1015 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1016 ) -> KeysReplaceTransformation: 1017 return KeysReplaceTransformation( 1018 old=model.old, new=model.new, parameters=model.parameters or {} 1019 ) 1020 1021 def create_flatten_fields( 1022 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1023 ) -> FlattenFields: 1024 return FlattenFields( 1025 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1026 ) 1027 1028 def create_dpath_flatten_fields( 1029 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1030 ) -> DpathFlattenFields: 1031 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1032 key_transformation = ( 1033 KeyTransformation( 1034 config=config, 1035 prefix=model.key_transformation.prefix, 1036 suffix=model.key_transformation.suffix, 1037 parameters=model.parameters or {}, 1038 ) 1039 if model.key_transformation is not None 1040 else None 1041 ) 1042 return DpathFlattenFields( 1043 config=config, 1044 field_path=model_field_path, 1045 delete_origin_value=model.delete_origin_value 1046 if model.delete_origin_value is not None 1047 else False, 1048 replace_record=model.replace_record if model.replace_record is not None else False, 1049 key_transformation=key_transformation, 1050 parameters=model.parameters or {}, 1051 ) 1052 1053 @staticmethod 1054 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1055 if not value_type: 1056 return None 1057 names_to_types = { 1058 ValueType.string: str, 1059 ValueType.number: float, 1060 ValueType.integer: int, 1061 ValueType.boolean: bool, 1062 } 1063 return names_to_types[value_type] 1064 1065 def create_api_key_authenticator( 1066 self, 1067 model: ApiKeyAuthenticatorModel, 1068 config: Config, 1069 token_provider: Optional[TokenProvider] = None, 1070 **kwargs: Any, 1071 ) -> ApiKeyAuthenticator: 1072 if model.inject_into is None and model.header is None: 1073 raise ValueError( 1074 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1075 ) 1076 1077 if model.inject_into is not None and model.header is not None: 1078 raise ValueError( 1079 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1080 ) 1081 1082 if token_provider is not None and model.api_token != "": 1083 raise ValueError( 1084 "If token_provider is set, api_token is ignored and has to be set to empty string." 1085 ) 1086 1087 request_option = ( 1088 self._create_component_from_model( 1089 model.inject_into, config, parameters=model.parameters or {} 1090 ) 1091 if model.inject_into 1092 else RequestOption( 1093 inject_into=RequestOptionType.header, 1094 field_name=model.header or "", 1095 parameters=model.parameters or {}, 1096 ) 1097 ) 1098 1099 return ApiKeyAuthenticator( 1100 token_provider=( 1101 token_provider 1102 if token_provider is not None 1103 else InterpolatedStringTokenProvider( 1104 api_token=model.api_token or "", 1105 config=config, 1106 parameters=model.parameters or {}, 1107 ) 1108 ), 1109 request_option=request_option, 1110 config=config, 1111 parameters=model.parameters or {}, 1112 ) 1113 1114 def create_legacy_to_per_partition_state_migration( 1115 self, 1116 model: LegacyToPerPartitionStateMigrationModel, 1117 config: Mapping[str, Any], 1118 declarative_stream: DeclarativeStreamModel, 1119 ) -> LegacyToPerPartitionStateMigration: 1120 retriever = declarative_stream.retriever 1121 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1122 raise ValueError( 1123 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1124 ) 1125 partition_router = retriever.partition_router 1126 if not isinstance( 1127 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1128 ): 1129 raise ValueError( 1130 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1131 ) 1132 if not hasattr(partition_router, "parent_stream_configs"): 1133 raise ValueError( 1134 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1135 ) 1136 1137 if not hasattr(declarative_stream, "incremental_sync"): 1138 raise ValueError( 1139 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1140 ) 1141 1142 return LegacyToPerPartitionStateMigration( 1143 partition_router, # type: ignore # was already checked above 1144 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1145 config, 1146 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1147 ) 1148 1149 def create_session_token_authenticator( 1150 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1151 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1152 decoder = ( 1153 self._create_component_from_model(model=model.decoder, config=config) 1154 if model.decoder 1155 else JsonDecoder(parameters={}) 1156 ) 1157 login_requester = self._create_component_from_model( 1158 model=model.login_requester, 1159 config=config, 1160 name=f"{name}_login_requester", 1161 decoder=decoder, 1162 ) 1163 token_provider = SessionTokenProvider( 1164 login_requester=login_requester, 1165 session_token_path=model.session_token_path, 1166 expiration_duration=parse_duration(model.expiration_duration) 1167 if model.expiration_duration 1168 else None, 1169 parameters=model.parameters or {}, 1170 message_repository=self._message_repository, 1171 decoder=decoder, 1172 ) 1173 if model.request_authentication.type == "Bearer": 1174 return ModelToComponentFactory.create_bearer_authenticator( 1175 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1176 config, 1177 token_provider=token_provider, 1178 ) 1179 else: 1180 # Get the api_token template if specified, default to just the session token 1181 api_token_template = ( 1182 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1183 ) 1184 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1185 config=config, 1186 api_token=api_token_template, 1187 session_token_provider=token_provider, 1188 parameters=model.parameters or {}, 1189 ) 1190 return self.create_api_key_authenticator( 1191 ApiKeyAuthenticatorModel( 1192 type="ApiKeyAuthenticator", 1193 api_token="", 1194 inject_into=model.request_authentication.inject_into, 1195 ), # type: ignore # $parameters and headers default to None 1196 config=config, 1197 token_provider=final_token_provider, 1198 ) 1199 1200 @staticmethod 1201 def create_basic_http_authenticator( 1202 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1203 ) -> BasicHttpAuthenticator: 1204 return BasicHttpAuthenticator( 1205 password=model.password or "", 1206 username=model.username, 1207 config=config, 1208 parameters=model.parameters or {}, 1209 ) 1210 1211 @staticmethod 1212 def create_bearer_authenticator( 1213 model: BearerAuthenticatorModel, 1214 config: Config, 1215 token_provider: Optional[TokenProvider] = None, 1216 **kwargs: Any, 1217 ) -> BearerAuthenticator: 1218 if token_provider is not None and model.api_token != "": 1219 raise ValueError( 1220 "If token_provider is set, api_token is ignored and has to be set to empty string." 1221 ) 1222 return BearerAuthenticator( 1223 token_provider=( 1224 token_provider 1225 if token_provider is not None 1226 else InterpolatedStringTokenProvider( 1227 api_token=model.api_token or "", 1228 config=config, 1229 parameters=model.parameters or {}, 1230 ) 1231 ), 1232 config=config, 1233 parameters=model.parameters or {}, 1234 ) 1235 1236 @staticmethod 1237 def create_dynamic_stream_check_config( 1238 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1239 ) -> DynamicStreamCheckConfig: 1240 return DynamicStreamCheckConfig( 1241 dynamic_stream_name=model.dynamic_stream_name, 1242 stream_count=model.stream_count or 0, 1243 ) 1244 1245 def create_check_stream( 1246 self, model: CheckStreamModel, config: Config, **kwargs: Any 1247 ) -> CheckStream: 1248 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1249 raise ValueError( 1250 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1251 ) 1252 1253 dynamic_streams_check_configs = ( 1254 [ 1255 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1256 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1257 ] 1258 if model.dynamic_streams_check_configs 1259 else [] 1260 ) 1261 1262 return CheckStream( 1263 stream_names=model.stream_names or [], 1264 dynamic_streams_check_configs=dynamic_streams_check_configs, 1265 parameters={}, 1266 ) 1267 1268 @staticmethod 1269 def create_check_dynamic_stream( 1270 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1271 ) -> CheckDynamicStream: 1272 assert model.use_check_availability is not None # for mypy 1273 1274 use_check_availability = model.use_check_availability 1275 1276 return CheckDynamicStream( 1277 stream_count=model.stream_count, 1278 use_check_availability=use_check_availability, 1279 parameters={}, 1280 ) 1281 1282 def create_composite_error_handler( 1283 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1284 ) -> CompositeErrorHandler: 1285 error_handlers = [ 1286 self._create_component_from_model(model=error_handler_model, config=config) 1287 for error_handler_model in model.error_handlers 1288 ] 1289 return CompositeErrorHandler( 1290 error_handlers=error_handlers, parameters=model.parameters or {} 1291 ) 1292 1293 @staticmethod 1294 def create_concurrency_level( 1295 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1296 ) -> ConcurrencyLevel: 1297 return ConcurrencyLevel( 1298 default_concurrency=model.default_concurrency, 1299 max_concurrency=model.max_concurrency, 1300 config=config, 1301 parameters={}, 1302 ) 1303 1304 @staticmethod 1305 def apply_stream_state_migrations( 1306 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1307 ) -> MutableMapping[str, Any]: 1308 if stream_state_migrations: 1309 for state_migration in stream_state_migrations: 1310 if state_migration.should_migrate(stream_state): 1311 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1312 stream_state = dict(state_migration.migrate(stream_state)) 1313 return stream_state 1314 1315 def create_concurrent_cursor_from_datetime_based_cursor( 1316 self, 1317 model_type: Type[BaseModel], 1318 component_definition: ComponentDefinition, 1319 stream_name: str, 1320 stream_namespace: Optional[str], 1321 stream_state: MutableMapping[str, Any], 1322 config: Config, 1323 message_repository: Optional[MessageRepository] = None, 1324 runtime_lookback_window: Optional[datetime.timedelta] = None, 1325 **kwargs: Any, 1326 ) -> ConcurrentCursor: 1327 component_type = component_definition.get("type") 1328 if component_definition.get("type") != model_type.__name__: 1329 raise ValueError( 1330 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1331 ) 1332 1333 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1334 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1335 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1336 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1337 if "$parameters" not in component_definition and "parameters" in component_definition: 1338 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1339 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1340 1341 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1342 raise ValueError( 1343 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1344 ) 1345 1346 model_parameters = datetime_based_cursor_model.parameters or {} 1347 1348 cursor_field = self._get_catalog_defined_cursor_field( 1349 stream_name=stream_name, 1350 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1351 or False, 1352 ) 1353 1354 if not cursor_field: 1355 interpolated_cursor_field = InterpolatedString.create( 1356 datetime_based_cursor_model.cursor_field, 1357 parameters=model_parameters, 1358 ) 1359 cursor_field = CursorField( 1360 cursor_field_key=interpolated_cursor_field.eval(config=config), 1361 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1362 or False, 1363 ) 1364 1365 interpolated_partition_field_start = InterpolatedString.create( 1366 datetime_based_cursor_model.partition_field_start or "start_time", 1367 parameters=model_parameters, 1368 ) 1369 interpolated_partition_field_end = InterpolatedString.create( 1370 datetime_based_cursor_model.partition_field_end or "end_time", 1371 parameters=model_parameters, 1372 ) 1373 1374 slice_boundary_fields = ( 1375 interpolated_partition_field_start.eval(config=config), 1376 interpolated_partition_field_end.eval(config=config), 1377 ) 1378 1379 datetime_format = datetime_based_cursor_model.datetime_format 1380 1381 cursor_granularity = ( 1382 parse_duration(datetime_based_cursor_model.cursor_granularity) 1383 if datetime_based_cursor_model.cursor_granularity 1384 else None 1385 ) 1386 1387 lookback_window = None 1388 interpolated_lookback_window = ( 1389 InterpolatedString.create( 1390 datetime_based_cursor_model.lookback_window, 1391 parameters=model_parameters, 1392 ) 1393 if datetime_based_cursor_model.lookback_window 1394 else None 1395 ) 1396 if interpolated_lookback_window: 1397 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1398 if evaluated_lookback_window: 1399 lookback_window = parse_duration(evaluated_lookback_window) 1400 1401 connector_state_converter: DateTimeStreamStateConverter 1402 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1403 datetime_format=datetime_format, 1404 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1405 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1406 cursor_granularity=cursor_granularity, 1407 ) 1408 1409 # Adjusts the stream state by applying the runtime lookback window. 1410 # This is used to ensure correct state handling in case of failed partitions. 1411 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1412 if runtime_lookback_window and stream_state_value: 1413 new_stream_state = ( 1414 connector_state_converter.parse_timestamp(stream_state_value) 1415 - runtime_lookback_window 1416 ) 1417 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1418 new_stream_state 1419 ) 1420 1421 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1422 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1423 start_date_runtime_value = self.create_min_max_datetime( 1424 model=datetime_based_cursor_model.start_datetime, config=config 1425 ) 1426 else: 1427 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1428 1429 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1430 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1431 end_date_runtime_value = self.create_min_max_datetime( 1432 model=datetime_based_cursor_model.end_datetime, config=config 1433 ) 1434 else: 1435 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1436 1437 interpolated_start_date = MinMaxDatetime.create( 1438 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1439 parameters=datetime_based_cursor_model.parameters, 1440 ) 1441 interpolated_end_date = ( 1442 None 1443 if not end_date_runtime_value 1444 else MinMaxDatetime.create( 1445 end_date_runtime_value, datetime_based_cursor_model.parameters 1446 ) 1447 ) 1448 1449 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1450 if not interpolated_start_date.datetime_format: 1451 interpolated_start_date.datetime_format = datetime_format 1452 if interpolated_end_date and not interpolated_end_date.datetime_format: 1453 interpolated_end_date.datetime_format = datetime_format 1454 1455 start_date = interpolated_start_date.get_datetime(config=config) 1456 end_date_provider = ( 1457 partial(interpolated_end_date.get_datetime, config) 1458 if interpolated_end_date 1459 else connector_state_converter.get_end_provider() 1460 ) 1461 1462 if ( 1463 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1464 ) or ( 1465 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1466 ): 1467 raise ValueError( 1468 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1469 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1470 ) 1471 1472 # When step is not defined, default to a step size from the starting date to the present moment 1473 step_length = datetime.timedelta.max 1474 interpolated_step = ( 1475 InterpolatedString.create( 1476 datetime_based_cursor_model.step, 1477 parameters=model_parameters, 1478 ) 1479 if datetime_based_cursor_model.step 1480 else None 1481 ) 1482 if interpolated_step: 1483 evaluated_step = interpolated_step.eval(config) 1484 if evaluated_step: 1485 step_length = parse_duration(evaluated_step) 1486 1487 clamping_strategy: ClampingStrategy = NoClamping() 1488 if datetime_based_cursor_model.clamping: 1489 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1490 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1491 # object which we want to keep agnostic of being low-code 1492 target = InterpolatedString( 1493 string=datetime_based_cursor_model.clamping.target, 1494 parameters=model_parameters, 1495 ) 1496 evaluated_target = target.eval(config=config) 1497 match evaluated_target: 1498 case "DAY": 1499 clamping_strategy = DayClampingStrategy() 1500 end_date_provider = ClampingEndProvider( 1501 DayClampingStrategy(is_ceiling=False), 1502 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1503 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1504 ) 1505 case "WEEK": 1506 if ( 1507 not datetime_based_cursor_model.clamping.target_details 1508 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1509 ): 1510 raise ValueError( 1511 "Given WEEK clamping, weekday needs to be provided as target_details" 1512 ) 1513 weekday = self._assemble_weekday( 1514 datetime_based_cursor_model.clamping.target_details["weekday"] 1515 ) 1516 clamping_strategy = WeekClampingStrategy(weekday) 1517 end_date_provider = ClampingEndProvider( 1518 WeekClampingStrategy(weekday, is_ceiling=False), 1519 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1520 granularity=cursor_granularity or datetime.timedelta(days=1), 1521 ) 1522 case "MONTH": 1523 clamping_strategy = MonthClampingStrategy() 1524 end_date_provider = ClampingEndProvider( 1525 MonthClampingStrategy(is_ceiling=False), 1526 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1527 granularity=cursor_granularity or datetime.timedelta(days=1), 1528 ) 1529 case _: 1530 raise ValueError( 1531 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1532 ) 1533 1534 return ConcurrentCursor( 1535 stream_name=stream_name, 1536 stream_namespace=stream_namespace, 1537 stream_state=stream_state, 1538 message_repository=message_repository or self._message_repository, 1539 connector_state_manager=self._connector_state_manager, 1540 connector_state_converter=connector_state_converter, 1541 cursor_field=cursor_field, 1542 slice_boundary_fields=slice_boundary_fields, 1543 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1544 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1545 lookback_window=lookback_window, 1546 slice_range=step_length, 1547 cursor_granularity=cursor_granularity, 1548 clamping_strategy=clamping_strategy, 1549 ) 1550 1551 def create_concurrent_cursor_from_incrementing_count_cursor( 1552 self, 1553 model_type: Type[BaseModel], 1554 component_definition: ComponentDefinition, 1555 stream_name: str, 1556 stream_namespace: Optional[str], 1557 stream_state: MutableMapping[str, Any], 1558 config: Config, 1559 message_repository: Optional[MessageRepository] = None, 1560 **kwargs: Any, 1561 ) -> ConcurrentCursor: 1562 component_type = component_definition.get("type") 1563 if component_definition.get("type") != model_type.__name__: 1564 raise ValueError( 1565 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1566 ) 1567 1568 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1569 1570 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1571 raise ValueError( 1572 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1573 ) 1574 1575 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1576 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1577 # We need to handle both int and str representations of numeric values. 1578 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1579 if start_value is not None: 1580 interpolated_start_value = InterpolatedString.create( 1581 str(start_value), # Ensure we pass a string to InterpolatedString.create 1582 parameters=incrementing_count_cursor_model.parameters or {}, 1583 ) 1584 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1585 else: 1586 evaluated_start_value = 0 1587 1588 cursor_field = self._get_catalog_defined_cursor_field( 1589 stream_name=stream_name, 1590 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1591 or False, 1592 ) 1593 1594 if not cursor_field: 1595 interpolated_cursor_field = InterpolatedString.create( 1596 incrementing_count_cursor_model.cursor_field, 1597 parameters=incrementing_count_cursor_model.parameters or {}, 1598 ) 1599 cursor_field = CursorField( 1600 cursor_field_key=interpolated_cursor_field.eval(config=config), 1601 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1602 or False, 1603 ) 1604 1605 connector_state_converter = IncrementingCountStreamStateConverter( 1606 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1607 ) 1608 1609 return ConcurrentCursor( 1610 stream_name=stream_name, 1611 stream_namespace=stream_namespace, 1612 stream_state=stream_state, 1613 message_repository=message_repository or self._message_repository, 1614 connector_state_manager=self._connector_state_manager, 1615 connector_state_converter=connector_state_converter, 1616 cursor_field=cursor_field, 1617 slice_boundary_fields=None, 1618 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1619 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1620 ) 1621 1622 def _assemble_weekday(self, weekday: str) -> Weekday: 1623 match weekday: 1624 case "MONDAY": 1625 return Weekday.MONDAY 1626 case "TUESDAY": 1627 return Weekday.TUESDAY 1628 case "WEDNESDAY": 1629 return Weekday.WEDNESDAY 1630 case "THURSDAY": 1631 return Weekday.THURSDAY 1632 case "FRIDAY": 1633 return Weekday.FRIDAY 1634 case "SATURDAY": 1635 return Weekday.SATURDAY 1636 case "SUNDAY": 1637 return Weekday.SUNDAY 1638 case _: 1639 raise ValueError(f"Unknown weekday {weekday}") 1640 1641 def create_concurrent_cursor_from_perpartition_cursor( 1642 self, 1643 state_manager: ConnectorStateManager, 1644 model_type: Type[BaseModel], 1645 component_definition: ComponentDefinition, 1646 stream_name: str, 1647 stream_namespace: Optional[str], 1648 config: Config, 1649 stream_state: MutableMapping[str, Any], 1650 partition_router: PartitionRouter, 1651 attempt_to_create_cursor_if_not_provided: bool = False, 1652 **kwargs: Any, 1653 ) -> ConcurrentPerPartitionCursor: 1654 component_type = component_definition.get("type") 1655 if component_definition.get("type") != model_type.__name__: 1656 raise ValueError( 1657 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1658 ) 1659 1660 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1661 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1662 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1663 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1664 if "$parameters" not in component_definition and "parameters" in component_definition: 1665 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1666 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1667 1668 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1669 raise ValueError( 1670 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1671 ) 1672 1673 cursor_field = self._get_catalog_defined_cursor_field( 1674 stream_name=stream_name, 1675 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1676 or False, 1677 ) 1678 1679 if not cursor_field: 1680 interpolated_cursor_field = InterpolatedString.create( 1681 datetime_based_cursor_model.cursor_field, 1682 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1683 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1684 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1685 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1686 parameters=datetime_based_cursor_model.parameters or {}, 1687 ) 1688 cursor_field = CursorField( 1689 cursor_field_key=interpolated_cursor_field.eval(config=config), 1690 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1691 or False, 1692 ) 1693 1694 datetime_format = datetime_based_cursor_model.datetime_format 1695 1696 cursor_granularity = ( 1697 parse_duration(datetime_based_cursor_model.cursor_granularity) 1698 if datetime_based_cursor_model.cursor_granularity 1699 else None 1700 ) 1701 1702 connector_state_converter: DateTimeStreamStateConverter 1703 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1704 datetime_format=datetime_format, 1705 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1706 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1707 cursor_granularity=cursor_granularity, 1708 ) 1709 1710 # Create the cursor factory 1711 cursor_factory = ConcurrentCursorFactory( 1712 partial( 1713 self.create_concurrent_cursor_from_datetime_based_cursor, 1714 state_manager=state_manager, 1715 model_type=model_type, 1716 component_definition=component_definition, 1717 stream_name=stream_name, 1718 stream_namespace=stream_namespace, 1719 config=config, 1720 message_repository=NoopMessageRepository(), 1721 ) 1722 ) 1723 1724 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1725 use_global_cursor = isinstance( 1726 partition_router, GroupingPartitionRouter 1727 ) or component_definition.get("global_substream_cursor", False) 1728 1729 # Return the concurrent cursor and state converter 1730 return ConcurrentPerPartitionCursor( 1731 cursor_factory=cursor_factory, 1732 partition_router=partition_router, 1733 stream_name=stream_name, 1734 stream_namespace=stream_namespace, 1735 stream_state=stream_state, 1736 message_repository=self._message_repository, # type: ignore 1737 connector_state_manager=state_manager, 1738 connector_state_converter=connector_state_converter, 1739 cursor_field=cursor_field, 1740 use_global_cursor=use_global_cursor, 1741 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1742 ) 1743 1744 @staticmethod 1745 def create_constant_backoff_strategy( 1746 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1747 ) -> ConstantBackoffStrategy: 1748 return ConstantBackoffStrategy( 1749 backoff_time_in_seconds=model.backoff_time_in_seconds, 1750 config=config, 1751 parameters=model.parameters or {}, 1752 ) 1753 1754 def create_cursor_pagination( 1755 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1756 ) -> CursorPaginationStrategy: 1757 if isinstance(decoder, PaginationDecoderDecorator): 1758 inner_decoder = decoder.decoder 1759 else: 1760 inner_decoder = decoder 1761 decoder = PaginationDecoderDecorator(decoder=decoder) 1762 1763 if self._is_supported_decoder_for_pagination(inner_decoder): 1764 decoder_to_use = decoder 1765 else: 1766 raise ValueError( 1767 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1768 ) 1769 1770 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1771 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1772 page_size = model.page_size 1773 if isinstance(page_size, str) and page_size.isdigit(): 1774 page_size = int(page_size) 1775 1776 return CursorPaginationStrategy( 1777 cursor_value=model.cursor_value, 1778 decoder=decoder_to_use, 1779 page_size=page_size, 1780 stop_condition=model.stop_condition, 1781 config=config, 1782 parameters=model.parameters or {}, 1783 ) 1784 1785 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1786 """ 1787 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1788 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1789 :param model: The Pydantic model of the custom component being created 1790 :param config: The custom defined connector config 1791 :return: The declarative component built from the Pydantic model to be used at runtime 1792 """ 1793 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1794 component_fields = get_type_hints(custom_component_class) 1795 model_args = model.dict() 1796 model_args["config"] = config 1797 1798 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1799 # we defer to these arguments over the component's definition 1800 for key, arg in kwargs.items(): 1801 model_args[key] = arg 1802 1803 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1804 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1805 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1806 for model_field, model_value in model_args.items(): 1807 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1808 if ( 1809 isinstance(model_value, dict) 1810 and "type" not in model_value 1811 and model_field in component_fields 1812 ): 1813 derived_type = self._derive_component_type_from_type_hints( 1814 component_fields.get(model_field) 1815 ) 1816 if derived_type: 1817 model_value["type"] = derived_type 1818 1819 if self._is_component(model_value): 1820 model_args[model_field] = self._create_nested_component( 1821 model, 1822 model_field, 1823 model_value, 1824 config, 1825 **kwargs, 1826 ) 1827 elif isinstance(model_value, list): 1828 vals = [] 1829 for v in model_value: 1830 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1831 derived_type = self._derive_component_type_from_type_hints( 1832 component_fields.get(model_field) 1833 ) 1834 if derived_type: 1835 v["type"] = derived_type 1836 if self._is_component(v): 1837 vals.append( 1838 self._create_nested_component( 1839 model, 1840 model_field, 1841 v, 1842 config, 1843 **kwargs, 1844 ) 1845 ) 1846 else: 1847 vals.append(v) 1848 model_args[model_field] = vals 1849 1850 kwargs = { 1851 class_field: model_args[class_field] 1852 for class_field in component_fields.keys() 1853 if class_field in model_args 1854 } 1855 return custom_component_class(**kwargs) 1856 1857 @staticmethod 1858 def _get_class_from_fully_qualified_class_name( 1859 full_qualified_class_name: str, 1860 ) -> Any: 1861 """Get a class from its fully qualified name. 1862 1863 If a custom components module is needed, we assume it is already registered - probably 1864 as `source_declarative_manifest.components` or `components`. 1865 1866 Args: 1867 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1868 1869 Returns: 1870 Any: The class object. 1871 1872 Raises: 1873 ValueError: If the class cannot be loaded. 1874 """ 1875 split = full_qualified_class_name.split(".") 1876 module_name_full = ".".join(split[:-1]) 1877 class_name = split[-1] 1878 1879 try: 1880 module_ref = importlib.import_module(module_name_full) 1881 except ModuleNotFoundError as e: 1882 if split[0] == "source_declarative_manifest": 1883 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1884 try: 1885 import os 1886 1887 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1888 module_ref = importlib.import_module( 1889 module_name_with_source_declarative_manifest 1890 ) 1891 except ModuleNotFoundError: 1892 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1893 else: 1894 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1895 1896 try: 1897 return getattr(module_ref, class_name) 1898 except AttributeError as e: 1899 raise ValueError( 1900 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1901 ) from e 1902 1903 @staticmethod 1904 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1905 interface = field_type 1906 while True: 1907 origin = get_origin(interface) 1908 if origin: 1909 # Unnest types until we reach the raw type 1910 # List[T] -> T 1911 # Optional[List[T]] -> T 1912 args = get_args(interface) 1913 interface = args[0] 1914 else: 1915 break 1916 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1917 return interface.__name__ 1918 return None 1919 1920 @staticmethod 1921 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1922 if not cls: 1923 return False 1924 return cls.__module__ == "builtins" 1925 1926 @staticmethod 1927 def _extract_missing_parameters(error: TypeError) -> List[str]: 1928 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1929 if parameter_search: 1930 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1931 else: 1932 return [] 1933 1934 def _create_nested_component( 1935 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1936 ) -> Any: 1937 type_name = model_value.get("type", None) 1938 if not type_name: 1939 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1940 return model_value 1941 1942 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1943 if model_type: 1944 parsed_model = model_type.parse_obj(model_value) 1945 try: 1946 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1947 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1948 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1949 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1950 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1951 # are needed by a component and could not be shared. 1952 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1953 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1954 model_parameters = model_value.get("$parameters", {}) 1955 matching_parameters = { 1956 kwarg: model_parameters[kwarg] 1957 for kwarg in constructor_kwargs 1958 if kwarg in model_parameters 1959 } 1960 matching_kwargs = { 1961 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1962 } 1963 return self._create_component_from_model( 1964 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1965 ) 1966 except TypeError as error: 1967 missing_parameters = self._extract_missing_parameters(error) 1968 if missing_parameters: 1969 raise ValueError( 1970 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1971 + ", ".join( 1972 ( 1973 f"{type_name}.$parameters.{parameter}" 1974 for parameter in missing_parameters 1975 ) 1976 ) 1977 ) 1978 raise TypeError( 1979 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1980 ) 1981 else: 1982 raise ValueError( 1983 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1984 ) 1985 1986 @staticmethod 1987 def _is_component(model_value: Any) -> bool: 1988 return isinstance(model_value, dict) and model_value.get("type") is not None 1989 1990 def create_default_stream( 1991 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1992 ) -> AbstractStream: 1993 primary_key = model.primary_key.__root__ if model.primary_key else None 1994 self._migrate_state(model, config) 1995 1996 partition_router = self._build_stream_slicer_from_partition_router( 1997 model.retriever, 1998 config, 1999 stream_name=model.name, 2000 **kwargs, 2001 ) 2002 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2003 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2004 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2005 2006 end_time_option = ( 2007 self._create_component_from_model( 2008 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2009 ) 2010 if cursor_model.end_time_option 2011 else None 2012 ) 2013 start_time_option = ( 2014 self._create_component_from_model( 2015 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2016 ) 2017 if cursor_model.start_time_option 2018 else None 2019 ) 2020 2021 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2022 start_time_option=start_time_option, 2023 end_time_option=end_time_option, 2024 partition_field_start=cursor_model.partition_field_start, 2025 partition_field_end=cursor_model.partition_field_end, 2026 config=config, 2027 parameters=model.parameters or {}, 2028 ) 2029 request_options_provider = ( 2030 datetime_request_options_provider 2031 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2032 else PerPartitionRequestOptionsProvider( 2033 partition_router, datetime_request_options_provider 2034 ) 2035 ) 2036 elif model.incremental_sync and isinstance( 2037 model.incremental_sync, IncrementingCountCursorModel 2038 ): 2039 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2040 raise ValueError( 2041 "PerPartition does not support per partition states because switching to global state is time based" 2042 ) 2043 2044 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2045 2046 start_time_option = ( 2047 self._create_component_from_model( 2048 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2049 config, 2050 parameters=cursor_model.parameters or {}, 2051 ) 2052 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2053 else None 2054 ) 2055 2056 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2057 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2058 partition_field_start = "start" 2059 2060 request_options_provider = DatetimeBasedRequestOptionsProvider( 2061 start_time_option=start_time_option, 2062 partition_field_start=partition_field_start, 2063 config=config, 2064 parameters=model.parameters or {}, 2065 ) 2066 else: 2067 request_options_provider = None 2068 2069 transformations = [] 2070 if model.transformations: 2071 for transformation_model in model.transformations: 2072 transformations.append( 2073 self._create_component_from_model(model=transformation_model, config=config) 2074 ) 2075 file_uploader = None 2076 if model.file_uploader: 2077 file_uploader = self._create_component_from_model( 2078 model=model.file_uploader, config=config 2079 ) 2080 2081 stream_slicer: ConcurrentStreamSlicer = ( 2082 partition_router 2083 if isinstance(concurrent_cursor, FinalStateCursor) 2084 else concurrent_cursor 2085 ) 2086 2087 retriever = self._create_component_from_model( 2088 model=model.retriever, 2089 config=config, 2090 name=model.name, 2091 primary_key=primary_key, 2092 request_options_provider=request_options_provider, 2093 stream_slicer=stream_slicer, 2094 partition_router=partition_router, 2095 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2096 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2097 cursor=concurrent_cursor, 2098 transformations=transformations, 2099 file_uploader=file_uploader, 2100 incremental_sync=model.incremental_sync, 2101 ) 2102 if isinstance(retriever, AsyncRetriever): 2103 stream_slicer = retriever.stream_slicer 2104 2105 schema_loader: SchemaLoader 2106 if model.schema_loader and isinstance(model.schema_loader, list): 2107 nested_schema_loaders = [ 2108 self._create_component_from_model(model=nested_schema_loader, config=config) 2109 for nested_schema_loader in model.schema_loader 2110 ] 2111 schema_loader = CompositeSchemaLoader( 2112 schema_loaders=nested_schema_loaders, parameters={} 2113 ) 2114 elif model.schema_loader: 2115 schema_loader = self._create_component_from_model( 2116 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2117 config=config, 2118 ) 2119 else: 2120 options = model.parameters or {} 2121 if "name" not in options: 2122 options["name"] = model.name 2123 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2124 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2125 2126 stream_name = model.name or "" 2127 return DefaultStream( 2128 partition_generator=StreamSlicerPartitionGenerator( 2129 DeclarativePartitionFactory( 2130 stream_name, 2131 schema_loader, 2132 retriever, 2133 self._message_repository, 2134 ), 2135 stream_slicer, 2136 slice_limit=self._limit_slices_fetched, 2137 ), 2138 name=stream_name, 2139 json_schema=schema_loader.get_json_schema, 2140 primary_key=get_primary_key_from_stream(primary_key), 2141 cursor_field=( 2142 concurrent_cursor.cursor_field 2143 if hasattr(concurrent_cursor, "cursor_field") 2144 else None 2145 ), 2146 logger=logging.getLogger(f"airbyte.{stream_name}"), 2147 cursor=concurrent_cursor, 2148 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2149 ) 2150 2151 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2152 stream_name = model.name or "" 2153 stream_state = self._connector_state_manager.get_stream_state( 2154 stream_name=stream_name, namespace=None 2155 ) 2156 if model.state_migrations: 2157 state_transformations = [ 2158 self._create_component_from_model(state_migration, config, declarative_stream=model) 2159 for state_migration in model.state_migrations 2160 ] 2161 else: 2162 state_transformations = [] 2163 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2164 self._connector_state_manager.update_state_for_stream( 2165 stream_name=stream_name, namespace=None, value=stream_state 2166 ) 2167 2168 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2169 return bool( 2170 model.incremental_sync 2171 and hasattr(model.incremental_sync, "is_data_feed") 2172 and model.incremental_sync.is_data_feed 2173 ) 2174 2175 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2176 return bool( 2177 model.incremental_sync 2178 and hasattr(model.incremental_sync, "is_client_side_incremental") 2179 and model.incremental_sync.is_client_side_incremental 2180 ) 2181 2182 def _build_stream_slicer_from_partition_router( 2183 self, 2184 model: Union[ 2185 AsyncRetrieverModel, 2186 CustomRetrieverModel, 2187 SimpleRetrieverModel, 2188 ], 2189 config: Config, 2190 stream_name: Optional[str] = None, 2191 **kwargs: Any, 2192 ) -> PartitionRouter: 2193 if ( 2194 hasattr(model, "partition_router") 2195 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2196 and model.partition_router 2197 ): 2198 stream_slicer_model = model.partition_router 2199 if isinstance(stream_slicer_model, list): 2200 return CartesianProductStreamSlicer( 2201 [ 2202 self._create_component_from_model( 2203 model=slicer, config=config, stream_name=stream_name or "" 2204 ) 2205 for slicer in stream_slicer_model 2206 ], 2207 parameters={}, 2208 ) 2209 elif isinstance(stream_slicer_model, dict): 2210 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2211 params = stream_slicer_model.get("$parameters") 2212 if not isinstance(params, dict): 2213 params = {} 2214 stream_slicer_model["$parameters"] = params 2215 2216 if stream_name is not None: 2217 params["stream_name"] = stream_name 2218 2219 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2220 model, 2221 "partition_router", 2222 stream_slicer_model, 2223 config, 2224 **kwargs, 2225 ) 2226 else: 2227 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2228 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2229 ) 2230 return SinglePartitionRouter(parameters={}) 2231 2232 def _build_concurrent_cursor( 2233 self, 2234 model: DeclarativeStreamModel, 2235 stream_slicer: Optional[PartitionRouter], 2236 config: Config, 2237 ) -> Cursor: 2238 stream_name = model.name or "" 2239 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2240 2241 if ( 2242 model.incremental_sync 2243 and stream_slicer 2244 and not isinstance(stream_slicer, SinglePartitionRouter) 2245 ): 2246 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2247 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2248 # same time because we didn't solve for design questions like what the lookback window would 2249 # be as well as global cursor fall backs. We have not seen customers that have needed both 2250 # at the same time yet and are currently punting on this until we need to solve it. 2251 raise ValueError( 2252 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2253 ) 2254 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2255 state_manager=self._connector_state_manager, 2256 model_type=DatetimeBasedCursorModel, 2257 component_definition=model.incremental_sync.__dict__, 2258 stream_name=stream_name, 2259 stream_state=stream_state, 2260 stream_namespace=None, 2261 config=config or {}, 2262 partition_router=stream_slicer, 2263 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2264 ) 2265 elif model.incremental_sync: 2266 if type(model.incremental_sync) == IncrementingCountCursorModel: 2267 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2268 model_type=IncrementingCountCursorModel, 2269 component_definition=model.incremental_sync.__dict__, 2270 stream_name=stream_name, 2271 stream_namespace=None, 2272 stream_state=stream_state, 2273 config=config or {}, 2274 ) 2275 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2276 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2277 model_type=type(model.incremental_sync), 2278 component_definition=model.incremental_sync.__dict__, 2279 stream_name=stream_name, 2280 stream_namespace=None, 2281 stream_state=stream_state, 2282 config=config or {}, 2283 attempt_to_create_cursor_if_not_provided=True, 2284 ) 2285 else: 2286 raise ValueError( 2287 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2288 ) 2289 return FinalStateCursor(stream_name, None, self._message_repository) 2290 2291 def create_default_error_handler( 2292 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2293 ) -> DefaultErrorHandler: 2294 backoff_strategies = [] 2295 if model.backoff_strategies: 2296 for backoff_strategy_model in model.backoff_strategies: 2297 backoff_strategies.append( 2298 self._create_component_from_model(model=backoff_strategy_model, config=config) 2299 ) 2300 2301 response_filters = [] 2302 if model.response_filters: 2303 for response_filter_model in model.response_filters: 2304 response_filters.append( 2305 self._create_component_from_model(model=response_filter_model, config=config) 2306 ) 2307 response_filters.append( 2308 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2309 ) 2310 2311 return DefaultErrorHandler( 2312 backoff_strategies=backoff_strategies, 2313 max_retries=model.max_retries, 2314 response_filters=response_filters, 2315 config=config, 2316 parameters=model.parameters or {}, 2317 ) 2318 2319 def create_default_paginator( 2320 self, 2321 model: DefaultPaginatorModel, 2322 config: Config, 2323 *, 2324 url_base: str, 2325 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2326 decoder: Optional[Decoder] = None, 2327 cursor_used_for_stop_condition: Optional[Cursor] = None, 2328 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2329 if decoder: 2330 if self._is_supported_decoder_for_pagination(decoder): 2331 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2332 else: 2333 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2334 else: 2335 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2336 page_size_option = ( 2337 self._create_component_from_model(model=model.page_size_option, config=config) 2338 if model.page_size_option 2339 else None 2340 ) 2341 page_token_option = ( 2342 self._create_component_from_model(model=model.page_token_option, config=config) 2343 if model.page_token_option 2344 else None 2345 ) 2346 pagination_strategy = self._create_component_from_model( 2347 model=model.pagination_strategy, 2348 config=config, 2349 decoder=decoder_to_use, 2350 extractor_model=extractor_model, 2351 ) 2352 if cursor_used_for_stop_condition: 2353 pagination_strategy = StopConditionPaginationStrategyDecorator( 2354 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2355 ) 2356 paginator = DefaultPaginator( 2357 decoder=decoder_to_use, 2358 page_size_option=page_size_option, 2359 page_token_option=page_token_option, 2360 pagination_strategy=pagination_strategy, 2361 url_base=url_base, 2362 config=config, 2363 parameters=model.parameters or {}, 2364 ) 2365 if self._limit_pages_fetched_per_slice: 2366 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2367 return paginator 2368 2369 def create_dpath_extractor( 2370 self, 2371 model: DpathExtractorModel, 2372 config: Config, 2373 decoder: Optional[Decoder] = None, 2374 **kwargs: Any, 2375 ) -> DpathExtractor: 2376 if decoder: 2377 decoder_to_use = decoder 2378 else: 2379 decoder_to_use = JsonDecoder(parameters={}) 2380 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2381 return DpathExtractor( 2382 decoder=decoder_to_use, 2383 field_path=model_field_path, 2384 config=config, 2385 parameters=model.parameters or {}, 2386 ) 2387 2388 @staticmethod 2389 def create_response_to_file_extractor( 2390 model: ResponseToFileExtractorModel, 2391 **kwargs: Any, 2392 ) -> ResponseToFileExtractor: 2393 return ResponseToFileExtractor(parameters=model.parameters or {}) 2394 2395 @staticmethod 2396 def create_exponential_backoff_strategy( 2397 model: ExponentialBackoffStrategyModel, config: Config 2398 ) -> ExponentialBackoffStrategy: 2399 return ExponentialBackoffStrategy( 2400 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2401 ) 2402 2403 @staticmethod 2404 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2405 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2406 2407 def create_http_requester( 2408 self, 2409 model: HttpRequesterModel, 2410 config: Config, 2411 decoder: Decoder = JsonDecoder(parameters={}), 2412 query_properties_key: Optional[str] = None, 2413 use_cache: Optional[bool] = None, 2414 *, 2415 name: str, 2416 ) -> HttpRequester: 2417 authenticator = ( 2418 self._create_component_from_model( 2419 model=model.authenticator, 2420 config=config, 2421 url_base=model.url or model.url_base, 2422 name=name, 2423 decoder=decoder, 2424 ) 2425 if model.authenticator 2426 else None 2427 ) 2428 error_handler = ( 2429 self._create_component_from_model(model=model.error_handler, config=config) 2430 if model.error_handler 2431 else DefaultErrorHandler( 2432 backoff_strategies=[], 2433 response_filters=[], 2434 config=config, 2435 parameters=model.parameters or {}, 2436 ) 2437 ) 2438 2439 api_budget = self._api_budget 2440 2441 request_options_provider = InterpolatedRequestOptionsProvider( 2442 request_body=model.request_body, 2443 request_body_data=model.request_body_data, 2444 request_body_json=model.request_body_json, 2445 request_headers=model.request_headers, 2446 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2447 query_properties_key=query_properties_key, 2448 config=config, 2449 parameters=model.parameters or {}, 2450 ) 2451 2452 assert model.use_cache is not None # for mypy 2453 assert model.http_method is not None # for mypy 2454 2455 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2456 2457 return HttpRequester( 2458 name=name, 2459 url=model.url, 2460 url_base=model.url_base, 2461 path=model.path, 2462 authenticator=authenticator, 2463 error_handler=error_handler, 2464 api_budget=api_budget, 2465 http_method=HttpMethod[model.http_method.value], 2466 request_options_provider=request_options_provider, 2467 config=config, 2468 disable_retries=self._disable_retries, 2469 parameters=model.parameters or {}, 2470 message_repository=self._message_repository, 2471 use_cache=should_use_cache, 2472 decoder=decoder, 2473 stream_response=decoder.is_stream_response() if decoder else False, 2474 ) 2475 2476 @staticmethod 2477 def create_http_response_filter( 2478 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2479 ) -> HttpResponseFilter: 2480 if model.action: 2481 action = ResponseAction(model.action.value) 2482 else: 2483 action = None 2484 2485 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2486 2487 http_codes = ( 2488 set(model.http_codes) if model.http_codes else set() 2489 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2490 2491 return HttpResponseFilter( 2492 action=action, 2493 failure_type=failure_type, 2494 error_message=model.error_message or "", 2495 error_message_contains=model.error_message_contains or "", 2496 http_codes=http_codes, 2497 predicate=model.predicate or "", 2498 config=config, 2499 parameters=model.parameters or {}, 2500 ) 2501 2502 @staticmethod 2503 def create_inline_schema_loader( 2504 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2505 ) -> InlineSchemaLoader: 2506 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2507 2508 def create_complex_field_type( 2509 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2510 ) -> ComplexFieldType: 2511 items = ( 2512 self._create_component_from_model(model=model.items, config=config) 2513 if isinstance(model.items, ComplexFieldTypeModel) 2514 else model.items 2515 ) 2516 2517 return ComplexFieldType(field_type=model.field_type, items=items) 2518 2519 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2520 target_type = ( 2521 self._create_component_from_model(model=model.target_type, config=config) 2522 if isinstance(model.target_type, ComplexFieldTypeModel) 2523 else model.target_type 2524 ) 2525 2526 return TypesMap( 2527 target_type=target_type, 2528 current_type=model.current_type, 2529 condition=model.condition if model.condition is not None else "True", 2530 ) 2531 2532 def create_schema_type_identifier( 2533 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2534 ) -> SchemaTypeIdentifier: 2535 types_mapping = [] 2536 if model.types_mapping: 2537 types_mapping.extend( 2538 [ 2539 self._create_component_from_model(types_map, config=config) 2540 for types_map in model.types_mapping 2541 ] 2542 ) 2543 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2544 [x for x in model.schema_pointer] if model.schema_pointer else [] 2545 ) 2546 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2547 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2548 [x for x in model.type_pointer] if model.type_pointer else None 2549 ) 2550 2551 return SchemaTypeIdentifier( 2552 schema_pointer=model_schema_pointer, 2553 key_pointer=model_key_pointer, 2554 type_pointer=model_type_pointer, 2555 types_mapping=types_mapping, 2556 parameters=model.parameters or {}, 2557 ) 2558 2559 def create_dynamic_schema_loader( 2560 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2561 ) -> DynamicSchemaLoader: 2562 schema_transformations = [] 2563 if model.schema_transformations: 2564 for transformation_model in model.schema_transformations: 2565 schema_transformations.append( 2566 self._create_component_from_model(model=transformation_model, config=config) 2567 ) 2568 name = "dynamic_properties" 2569 retriever = self._create_component_from_model( 2570 model=model.retriever, 2571 config=config, 2572 name=name, 2573 primary_key=None, 2574 partition_router=self._build_stream_slicer_from_partition_router( 2575 model.retriever, config 2576 ), 2577 transformations=[], 2578 use_cache=True, 2579 log_formatter=( 2580 lambda response: format_http_message( 2581 response, 2582 f"Schema loader '{name}' request", 2583 f"Request performed in order to extract schema.", 2584 name, 2585 is_auxiliary=True, 2586 ) 2587 ), 2588 ) 2589 schema_type_identifier = self._create_component_from_model( 2590 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2591 ) 2592 schema_filter = ( 2593 self._create_component_from_model( 2594 model.schema_filter, config=config, parameters=model.parameters or {} 2595 ) 2596 if model.schema_filter is not None 2597 else None 2598 ) 2599 2600 return DynamicSchemaLoader( 2601 retriever=retriever, 2602 config=config, 2603 schema_transformations=schema_transformations, 2604 schema_filter=schema_filter, 2605 schema_type_identifier=schema_type_identifier, 2606 parameters=model.parameters or {}, 2607 ) 2608 2609 @staticmethod 2610 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2611 return JsonDecoder(parameters={}) 2612 2613 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2614 return CompositeRawDecoder( 2615 parser=ModelToComponentFactory._get_parser(model, config), 2616 stream_response=False if self._emit_connector_builder_messages else True, 2617 ) 2618 2619 def create_jsonl_decoder( 2620 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2621 ) -> Decoder: 2622 return CompositeRawDecoder( 2623 parser=ModelToComponentFactory._get_parser(model, config), 2624 stream_response=False if self._emit_connector_builder_messages else True, 2625 ) 2626 2627 def create_gzip_decoder( 2628 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2629 ) -> Decoder: 2630 _compressed_response_types = { 2631 "gzip", 2632 "x-gzip", 2633 "gzip, deflate", 2634 "x-gzip, deflate", 2635 "application/zip", 2636 "application/gzip", 2637 "application/x-gzip", 2638 "application/x-zip-compressed", 2639 } 2640 2641 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2642 2643 if self._emit_connector_builder_messages: 2644 # This is very surprising but if the response is not streamed, 2645 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2646 # which uses urllib3 directly and does not uncompress the data. 2647 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2648 2649 return CompositeRawDecoder.by_headers( 2650 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2651 stream_response=True, 2652 fallback_parser=gzip_parser.inner_parser, 2653 ) 2654 2655 @staticmethod 2656 def create_iterable_decoder( 2657 model: IterableDecoderModel, config: Config, **kwargs: Any 2658 ) -> IterableDecoder: 2659 return IterableDecoder(parameters={}) 2660 2661 @staticmethod 2662 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2663 return XmlDecoder(parameters={}) 2664 2665 def create_zipfile_decoder( 2666 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2667 ) -> ZipfileDecoder: 2668 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2669 2670 @staticmethod 2671 def _get_parser(model: BaseModel, config: Config) -> Parser: 2672 if isinstance(model, JsonDecoderModel): 2673 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2674 return JsonParser() 2675 elif isinstance(model, JsonlDecoderModel): 2676 return JsonLineParser() 2677 elif isinstance(model, CsvDecoderModel): 2678 return CsvParser( 2679 encoding=model.encoding, 2680 delimiter=model.delimiter, 2681 set_values_to_none=model.set_values_to_none, 2682 ) 2683 elif isinstance(model, GzipDecoderModel): 2684 return GzipParser( 2685 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2686 ) 2687 elif isinstance( 2688 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2689 ): 2690 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2691 2692 raise ValueError(f"Unknown decoder type {model}") 2693 2694 @staticmethod 2695 def create_json_file_schema_loader( 2696 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2697 ) -> JsonFileSchemaLoader: 2698 return JsonFileSchemaLoader( 2699 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2700 ) 2701 2702 def create_jwt_authenticator( 2703 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2704 ) -> JwtAuthenticator: 2705 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2706 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2707 request_option = ( 2708 self._create_component_from_model(model.request_option, config) 2709 if model.request_option 2710 else None 2711 ) 2712 return JwtAuthenticator( 2713 config=config, 2714 parameters=model.parameters or {}, 2715 algorithm=JwtAlgorithm(model.algorithm.value), 2716 secret_key=model.secret_key, 2717 base64_encode_secret_key=model.base64_encode_secret_key, 2718 token_duration=model.token_duration, 2719 header_prefix=model.header_prefix, 2720 kid=jwt_headers.kid, 2721 typ=jwt_headers.typ, 2722 cty=jwt_headers.cty, 2723 iss=jwt_payload.iss, 2724 sub=jwt_payload.sub, 2725 aud=jwt_payload.aud, 2726 additional_jwt_headers=model.additional_jwt_headers, 2727 additional_jwt_payload=model.additional_jwt_payload, 2728 passphrase=model.passphrase, 2729 request_option=request_option, 2730 ) 2731 2732 def create_list_partition_router( 2733 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2734 ) -> ListPartitionRouter: 2735 request_option = ( 2736 self._create_component_from_model(model.request_option, config) 2737 if model.request_option 2738 else None 2739 ) 2740 return ListPartitionRouter( 2741 cursor_field=model.cursor_field, 2742 request_option=request_option, 2743 values=model.values, 2744 config=config, 2745 parameters=model.parameters or {}, 2746 ) 2747 2748 @staticmethod 2749 def create_min_max_datetime( 2750 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2751 ) -> MinMaxDatetime: 2752 return MinMaxDatetime( 2753 datetime=model.datetime, 2754 datetime_format=model.datetime_format or "", 2755 max_datetime=model.max_datetime or "", 2756 min_datetime=model.min_datetime or "", 2757 parameters=model.parameters or {}, 2758 ) 2759 2760 @staticmethod 2761 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2762 return NoAuth(parameters=model.parameters or {}) 2763 2764 @staticmethod 2765 def create_no_pagination( 2766 model: NoPaginationModel, config: Config, **kwargs: Any 2767 ) -> NoPagination: 2768 return NoPagination(parameters={}) 2769 2770 def create_oauth_authenticator( 2771 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2772 ) -> DeclarativeOauth2Authenticator: 2773 profile_assertion = ( 2774 self._create_component_from_model(model.profile_assertion, config=config) 2775 if model.profile_assertion 2776 else None 2777 ) 2778 2779 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2780 self._get_refresh_token_error_information(model) 2781 ) 2782 if model.refresh_token_updater: 2783 # ignore type error because fixing it would have a lot of dependencies, revisit later 2784 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2785 config, 2786 InterpolatedString.create( 2787 model.token_refresh_endpoint, # type: ignore 2788 parameters=model.parameters or {}, 2789 ).eval(config), 2790 access_token_name=InterpolatedString.create( 2791 model.access_token_name or "access_token", parameters=model.parameters or {} 2792 ).eval(config), 2793 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2794 expires_in_name=InterpolatedString.create( 2795 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2796 ).eval(config), 2797 client_id_name=InterpolatedString.create( 2798 model.client_id_name or "client_id", parameters=model.parameters or {} 2799 ).eval(config), 2800 client_id=InterpolatedString.create( 2801 model.client_id, parameters=model.parameters or {} 2802 ).eval(config) 2803 if model.client_id 2804 else model.client_id, 2805 client_secret_name=InterpolatedString.create( 2806 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2807 ).eval(config), 2808 client_secret=InterpolatedString.create( 2809 model.client_secret, parameters=model.parameters or {} 2810 ).eval(config) 2811 if model.client_secret 2812 else model.client_secret, 2813 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2814 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2815 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2816 grant_type_name=InterpolatedString.create( 2817 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2818 ).eval(config), 2819 grant_type=InterpolatedString.create( 2820 model.grant_type or "refresh_token", parameters=model.parameters or {} 2821 ).eval(config), 2822 refresh_request_body=InterpolatedMapping( 2823 model.refresh_request_body or {}, parameters=model.parameters or {} 2824 ).eval(config), 2825 refresh_request_headers=InterpolatedMapping( 2826 model.refresh_request_headers or {}, parameters=model.parameters or {} 2827 ).eval(config), 2828 scopes=model.scopes, 2829 token_expiry_date_format=model.token_expiry_date_format, 2830 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2831 message_repository=self._message_repository, 2832 refresh_token_error_status_codes=refresh_token_error_status_codes, 2833 refresh_token_error_key=refresh_token_error_key, 2834 refresh_token_error_values=refresh_token_error_values, 2835 ) 2836 # ignore type error because fixing it would have a lot of dependencies, revisit later 2837 return DeclarativeOauth2Authenticator( # type: ignore 2838 access_token_name=model.access_token_name or "access_token", 2839 access_token_value=model.access_token_value, 2840 client_id_name=model.client_id_name or "client_id", 2841 client_id=model.client_id, 2842 client_secret_name=model.client_secret_name or "client_secret", 2843 client_secret=model.client_secret, 2844 expires_in_name=model.expires_in_name or "expires_in", 2845 grant_type_name=model.grant_type_name or "grant_type", 2846 grant_type=model.grant_type or "refresh_token", 2847 refresh_request_body=model.refresh_request_body, 2848 refresh_request_headers=model.refresh_request_headers, 2849 refresh_token_name=model.refresh_token_name or "refresh_token", 2850 refresh_token=model.refresh_token, 2851 scopes=model.scopes, 2852 token_expiry_date=model.token_expiry_date, 2853 token_expiry_date_format=model.token_expiry_date_format, 2854 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2855 token_refresh_endpoint=model.token_refresh_endpoint, 2856 config=config, 2857 parameters=model.parameters or {}, 2858 message_repository=self._message_repository, 2859 profile_assertion=profile_assertion, 2860 use_profile_assertion=model.use_profile_assertion, 2861 refresh_token_error_status_codes=refresh_token_error_status_codes, 2862 refresh_token_error_key=refresh_token_error_key, 2863 refresh_token_error_values=refresh_token_error_values, 2864 ) 2865 2866 @staticmethod 2867 def _get_refresh_token_error_information( 2868 model: OAuthAuthenticatorModel, 2869 ) -> Tuple[Tuple[int, ...], str, Tuple[str, ...]]: 2870 """ 2871 In a previous version of the CDK, the auth error as config_error was only done if a refresh token updater was 2872 defined. As a transition, we added those fields on the OAuthAuthenticatorModel. This method ensures that the 2873 information is defined only once and return the right fields. 2874 """ 2875 refresh_token_updater = model.refresh_token_updater 2876 is_defined_on_refresh_token_updated = refresh_token_updater and ( 2877 refresh_token_updater.refresh_token_error_status_codes 2878 or refresh_token_updater.refresh_token_error_key 2879 or refresh_token_updater.refresh_token_error_values 2880 ) 2881 is_defined_on_oauth_authenticator = ( 2882 model.refresh_token_error_status_codes 2883 or model.refresh_token_error_key 2884 or model.refresh_token_error_values 2885 ) 2886 if is_defined_on_refresh_token_updated and is_defined_on_oauth_authenticator: 2887 raise ValueError( 2888 "refresh_token_error should either be defined on the OAuthAuthenticatorModel or the RefreshTokenUpdaterModel, not both" 2889 ) 2890 2891 if is_defined_on_refresh_token_updated: 2892 not_optional_refresh_token_updater: RefreshTokenUpdaterModel = refresh_token_updater # type: ignore # we know from the condition that this is not None 2893 return ( 2894 tuple(not_optional_refresh_token_updater.refresh_token_error_status_codes) 2895 if not_optional_refresh_token_updater.refresh_token_error_status_codes 2896 else (), 2897 not_optional_refresh_token_updater.refresh_token_error_key or "", 2898 tuple(not_optional_refresh_token_updater.refresh_token_error_values) 2899 if not_optional_refresh_token_updater.refresh_token_error_values 2900 else (), 2901 ) 2902 elif is_defined_on_oauth_authenticator: 2903 return ( 2904 tuple(model.refresh_token_error_status_codes) 2905 if model.refresh_token_error_status_codes 2906 else (), 2907 model.refresh_token_error_key or "", 2908 tuple(model.refresh_token_error_values) if model.refresh_token_error_values else (), 2909 ) 2910 2911 # returning default values we think cover most cases 2912 return (400,), "error", ("invalid_grant", "invalid_permissions") 2913 2914 def create_offset_increment( 2915 self, 2916 model: OffsetIncrementModel, 2917 config: Config, 2918 decoder: Decoder, 2919 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2920 **kwargs: Any, 2921 ) -> OffsetIncrement: 2922 if isinstance(decoder, PaginationDecoderDecorator): 2923 inner_decoder = decoder.decoder 2924 else: 2925 inner_decoder = decoder 2926 decoder = PaginationDecoderDecorator(decoder=decoder) 2927 2928 if self._is_supported_decoder_for_pagination(inner_decoder): 2929 decoder_to_use = decoder 2930 else: 2931 raise ValueError( 2932 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2933 ) 2934 2935 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2936 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2937 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2938 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2939 # When we have more time to investigate we can look into reusing the same component. 2940 extractor = ( 2941 self._create_component_from_model( 2942 model=extractor_model, config=config, decoder=decoder_to_use 2943 ) 2944 if extractor_model 2945 else None 2946 ) 2947 2948 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2949 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2950 page_size = model.page_size 2951 if isinstance(page_size, str) and page_size.isdigit(): 2952 page_size = int(page_size) 2953 2954 return OffsetIncrement( 2955 page_size=page_size, 2956 config=config, 2957 decoder=decoder_to_use, 2958 extractor=extractor, 2959 inject_on_first_request=model.inject_on_first_request or False, 2960 parameters=model.parameters or {}, 2961 ) 2962 2963 @staticmethod 2964 def create_page_increment( 2965 model: PageIncrementModel, config: Config, **kwargs: Any 2966 ) -> PageIncrement: 2967 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2968 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2969 page_size = model.page_size 2970 if isinstance(page_size, str) and page_size.isdigit(): 2971 page_size = int(page_size) 2972 2973 return PageIncrement( 2974 page_size=page_size, 2975 config=config, 2976 start_from_page=model.start_from_page or 0, 2977 inject_on_first_request=model.inject_on_first_request or False, 2978 parameters=model.parameters or {}, 2979 ) 2980 2981 def create_parent_stream_config( 2982 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2983 ) -> ParentStreamConfig: 2984 declarative_stream = self._create_component_from_model( 2985 model.stream, 2986 config=config, 2987 is_parent=True, 2988 **kwargs, 2989 ) 2990 request_option = ( 2991 self._create_component_from_model(model.request_option, config=config) 2992 if model.request_option 2993 else None 2994 ) 2995 2996 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2997 raise ValueError( 2998 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2999 ) 3000 3001 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3002 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3003 ) 3004 3005 return ParentStreamConfig( 3006 parent_key=model.parent_key, 3007 request_option=request_option, 3008 stream=declarative_stream, 3009 partition_field=model.partition_field, 3010 config=config, 3011 incremental_dependency=model.incremental_dependency or False, 3012 parameters=model.parameters or {}, 3013 extra_fields=model.extra_fields, 3014 lazy_read_pointer=model_lazy_read_pointer, 3015 ) 3016 3017 def create_properties_from_endpoint( 3018 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3019 ) -> PropertiesFromEndpoint: 3020 retriever = self._create_component_from_model( 3021 model=model.retriever, 3022 config=config, 3023 name="dynamic_properties", 3024 primary_key=None, 3025 stream_slicer=None, 3026 transformations=[], 3027 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3028 ) 3029 return PropertiesFromEndpoint( 3030 property_field_path=model.property_field_path, 3031 retriever=retriever, 3032 config=config, 3033 parameters=model.parameters or {}, 3034 ) 3035 3036 def create_property_chunking( 3037 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3038 ) -> PropertyChunking: 3039 record_merge_strategy = ( 3040 self._create_component_from_model( 3041 model=model.record_merge_strategy, config=config, **kwargs 3042 ) 3043 if model.record_merge_strategy 3044 else None 3045 ) 3046 3047 property_limit_type: PropertyLimitType 3048 match model.property_limit_type: 3049 case PropertyLimitTypeModel.property_count: 3050 property_limit_type = PropertyLimitType.property_count 3051 case PropertyLimitTypeModel.characters: 3052 property_limit_type = PropertyLimitType.characters 3053 case _: 3054 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3055 3056 return PropertyChunking( 3057 property_limit_type=property_limit_type, 3058 property_limit=model.property_limit, 3059 record_merge_strategy=record_merge_strategy, 3060 config=config, 3061 parameters=model.parameters or {}, 3062 ) 3063 3064 def create_query_properties( 3065 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3066 ) -> QueryProperties: 3067 if isinstance(model.property_list, list): 3068 property_list = model.property_list 3069 else: 3070 property_list = self._create_component_from_model( 3071 model=model.property_list, config=config, **kwargs 3072 ) 3073 3074 property_chunking = ( 3075 self._create_component_from_model( 3076 model=model.property_chunking, config=config, **kwargs 3077 ) 3078 if model.property_chunking 3079 else None 3080 ) 3081 3082 property_selector = ( 3083 self._create_component_from_model( 3084 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3085 ) 3086 if model.property_selector 3087 else None 3088 ) 3089 3090 return QueryProperties( 3091 property_list=property_list, 3092 always_include_properties=model.always_include_properties, 3093 property_chunking=property_chunking, 3094 property_selector=property_selector, 3095 config=config, 3096 parameters=model.parameters or {}, 3097 ) 3098 3099 def create_json_schema_property_selector( 3100 self, 3101 model: JsonSchemaPropertySelectorModel, 3102 config: Config, 3103 *, 3104 stream_name: str, 3105 **kwargs: Any, 3106 ) -> JsonSchemaPropertySelector: 3107 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3108 3109 transformations = [] 3110 if model.transformations: 3111 for transformation_model in model.transformations: 3112 transformations.append( 3113 self._create_component_from_model(model=transformation_model, config=config) 3114 ) 3115 3116 return JsonSchemaPropertySelector( 3117 configured_stream=configured_stream, 3118 properties_transformations=transformations, 3119 config=config, 3120 parameters=model.parameters or {}, 3121 ) 3122 3123 @staticmethod 3124 def create_record_filter( 3125 model: RecordFilterModel, config: Config, **kwargs: Any 3126 ) -> RecordFilter: 3127 return RecordFilter( 3128 condition=model.condition or "", config=config, parameters=model.parameters or {} 3129 ) 3130 3131 @staticmethod 3132 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3133 return RequestPath(parameters={}) 3134 3135 @staticmethod 3136 def create_request_option( 3137 model: RequestOptionModel, config: Config, **kwargs: Any 3138 ) -> RequestOption: 3139 inject_into = RequestOptionType(model.inject_into.value) 3140 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3141 [ 3142 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3143 for segment in model.field_path 3144 ] 3145 if model.field_path 3146 else None 3147 ) 3148 field_name = ( 3149 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3150 if model.field_name 3151 else None 3152 ) 3153 return RequestOption( 3154 field_name=field_name, 3155 field_path=field_path, 3156 inject_into=inject_into, 3157 parameters=kwargs.get("parameters", {}), 3158 ) 3159 3160 def create_record_selector( 3161 self, 3162 model: RecordSelectorModel, 3163 config: Config, 3164 *, 3165 name: str, 3166 transformations: List[RecordTransformation] | None = None, 3167 decoder: Decoder | None = None, 3168 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3169 file_uploader: Optional[DefaultFileUploader] = None, 3170 **kwargs: Any, 3171 ) -> RecordSelector: 3172 extractor = self._create_component_from_model( 3173 model=model.extractor, decoder=decoder, config=config 3174 ) 3175 record_filter = ( 3176 self._create_component_from_model(model.record_filter, config=config) 3177 if model.record_filter 3178 else None 3179 ) 3180 3181 transform_before_filtering = ( 3182 False if model.transform_before_filtering is None else model.transform_before_filtering 3183 ) 3184 if client_side_incremental_sync_cursor: 3185 record_filter = ClientSideIncrementalRecordFilterDecorator( 3186 config=config, 3187 parameters=model.parameters, 3188 condition=model.record_filter.condition 3189 if (model.record_filter and hasattr(model.record_filter, "condition")) 3190 else None, 3191 cursor=client_side_incremental_sync_cursor, 3192 ) 3193 transform_before_filtering = ( 3194 True 3195 if model.transform_before_filtering is None 3196 else model.transform_before_filtering 3197 ) 3198 3199 if model.schema_normalization is None: 3200 # default to no schema normalization if not set 3201 model.schema_normalization = SchemaNormalizationModel.None_ 3202 3203 schema_normalization = ( 3204 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3205 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3206 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3207 ) 3208 3209 return RecordSelector( 3210 extractor=extractor, 3211 name=name, 3212 config=config, 3213 record_filter=record_filter, 3214 transformations=transformations or [], 3215 file_uploader=file_uploader, 3216 schema_normalization=schema_normalization, 3217 parameters=model.parameters or {}, 3218 transform_before_filtering=transform_before_filtering, 3219 ) 3220 3221 @staticmethod 3222 def create_remove_fields( 3223 model: RemoveFieldsModel, config: Config, **kwargs: Any 3224 ) -> RemoveFields: 3225 return RemoveFields( 3226 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3227 ) 3228 3229 def create_selective_authenticator( 3230 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3231 ) -> DeclarativeAuthenticator: 3232 authenticators = { 3233 name: self._create_component_from_model(model=auth, config=config) 3234 for name, auth in model.authenticators.items() 3235 } 3236 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3237 return SelectiveAuthenticator( # type: ignore[abstract] 3238 config=config, 3239 authenticators=authenticators, 3240 authenticator_selection_path=model.authenticator_selection_path, 3241 **kwargs, 3242 ) 3243 3244 @staticmethod 3245 def create_legacy_session_token_authenticator( 3246 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3247 ) -> LegacySessionTokenAuthenticator: 3248 return LegacySessionTokenAuthenticator( 3249 api_url=url_base, 3250 header=model.header, 3251 login_url=model.login_url, 3252 password=model.password or "", 3253 session_token=model.session_token or "", 3254 session_token_response_key=model.session_token_response_key or "", 3255 username=model.username or "", 3256 validate_session_url=model.validate_session_url, 3257 config=config, 3258 parameters=model.parameters or {}, 3259 ) 3260 3261 def create_simple_retriever( 3262 self, 3263 model: SimpleRetrieverModel, 3264 config: Config, 3265 *, 3266 name: str, 3267 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3268 request_options_provider: Optional[RequestOptionsProvider] = None, 3269 cursor: Optional[Cursor] = None, 3270 has_stop_condition_cursor: bool = False, 3271 is_client_side_incremental_sync: bool = False, 3272 transformations: List[RecordTransformation], 3273 file_uploader: Optional[DefaultFileUploader] = None, 3274 incremental_sync: Optional[ 3275 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3276 ] = None, 3277 use_cache: Optional[bool] = None, 3278 log_formatter: Optional[Callable[[Response], Any]] = None, 3279 partition_router: Optional[PartitionRouter] = None, 3280 **kwargs: Any, 3281 ) -> SimpleRetriever: 3282 def _get_url(req: Requester) -> str: 3283 """ 3284 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3285 This is needed because the URL is not set until the requester is created. 3286 """ 3287 3288 _url: str = ( 3289 model.requester.url 3290 if hasattr(model.requester, "url") and model.requester.url is not None 3291 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3292 ) 3293 _url_base: str = ( 3294 model.requester.url_base 3295 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3296 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3297 ) 3298 3299 return _url or _url_base 3300 3301 if cursor is None: 3302 cursor = FinalStateCursor(name, None, self._message_repository) 3303 3304 decoder = ( 3305 self._create_component_from_model(model=model.decoder, config=config) 3306 if model.decoder 3307 else JsonDecoder(parameters={}) 3308 ) 3309 record_selector = self._create_component_from_model( 3310 model=model.record_selector, 3311 name=name, 3312 config=config, 3313 decoder=decoder, 3314 transformations=transformations, 3315 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3316 file_uploader=file_uploader, 3317 ) 3318 3319 query_properties: Optional[QueryProperties] = None 3320 query_properties_key: Optional[str] = None 3321 self._ensure_query_properties_to_model(model.requester) 3322 if self._has_query_properties_in_request_parameters(model.requester): 3323 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3324 # places instead of default to request_parameters which isn't clearly documented 3325 if ( 3326 hasattr(model.requester, "fetch_properties_from_endpoint") 3327 and model.requester.fetch_properties_from_endpoint 3328 ): 3329 raise ValueError( 3330 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3331 ) 3332 3333 query_properties_definitions = [] 3334 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3335 if isinstance(request_parameter, QueryPropertiesModel): 3336 query_properties_key = key 3337 query_properties_definitions.append(request_parameter) 3338 3339 if len(query_properties_definitions) > 1: 3340 raise ValueError( 3341 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3342 ) 3343 3344 if len(query_properties_definitions) == 1: 3345 query_properties = self._create_component_from_model( 3346 model=query_properties_definitions[0], stream_name=name, config=config 3347 ) 3348 3349 # Removes QueryProperties components from the interpolated mappings because it has been designed 3350 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3351 # instead of through jinja interpolation 3352 if hasattr(model.requester, "request_parameters") and isinstance( 3353 model.requester.request_parameters, Mapping 3354 ): 3355 model.requester.request_parameters = self._remove_query_properties( 3356 model.requester.request_parameters 3357 ) 3358 elif ( 3359 hasattr(model.requester, "fetch_properties_from_endpoint") 3360 and model.requester.fetch_properties_from_endpoint 3361 ): 3362 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3363 query_properties_definition = QueryPropertiesModel( 3364 type="QueryProperties", 3365 property_list=model.requester.fetch_properties_from_endpoint, 3366 always_include_properties=None, 3367 property_chunking=None, 3368 ) # type: ignore # $parameters has a default value 3369 3370 query_properties = self.create_query_properties( 3371 model=query_properties_definition, 3372 stream_name=name, 3373 config=config, 3374 ) 3375 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3376 query_properties = self.create_query_properties( 3377 model=model.requester.query_properties, 3378 stream_name=name, 3379 config=config, 3380 ) 3381 3382 requester = self._create_component_from_model( 3383 model=model.requester, 3384 decoder=decoder, 3385 name=name, 3386 query_properties_key=query_properties_key, 3387 use_cache=use_cache, 3388 config=config, 3389 ) 3390 3391 if not request_options_provider: 3392 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3393 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3394 partition_router, PartitionRouter 3395 ): 3396 request_options_provider = partition_router 3397 3398 paginator = ( 3399 self._create_component_from_model( 3400 model=model.paginator, 3401 config=config, 3402 url_base=_get_url(requester), 3403 extractor_model=model.record_selector.extractor, 3404 decoder=decoder, 3405 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3406 ) 3407 if model.paginator 3408 else NoPagination(parameters={}) 3409 ) 3410 3411 ignore_stream_slicer_parameters_on_paginated_requests = ( 3412 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3413 ) 3414 3415 if ( 3416 model.partition_router 3417 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3418 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3419 and any( 3420 parent_stream_config.lazy_read_pointer 3421 for parent_stream_config in model.partition_router.parent_stream_configs 3422 ) 3423 ): 3424 if incremental_sync: 3425 if incremental_sync.type != "DatetimeBasedCursor": 3426 raise ValueError( 3427 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3428 ) 3429 3430 elif incremental_sync.step or incremental_sync.cursor_granularity: 3431 raise ValueError( 3432 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3433 ) 3434 3435 if model.decoder and model.decoder.type != "JsonDecoder": 3436 raise ValueError( 3437 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3438 ) 3439 3440 return LazySimpleRetriever( 3441 name=name, 3442 paginator=paginator, 3443 primary_key=primary_key, 3444 requester=requester, 3445 record_selector=record_selector, 3446 stream_slicer=_NO_STREAM_SLICING, 3447 request_option_provider=request_options_provider, 3448 config=config, 3449 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3450 parameters=model.parameters or {}, 3451 ) 3452 3453 if ( 3454 model.record_selector.record_filter 3455 and model.pagination_reset 3456 and model.pagination_reset.limits 3457 ): 3458 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3459 3460 return SimpleRetriever( 3461 name=name, 3462 paginator=paginator, 3463 primary_key=primary_key, 3464 requester=requester, 3465 record_selector=record_selector, 3466 stream_slicer=_NO_STREAM_SLICING, 3467 request_option_provider=request_options_provider, 3468 config=config, 3469 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3470 additional_query_properties=query_properties, 3471 log_formatter=self._get_log_formatter(log_formatter, name), 3472 pagination_tracker_factory=self._create_pagination_tracker_factory( 3473 model.pagination_reset, cursor 3474 ), 3475 parameters=model.parameters or {}, 3476 ) 3477 3478 def _create_pagination_tracker_factory( 3479 self, model: Optional[PaginationResetModel], cursor: Cursor 3480 ) -> Callable[[], PaginationTracker]: 3481 if model is None: 3482 return lambda: PaginationTracker() 3483 3484 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3485 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3486 if model.action == PaginationResetActionModel.RESET: 3487 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3488 pass 3489 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3490 if isinstance(cursor, ConcurrentCursor): 3491 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3492 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3493 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3494 {}, datetime.timedelta(0) 3495 ) 3496 elif not isinstance(cursor, FinalStateCursor): 3497 LOGGER.warning( 3498 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3499 ) 3500 else: 3501 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3502 3503 limit = model.limits.number_of_records if model and model.limits else None 3504 return lambda: PaginationTracker(cursor_factory(), limit) 3505 3506 def _get_log_formatter( 3507 self, log_formatter: Callable[[Response], Any] | None, name: str 3508 ) -> Callable[[Response], Any] | None: 3509 if self._should_limit_slices_fetched(): 3510 return ( 3511 ( 3512 lambda response: format_http_message( 3513 response, 3514 f"Stream '{name}' request", 3515 f"Request performed in order to extract records for stream '{name}'", 3516 name, 3517 ) 3518 ) 3519 if not log_formatter 3520 else log_formatter 3521 ) 3522 return None 3523 3524 def _should_limit_slices_fetched(self) -> bool: 3525 """ 3526 Returns True if the number of slices fetched should be limited, False otherwise. 3527 This is used to limit the number of slices fetched during tests. 3528 """ 3529 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3530 3531 @staticmethod 3532 def _has_query_properties_in_request_parameters( 3533 requester: Union[HttpRequesterModel, CustomRequesterModel], 3534 ) -> bool: 3535 if not hasattr(requester, "request_parameters"): 3536 return False 3537 request_parameters = requester.request_parameters 3538 if request_parameters and isinstance(request_parameters, Mapping): 3539 for request_parameter in request_parameters.values(): 3540 if isinstance(request_parameter, QueryPropertiesModel): 3541 return True 3542 return False 3543 3544 @staticmethod 3545 def _remove_query_properties( 3546 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3547 ) -> Mapping[str, str]: 3548 return { 3549 parameter_field: request_parameter 3550 for parameter_field, request_parameter in request_parameters.items() 3551 if not isinstance(request_parameter, QueryPropertiesModel) 3552 } 3553 3554 def create_state_delegating_stream( 3555 self, 3556 model: StateDelegatingStreamModel, 3557 config: Config, 3558 **kwargs: Any, 3559 ) -> DefaultStream: 3560 if ( 3561 model.full_refresh_stream.name != model.name 3562 or model.name != model.incremental_stream.name 3563 ): 3564 raise ValueError( 3565 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3566 ) 3567 3568 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3569 resolved_retention_period: Optional[str] = None 3570 if model.api_retention_period: 3571 interpolated_retention = InterpolatedString.create( 3572 model.api_retention_period, parameters=model.parameters or {} 3573 ) 3574 resolved_value = interpolated_retention.eval(config=config) 3575 if resolved_value: 3576 resolved_retention_period = str(resolved_value) 3577 3578 if resolved_retention_period: 3579 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3580 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3581 raise ValueError( 3582 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3583 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3584 f"cursors, so cursor age validation cannot be performed." 3585 ) 3586 3587 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3588 3589 if not stream_state: 3590 return self._create_component_from_model( # type: ignore[no-any-return] 3591 model.full_refresh_stream, config=config, **kwargs 3592 ) 3593 3594 incremental_stream: DefaultStream = self._create_component_from_model( 3595 model.incremental_stream, config=config, **kwargs 3596 ) # type: ignore[assignment] 3597 3598 # Only run cursor age validation for streams that are in the configured 3599 # catalog (or when no catalog was provided, e.g. during discover / connector 3600 # builder). Streams not selected by the user but instantiated as parent-stream 3601 # dependencies must not go through this path because it emits state messages 3602 # that the destination does not know about, causing "Stream not found" crashes. 3603 stream_is_in_catalog = ( 3604 not self._stream_name_to_configured_stream # no catalog → validate by default 3605 or model.name in self._stream_name_to_configured_stream 3606 ) 3607 if resolved_retention_period and stream_is_in_catalog: 3608 full_refresh_stream: DefaultStream = self._create_component_from_model( 3609 model.full_refresh_stream, config=config, **kwargs 3610 ) # type: ignore[assignment] 3611 if self._is_cursor_older_than_retention_period( 3612 stream_state, 3613 full_refresh_stream.cursor, 3614 incremental_stream.cursor, 3615 resolved_retention_period, 3616 model.name, 3617 ): 3618 # Clear state BEFORE constructing the full_refresh_stream so that 3619 # its cursor starts from start_date instead of the stale cursor. 3620 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3621 state_message = self._connector_state_manager.create_state_message(model.name, None) 3622 self._message_repository.emit_message(state_message) 3623 return self._create_component_from_model( # type: ignore[no-any-return] 3624 model.full_refresh_stream, config=config, **kwargs 3625 ) 3626 3627 return incremental_stream 3628 3629 @staticmethod 3630 def _is_cursor_older_than_retention_period( 3631 stream_state: Mapping[str, Any], 3632 full_refresh_cursor: Cursor, 3633 incremental_cursor: Cursor, 3634 api_retention_period: str, 3635 stream_name: str, 3636 ) -> bool: 3637 """Check if the cursor value in the state is older than the API's retention period. 3638 3639 Checks cursors in sequence: full refresh cursor first, then incremental cursor. 3640 FinalStateCursor returns now() for completed full refresh state (NO_CURSOR_STATE_KEY), 3641 which is always within retention, so we use incremental. For other states, it returns 3642 None and we fall back to checking the incremental cursor. 3643 3644 Returns True if the cursor is older than the retention period (should use full refresh). 3645 Returns False if the cursor is within the retention period (safe to use incremental). 3646 """ 3647 retention_duration = parse_duration(api_retention_period) 3648 retention_cutoff = datetime.datetime.now(datetime.timezone.utc) - retention_duration 3649 3650 # Check full refresh cursor first 3651 cursor_datetime = full_refresh_cursor.get_cursor_datetime_from_state(stream_state) 3652 3653 # If full refresh cursor returns None, check incremental cursor 3654 if cursor_datetime is None: 3655 cursor_datetime = incremental_cursor.get_cursor_datetime_from_state(stream_state) 3656 3657 if cursor_datetime is None: 3658 # Neither cursor could parse the state - fall back to full refresh to be safe 3659 return True 3660 3661 if cursor_datetime < retention_cutoff: 3662 logging.warning( 3663 f"Stream '{stream_name}' has a cursor value older than " 3664 f"the API's retention period of {api_retention_period} " 3665 f"(cutoff: {retention_cutoff.isoformat()}). " 3666 f"Falling back to full refresh to avoid data loss." 3667 ) 3668 return True 3669 3670 return False 3671 3672 def _get_state_delegating_stream_model( 3673 self, 3674 model: StateDelegatingStreamModel, 3675 parent_state: Optional[Mapping[str, Any]] = None, 3676 ) -> DeclarativeStreamModel: 3677 """Return the appropriate underlying stream model based on state.""" 3678 return ( 3679 model.incremental_stream 3680 if self._connector_state_manager.get_stream_state(model.name, None) or parent_state 3681 else model.full_refresh_stream 3682 ) 3683 3684 def _create_async_job_status_mapping( 3685 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3686 ) -> Mapping[str, AsyncJobStatus]: 3687 api_status_to_cdk_status = {} 3688 for cdk_status, api_statuses in model.dict().items(): 3689 if cdk_status == "type": 3690 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3691 continue 3692 3693 for status in api_statuses: 3694 if status in api_status_to_cdk_status: 3695 raise ValueError( 3696 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3697 ) 3698 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3699 return api_status_to_cdk_status 3700 3701 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3702 match status: 3703 case "running": 3704 return AsyncJobStatus.RUNNING 3705 case "completed": 3706 return AsyncJobStatus.COMPLETED 3707 case "failed": 3708 return AsyncJobStatus.FAILED 3709 case "timeout": 3710 return AsyncJobStatus.TIMED_OUT 3711 case _: 3712 raise ValueError(f"Unsupported CDK status {status}") 3713 3714 def create_async_retriever( 3715 self, 3716 model: AsyncRetrieverModel, 3717 config: Config, 3718 *, 3719 name: str, 3720 primary_key: Optional[ 3721 Union[str, List[str], List[List[str]]] 3722 ], # this seems to be needed to match create_simple_retriever 3723 stream_slicer: Optional[StreamSlicer], 3724 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3725 transformations: List[RecordTransformation], 3726 **kwargs: Any, 3727 ) -> AsyncRetriever: 3728 if model.download_target_requester and not model.download_target_extractor: 3729 raise ValueError( 3730 f"`download_target_extractor` required if using a `download_target_requester`" 3731 ) 3732 3733 def _get_download_retriever( 3734 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3735 ) -> SimpleRetriever: 3736 # We create a record selector for the download retriever 3737 # with no schema normalization and no transformations, neither record filter 3738 # as all this occurs in the record_selector of the AsyncRetriever 3739 record_selector = RecordSelector( 3740 extractor=extractor, 3741 name=name, 3742 record_filter=None, 3743 transformations=[], 3744 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3745 config=config, 3746 parameters={}, 3747 ) 3748 paginator = ( 3749 self._create_component_from_model( 3750 model=model.download_paginator, 3751 decoder=_decoder, 3752 config=config, 3753 url_base="", 3754 ) 3755 if model.download_paginator 3756 else NoPagination(parameters={}) 3757 ) 3758 3759 return SimpleRetriever( 3760 requester=requester, 3761 record_selector=record_selector, 3762 primary_key=None, 3763 name=name, 3764 paginator=paginator, 3765 config=config, 3766 parameters={}, 3767 log_formatter=self._get_log_formatter(None, name), 3768 ) 3769 3770 def _get_job_timeout() -> datetime.timedelta: 3771 user_defined_timeout: Optional[int] = ( 3772 int( 3773 InterpolatedString.create( 3774 str(model.polling_job_timeout), 3775 parameters={}, 3776 ).eval(config) 3777 ) 3778 if model.polling_job_timeout 3779 else None 3780 ) 3781 3782 # check for user defined timeout during the test read or 15 minutes 3783 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3784 # default value for non-connector builder is 60 minutes. 3785 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3786 3787 return ( 3788 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3789 ) 3790 3791 decoder = ( 3792 self._create_component_from_model(model=model.decoder, config=config) 3793 if model.decoder 3794 else JsonDecoder(parameters={}) 3795 ) 3796 record_selector = self._create_component_from_model( 3797 model=model.record_selector, 3798 config=config, 3799 decoder=decoder, 3800 name=name, 3801 transformations=transformations, 3802 client_side_incremental_sync=client_side_incremental_sync, 3803 ) 3804 3805 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3806 if self._should_limit_slices_fetched(): 3807 stream_slicer = cast( 3808 StreamSlicer, 3809 StreamSlicerTestReadDecorator( 3810 wrapped_slicer=stream_slicer, 3811 maximum_number_of_slices=self._limit_slices_fetched or 5, 3812 ), 3813 ) 3814 3815 creation_requester = self._create_component_from_model( 3816 model=model.creation_requester, 3817 decoder=decoder, 3818 config=config, 3819 name=f"job creation - {name}", 3820 ) 3821 polling_requester = self._create_component_from_model( 3822 model=model.polling_requester, 3823 decoder=decoder, 3824 config=config, 3825 name=f"job polling - {name}", 3826 ) 3827 job_download_components_name = f"job download - {name}" 3828 download_decoder = ( 3829 self._create_component_from_model(model=model.download_decoder, config=config) 3830 if model.download_decoder 3831 else JsonDecoder(parameters={}) 3832 ) 3833 download_extractor = ( 3834 self._create_component_from_model( 3835 model=model.download_extractor, 3836 config=config, 3837 decoder=download_decoder, 3838 parameters=model.parameters, 3839 ) 3840 if model.download_extractor 3841 else DpathExtractor( 3842 [], 3843 config=config, 3844 decoder=download_decoder, 3845 parameters=model.parameters or {}, 3846 ) 3847 ) 3848 download_requester = self._create_component_from_model( 3849 model=model.download_requester, 3850 decoder=download_decoder, 3851 config=config, 3852 name=job_download_components_name, 3853 ) 3854 download_retriever = _get_download_retriever( 3855 download_requester, download_extractor, download_decoder 3856 ) 3857 abort_requester = ( 3858 self._create_component_from_model( 3859 model=model.abort_requester, 3860 decoder=decoder, 3861 config=config, 3862 name=f"job abort - {name}", 3863 ) 3864 if model.abort_requester 3865 else None 3866 ) 3867 delete_requester = ( 3868 self._create_component_from_model( 3869 model=model.delete_requester, 3870 decoder=decoder, 3871 config=config, 3872 name=f"job delete - {name}", 3873 ) 3874 if model.delete_requester 3875 else None 3876 ) 3877 download_target_requester = ( 3878 self._create_component_from_model( 3879 model=model.download_target_requester, 3880 decoder=decoder, 3881 config=config, 3882 name=f"job extract_url - {name}", 3883 ) 3884 if model.download_target_requester 3885 else None 3886 ) 3887 status_extractor = self._create_component_from_model( 3888 model=model.status_extractor, decoder=decoder, config=config, name=name 3889 ) 3890 download_target_extractor = ( 3891 self._create_component_from_model( 3892 model=model.download_target_extractor, 3893 decoder=decoder, 3894 config=config, 3895 name=name, 3896 ) 3897 if model.download_target_extractor 3898 else None 3899 ) 3900 3901 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3902 creation_requester=creation_requester, 3903 polling_requester=polling_requester, 3904 download_retriever=download_retriever, 3905 download_target_requester=download_target_requester, 3906 abort_requester=abort_requester, 3907 delete_requester=delete_requester, 3908 status_extractor=status_extractor, 3909 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3910 download_target_extractor=download_target_extractor, 3911 job_timeout=_get_job_timeout(), 3912 ) 3913 3914 async_job_partition_router = AsyncJobPartitionRouter( 3915 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3916 job_repository, 3917 stream_slices, 3918 self._job_tracker, 3919 self._message_repository, 3920 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3921 has_bulk_parent=False, 3922 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3923 # `None` == default retry is set to 3 attempts, under the hood. 3924 job_max_retry=1 if self._emit_connector_builder_messages else None, 3925 ), 3926 stream_slicer=stream_slicer, 3927 config=config, 3928 parameters=model.parameters or {}, 3929 ) 3930 3931 return AsyncRetriever( 3932 record_selector=record_selector, 3933 stream_slicer=async_job_partition_router, 3934 config=config, 3935 parameters=model.parameters or {}, 3936 ) 3937 3938 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3939 config_migrations = [ 3940 self._create_component_from_model(migration, config) 3941 for migration in ( 3942 model.config_normalization_rules.config_migrations 3943 if ( 3944 model.config_normalization_rules 3945 and model.config_normalization_rules.config_migrations 3946 ) 3947 else [] 3948 ) 3949 ] 3950 config_transformations = [ 3951 self._create_component_from_model(transformation, config) 3952 for transformation in ( 3953 model.config_normalization_rules.transformations 3954 if ( 3955 model.config_normalization_rules 3956 and model.config_normalization_rules.transformations 3957 ) 3958 else [] 3959 ) 3960 ] 3961 config_validations = [ 3962 self._create_component_from_model(validation, config) 3963 for validation in ( 3964 model.config_normalization_rules.validations 3965 if ( 3966 model.config_normalization_rules 3967 and model.config_normalization_rules.validations 3968 ) 3969 else [] 3970 ) 3971 ] 3972 3973 return Spec( 3974 connection_specification=model.connection_specification, 3975 documentation_url=model.documentation_url, 3976 advanced_auth=model.advanced_auth, 3977 parameters={}, 3978 config_migrations=config_migrations, 3979 config_transformations=config_transformations, 3980 config_validations=config_validations, 3981 ) 3982 3983 def create_substream_partition_router( 3984 self, 3985 model: SubstreamPartitionRouterModel, 3986 config: Config, 3987 *, 3988 stream_name: str, 3989 **kwargs: Any, 3990 ) -> SubstreamPartitionRouter: 3991 parent_stream_configs = [] 3992 if model.parent_stream_configs: 3993 parent_stream_configs.extend( 3994 [ 3995 self.create_parent_stream_config_with_substream_wrapper( 3996 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3997 ) 3998 for parent_stream_config in model.parent_stream_configs 3999 ] 4000 ) 4001 4002 return SubstreamPartitionRouter( 4003 parent_stream_configs=parent_stream_configs, 4004 parameters=model.parameters or {}, 4005 config=config, 4006 ) 4007 4008 def create_parent_stream_config_with_substream_wrapper( 4009 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4010 ) -> Any: 4011 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4012 4013 parent_state: Optional[Mapping[str, Any]] = ( 4014 child_state if model.incremental_dependency and child_state else None 4015 ) 4016 connector_state_manager = self._instantiate_parent_stream_state_manager( 4017 child_state, config, model, parent_state 4018 ) 4019 4020 substream_factory = ModelToComponentFactory( 4021 connector_state_manager=connector_state_manager, 4022 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4023 limit_slices_fetched=self._limit_slices_fetched, 4024 emit_connector_builder_messages=self._emit_connector_builder_messages, 4025 disable_retries=self._disable_retries, 4026 disable_cache=self._disable_cache, 4027 message_repository=StateFilteringMessageRepository( 4028 LogAppenderMessageRepositoryDecorator( 4029 { 4030 "airbyte_cdk": {"stream": {"is_substream": True}}, 4031 "http": {"is_auxiliary": True}, 4032 }, 4033 self._message_repository, 4034 self._evaluate_log_level(self._emit_connector_builder_messages), 4035 ), 4036 ), 4037 api_budget=self._api_budget, 4038 ) 4039 4040 return substream_factory.create_parent_stream_config( 4041 model=model, config=config, stream_name=stream_name, **kwargs 4042 ) 4043 4044 def _instantiate_parent_stream_state_manager( 4045 self, 4046 child_state: MutableMapping[str, Any], 4047 config: Config, 4048 model: ParentStreamConfigModel, 4049 parent_state: Optional[Mapping[str, Any]] = None, 4050 ) -> ConnectorStateManager: 4051 """ 4052 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 4053 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 4054 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 4055 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 4056 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 4057 incremental_dependency is set. 4058 """ 4059 if model.incremental_dependency and child_state: 4060 parent_stream_name = model.stream.name or "" 4061 extracted_parent_state = ConcurrentPerPartitionCursor.get_parent_state( 4062 child_state, parent_stream_name 4063 ) 4064 4065 if not extracted_parent_state: 4066 extracted_parent_state = ConcurrentPerPartitionCursor.get_global_state( 4067 child_state, parent_stream_name 4068 ) 4069 4070 if not extracted_parent_state and not isinstance(extracted_parent_state, dict): 4071 cursor_values = child_state.values() 4072 if cursor_values and len(cursor_values) == 1: 4073 incremental_sync_model: Union[ 4074 DatetimeBasedCursorModel, 4075 IncrementingCountCursorModel, 4076 ] = ( 4077 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 4078 if isinstance(model.stream, DeclarativeStreamModel) 4079 else self._get_state_delegating_stream_model( 4080 model.stream, parent_state=parent_state 4081 ).incremental_sync 4082 ) 4083 cursor_field = InterpolatedString.create( 4084 incremental_sync_model.cursor_field, 4085 parameters=incremental_sync_model.parameters or {}, 4086 ).eval(config) 4087 extracted_parent_state = AirbyteStateMessage( 4088 type=AirbyteStateType.STREAM, 4089 stream=AirbyteStreamState( 4090 stream_descriptor=StreamDescriptor( 4091 name=parent_stream_name, namespace=None 4092 ), 4093 stream_state=AirbyteStateBlob( 4094 {cursor_field: list(cursor_values)[0]} 4095 ), 4096 ), 4097 ) 4098 return ConnectorStateManager([extracted_parent_state] if extracted_parent_state else []) 4099 4100 return ConnectorStateManager([]) 4101 4102 @staticmethod 4103 def create_wait_time_from_header( 4104 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4105 ) -> WaitTimeFromHeaderBackoffStrategy: 4106 return WaitTimeFromHeaderBackoffStrategy( 4107 header=model.header, 4108 parameters=model.parameters or {}, 4109 config=config, 4110 regex=model.regex, 4111 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4112 if model.max_waiting_time_in_seconds is not None 4113 else None, 4114 ) 4115 4116 @staticmethod 4117 def create_wait_until_time_from_header( 4118 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4119 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4120 return WaitUntilTimeFromHeaderBackoffStrategy( 4121 header=model.header, 4122 parameters=model.parameters or {}, 4123 config=config, 4124 min_wait=model.min_wait, 4125 regex=model.regex, 4126 ) 4127 4128 def get_message_repository(self) -> MessageRepository: 4129 return self._message_repository 4130 4131 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 4132 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 4133 4134 @staticmethod 4135 def create_components_mapping_definition( 4136 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4137 ) -> ComponentMappingDefinition: 4138 interpolated_value = InterpolatedString.create( 4139 model.value, parameters=model.parameters or {} 4140 ) 4141 field_path = [ 4142 InterpolatedString.create(path, parameters=model.parameters or {}) 4143 for path in model.field_path 4144 ] 4145 return ComponentMappingDefinition( 4146 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4147 value=interpolated_value, 4148 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4149 create_or_update=model.create_or_update, 4150 condition=model.condition, 4151 parameters=model.parameters or {}, 4152 ) 4153 4154 def create_http_components_resolver( 4155 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4156 ) -> Any: 4157 retriever = self._create_component_from_model( 4158 model=model.retriever, 4159 config=config, 4160 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4161 primary_key=None, 4162 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4163 transformations=[], 4164 ) 4165 4166 components_mapping = [] 4167 for component_mapping_definition_model in model.components_mapping: 4168 if component_mapping_definition_model.condition: 4169 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4170 components_mapping.append( 4171 self._create_component_from_model( 4172 model=component_mapping_definition_model, 4173 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4174 component_mapping_definition_model.value_type 4175 ), 4176 config=config, 4177 ) 4178 ) 4179 4180 return HttpComponentsResolver( 4181 retriever=retriever, 4182 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4183 config=config, 4184 components_mapping=components_mapping, 4185 parameters=model.parameters or {}, 4186 ) 4187 4188 @staticmethod 4189 def create_stream_config( 4190 model: StreamConfigModel, config: Config, **kwargs: Any 4191 ) -> StreamConfig: 4192 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4193 [x for x in model.configs_pointer] if model.configs_pointer else [] 4194 ) 4195 4196 return StreamConfig( 4197 configs_pointer=model_configs_pointer, 4198 default_values=model.default_values, 4199 parameters=model.parameters or {}, 4200 ) 4201 4202 def create_config_components_resolver( 4203 self, 4204 model: ConfigComponentsResolverModel, 4205 config: Config, 4206 ) -> Any: 4207 model_stream_configs = ( 4208 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4209 ) 4210 4211 stream_configs = [ 4212 self._create_component_from_model( 4213 stream_config, config=config, parameters=model.parameters or {} 4214 ) 4215 for stream_config in model_stream_configs 4216 ] 4217 4218 components_mapping = [ 4219 self._create_component_from_model( 4220 model=components_mapping_definition_model, 4221 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4222 components_mapping_definition_model.value_type 4223 ), 4224 config=config, 4225 parameters=model.parameters, 4226 ) 4227 for components_mapping_definition_model in model.components_mapping 4228 ] 4229 4230 return ConfigComponentsResolver( 4231 stream_configs=stream_configs, 4232 config=config, 4233 components_mapping=components_mapping, 4234 parameters=model.parameters or {}, 4235 ) 4236 4237 def create_parametrized_components_resolver( 4238 self, 4239 model: ParametrizedComponentsResolverModel, 4240 config: Config, 4241 ) -> ParametrizedComponentsResolver: 4242 stream_parameters = StreamParametersDefinition( 4243 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4244 ) 4245 4246 components_mapping = [] 4247 for components_mapping_definition_model in model.components_mapping: 4248 if components_mapping_definition_model.condition: 4249 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4250 components_mapping.append( 4251 self._create_component_from_model( 4252 model=components_mapping_definition_model, 4253 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4254 components_mapping_definition_model.value_type 4255 ), 4256 config=config, 4257 ) 4258 ) 4259 return ParametrizedComponentsResolver( 4260 stream_parameters=stream_parameters, 4261 config=config, 4262 components_mapping=components_mapping, 4263 parameters=model.parameters or {}, 4264 ) 4265 4266 _UNSUPPORTED_DECODER_ERROR = ( 4267 "Specified decoder of {decoder_type} is not supported for pagination." 4268 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4269 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4270 ) 4271 4272 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4273 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4274 return True 4275 elif isinstance(decoder, CompositeRawDecoder): 4276 return self._is_supported_parser_for_pagination(decoder.parser) 4277 else: 4278 return False 4279 4280 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4281 if isinstance(parser, JsonParser): 4282 return True 4283 elif isinstance(parser, GzipParser): 4284 return isinstance(parser.inner_parser, JsonParser) 4285 else: 4286 return False 4287 4288 def create_http_api_budget( 4289 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4290 ) -> HttpAPIBudget: 4291 policies = [ 4292 self._create_component_from_model(model=policy, config=config) 4293 for policy in model.policies 4294 ] 4295 4296 return HttpAPIBudget( 4297 policies=policies, 4298 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4299 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4300 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4301 ) 4302 4303 def create_fixed_window_call_rate_policy( 4304 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4305 ) -> FixedWindowCallRatePolicy: 4306 matchers = [ 4307 self._create_component_from_model(model=matcher, config=config) 4308 for matcher in model.matchers 4309 ] 4310 4311 # Set the initial reset timestamp to 10 days from now. 4312 # This value will be updated by the first request. 4313 return FixedWindowCallRatePolicy( 4314 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4315 period=parse_duration(model.period), 4316 call_limit=model.call_limit, 4317 matchers=matchers, 4318 ) 4319 4320 def create_file_uploader( 4321 self, model: FileUploaderModel, config: Config, **kwargs: Any 4322 ) -> FileUploader: 4323 name = "File Uploader" 4324 requester = self._create_component_from_model( 4325 model=model.requester, 4326 config=config, 4327 name=name, 4328 **kwargs, 4329 ) 4330 download_target_extractor = self._create_component_from_model( 4331 model=model.download_target_extractor, 4332 config=config, 4333 name=name, 4334 **kwargs, 4335 ) 4336 emit_connector_builder_messages = self._emit_connector_builder_messages 4337 file_uploader = DefaultFileUploader( 4338 requester=requester, 4339 download_target_extractor=download_target_extractor, 4340 config=config, 4341 file_writer=NoopFileWriter() 4342 if emit_connector_builder_messages 4343 else LocalFileSystemFileWriter(), 4344 parameters=model.parameters or {}, 4345 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4346 ) 4347 4348 return ( 4349 ConnectorBuilderFileUploader(file_uploader) 4350 if emit_connector_builder_messages 4351 else file_uploader 4352 ) 4353 4354 def create_moving_window_call_rate_policy( 4355 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4356 ) -> MovingWindowCallRatePolicy: 4357 rates = [ 4358 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4359 ] 4360 matchers = [ 4361 self._create_component_from_model(model=matcher, config=config) 4362 for matcher in model.matchers 4363 ] 4364 return MovingWindowCallRatePolicy( 4365 rates=rates, 4366 matchers=matchers, 4367 ) 4368 4369 def create_unlimited_call_rate_policy( 4370 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4371 ) -> UnlimitedCallRatePolicy: 4372 matchers = [ 4373 self._create_component_from_model(model=matcher, config=config) 4374 for matcher in model.matchers 4375 ] 4376 4377 return UnlimitedCallRatePolicy( 4378 matchers=matchers, 4379 ) 4380 4381 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4382 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4383 return Rate( 4384 limit=int(interpolated_limit.eval(config=config)), 4385 interval=parse_duration(model.interval), 4386 ) 4387 4388 def create_http_request_matcher( 4389 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4390 ) -> HttpRequestRegexMatcher: 4391 weight = model.weight 4392 if weight is not None: 4393 if isinstance(weight, str): 4394 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4395 else: 4396 weight = int(weight) 4397 if weight < 1: 4398 raise ValueError(f"weight must be >= 1, got {weight}") 4399 return HttpRequestRegexMatcher( 4400 method=model.method, 4401 url_base=model.url_base, 4402 url_path_pattern=model.url_path_pattern, 4403 params=model.params, 4404 headers=model.headers, 4405 weight=weight, 4406 ) 4407 4408 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4409 self._api_budget = self.create_component( 4410 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4411 ) 4412 4413 def create_grouping_partition_router( 4414 self, 4415 model: GroupingPartitionRouterModel, 4416 config: Config, 4417 *, 4418 stream_name: str, 4419 **kwargs: Any, 4420 ) -> GroupingPartitionRouter: 4421 underlying_router = self._create_component_from_model( 4422 model=model.underlying_partition_router, 4423 config=config, 4424 stream_name=stream_name, 4425 **kwargs, 4426 ) 4427 if model.group_size < 1: 4428 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4429 4430 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4431 # because they are specific to individual partitions and cannot be aggregated or handled 4432 # when grouping, potentially leading to incorrect API calls. Any request customization 4433 # should be managed at the stream level through the requester's configuration. 4434 if isinstance(underlying_router, SubstreamPartitionRouter): 4435 if any( 4436 parent_config.request_option 4437 for parent_config in underlying_router.parent_stream_configs 4438 ): 4439 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4440 4441 if isinstance(underlying_router, ListPartitionRouter): 4442 if underlying_router.request_option: 4443 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4444 4445 return GroupingPartitionRouter( 4446 group_size=model.group_size, 4447 underlying_partition_router=underlying_router, 4448 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4449 config=config, 4450 ) 4451 4452 def _ensure_query_properties_to_model( 4453 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4454 ) -> None: 4455 """ 4456 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4457 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4458 proper model. 4459 """ 4460 if not hasattr(requester, "request_parameters"): 4461 return 4462 4463 request_parameters = requester.request_parameters 4464 if request_parameters and isinstance(request_parameters, Dict): 4465 for request_parameter_key in request_parameters.keys(): 4466 request_parameter = request_parameters[request_parameter_key] 4467 if ( 4468 isinstance(request_parameter, Dict) 4469 and request_parameter.get("type") == "QueryProperties" 4470 ): 4471 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4472 request_parameter 4473 ) 4474 4475 def _get_catalog_defined_cursor_field( 4476 self, stream_name: str, allow_catalog_defined_cursor_field: bool 4477 ) -> Optional[CursorField]: 4478 if not allow_catalog_defined_cursor_field: 4479 return None 4480 4481 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 4482 4483 # Depending on the operation is being performed, there may not be a configured stream yet. In this 4484 # case we return None which will then use the default cursor field defined on the cursor model. 4485 # We also treat cursor_field: [""] (list with empty string) as no cursor field, since this can 4486 # occur when the platform serializes "no cursor configured" streams incorrectly. 4487 if ( 4488 not configured_stream 4489 or not configured_stream.cursor_field 4490 or not configured_stream.cursor_field[0] 4491 ): 4492 return None 4493 elif len(configured_stream.cursor_field) > 1: 4494 raise ValueError( 4495 f"The `{stream_name}` stream does not support nested cursor_field. Please specify only a single cursor_field for the stream in the configured catalog." 4496 ) 4497 else: 4498 return CursorField( 4499 cursor_field_key=configured_stream.cursor_field[0], 4500 supports_catalog_defined_cursor_field=allow_catalog_defined_cursor_field, 4501 )
681 def __init__( 682 self, 683 limit_pages_fetched_per_slice: Optional[int] = None, 684 limit_slices_fetched: Optional[int] = None, 685 emit_connector_builder_messages: bool = False, 686 disable_retries: bool = False, 687 disable_cache: bool = False, 688 message_repository: Optional[MessageRepository] = None, 689 connector_state_manager: Optional[ConnectorStateManager] = None, 690 max_concurrent_async_job_count: Optional[int] = None, 691 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 692 api_budget: Optional[APIBudget] = None, 693 ): 694 self._init_mappings() 695 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 696 self._limit_slices_fetched = limit_slices_fetched 697 self._emit_connector_builder_messages = emit_connector_builder_messages 698 self._disable_retries = disable_retries 699 self._disable_cache = disable_cache 700 self._message_repository = message_repository or InMemoryMessageRepository( 701 self._evaluate_log_level(emit_connector_builder_messages) 702 ) 703 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 704 configured_catalog 705 ) 706 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 707 self._api_budget: Optional[Union[APIBudget]] = api_budget 708 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 709 # placeholder for deprecation warnings 710 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
833 def create_component( 834 self, 835 model_type: Type[BaseModel], 836 component_definition: ComponentDefinition, 837 config: Config, 838 **kwargs: Any, 839 ) -> Any: 840 """ 841 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 842 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 843 creating declarative components from that model. 844 845 :param model_type: The type of declarative component that is being initialized 846 :param component_definition: The mapping that represents a declarative component 847 :param config: The connector config that is provided by the customer 848 :return: The declarative component to be used at runtime 849 """ 850 851 component_type = component_definition.get("type") 852 if component_definition.get("type") != model_type.__name__: 853 raise ValueError( 854 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 855 ) 856 857 declarative_component_model = model_type.parse_obj(component_definition) 858 859 if not isinstance(declarative_component_model, model_type): 860 raise ValueError( 861 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 862 ) 863 864 return self._create_component_from_model( 865 model=declarative_component_model, config=config, **kwargs 866 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
883 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 884 """ 885 Returns the deprecation warnings that were collected during the creation of components. 886 """ 887 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
904 def create_config_migration( 905 self, model: ConfigMigrationModel, config: Config 906 ) -> ConfigMigration: 907 transformations: List[ConfigTransformation] = [ 908 self._create_component_from_model(transformation, config) 909 for transformation in model.transformations 910 ] 911 912 return ConfigMigration( 913 description=model.description, 914 transformations=transformations, 915 )
917 def create_config_add_fields( 918 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 919 ) -> ConfigAddFields: 920 fields = [self._create_component_from_model(field, config) for field in model.fields] 921 return ConfigAddFields( 922 fields=fields, 923 condition=model.condition or "", 924 )
973 @staticmethod 974 def create_added_field_definition( 975 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 976 ) -> AddedFieldDefinition: 977 interpolated_value = InterpolatedString.create( 978 model.value, parameters=model.parameters or {} 979 ) 980 return AddedFieldDefinition( 981 path=model.path, 982 value=interpolated_value, 983 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 984 parameters=model.parameters or {}, 985 )
987 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 988 added_field_definitions = [ 989 self._create_component_from_model( 990 model=added_field_definition_model, 991 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 992 added_field_definition_model.value_type 993 ), 994 config=config, 995 ) 996 for added_field_definition_model in model.fields 997 ] 998 return AddFields( 999 fields=added_field_definitions, 1000 condition=model.condition or "", 1001 parameters=model.parameters or {}, 1002 )
1028 def create_dpath_flatten_fields( 1029 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1030 ) -> DpathFlattenFields: 1031 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1032 key_transformation = ( 1033 KeyTransformation( 1034 config=config, 1035 prefix=model.key_transformation.prefix, 1036 suffix=model.key_transformation.suffix, 1037 parameters=model.parameters or {}, 1038 ) 1039 if model.key_transformation is not None 1040 else None 1041 ) 1042 return DpathFlattenFields( 1043 config=config, 1044 field_path=model_field_path, 1045 delete_origin_value=model.delete_origin_value 1046 if model.delete_origin_value is not None 1047 else False, 1048 replace_record=model.replace_record if model.replace_record is not None else False, 1049 key_transformation=key_transformation, 1050 parameters=model.parameters or {}, 1051 )
1065 def create_api_key_authenticator( 1066 self, 1067 model: ApiKeyAuthenticatorModel, 1068 config: Config, 1069 token_provider: Optional[TokenProvider] = None, 1070 **kwargs: Any, 1071 ) -> ApiKeyAuthenticator: 1072 if model.inject_into is None and model.header is None: 1073 raise ValueError( 1074 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1075 ) 1076 1077 if model.inject_into is not None and model.header is not None: 1078 raise ValueError( 1079 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1080 ) 1081 1082 if token_provider is not None and model.api_token != "": 1083 raise ValueError( 1084 "If token_provider is set, api_token is ignored and has to be set to empty string." 1085 ) 1086 1087 request_option = ( 1088 self._create_component_from_model( 1089 model.inject_into, config, parameters=model.parameters or {} 1090 ) 1091 if model.inject_into 1092 else RequestOption( 1093 inject_into=RequestOptionType.header, 1094 field_name=model.header or "", 1095 parameters=model.parameters or {}, 1096 ) 1097 ) 1098 1099 return ApiKeyAuthenticator( 1100 token_provider=( 1101 token_provider 1102 if token_provider is not None 1103 else InterpolatedStringTokenProvider( 1104 api_token=model.api_token or "", 1105 config=config, 1106 parameters=model.parameters or {}, 1107 ) 1108 ), 1109 request_option=request_option, 1110 config=config, 1111 parameters=model.parameters or {}, 1112 )
1114 def create_legacy_to_per_partition_state_migration( 1115 self, 1116 model: LegacyToPerPartitionStateMigrationModel, 1117 config: Mapping[str, Any], 1118 declarative_stream: DeclarativeStreamModel, 1119 ) -> LegacyToPerPartitionStateMigration: 1120 retriever = declarative_stream.retriever 1121 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1122 raise ValueError( 1123 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1124 ) 1125 partition_router = retriever.partition_router 1126 if not isinstance( 1127 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1128 ): 1129 raise ValueError( 1130 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1131 ) 1132 if not hasattr(partition_router, "parent_stream_configs"): 1133 raise ValueError( 1134 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1135 ) 1136 1137 if not hasattr(declarative_stream, "incremental_sync"): 1138 raise ValueError( 1139 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1140 ) 1141 1142 return LegacyToPerPartitionStateMigration( 1143 partition_router, # type: ignore # was already checked above 1144 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1145 config, 1146 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1147 )
1149 def create_session_token_authenticator( 1150 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1151 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1152 decoder = ( 1153 self._create_component_from_model(model=model.decoder, config=config) 1154 if model.decoder 1155 else JsonDecoder(parameters={}) 1156 ) 1157 login_requester = self._create_component_from_model( 1158 model=model.login_requester, 1159 config=config, 1160 name=f"{name}_login_requester", 1161 decoder=decoder, 1162 ) 1163 token_provider = SessionTokenProvider( 1164 login_requester=login_requester, 1165 session_token_path=model.session_token_path, 1166 expiration_duration=parse_duration(model.expiration_duration) 1167 if model.expiration_duration 1168 else None, 1169 parameters=model.parameters or {}, 1170 message_repository=self._message_repository, 1171 decoder=decoder, 1172 ) 1173 if model.request_authentication.type == "Bearer": 1174 return ModelToComponentFactory.create_bearer_authenticator( 1175 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1176 config, 1177 token_provider=token_provider, 1178 ) 1179 else: 1180 # Get the api_token template if specified, default to just the session token 1181 api_token_template = ( 1182 getattr(model.request_authentication, "api_token", None) or "{{ session_token }}" 1183 ) 1184 final_token_provider: TokenProvider = InterpolatedSessionTokenProvider( 1185 config=config, 1186 api_token=api_token_template, 1187 session_token_provider=token_provider, 1188 parameters=model.parameters or {}, 1189 ) 1190 return self.create_api_key_authenticator( 1191 ApiKeyAuthenticatorModel( 1192 type="ApiKeyAuthenticator", 1193 api_token="", 1194 inject_into=model.request_authentication.inject_into, 1195 ), # type: ignore # $parameters and headers default to None 1196 config=config, 1197 token_provider=final_token_provider, 1198 )
1200 @staticmethod 1201 def create_basic_http_authenticator( 1202 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1203 ) -> BasicHttpAuthenticator: 1204 return BasicHttpAuthenticator( 1205 password=model.password or "", 1206 username=model.username, 1207 config=config, 1208 parameters=model.parameters or {}, 1209 )
1211 @staticmethod 1212 def create_bearer_authenticator( 1213 model: BearerAuthenticatorModel, 1214 config: Config, 1215 token_provider: Optional[TokenProvider] = None, 1216 **kwargs: Any, 1217 ) -> BearerAuthenticator: 1218 if token_provider is not None and model.api_token != "": 1219 raise ValueError( 1220 "If token_provider is set, api_token is ignored and has to be set to empty string." 1221 ) 1222 return BearerAuthenticator( 1223 token_provider=( 1224 token_provider 1225 if token_provider is not None 1226 else InterpolatedStringTokenProvider( 1227 api_token=model.api_token or "", 1228 config=config, 1229 parameters=model.parameters or {}, 1230 ) 1231 ), 1232 config=config, 1233 parameters=model.parameters or {}, 1234 )
1236 @staticmethod 1237 def create_dynamic_stream_check_config( 1238 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1239 ) -> DynamicStreamCheckConfig: 1240 return DynamicStreamCheckConfig( 1241 dynamic_stream_name=model.dynamic_stream_name, 1242 stream_count=model.stream_count or 0, 1243 )
1245 def create_check_stream( 1246 self, model: CheckStreamModel, config: Config, **kwargs: Any 1247 ) -> CheckStream: 1248 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1249 raise ValueError( 1250 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1251 ) 1252 1253 dynamic_streams_check_configs = ( 1254 [ 1255 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1256 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1257 ] 1258 if model.dynamic_streams_check_configs 1259 else [] 1260 ) 1261 1262 return CheckStream( 1263 stream_names=model.stream_names or [], 1264 dynamic_streams_check_configs=dynamic_streams_check_configs, 1265 parameters={}, 1266 )
1268 @staticmethod 1269 def create_check_dynamic_stream( 1270 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1271 ) -> CheckDynamicStream: 1272 assert model.use_check_availability is not None # for mypy 1273 1274 use_check_availability = model.use_check_availability 1275 1276 return CheckDynamicStream( 1277 stream_count=model.stream_count, 1278 use_check_availability=use_check_availability, 1279 parameters={}, 1280 )
1282 def create_composite_error_handler( 1283 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1284 ) -> CompositeErrorHandler: 1285 error_handlers = [ 1286 self._create_component_from_model(model=error_handler_model, config=config) 1287 for error_handler_model in model.error_handlers 1288 ] 1289 return CompositeErrorHandler( 1290 error_handlers=error_handlers, parameters=model.parameters or {} 1291 )
1293 @staticmethod 1294 def create_concurrency_level( 1295 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1296 ) -> ConcurrencyLevel: 1297 return ConcurrencyLevel( 1298 default_concurrency=model.default_concurrency, 1299 max_concurrency=model.max_concurrency, 1300 config=config, 1301 parameters={}, 1302 )
1304 @staticmethod 1305 def apply_stream_state_migrations( 1306 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1307 ) -> MutableMapping[str, Any]: 1308 if stream_state_migrations: 1309 for state_migration in stream_state_migrations: 1310 if state_migration.should_migrate(stream_state): 1311 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1312 stream_state = dict(state_migration.migrate(stream_state)) 1313 return stream_state
1315 def create_concurrent_cursor_from_datetime_based_cursor( 1316 self, 1317 model_type: Type[BaseModel], 1318 component_definition: ComponentDefinition, 1319 stream_name: str, 1320 stream_namespace: Optional[str], 1321 stream_state: MutableMapping[str, Any], 1322 config: Config, 1323 message_repository: Optional[MessageRepository] = None, 1324 runtime_lookback_window: Optional[datetime.timedelta] = None, 1325 **kwargs: Any, 1326 ) -> ConcurrentCursor: 1327 component_type = component_definition.get("type") 1328 if component_definition.get("type") != model_type.__name__: 1329 raise ValueError( 1330 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1331 ) 1332 1333 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1334 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1335 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1336 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1337 if "$parameters" not in component_definition and "parameters" in component_definition: 1338 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1339 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1340 1341 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1342 raise ValueError( 1343 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1344 ) 1345 1346 model_parameters = datetime_based_cursor_model.parameters or {} 1347 1348 cursor_field = self._get_catalog_defined_cursor_field( 1349 stream_name=stream_name, 1350 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1351 or False, 1352 ) 1353 1354 if not cursor_field: 1355 interpolated_cursor_field = InterpolatedString.create( 1356 datetime_based_cursor_model.cursor_field, 1357 parameters=model_parameters, 1358 ) 1359 cursor_field = CursorField( 1360 cursor_field_key=interpolated_cursor_field.eval(config=config), 1361 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1362 or False, 1363 ) 1364 1365 interpolated_partition_field_start = InterpolatedString.create( 1366 datetime_based_cursor_model.partition_field_start or "start_time", 1367 parameters=model_parameters, 1368 ) 1369 interpolated_partition_field_end = InterpolatedString.create( 1370 datetime_based_cursor_model.partition_field_end or "end_time", 1371 parameters=model_parameters, 1372 ) 1373 1374 slice_boundary_fields = ( 1375 interpolated_partition_field_start.eval(config=config), 1376 interpolated_partition_field_end.eval(config=config), 1377 ) 1378 1379 datetime_format = datetime_based_cursor_model.datetime_format 1380 1381 cursor_granularity = ( 1382 parse_duration(datetime_based_cursor_model.cursor_granularity) 1383 if datetime_based_cursor_model.cursor_granularity 1384 else None 1385 ) 1386 1387 lookback_window = None 1388 interpolated_lookback_window = ( 1389 InterpolatedString.create( 1390 datetime_based_cursor_model.lookback_window, 1391 parameters=model_parameters, 1392 ) 1393 if datetime_based_cursor_model.lookback_window 1394 else None 1395 ) 1396 if interpolated_lookback_window: 1397 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1398 if evaluated_lookback_window: 1399 lookback_window = parse_duration(evaluated_lookback_window) 1400 1401 connector_state_converter: DateTimeStreamStateConverter 1402 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1403 datetime_format=datetime_format, 1404 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1405 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1406 cursor_granularity=cursor_granularity, 1407 ) 1408 1409 # Adjusts the stream state by applying the runtime lookback window. 1410 # This is used to ensure correct state handling in case of failed partitions. 1411 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1412 if runtime_lookback_window and stream_state_value: 1413 new_stream_state = ( 1414 connector_state_converter.parse_timestamp(stream_state_value) 1415 - runtime_lookback_window 1416 ) 1417 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1418 new_stream_state 1419 ) 1420 1421 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1422 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1423 start_date_runtime_value = self.create_min_max_datetime( 1424 model=datetime_based_cursor_model.start_datetime, config=config 1425 ) 1426 else: 1427 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1428 1429 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1430 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1431 end_date_runtime_value = self.create_min_max_datetime( 1432 model=datetime_based_cursor_model.end_datetime, config=config 1433 ) 1434 else: 1435 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1436 1437 interpolated_start_date = MinMaxDatetime.create( 1438 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1439 parameters=datetime_based_cursor_model.parameters, 1440 ) 1441 interpolated_end_date = ( 1442 None 1443 if not end_date_runtime_value 1444 else MinMaxDatetime.create( 1445 end_date_runtime_value, datetime_based_cursor_model.parameters 1446 ) 1447 ) 1448 1449 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1450 if not interpolated_start_date.datetime_format: 1451 interpolated_start_date.datetime_format = datetime_format 1452 if interpolated_end_date and not interpolated_end_date.datetime_format: 1453 interpolated_end_date.datetime_format = datetime_format 1454 1455 start_date = interpolated_start_date.get_datetime(config=config) 1456 end_date_provider = ( 1457 partial(interpolated_end_date.get_datetime, config) 1458 if interpolated_end_date 1459 else connector_state_converter.get_end_provider() 1460 ) 1461 1462 if ( 1463 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1464 ) or ( 1465 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1466 ): 1467 raise ValueError( 1468 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1469 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1470 ) 1471 1472 # When step is not defined, default to a step size from the starting date to the present moment 1473 step_length = datetime.timedelta.max 1474 interpolated_step = ( 1475 InterpolatedString.create( 1476 datetime_based_cursor_model.step, 1477 parameters=model_parameters, 1478 ) 1479 if datetime_based_cursor_model.step 1480 else None 1481 ) 1482 if interpolated_step: 1483 evaluated_step = interpolated_step.eval(config) 1484 if evaluated_step: 1485 step_length = parse_duration(evaluated_step) 1486 1487 clamping_strategy: ClampingStrategy = NoClamping() 1488 if datetime_based_cursor_model.clamping: 1489 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1490 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1491 # object which we want to keep agnostic of being low-code 1492 target = InterpolatedString( 1493 string=datetime_based_cursor_model.clamping.target, 1494 parameters=model_parameters, 1495 ) 1496 evaluated_target = target.eval(config=config) 1497 match evaluated_target: 1498 case "DAY": 1499 clamping_strategy = DayClampingStrategy() 1500 end_date_provider = ClampingEndProvider( 1501 DayClampingStrategy(is_ceiling=False), 1502 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1503 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1504 ) 1505 case "WEEK": 1506 if ( 1507 not datetime_based_cursor_model.clamping.target_details 1508 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1509 ): 1510 raise ValueError( 1511 "Given WEEK clamping, weekday needs to be provided as target_details" 1512 ) 1513 weekday = self._assemble_weekday( 1514 datetime_based_cursor_model.clamping.target_details["weekday"] 1515 ) 1516 clamping_strategy = WeekClampingStrategy(weekday) 1517 end_date_provider = ClampingEndProvider( 1518 WeekClampingStrategy(weekday, is_ceiling=False), 1519 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1520 granularity=cursor_granularity or datetime.timedelta(days=1), 1521 ) 1522 case "MONTH": 1523 clamping_strategy = MonthClampingStrategy() 1524 end_date_provider = ClampingEndProvider( 1525 MonthClampingStrategy(is_ceiling=False), 1526 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1527 granularity=cursor_granularity or datetime.timedelta(days=1), 1528 ) 1529 case _: 1530 raise ValueError( 1531 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1532 ) 1533 1534 return ConcurrentCursor( 1535 stream_name=stream_name, 1536 stream_namespace=stream_namespace, 1537 stream_state=stream_state, 1538 message_repository=message_repository or self._message_repository, 1539 connector_state_manager=self._connector_state_manager, 1540 connector_state_converter=connector_state_converter, 1541 cursor_field=cursor_field, 1542 slice_boundary_fields=slice_boundary_fields, 1543 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1544 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1545 lookback_window=lookback_window, 1546 slice_range=step_length, 1547 cursor_granularity=cursor_granularity, 1548 clamping_strategy=clamping_strategy, 1549 )
1551 def create_concurrent_cursor_from_incrementing_count_cursor( 1552 self, 1553 model_type: Type[BaseModel], 1554 component_definition: ComponentDefinition, 1555 stream_name: str, 1556 stream_namespace: Optional[str], 1557 stream_state: MutableMapping[str, Any], 1558 config: Config, 1559 message_repository: Optional[MessageRepository] = None, 1560 **kwargs: Any, 1561 ) -> ConcurrentCursor: 1562 component_type = component_definition.get("type") 1563 if component_definition.get("type") != model_type.__name__: 1564 raise ValueError( 1565 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1566 ) 1567 1568 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1569 1570 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1571 raise ValueError( 1572 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1573 ) 1574 1575 start_value: Union[int, str, None] = incrementing_count_cursor_model.start_value 1576 # Pydantic Union type coercion can convert int 0 to string '0' depending on Union order. 1577 # We need to handle both int and str representations of numeric values. 1578 # Evaluate the InterpolatedString and convert to int for the ConcurrentCursor. 1579 if start_value is not None: 1580 interpolated_start_value = InterpolatedString.create( 1581 str(start_value), # Ensure we pass a string to InterpolatedString.create 1582 parameters=incrementing_count_cursor_model.parameters or {}, 1583 ) 1584 evaluated_start_value: int = int(interpolated_start_value.eval(config=config)) 1585 else: 1586 evaluated_start_value = 0 1587 1588 cursor_field = self._get_catalog_defined_cursor_field( 1589 stream_name=stream_name, 1590 allow_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1591 or False, 1592 ) 1593 1594 if not cursor_field: 1595 interpolated_cursor_field = InterpolatedString.create( 1596 incrementing_count_cursor_model.cursor_field, 1597 parameters=incrementing_count_cursor_model.parameters or {}, 1598 ) 1599 cursor_field = CursorField( 1600 cursor_field_key=interpolated_cursor_field.eval(config=config), 1601 supports_catalog_defined_cursor_field=incrementing_count_cursor_model.allow_catalog_defined_cursor_field 1602 or False, 1603 ) 1604 1605 connector_state_converter = IncrementingCountStreamStateConverter( 1606 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1607 ) 1608 1609 return ConcurrentCursor( 1610 stream_name=stream_name, 1611 stream_namespace=stream_namespace, 1612 stream_state=stream_state, 1613 message_repository=message_repository or self._message_repository, 1614 connector_state_manager=self._connector_state_manager, 1615 connector_state_converter=connector_state_converter, 1616 cursor_field=cursor_field, 1617 slice_boundary_fields=None, 1618 start=evaluated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1619 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1620 )
1641 def create_concurrent_cursor_from_perpartition_cursor( 1642 self, 1643 state_manager: ConnectorStateManager, 1644 model_type: Type[BaseModel], 1645 component_definition: ComponentDefinition, 1646 stream_name: str, 1647 stream_namespace: Optional[str], 1648 config: Config, 1649 stream_state: MutableMapping[str, Any], 1650 partition_router: PartitionRouter, 1651 attempt_to_create_cursor_if_not_provided: bool = False, 1652 **kwargs: Any, 1653 ) -> ConcurrentPerPartitionCursor: 1654 component_type = component_definition.get("type") 1655 if component_definition.get("type") != model_type.__name__: 1656 raise ValueError( 1657 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1658 ) 1659 1660 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1661 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1662 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1663 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1664 if "$parameters" not in component_definition and "parameters" in component_definition: 1665 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1666 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1667 1668 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1669 raise ValueError( 1670 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1671 ) 1672 1673 cursor_field = self._get_catalog_defined_cursor_field( 1674 stream_name=stream_name, 1675 allow_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1676 or False, 1677 ) 1678 1679 if not cursor_field: 1680 interpolated_cursor_field = InterpolatedString.create( 1681 datetime_based_cursor_model.cursor_field, 1682 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1683 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1684 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1685 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1686 parameters=datetime_based_cursor_model.parameters or {}, 1687 ) 1688 cursor_field = CursorField( 1689 cursor_field_key=interpolated_cursor_field.eval(config=config), 1690 supports_catalog_defined_cursor_field=datetime_based_cursor_model.allow_catalog_defined_cursor_field 1691 or False, 1692 ) 1693 1694 datetime_format = datetime_based_cursor_model.datetime_format 1695 1696 cursor_granularity = ( 1697 parse_duration(datetime_based_cursor_model.cursor_granularity) 1698 if datetime_based_cursor_model.cursor_granularity 1699 else None 1700 ) 1701 1702 connector_state_converter: DateTimeStreamStateConverter 1703 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1704 datetime_format=datetime_format, 1705 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1706 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1707 cursor_granularity=cursor_granularity, 1708 ) 1709 1710 # Create the cursor factory 1711 cursor_factory = ConcurrentCursorFactory( 1712 partial( 1713 self.create_concurrent_cursor_from_datetime_based_cursor, 1714 state_manager=state_manager, 1715 model_type=model_type, 1716 component_definition=component_definition, 1717 stream_name=stream_name, 1718 stream_namespace=stream_namespace, 1719 config=config, 1720 message_repository=NoopMessageRepository(), 1721 ) 1722 ) 1723 1724 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1725 use_global_cursor = isinstance( 1726 partition_router, GroupingPartitionRouter 1727 ) or component_definition.get("global_substream_cursor", False) 1728 1729 # Return the concurrent cursor and state converter 1730 return ConcurrentPerPartitionCursor( 1731 cursor_factory=cursor_factory, 1732 partition_router=partition_router, 1733 stream_name=stream_name, 1734 stream_namespace=stream_namespace, 1735 stream_state=stream_state, 1736 message_repository=self._message_repository, # type: ignore 1737 connector_state_manager=state_manager, 1738 connector_state_converter=connector_state_converter, 1739 cursor_field=cursor_field, 1740 use_global_cursor=use_global_cursor, 1741 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1742 )
1744 @staticmethod 1745 def create_constant_backoff_strategy( 1746 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1747 ) -> ConstantBackoffStrategy: 1748 return ConstantBackoffStrategy( 1749 backoff_time_in_seconds=model.backoff_time_in_seconds, 1750 config=config, 1751 parameters=model.parameters or {}, 1752 )
1754 def create_cursor_pagination( 1755 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1756 ) -> CursorPaginationStrategy: 1757 if isinstance(decoder, PaginationDecoderDecorator): 1758 inner_decoder = decoder.decoder 1759 else: 1760 inner_decoder = decoder 1761 decoder = PaginationDecoderDecorator(decoder=decoder) 1762 1763 if self._is_supported_decoder_for_pagination(inner_decoder): 1764 decoder_to_use = decoder 1765 else: 1766 raise ValueError( 1767 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1768 ) 1769 1770 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 1771 # If page_size is a string that represents an integer (not an interpolation), convert it back. 1772 page_size = model.page_size 1773 if isinstance(page_size, str) and page_size.isdigit(): 1774 page_size = int(page_size) 1775 1776 return CursorPaginationStrategy( 1777 cursor_value=model.cursor_value, 1778 decoder=decoder_to_use, 1779 page_size=page_size, 1780 stop_condition=model.stop_condition, 1781 config=config, 1782 parameters=model.parameters or {}, 1783 )
1785 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1786 """ 1787 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1788 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1789 :param model: The Pydantic model of the custom component being created 1790 :param config: The custom defined connector config 1791 :return: The declarative component built from the Pydantic model to be used at runtime 1792 """ 1793 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1794 component_fields = get_type_hints(custom_component_class) 1795 model_args = model.dict() 1796 model_args["config"] = config 1797 1798 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1799 # we defer to these arguments over the component's definition 1800 for key, arg in kwargs.items(): 1801 model_args[key] = arg 1802 1803 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1804 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1805 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1806 for model_field, model_value in model_args.items(): 1807 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1808 if ( 1809 isinstance(model_value, dict) 1810 and "type" not in model_value 1811 and model_field in component_fields 1812 ): 1813 derived_type = self._derive_component_type_from_type_hints( 1814 component_fields.get(model_field) 1815 ) 1816 if derived_type: 1817 model_value["type"] = derived_type 1818 1819 if self._is_component(model_value): 1820 model_args[model_field] = self._create_nested_component( 1821 model, 1822 model_field, 1823 model_value, 1824 config, 1825 **kwargs, 1826 ) 1827 elif isinstance(model_value, list): 1828 vals = [] 1829 for v in model_value: 1830 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1831 derived_type = self._derive_component_type_from_type_hints( 1832 component_fields.get(model_field) 1833 ) 1834 if derived_type: 1835 v["type"] = derived_type 1836 if self._is_component(v): 1837 vals.append( 1838 self._create_nested_component( 1839 model, 1840 model_field, 1841 v, 1842 config, 1843 **kwargs, 1844 ) 1845 ) 1846 else: 1847 vals.append(v) 1848 model_args[model_field] = vals 1849 1850 kwargs = { 1851 class_field: model_args[class_field] 1852 for class_field in component_fields.keys() 1853 if class_field in model_args 1854 } 1855 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1990 def create_default_stream( 1991 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1992 ) -> AbstractStream: 1993 primary_key = model.primary_key.__root__ if model.primary_key else None 1994 self._migrate_state(model, config) 1995 1996 partition_router = self._build_stream_slicer_from_partition_router( 1997 model.retriever, 1998 config, 1999 stream_name=model.name, 2000 **kwargs, 2001 ) 2002 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2003 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2004 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2005 2006 end_time_option = ( 2007 self._create_component_from_model( 2008 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2009 ) 2010 if cursor_model.end_time_option 2011 else None 2012 ) 2013 start_time_option = ( 2014 self._create_component_from_model( 2015 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2016 ) 2017 if cursor_model.start_time_option 2018 else None 2019 ) 2020 2021 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2022 start_time_option=start_time_option, 2023 end_time_option=end_time_option, 2024 partition_field_start=cursor_model.partition_field_start, 2025 partition_field_end=cursor_model.partition_field_end, 2026 config=config, 2027 parameters=model.parameters or {}, 2028 ) 2029 request_options_provider = ( 2030 datetime_request_options_provider 2031 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2032 else PerPartitionRequestOptionsProvider( 2033 partition_router, datetime_request_options_provider 2034 ) 2035 ) 2036 elif model.incremental_sync and isinstance( 2037 model.incremental_sync, IncrementingCountCursorModel 2038 ): 2039 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2040 raise ValueError( 2041 "PerPartition does not support per partition states because switching to global state is time based" 2042 ) 2043 2044 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2045 2046 start_time_option = ( 2047 self._create_component_from_model( 2048 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2049 config, 2050 parameters=cursor_model.parameters or {}, 2051 ) 2052 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2053 else None 2054 ) 2055 2056 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2057 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2058 partition_field_start = "start" 2059 2060 request_options_provider = DatetimeBasedRequestOptionsProvider( 2061 start_time_option=start_time_option, 2062 partition_field_start=partition_field_start, 2063 config=config, 2064 parameters=model.parameters or {}, 2065 ) 2066 else: 2067 request_options_provider = None 2068 2069 transformations = [] 2070 if model.transformations: 2071 for transformation_model in model.transformations: 2072 transformations.append( 2073 self._create_component_from_model(model=transformation_model, config=config) 2074 ) 2075 file_uploader = None 2076 if model.file_uploader: 2077 file_uploader = self._create_component_from_model( 2078 model=model.file_uploader, config=config 2079 ) 2080 2081 stream_slicer: ConcurrentStreamSlicer = ( 2082 partition_router 2083 if isinstance(concurrent_cursor, FinalStateCursor) 2084 else concurrent_cursor 2085 ) 2086 2087 retriever = self._create_component_from_model( 2088 model=model.retriever, 2089 config=config, 2090 name=model.name, 2091 primary_key=primary_key, 2092 request_options_provider=request_options_provider, 2093 stream_slicer=stream_slicer, 2094 partition_router=partition_router, 2095 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2096 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2097 cursor=concurrent_cursor, 2098 transformations=transformations, 2099 file_uploader=file_uploader, 2100 incremental_sync=model.incremental_sync, 2101 ) 2102 if isinstance(retriever, AsyncRetriever): 2103 stream_slicer = retriever.stream_slicer 2104 2105 schema_loader: SchemaLoader 2106 if model.schema_loader and isinstance(model.schema_loader, list): 2107 nested_schema_loaders = [ 2108 self._create_component_from_model(model=nested_schema_loader, config=config) 2109 for nested_schema_loader in model.schema_loader 2110 ] 2111 schema_loader = CompositeSchemaLoader( 2112 schema_loaders=nested_schema_loaders, parameters={} 2113 ) 2114 elif model.schema_loader: 2115 schema_loader = self._create_component_from_model( 2116 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2117 config=config, 2118 ) 2119 else: 2120 options = model.parameters or {} 2121 if "name" not in options: 2122 options["name"] = model.name 2123 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2124 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2125 2126 stream_name = model.name or "" 2127 return DefaultStream( 2128 partition_generator=StreamSlicerPartitionGenerator( 2129 DeclarativePartitionFactory( 2130 stream_name, 2131 schema_loader, 2132 retriever, 2133 self._message_repository, 2134 ), 2135 stream_slicer, 2136 slice_limit=self._limit_slices_fetched, 2137 ), 2138 name=stream_name, 2139 json_schema=schema_loader.get_json_schema, 2140 primary_key=get_primary_key_from_stream(primary_key), 2141 cursor_field=( 2142 concurrent_cursor.cursor_field 2143 if hasattr(concurrent_cursor, "cursor_field") 2144 else None 2145 ), 2146 logger=logging.getLogger(f"airbyte.{stream_name}"), 2147 cursor=concurrent_cursor, 2148 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2149 )
2291 def create_default_error_handler( 2292 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2293 ) -> DefaultErrorHandler: 2294 backoff_strategies = [] 2295 if model.backoff_strategies: 2296 for backoff_strategy_model in model.backoff_strategies: 2297 backoff_strategies.append( 2298 self._create_component_from_model(model=backoff_strategy_model, config=config) 2299 ) 2300 2301 response_filters = [] 2302 if model.response_filters: 2303 for response_filter_model in model.response_filters: 2304 response_filters.append( 2305 self._create_component_from_model(model=response_filter_model, config=config) 2306 ) 2307 response_filters.append( 2308 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2309 ) 2310 2311 return DefaultErrorHandler( 2312 backoff_strategies=backoff_strategies, 2313 max_retries=model.max_retries, 2314 response_filters=response_filters, 2315 config=config, 2316 parameters=model.parameters or {}, 2317 )
2319 def create_default_paginator( 2320 self, 2321 model: DefaultPaginatorModel, 2322 config: Config, 2323 *, 2324 url_base: str, 2325 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2326 decoder: Optional[Decoder] = None, 2327 cursor_used_for_stop_condition: Optional[Cursor] = None, 2328 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2329 if decoder: 2330 if self._is_supported_decoder_for_pagination(decoder): 2331 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2332 else: 2333 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2334 else: 2335 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2336 page_size_option = ( 2337 self._create_component_from_model(model=model.page_size_option, config=config) 2338 if model.page_size_option 2339 else None 2340 ) 2341 page_token_option = ( 2342 self._create_component_from_model(model=model.page_token_option, config=config) 2343 if model.page_token_option 2344 else None 2345 ) 2346 pagination_strategy = self._create_component_from_model( 2347 model=model.pagination_strategy, 2348 config=config, 2349 decoder=decoder_to_use, 2350 extractor_model=extractor_model, 2351 ) 2352 if cursor_used_for_stop_condition: 2353 pagination_strategy = StopConditionPaginationStrategyDecorator( 2354 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2355 ) 2356 paginator = DefaultPaginator( 2357 decoder=decoder_to_use, 2358 page_size_option=page_size_option, 2359 page_token_option=page_token_option, 2360 pagination_strategy=pagination_strategy, 2361 url_base=url_base, 2362 config=config, 2363 parameters=model.parameters or {}, 2364 ) 2365 if self._limit_pages_fetched_per_slice: 2366 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2367 return paginator
2369 def create_dpath_extractor( 2370 self, 2371 model: DpathExtractorModel, 2372 config: Config, 2373 decoder: Optional[Decoder] = None, 2374 **kwargs: Any, 2375 ) -> DpathExtractor: 2376 if decoder: 2377 decoder_to_use = decoder 2378 else: 2379 decoder_to_use = JsonDecoder(parameters={}) 2380 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2381 return DpathExtractor( 2382 decoder=decoder_to_use, 2383 field_path=model_field_path, 2384 config=config, 2385 parameters=model.parameters or {}, 2386 )
2407 def create_http_requester( 2408 self, 2409 model: HttpRequesterModel, 2410 config: Config, 2411 decoder: Decoder = JsonDecoder(parameters={}), 2412 query_properties_key: Optional[str] = None, 2413 use_cache: Optional[bool] = None, 2414 *, 2415 name: str, 2416 ) -> HttpRequester: 2417 authenticator = ( 2418 self._create_component_from_model( 2419 model=model.authenticator, 2420 config=config, 2421 url_base=model.url or model.url_base, 2422 name=name, 2423 decoder=decoder, 2424 ) 2425 if model.authenticator 2426 else None 2427 ) 2428 error_handler = ( 2429 self._create_component_from_model(model=model.error_handler, config=config) 2430 if model.error_handler 2431 else DefaultErrorHandler( 2432 backoff_strategies=[], 2433 response_filters=[], 2434 config=config, 2435 parameters=model.parameters or {}, 2436 ) 2437 ) 2438 2439 api_budget = self._api_budget 2440 2441 request_options_provider = InterpolatedRequestOptionsProvider( 2442 request_body=model.request_body, 2443 request_body_data=model.request_body_data, 2444 request_body_json=model.request_body_json, 2445 request_headers=model.request_headers, 2446 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2447 query_properties_key=query_properties_key, 2448 config=config, 2449 parameters=model.parameters or {}, 2450 ) 2451 2452 assert model.use_cache is not None # for mypy 2453 assert model.http_method is not None # for mypy 2454 2455 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2456 2457 return HttpRequester( 2458 name=name, 2459 url=model.url, 2460 url_base=model.url_base, 2461 path=model.path, 2462 authenticator=authenticator, 2463 error_handler=error_handler, 2464 api_budget=api_budget, 2465 http_method=HttpMethod[model.http_method.value], 2466 request_options_provider=request_options_provider, 2467 config=config, 2468 disable_retries=self._disable_retries, 2469 parameters=model.parameters or {}, 2470 message_repository=self._message_repository, 2471 use_cache=should_use_cache, 2472 decoder=decoder, 2473 stream_response=decoder.is_stream_response() if decoder else False, 2474 )
2476 @staticmethod 2477 def create_http_response_filter( 2478 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2479 ) -> HttpResponseFilter: 2480 if model.action: 2481 action = ResponseAction(model.action.value) 2482 else: 2483 action = None 2484 2485 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2486 2487 http_codes = ( 2488 set(model.http_codes) if model.http_codes else set() 2489 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2490 2491 return HttpResponseFilter( 2492 action=action, 2493 failure_type=failure_type, 2494 error_message=model.error_message or "", 2495 error_message_contains=model.error_message_contains or "", 2496 http_codes=http_codes, 2497 predicate=model.predicate or "", 2498 config=config, 2499 parameters=model.parameters or {}, 2500 )
2508 def create_complex_field_type( 2509 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2510 ) -> ComplexFieldType: 2511 items = ( 2512 self._create_component_from_model(model=model.items, config=config) 2513 if isinstance(model.items, ComplexFieldTypeModel) 2514 else model.items 2515 ) 2516 2517 return ComplexFieldType(field_type=model.field_type, items=items)
2519 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2520 target_type = ( 2521 self._create_component_from_model(model=model.target_type, config=config) 2522 if isinstance(model.target_type, ComplexFieldTypeModel) 2523 else model.target_type 2524 ) 2525 2526 return TypesMap( 2527 target_type=target_type, 2528 current_type=model.current_type, 2529 condition=model.condition if model.condition is not None else "True", 2530 )
2532 def create_schema_type_identifier( 2533 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2534 ) -> SchemaTypeIdentifier: 2535 types_mapping = [] 2536 if model.types_mapping: 2537 types_mapping.extend( 2538 [ 2539 self._create_component_from_model(types_map, config=config) 2540 for types_map in model.types_mapping 2541 ] 2542 ) 2543 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2544 [x for x in model.schema_pointer] if model.schema_pointer else [] 2545 ) 2546 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2547 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2548 [x for x in model.type_pointer] if model.type_pointer else None 2549 ) 2550 2551 return SchemaTypeIdentifier( 2552 schema_pointer=model_schema_pointer, 2553 key_pointer=model_key_pointer, 2554 type_pointer=model_type_pointer, 2555 types_mapping=types_mapping, 2556 parameters=model.parameters or {}, 2557 )
2559 def create_dynamic_schema_loader( 2560 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2561 ) -> DynamicSchemaLoader: 2562 schema_transformations = [] 2563 if model.schema_transformations: 2564 for transformation_model in model.schema_transformations: 2565 schema_transformations.append( 2566 self._create_component_from_model(model=transformation_model, config=config) 2567 ) 2568 name = "dynamic_properties" 2569 retriever = self._create_component_from_model( 2570 model=model.retriever, 2571 config=config, 2572 name=name, 2573 primary_key=None, 2574 partition_router=self._build_stream_slicer_from_partition_router( 2575 model.retriever, config 2576 ), 2577 transformations=[], 2578 use_cache=True, 2579 log_formatter=( 2580 lambda response: format_http_message( 2581 response, 2582 f"Schema loader '{name}' request", 2583 f"Request performed in order to extract schema.", 2584 name, 2585 is_auxiliary=True, 2586 ) 2587 ), 2588 ) 2589 schema_type_identifier = self._create_component_from_model( 2590 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2591 ) 2592 schema_filter = ( 2593 self._create_component_from_model( 2594 model.schema_filter, config=config, parameters=model.parameters or {} 2595 ) 2596 if model.schema_filter is not None 2597 else None 2598 ) 2599 2600 return DynamicSchemaLoader( 2601 retriever=retriever, 2602 config=config, 2603 schema_transformations=schema_transformations, 2604 schema_filter=schema_filter, 2605 schema_type_identifier=schema_type_identifier, 2606 parameters=model.parameters or {}, 2607 )
2627 def create_gzip_decoder( 2628 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2629 ) -> Decoder: 2630 _compressed_response_types = { 2631 "gzip", 2632 "x-gzip", 2633 "gzip, deflate", 2634 "x-gzip, deflate", 2635 "application/zip", 2636 "application/gzip", 2637 "application/x-gzip", 2638 "application/x-zip-compressed", 2639 } 2640 2641 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2642 2643 if self._emit_connector_builder_messages: 2644 # This is very surprising but if the response is not streamed, 2645 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2646 # which uses urllib3 directly and does not uncompress the data. 2647 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2648 2649 return CompositeRawDecoder.by_headers( 2650 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2651 stream_response=True, 2652 fallback_parser=gzip_parser.inner_parser, 2653 )
2702 def create_jwt_authenticator( 2703 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2704 ) -> JwtAuthenticator: 2705 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2706 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2707 request_option = ( 2708 self._create_component_from_model(model.request_option, config) 2709 if model.request_option 2710 else None 2711 ) 2712 return JwtAuthenticator( 2713 config=config, 2714 parameters=model.parameters or {}, 2715 algorithm=JwtAlgorithm(model.algorithm.value), 2716 secret_key=model.secret_key, 2717 base64_encode_secret_key=model.base64_encode_secret_key, 2718 token_duration=model.token_duration, 2719 header_prefix=model.header_prefix, 2720 kid=jwt_headers.kid, 2721 typ=jwt_headers.typ, 2722 cty=jwt_headers.cty, 2723 iss=jwt_payload.iss, 2724 sub=jwt_payload.sub, 2725 aud=jwt_payload.aud, 2726 additional_jwt_headers=model.additional_jwt_headers, 2727 additional_jwt_payload=model.additional_jwt_payload, 2728 passphrase=model.passphrase, 2729 request_option=request_option, 2730 )
2732 def create_list_partition_router( 2733 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2734 ) -> ListPartitionRouter: 2735 request_option = ( 2736 self._create_component_from_model(model.request_option, config) 2737 if model.request_option 2738 else None 2739 ) 2740 return ListPartitionRouter( 2741 cursor_field=model.cursor_field, 2742 request_option=request_option, 2743 values=model.values, 2744 config=config, 2745 parameters=model.parameters or {}, 2746 )
2748 @staticmethod 2749 def create_min_max_datetime( 2750 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2751 ) -> MinMaxDatetime: 2752 return MinMaxDatetime( 2753 datetime=model.datetime, 2754 datetime_format=model.datetime_format or "", 2755 max_datetime=model.max_datetime or "", 2756 min_datetime=model.min_datetime or "", 2757 parameters=model.parameters or {}, 2758 )
2770 def create_oauth_authenticator( 2771 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2772 ) -> DeclarativeOauth2Authenticator: 2773 profile_assertion = ( 2774 self._create_component_from_model(model.profile_assertion, config=config) 2775 if model.profile_assertion 2776 else None 2777 ) 2778 2779 refresh_token_error_status_codes, refresh_token_error_key, refresh_token_error_values = ( 2780 self._get_refresh_token_error_information(model) 2781 ) 2782 if model.refresh_token_updater: 2783 # ignore type error because fixing it would have a lot of dependencies, revisit later 2784 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2785 config, 2786 InterpolatedString.create( 2787 model.token_refresh_endpoint, # type: ignore 2788 parameters=model.parameters or {}, 2789 ).eval(config), 2790 access_token_name=InterpolatedString.create( 2791 model.access_token_name or "access_token", parameters=model.parameters or {} 2792 ).eval(config), 2793 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2794 expires_in_name=InterpolatedString.create( 2795 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2796 ).eval(config), 2797 client_id_name=InterpolatedString.create( 2798 model.client_id_name or "client_id", parameters=model.parameters or {} 2799 ).eval(config), 2800 client_id=InterpolatedString.create( 2801 model.client_id, parameters=model.parameters or {} 2802 ).eval(config) 2803 if model.client_id 2804 else model.client_id, 2805 client_secret_name=InterpolatedString.create( 2806 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2807 ).eval(config), 2808 client_secret=InterpolatedString.create( 2809 model.client_secret, parameters=model.parameters or {} 2810 ).eval(config) 2811 if model.client_secret 2812 else model.client_secret, 2813 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2814 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2815 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2816 grant_type_name=InterpolatedString.create( 2817 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2818 ).eval(config), 2819 grant_type=InterpolatedString.create( 2820 model.grant_type or "refresh_token", parameters=model.parameters or {} 2821 ).eval(config), 2822 refresh_request_body=InterpolatedMapping( 2823 model.refresh_request_body or {}, parameters=model.parameters or {} 2824 ).eval(config), 2825 refresh_request_headers=InterpolatedMapping( 2826 model.refresh_request_headers or {}, parameters=model.parameters or {} 2827 ).eval(config), 2828 scopes=model.scopes, 2829 token_expiry_date_format=model.token_expiry_date_format, 2830 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2831 message_repository=self._message_repository, 2832 refresh_token_error_status_codes=refresh_token_error_status_codes, 2833 refresh_token_error_key=refresh_token_error_key, 2834 refresh_token_error_values=refresh_token_error_values, 2835 ) 2836 # ignore type error because fixing it would have a lot of dependencies, revisit later 2837 return DeclarativeOauth2Authenticator( # type: ignore 2838 access_token_name=model.access_token_name or "access_token", 2839 access_token_value=model.access_token_value, 2840 client_id_name=model.client_id_name or "client_id", 2841 client_id=model.client_id, 2842 client_secret_name=model.client_secret_name or "client_secret", 2843 client_secret=model.client_secret, 2844 expires_in_name=model.expires_in_name or "expires_in", 2845 grant_type_name=model.grant_type_name or "grant_type", 2846 grant_type=model.grant_type or "refresh_token", 2847 refresh_request_body=model.refresh_request_body, 2848 refresh_request_headers=model.refresh_request_headers, 2849 refresh_token_name=model.refresh_token_name or "refresh_token", 2850 refresh_token=model.refresh_token, 2851 scopes=model.scopes, 2852 token_expiry_date=model.token_expiry_date, 2853 token_expiry_date_format=model.token_expiry_date_format, 2854 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2855 token_refresh_endpoint=model.token_refresh_endpoint, 2856 config=config, 2857 parameters=model.parameters or {}, 2858 message_repository=self._message_repository, 2859 profile_assertion=profile_assertion, 2860 use_profile_assertion=model.use_profile_assertion, 2861 refresh_token_error_status_codes=refresh_token_error_status_codes, 2862 refresh_token_error_key=refresh_token_error_key, 2863 refresh_token_error_values=refresh_token_error_values, 2864 )
2914 def create_offset_increment( 2915 self, 2916 model: OffsetIncrementModel, 2917 config: Config, 2918 decoder: Decoder, 2919 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2920 **kwargs: Any, 2921 ) -> OffsetIncrement: 2922 if isinstance(decoder, PaginationDecoderDecorator): 2923 inner_decoder = decoder.decoder 2924 else: 2925 inner_decoder = decoder 2926 decoder = PaginationDecoderDecorator(decoder=decoder) 2927 2928 if self._is_supported_decoder_for_pagination(inner_decoder): 2929 decoder_to_use = decoder 2930 else: 2931 raise ValueError( 2932 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2933 ) 2934 2935 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2936 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2937 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2938 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2939 # When we have more time to investigate we can look into reusing the same component. 2940 extractor = ( 2941 self._create_component_from_model( 2942 model=extractor_model, config=config, decoder=decoder_to_use 2943 ) 2944 if extractor_model 2945 else None 2946 ) 2947 2948 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2949 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2950 page_size = model.page_size 2951 if isinstance(page_size, str) and page_size.isdigit(): 2952 page_size = int(page_size) 2953 2954 return OffsetIncrement( 2955 page_size=page_size, 2956 config=config, 2957 decoder=decoder_to_use, 2958 extractor=extractor, 2959 inject_on_first_request=model.inject_on_first_request or False, 2960 parameters=model.parameters or {}, 2961 )
2963 @staticmethod 2964 def create_page_increment( 2965 model: PageIncrementModel, config: Config, **kwargs: Any 2966 ) -> PageIncrement: 2967 # Pydantic v1 Union type coercion can convert int to string depending on Union order. 2968 # If page_size is a string that represents an integer (not an interpolation), convert it back. 2969 page_size = model.page_size 2970 if isinstance(page_size, str) and page_size.isdigit(): 2971 page_size = int(page_size) 2972 2973 return PageIncrement( 2974 page_size=page_size, 2975 config=config, 2976 start_from_page=model.start_from_page or 0, 2977 inject_on_first_request=model.inject_on_first_request or False, 2978 parameters=model.parameters or {}, 2979 )
2981 def create_parent_stream_config( 2982 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2983 ) -> ParentStreamConfig: 2984 declarative_stream = self._create_component_from_model( 2985 model.stream, 2986 config=config, 2987 is_parent=True, 2988 **kwargs, 2989 ) 2990 request_option = ( 2991 self._create_component_from_model(model.request_option, config=config) 2992 if model.request_option 2993 else None 2994 ) 2995 2996 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2997 raise ValueError( 2998 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2999 ) 3000 3001 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 3002 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 3003 ) 3004 3005 return ParentStreamConfig( 3006 parent_key=model.parent_key, 3007 request_option=request_option, 3008 stream=declarative_stream, 3009 partition_field=model.partition_field, 3010 config=config, 3011 incremental_dependency=model.incremental_dependency or False, 3012 parameters=model.parameters or {}, 3013 extra_fields=model.extra_fields, 3014 lazy_read_pointer=model_lazy_read_pointer, 3015 )
3017 def create_properties_from_endpoint( 3018 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 3019 ) -> PropertiesFromEndpoint: 3020 retriever = self._create_component_from_model( 3021 model=model.retriever, 3022 config=config, 3023 name="dynamic_properties", 3024 primary_key=None, 3025 stream_slicer=None, 3026 transformations=[], 3027 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 3028 ) 3029 return PropertiesFromEndpoint( 3030 property_field_path=model.property_field_path, 3031 retriever=retriever, 3032 config=config, 3033 parameters=model.parameters or {}, 3034 )
3036 def create_property_chunking( 3037 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 3038 ) -> PropertyChunking: 3039 record_merge_strategy = ( 3040 self._create_component_from_model( 3041 model=model.record_merge_strategy, config=config, **kwargs 3042 ) 3043 if model.record_merge_strategy 3044 else None 3045 ) 3046 3047 property_limit_type: PropertyLimitType 3048 match model.property_limit_type: 3049 case PropertyLimitTypeModel.property_count: 3050 property_limit_type = PropertyLimitType.property_count 3051 case PropertyLimitTypeModel.characters: 3052 property_limit_type = PropertyLimitType.characters 3053 case _: 3054 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3055 3056 return PropertyChunking( 3057 property_limit_type=property_limit_type, 3058 property_limit=model.property_limit, 3059 record_merge_strategy=record_merge_strategy, 3060 config=config, 3061 parameters=model.parameters or {}, 3062 )
3064 def create_query_properties( 3065 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3066 ) -> QueryProperties: 3067 if isinstance(model.property_list, list): 3068 property_list = model.property_list 3069 else: 3070 property_list = self._create_component_from_model( 3071 model=model.property_list, config=config, **kwargs 3072 ) 3073 3074 property_chunking = ( 3075 self._create_component_from_model( 3076 model=model.property_chunking, config=config, **kwargs 3077 ) 3078 if model.property_chunking 3079 else None 3080 ) 3081 3082 property_selector = ( 3083 self._create_component_from_model( 3084 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3085 ) 3086 if model.property_selector 3087 else None 3088 ) 3089 3090 return QueryProperties( 3091 property_list=property_list, 3092 always_include_properties=model.always_include_properties, 3093 property_chunking=property_chunking, 3094 property_selector=property_selector, 3095 config=config, 3096 parameters=model.parameters or {}, 3097 )
3099 def create_json_schema_property_selector( 3100 self, 3101 model: JsonSchemaPropertySelectorModel, 3102 config: Config, 3103 *, 3104 stream_name: str, 3105 **kwargs: Any, 3106 ) -> JsonSchemaPropertySelector: 3107 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3108 3109 transformations = [] 3110 if model.transformations: 3111 for transformation_model in model.transformations: 3112 transformations.append( 3113 self._create_component_from_model(model=transformation_model, config=config) 3114 ) 3115 3116 return JsonSchemaPropertySelector( 3117 configured_stream=configured_stream, 3118 properties_transformations=transformations, 3119 config=config, 3120 parameters=model.parameters or {}, 3121 )
3135 @staticmethod 3136 def create_request_option( 3137 model: RequestOptionModel, config: Config, **kwargs: Any 3138 ) -> RequestOption: 3139 inject_into = RequestOptionType(model.inject_into.value) 3140 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3141 [ 3142 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3143 for segment in model.field_path 3144 ] 3145 if model.field_path 3146 else None 3147 ) 3148 field_name = ( 3149 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3150 if model.field_name 3151 else None 3152 ) 3153 return RequestOption( 3154 field_name=field_name, 3155 field_path=field_path, 3156 inject_into=inject_into, 3157 parameters=kwargs.get("parameters", {}), 3158 )
3160 def create_record_selector( 3161 self, 3162 model: RecordSelectorModel, 3163 config: Config, 3164 *, 3165 name: str, 3166 transformations: List[RecordTransformation] | None = None, 3167 decoder: Decoder | None = None, 3168 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3169 file_uploader: Optional[DefaultFileUploader] = None, 3170 **kwargs: Any, 3171 ) -> RecordSelector: 3172 extractor = self._create_component_from_model( 3173 model=model.extractor, decoder=decoder, config=config 3174 ) 3175 record_filter = ( 3176 self._create_component_from_model(model.record_filter, config=config) 3177 if model.record_filter 3178 else None 3179 ) 3180 3181 transform_before_filtering = ( 3182 False if model.transform_before_filtering is None else model.transform_before_filtering 3183 ) 3184 if client_side_incremental_sync_cursor: 3185 record_filter = ClientSideIncrementalRecordFilterDecorator( 3186 config=config, 3187 parameters=model.parameters, 3188 condition=model.record_filter.condition 3189 if (model.record_filter and hasattr(model.record_filter, "condition")) 3190 else None, 3191 cursor=client_side_incremental_sync_cursor, 3192 ) 3193 transform_before_filtering = ( 3194 True 3195 if model.transform_before_filtering is None 3196 else model.transform_before_filtering 3197 ) 3198 3199 if model.schema_normalization is None: 3200 # default to no schema normalization if not set 3201 model.schema_normalization = SchemaNormalizationModel.None_ 3202 3203 schema_normalization = ( 3204 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3205 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3206 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3207 ) 3208 3209 return RecordSelector( 3210 extractor=extractor, 3211 name=name, 3212 config=config, 3213 record_filter=record_filter, 3214 transformations=transformations or [], 3215 file_uploader=file_uploader, 3216 schema_normalization=schema_normalization, 3217 parameters=model.parameters or {}, 3218 transform_before_filtering=transform_before_filtering, 3219 )
3229 def create_selective_authenticator( 3230 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3231 ) -> DeclarativeAuthenticator: 3232 authenticators = { 3233 name: self._create_component_from_model(model=auth, config=config) 3234 for name, auth in model.authenticators.items() 3235 } 3236 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3237 return SelectiveAuthenticator( # type: ignore[abstract] 3238 config=config, 3239 authenticators=authenticators, 3240 authenticator_selection_path=model.authenticator_selection_path, 3241 **kwargs, 3242 )
3244 @staticmethod 3245 def create_legacy_session_token_authenticator( 3246 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3247 ) -> LegacySessionTokenAuthenticator: 3248 return LegacySessionTokenAuthenticator( 3249 api_url=url_base, 3250 header=model.header, 3251 login_url=model.login_url, 3252 password=model.password or "", 3253 session_token=model.session_token or "", 3254 session_token_response_key=model.session_token_response_key or "", 3255 username=model.username or "", 3256 validate_session_url=model.validate_session_url, 3257 config=config, 3258 parameters=model.parameters or {}, 3259 )
3261 def create_simple_retriever( 3262 self, 3263 model: SimpleRetrieverModel, 3264 config: Config, 3265 *, 3266 name: str, 3267 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3268 request_options_provider: Optional[RequestOptionsProvider] = None, 3269 cursor: Optional[Cursor] = None, 3270 has_stop_condition_cursor: bool = False, 3271 is_client_side_incremental_sync: bool = False, 3272 transformations: List[RecordTransformation], 3273 file_uploader: Optional[DefaultFileUploader] = None, 3274 incremental_sync: Optional[ 3275 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3276 ] = None, 3277 use_cache: Optional[bool] = None, 3278 log_formatter: Optional[Callable[[Response], Any]] = None, 3279 partition_router: Optional[PartitionRouter] = None, 3280 **kwargs: Any, 3281 ) -> SimpleRetriever: 3282 def _get_url(req: Requester) -> str: 3283 """ 3284 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3285 This is needed because the URL is not set until the requester is created. 3286 """ 3287 3288 _url: str = ( 3289 model.requester.url 3290 if hasattr(model.requester, "url") and model.requester.url is not None 3291 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3292 ) 3293 _url_base: str = ( 3294 model.requester.url_base 3295 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3296 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3297 ) 3298 3299 return _url or _url_base 3300 3301 if cursor is None: 3302 cursor = FinalStateCursor(name, None, self._message_repository) 3303 3304 decoder = ( 3305 self._create_component_from_model(model=model.decoder, config=config) 3306 if model.decoder 3307 else JsonDecoder(parameters={}) 3308 ) 3309 record_selector = self._create_component_from_model( 3310 model=model.record_selector, 3311 name=name, 3312 config=config, 3313 decoder=decoder, 3314 transformations=transformations, 3315 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3316 file_uploader=file_uploader, 3317 ) 3318 3319 query_properties: Optional[QueryProperties] = None 3320 query_properties_key: Optional[str] = None 3321 self._ensure_query_properties_to_model(model.requester) 3322 if self._has_query_properties_in_request_parameters(model.requester): 3323 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3324 # places instead of default to request_parameters which isn't clearly documented 3325 if ( 3326 hasattr(model.requester, "fetch_properties_from_endpoint") 3327 and model.requester.fetch_properties_from_endpoint 3328 ): 3329 raise ValueError( 3330 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3331 ) 3332 3333 query_properties_definitions = [] 3334 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3335 if isinstance(request_parameter, QueryPropertiesModel): 3336 query_properties_key = key 3337 query_properties_definitions.append(request_parameter) 3338 3339 if len(query_properties_definitions) > 1: 3340 raise ValueError( 3341 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3342 ) 3343 3344 if len(query_properties_definitions) == 1: 3345 query_properties = self._create_component_from_model( 3346 model=query_properties_definitions[0], stream_name=name, config=config 3347 ) 3348 3349 # Removes QueryProperties components from the interpolated mappings because it has been designed 3350 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3351 # instead of through jinja interpolation 3352 if hasattr(model.requester, "request_parameters") and isinstance( 3353 model.requester.request_parameters, Mapping 3354 ): 3355 model.requester.request_parameters = self._remove_query_properties( 3356 model.requester.request_parameters 3357 ) 3358 elif ( 3359 hasattr(model.requester, "fetch_properties_from_endpoint") 3360 and model.requester.fetch_properties_from_endpoint 3361 ): 3362 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3363 query_properties_definition = QueryPropertiesModel( 3364 type="QueryProperties", 3365 property_list=model.requester.fetch_properties_from_endpoint, 3366 always_include_properties=None, 3367 property_chunking=None, 3368 ) # type: ignore # $parameters has a default value 3369 3370 query_properties = self.create_query_properties( 3371 model=query_properties_definition, 3372 stream_name=name, 3373 config=config, 3374 ) 3375 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3376 query_properties = self.create_query_properties( 3377 model=model.requester.query_properties, 3378 stream_name=name, 3379 config=config, 3380 ) 3381 3382 requester = self._create_component_from_model( 3383 model=model.requester, 3384 decoder=decoder, 3385 name=name, 3386 query_properties_key=query_properties_key, 3387 use_cache=use_cache, 3388 config=config, 3389 ) 3390 3391 if not request_options_provider: 3392 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3393 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3394 partition_router, PartitionRouter 3395 ): 3396 request_options_provider = partition_router 3397 3398 paginator = ( 3399 self._create_component_from_model( 3400 model=model.paginator, 3401 config=config, 3402 url_base=_get_url(requester), 3403 extractor_model=model.record_selector.extractor, 3404 decoder=decoder, 3405 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3406 ) 3407 if model.paginator 3408 else NoPagination(parameters={}) 3409 ) 3410 3411 ignore_stream_slicer_parameters_on_paginated_requests = ( 3412 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3413 ) 3414 3415 if ( 3416 model.partition_router 3417 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3418 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3419 and any( 3420 parent_stream_config.lazy_read_pointer 3421 for parent_stream_config in model.partition_router.parent_stream_configs 3422 ) 3423 ): 3424 if incremental_sync: 3425 if incremental_sync.type != "DatetimeBasedCursor": 3426 raise ValueError( 3427 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3428 ) 3429 3430 elif incremental_sync.step or incremental_sync.cursor_granularity: 3431 raise ValueError( 3432 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3433 ) 3434 3435 if model.decoder and model.decoder.type != "JsonDecoder": 3436 raise ValueError( 3437 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3438 ) 3439 3440 return LazySimpleRetriever( 3441 name=name, 3442 paginator=paginator, 3443 primary_key=primary_key, 3444 requester=requester, 3445 record_selector=record_selector, 3446 stream_slicer=_NO_STREAM_SLICING, 3447 request_option_provider=request_options_provider, 3448 config=config, 3449 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3450 parameters=model.parameters or {}, 3451 ) 3452 3453 if ( 3454 model.record_selector.record_filter 3455 and model.pagination_reset 3456 and model.pagination_reset.limits 3457 ): 3458 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3459 3460 return SimpleRetriever( 3461 name=name, 3462 paginator=paginator, 3463 primary_key=primary_key, 3464 requester=requester, 3465 record_selector=record_selector, 3466 stream_slicer=_NO_STREAM_SLICING, 3467 request_option_provider=request_options_provider, 3468 config=config, 3469 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3470 additional_query_properties=query_properties, 3471 log_formatter=self._get_log_formatter(log_formatter, name), 3472 pagination_tracker_factory=self._create_pagination_tracker_factory( 3473 model.pagination_reset, cursor 3474 ), 3475 parameters=model.parameters or {}, 3476 )
3554 def create_state_delegating_stream( 3555 self, 3556 model: StateDelegatingStreamModel, 3557 config: Config, 3558 **kwargs: Any, 3559 ) -> DefaultStream: 3560 if ( 3561 model.full_refresh_stream.name != model.name 3562 or model.name != model.incremental_stream.name 3563 ): 3564 raise ValueError( 3565 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3566 ) 3567 3568 # Resolve api_retention_period with config context (supports Jinja2 interpolation) 3569 resolved_retention_period: Optional[str] = None 3570 if model.api_retention_period: 3571 interpolated_retention = InterpolatedString.create( 3572 model.api_retention_period, parameters=model.parameters or {} 3573 ) 3574 resolved_value = interpolated_retention.eval(config=config) 3575 if resolved_value: 3576 resolved_retention_period = str(resolved_value) 3577 3578 if resolved_retention_period: 3579 for stream_model in (model.full_refresh_stream, model.incremental_stream): 3580 if isinstance(stream_model.incremental_sync, IncrementingCountCursorModel): 3581 raise ValueError( 3582 f"Stream '{model.name}' uses IncrementingCountCursor which is not supported " 3583 f"with api_retention_period. IncrementingCountCursor does not use datetime-based " 3584 f"cursors, so cursor age validation cannot be performed." 3585 ) 3586 3587 stream_state = self._connector_state_manager.get_stream_state(model.name, None) 3588 3589 if not stream_state: 3590 return self._create_component_from_model( # type: ignore[no-any-return] 3591 model.full_refresh_stream, config=config, **kwargs 3592 ) 3593 3594 incremental_stream: DefaultStream = self._create_component_from_model( 3595 model.incremental_stream, config=config, **kwargs 3596 ) # type: ignore[assignment] 3597 3598 # Only run cursor age validation for streams that are in the configured 3599 # catalog (or when no catalog was provided, e.g. during discover / connector 3600 # builder). Streams not selected by the user but instantiated as parent-stream 3601 # dependencies must not go through this path because it emits state messages 3602 # that the destination does not know about, causing "Stream not found" crashes. 3603 stream_is_in_catalog = ( 3604 not self._stream_name_to_configured_stream # no catalog → validate by default 3605 or model.name in self._stream_name_to_configured_stream 3606 ) 3607 if resolved_retention_period and stream_is_in_catalog: 3608 full_refresh_stream: DefaultStream = self._create_component_from_model( 3609 model.full_refresh_stream, config=config, **kwargs 3610 ) # type: ignore[assignment] 3611 if self._is_cursor_older_than_retention_period( 3612 stream_state, 3613 full_refresh_stream.cursor, 3614 incremental_stream.cursor, 3615 resolved_retention_period, 3616 model.name, 3617 ): 3618 # Clear state BEFORE constructing the full_refresh_stream so that 3619 # its cursor starts from start_date instead of the stale cursor. 3620 self._connector_state_manager.update_state_for_stream(model.name, None, {}) 3621 state_message = self._connector_state_manager.create_state_message(model.name, None) 3622 self._message_repository.emit_message(state_message) 3623 return self._create_component_from_model( # type: ignore[no-any-return] 3624 model.full_refresh_stream, config=config, **kwargs 3625 ) 3626 3627 return incremental_stream
3714 def create_async_retriever( 3715 self, 3716 model: AsyncRetrieverModel, 3717 config: Config, 3718 *, 3719 name: str, 3720 primary_key: Optional[ 3721 Union[str, List[str], List[List[str]]] 3722 ], # this seems to be needed to match create_simple_retriever 3723 stream_slicer: Optional[StreamSlicer], 3724 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3725 transformations: List[RecordTransformation], 3726 **kwargs: Any, 3727 ) -> AsyncRetriever: 3728 if model.download_target_requester and not model.download_target_extractor: 3729 raise ValueError( 3730 f"`download_target_extractor` required if using a `download_target_requester`" 3731 ) 3732 3733 def _get_download_retriever( 3734 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3735 ) -> SimpleRetriever: 3736 # We create a record selector for the download retriever 3737 # with no schema normalization and no transformations, neither record filter 3738 # as all this occurs in the record_selector of the AsyncRetriever 3739 record_selector = RecordSelector( 3740 extractor=extractor, 3741 name=name, 3742 record_filter=None, 3743 transformations=[], 3744 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3745 config=config, 3746 parameters={}, 3747 ) 3748 paginator = ( 3749 self._create_component_from_model( 3750 model=model.download_paginator, 3751 decoder=_decoder, 3752 config=config, 3753 url_base="", 3754 ) 3755 if model.download_paginator 3756 else NoPagination(parameters={}) 3757 ) 3758 3759 return SimpleRetriever( 3760 requester=requester, 3761 record_selector=record_selector, 3762 primary_key=None, 3763 name=name, 3764 paginator=paginator, 3765 config=config, 3766 parameters={}, 3767 log_formatter=self._get_log_formatter(None, name), 3768 ) 3769 3770 def _get_job_timeout() -> datetime.timedelta: 3771 user_defined_timeout: Optional[int] = ( 3772 int( 3773 InterpolatedString.create( 3774 str(model.polling_job_timeout), 3775 parameters={}, 3776 ).eval(config) 3777 ) 3778 if model.polling_job_timeout 3779 else None 3780 ) 3781 3782 # check for user defined timeout during the test read or 15 minutes 3783 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3784 # default value for non-connector builder is 60 minutes. 3785 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3786 3787 return ( 3788 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3789 ) 3790 3791 decoder = ( 3792 self._create_component_from_model(model=model.decoder, config=config) 3793 if model.decoder 3794 else JsonDecoder(parameters={}) 3795 ) 3796 record_selector = self._create_component_from_model( 3797 model=model.record_selector, 3798 config=config, 3799 decoder=decoder, 3800 name=name, 3801 transformations=transformations, 3802 client_side_incremental_sync=client_side_incremental_sync, 3803 ) 3804 3805 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3806 if self._should_limit_slices_fetched(): 3807 stream_slicer = cast( 3808 StreamSlicer, 3809 StreamSlicerTestReadDecorator( 3810 wrapped_slicer=stream_slicer, 3811 maximum_number_of_slices=self._limit_slices_fetched or 5, 3812 ), 3813 ) 3814 3815 creation_requester = self._create_component_from_model( 3816 model=model.creation_requester, 3817 decoder=decoder, 3818 config=config, 3819 name=f"job creation - {name}", 3820 ) 3821 polling_requester = self._create_component_from_model( 3822 model=model.polling_requester, 3823 decoder=decoder, 3824 config=config, 3825 name=f"job polling - {name}", 3826 ) 3827 job_download_components_name = f"job download - {name}" 3828 download_decoder = ( 3829 self._create_component_from_model(model=model.download_decoder, config=config) 3830 if model.download_decoder 3831 else JsonDecoder(parameters={}) 3832 ) 3833 download_extractor = ( 3834 self._create_component_from_model( 3835 model=model.download_extractor, 3836 config=config, 3837 decoder=download_decoder, 3838 parameters=model.parameters, 3839 ) 3840 if model.download_extractor 3841 else DpathExtractor( 3842 [], 3843 config=config, 3844 decoder=download_decoder, 3845 parameters=model.parameters or {}, 3846 ) 3847 ) 3848 download_requester = self._create_component_from_model( 3849 model=model.download_requester, 3850 decoder=download_decoder, 3851 config=config, 3852 name=job_download_components_name, 3853 ) 3854 download_retriever = _get_download_retriever( 3855 download_requester, download_extractor, download_decoder 3856 ) 3857 abort_requester = ( 3858 self._create_component_from_model( 3859 model=model.abort_requester, 3860 decoder=decoder, 3861 config=config, 3862 name=f"job abort - {name}", 3863 ) 3864 if model.abort_requester 3865 else None 3866 ) 3867 delete_requester = ( 3868 self._create_component_from_model( 3869 model=model.delete_requester, 3870 decoder=decoder, 3871 config=config, 3872 name=f"job delete - {name}", 3873 ) 3874 if model.delete_requester 3875 else None 3876 ) 3877 download_target_requester = ( 3878 self._create_component_from_model( 3879 model=model.download_target_requester, 3880 decoder=decoder, 3881 config=config, 3882 name=f"job extract_url - {name}", 3883 ) 3884 if model.download_target_requester 3885 else None 3886 ) 3887 status_extractor = self._create_component_from_model( 3888 model=model.status_extractor, decoder=decoder, config=config, name=name 3889 ) 3890 download_target_extractor = ( 3891 self._create_component_from_model( 3892 model=model.download_target_extractor, 3893 decoder=decoder, 3894 config=config, 3895 name=name, 3896 ) 3897 if model.download_target_extractor 3898 else None 3899 ) 3900 3901 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3902 creation_requester=creation_requester, 3903 polling_requester=polling_requester, 3904 download_retriever=download_retriever, 3905 download_target_requester=download_target_requester, 3906 abort_requester=abort_requester, 3907 delete_requester=delete_requester, 3908 status_extractor=status_extractor, 3909 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3910 download_target_extractor=download_target_extractor, 3911 job_timeout=_get_job_timeout(), 3912 ) 3913 3914 async_job_partition_router = AsyncJobPartitionRouter( 3915 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3916 job_repository, 3917 stream_slices, 3918 self._job_tracker, 3919 self._message_repository, 3920 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3921 has_bulk_parent=False, 3922 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3923 # `None` == default retry is set to 3 attempts, under the hood. 3924 job_max_retry=1 if self._emit_connector_builder_messages else None, 3925 ), 3926 stream_slicer=stream_slicer, 3927 config=config, 3928 parameters=model.parameters or {}, 3929 ) 3930 3931 return AsyncRetriever( 3932 record_selector=record_selector, 3933 stream_slicer=async_job_partition_router, 3934 config=config, 3935 parameters=model.parameters or {}, 3936 )
3938 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3939 config_migrations = [ 3940 self._create_component_from_model(migration, config) 3941 for migration in ( 3942 model.config_normalization_rules.config_migrations 3943 if ( 3944 model.config_normalization_rules 3945 and model.config_normalization_rules.config_migrations 3946 ) 3947 else [] 3948 ) 3949 ] 3950 config_transformations = [ 3951 self._create_component_from_model(transformation, config) 3952 for transformation in ( 3953 model.config_normalization_rules.transformations 3954 if ( 3955 model.config_normalization_rules 3956 and model.config_normalization_rules.transformations 3957 ) 3958 else [] 3959 ) 3960 ] 3961 config_validations = [ 3962 self._create_component_from_model(validation, config) 3963 for validation in ( 3964 model.config_normalization_rules.validations 3965 if ( 3966 model.config_normalization_rules 3967 and model.config_normalization_rules.validations 3968 ) 3969 else [] 3970 ) 3971 ] 3972 3973 return Spec( 3974 connection_specification=model.connection_specification, 3975 documentation_url=model.documentation_url, 3976 advanced_auth=model.advanced_auth, 3977 parameters={}, 3978 config_migrations=config_migrations, 3979 config_transformations=config_transformations, 3980 config_validations=config_validations, 3981 )
3983 def create_substream_partition_router( 3984 self, 3985 model: SubstreamPartitionRouterModel, 3986 config: Config, 3987 *, 3988 stream_name: str, 3989 **kwargs: Any, 3990 ) -> SubstreamPartitionRouter: 3991 parent_stream_configs = [] 3992 if model.parent_stream_configs: 3993 parent_stream_configs.extend( 3994 [ 3995 self.create_parent_stream_config_with_substream_wrapper( 3996 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3997 ) 3998 for parent_stream_config in model.parent_stream_configs 3999 ] 4000 ) 4001 4002 return SubstreamPartitionRouter( 4003 parent_stream_configs=parent_stream_configs, 4004 parameters=model.parameters or {}, 4005 config=config, 4006 )
4008 def create_parent_stream_config_with_substream_wrapper( 4009 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 4010 ) -> Any: 4011 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 4012 4013 parent_state: Optional[Mapping[str, Any]] = ( 4014 child_state if model.incremental_dependency and child_state else None 4015 ) 4016 connector_state_manager = self._instantiate_parent_stream_state_manager( 4017 child_state, config, model, parent_state 4018 ) 4019 4020 substream_factory = ModelToComponentFactory( 4021 connector_state_manager=connector_state_manager, 4022 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 4023 limit_slices_fetched=self._limit_slices_fetched, 4024 emit_connector_builder_messages=self._emit_connector_builder_messages, 4025 disable_retries=self._disable_retries, 4026 disable_cache=self._disable_cache, 4027 message_repository=StateFilteringMessageRepository( 4028 LogAppenderMessageRepositoryDecorator( 4029 { 4030 "airbyte_cdk": {"stream": {"is_substream": True}}, 4031 "http": {"is_auxiliary": True}, 4032 }, 4033 self._message_repository, 4034 self._evaluate_log_level(self._emit_connector_builder_messages), 4035 ), 4036 ), 4037 api_budget=self._api_budget, 4038 ) 4039 4040 return substream_factory.create_parent_stream_config( 4041 model=model, config=config, stream_name=stream_name, **kwargs 4042 )
4102 @staticmethod 4103 def create_wait_time_from_header( 4104 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 4105 ) -> WaitTimeFromHeaderBackoffStrategy: 4106 return WaitTimeFromHeaderBackoffStrategy( 4107 header=model.header, 4108 parameters=model.parameters or {}, 4109 config=config, 4110 regex=model.regex, 4111 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 4112 if model.max_waiting_time_in_seconds is not None 4113 else None, 4114 )
4116 @staticmethod 4117 def create_wait_until_time_from_header( 4118 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 4119 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 4120 return WaitUntilTimeFromHeaderBackoffStrategy( 4121 header=model.header, 4122 parameters=model.parameters or {}, 4123 config=config, 4124 min_wait=model.min_wait, 4125 regex=model.regex, 4126 )
4134 @staticmethod 4135 def create_components_mapping_definition( 4136 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 4137 ) -> ComponentMappingDefinition: 4138 interpolated_value = InterpolatedString.create( 4139 model.value, parameters=model.parameters or {} 4140 ) 4141 field_path = [ 4142 InterpolatedString.create(path, parameters=model.parameters or {}) 4143 for path in model.field_path 4144 ] 4145 return ComponentMappingDefinition( 4146 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4147 value=interpolated_value, 4148 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4149 create_or_update=model.create_or_update, 4150 condition=model.condition, 4151 parameters=model.parameters or {}, 4152 )
4154 def create_http_components_resolver( 4155 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4156 ) -> Any: 4157 retriever = self._create_component_from_model( 4158 model=model.retriever, 4159 config=config, 4160 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4161 primary_key=None, 4162 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4163 transformations=[], 4164 ) 4165 4166 components_mapping = [] 4167 for component_mapping_definition_model in model.components_mapping: 4168 if component_mapping_definition_model.condition: 4169 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4170 components_mapping.append( 4171 self._create_component_from_model( 4172 model=component_mapping_definition_model, 4173 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4174 component_mapping_definition_model.value_type 4175 ), 4176 config=config, 4177 ) 4178 ) 4179 4180 return HttpComponentsResolver( 4181 retriever=retriever, 4182 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4183 config=config, 4184 components_mapping=components_mapping, 4185 parameters=model.parameters or {}, 4186 )
4188 @staticmethod 4189 def create_stream_config( 4190 model: StreamConfigModel, config: Config, **kwargs: Any 4191 ) -> StreamConfig: 4192 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4193 [x for x in model.configs_pointer] if model.configs_pointer else [] 4194 ) 4195 4196 return StreamConfig( 4197 configs_pointer=model_configs_pointer, 4198 default_values=model.default_values, 4199 parameters=model.parameters or {}, 4200 )
4202 def create_config_components_resolver( 4203 self, 4204 model: ConfigComponentsResolverModel, 4205 config: Config, 4206 ) -> Any: 4207 model_stream_configs = ( 4208 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4209 ) 4210 4211 stream_configs = [ 4212 self._create_component_from_model( 4213 stream_config, config=config, parameters=model.parameters or {} 4214 ) 4215 for stream_config in model_stream_configs 4216 ] 4217 4218 components_mapping = [ 4219 self._create_component_from_model( 4220 model=components_mapping_definition_model, 4221 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4222 components_mapping_definition_model.value_type 4223 ), 4224 config=config, 4225 parameters=model.parameters, 4226 ) 4227 for components_mapping_definition_model in model.components_mapping 4228 ] 4229 4230 return ConfigComponentsResolver( 4231 stream_configs=stream_configs, 4232 config=config, 4233 components_mapping=components_mapping, 4234 parameters=model.parameters or {}, 4235 )
4237 def create_parametrized_components_resolver( 4238 self, 4239 model: ParametrizedComponentsResolverModel, 4240 config: Config, 4241 ) -> ParametrizedComponentsResolver: 4242 stream_parameters = StreamParametersDefinition( 4243 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4244 ) 4245 4246 components_mapping = [] 4247 for components_mapping_definition_model in model.components_mapping: 4248 if components_mapping_definition_model.condition: 4249 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4250 components_mapping.append( 4251 self._create_component_from_model( 4252 model=components_mapping_definition_model, 4253 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4254 components_mapping_definition_model.value_type 4255 ), 4256 config=config, 4257 ) 4258 ) 4259 return ParametrizedComponentsResolver( 4260 stream_parameters=stream_parameters, 4261 config=config, 4262 components_mapping=components_mapping, 4263 parameters=model.parameters or {}, 4264 )
4288 def create_http_api_budget( 4289 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4290 ) -> HttpAPIBudget: 4291 policies = [ 4292 self._create_component_from_model(model=policy, config=config) 4293 for policy in model.policies 4294 ] 4295 4296 return HttpAPIBudget( 4297 policies=policies, 4298 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4299 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4300 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4301 )
4303 def create_fixed_window_call_rate_policy( 4304 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4305 ) -> FixedWindowCallRatePolicy: 4306 matchers = [ 4307 self._create_component_from_model(model=matcher, config=config) 4308 for matcher in model.matchers 4309 ] 4310 4311 # Set the initial reset timestamp to 10 days from now. 4312 # This value will be updated by the first request. 4313 return FixedWindowCallRatePolicy( 4314 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4315 period=parse_duration(model.period), 4316 call_limit=model.call_limit, 4317 matchers=matchers, 4318 )
4320 def create_file_uploader( 4321 self, model: FileUploaderModel, config: Config, **kwargs: Any 4322 ) -> FileUploader: 4323 name = "File Uploader" 4324 requester = self._create_component_from_model( 4325 model=model.requester, 4326 config=config, 4327 name=name, 4328 **kwargs, 4329 ) 4330 download_target_extractor = self._create_component_from_model( 4331 model=model.download_target_extractor, 4332 config=config, 4333 name=name, 4334 **kwargs, 4335 ) 4336 emit_connector_builder_messages = self._emit_connector_builder_messages 4337 file_uploader = DefaultFileUploader( 4338 requester=requester, 4339 download_target_extractor=download_target_extractor, 4340 config=config, 4341 file_writer=NoopFileWriter() 4342 if emit_connector_builder_messages 4343 else LocalFileSystemFileWriter(), 4344 parameters=model.parameters or {}, 4345 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4346 ) 4347 4348 return ( 4349 ConnectorBuilderFileUploader(file_uploader) 4350 if emit_connector_builder_messages 4351 else file_uploader 4352 )
4354 def create_moving_window_call_rate_policy( 4355 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4356 ) -> MovingWindowCallRatePolicy: 4357 rates = [ 4358 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4359 ] 4360 matchers = [ 4361 self._create_component_from_model(model=matcher, config=config) 4362 for matcher in model.matchers 4363 ] 4364 return MovingWindowCallRatePolicy( 4365 rates=rates, 4366 matchers=matchers, 4367 )
4369 def create_unlimited_call_rate_policy( 4370 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4371 ) -> UnlimitedCallRatePolicy: 4372 matchers = [ 4373 self._create_component_from_model(model=matcher, config=config) 4374 for matcher in model.matchers 4375 ] 4376 4377 return UnlimitedCallRatePolicy( 4378 matchers=matchers, 4379 )
4388 def create_http_request_matcher( 4389 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4390 ) -> HttpRequestRegexMatcher: 4391 weight = model.weight 4392 if weight is not None: 4393 if isinstance(weight, str): 4394 weight = int(InterpolatedString.create(weight, parameters={}).eval(config)) 4395 else: 4396 weight = int(weight) 4397 if weight < 1: 4398 raise ValueError(f"weight must be >= 1, got {weight}") 4399 return HttpRequestRegexMatcher( 4400 method=model.method, 4401 url_base=model.url_base, 4402 url_path_pattern=model.url_path_pattern, 4403 params=model.params, 4404 headers=model.headers, 4405 weight=weight, 4406 )
4413 def create_grouping_partition_router( 4414 self, 4415 model: GroupingPartitionRouterModel, 4416 config: Config, 4417 *, 4418 stream_name: str, 4419 **kwargs: Any, 4420 ) -> GroupingPartitionRouter: 4421 underlying_router = self._create_component_from_model( 4422 model=model.underlying_partition_router, 4423 config=config, 4424 stream_name=stream_name, 4425 **kwargs, 4426 ) 4427 if model.group_size < 1: 4428 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4429 4430 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4431 # because they are specific to individual partitions and cannot be aggregated or handled 4432 # when grouping, potentially leading to incorrect API calls. Any request customization 4433 # should be managed at the stream level through the requester's configuration. 4434 if isinstance(underlying_router, SubstreamPartitionRouter): 4435 if any( 4436 parent_config.request_option 4437 for parent_config in underlying_router.parent_stream_configs 4438 ): 4439 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4440 4441 if isinstance(underlying_router, ListPartitionRouter): 4442 if underlying_router.request_option: 4443 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4444 4445 return GroupingPartitionRouter( 4446 group_size=model.group_size, 4447 underlying_partition_router=underlying_router, 4448 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4449 config=config, 4450 )