airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 Any, 15 Callable, 16 Dict, 17 List, 18 Mapping, 19 MutableMapping, 20 Optional, 21 Type, 22 Union, 23 cast, 24 get_args, 25 get_origin, 26 get_type_hints, 27) 28 29from isodate import parse_duration 30from pydantic.v1 import BaseModel 31from requests import Response 32 33from airbyte_cdk.connector_builder.models import ( 34 LogMessage as ConnectorBuilderLogMessage, 35) 36from airbyte_cdk.legacy.sources.declarative.declarative_stream import DeclarativeStream 37from airbyte_cdk.legacy.sources.declarative.incremental import ( 38 DatetimeBasedCursor, 39) 40from airbyte_cdk.models import ( 41 AirbyteStateBlob, 42 AirbyteStateMessage, 43 AirbyteStateType, 44 AirbyteStreamState, 45 FailureType, 46 Level, 47 StreamDescriptor, 48) 49from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 50from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 51from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 52from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 53from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 54from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 55from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 56 DeclarativeAuthenticator, 57 NoAuth, 58) 59from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 60from airbyte_cdk.sources.declarative.auth.oauth import ( 61 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 62) 63from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 64from airbyte_cdk.sources.declarative.auth.token import ( 65 ApiKeyAuthenticator, 66 BasicHttpAuthenticator, 67 BearerAuthenticator, 68 LegacySessionTokenAuthenticator, 69) 70from airbyte_cdk.sources.declarative.auth.token_provider import ( 71 InterpolatedStringTokenProvider, 72 SessionTokenProvider, 73 TokenProvider, 74) 75from airbyte_cdk.sources.declarative.checks import ( 76 CheckDynamicStream, 77 CheckStream, 78 DynamicStreamCheckConfig, 79) 80from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 81from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 82from airbyte_cdk.sources.declarative.decoders import ( 83 Decoder, 84 IterableDecoder, 85 JsonDecoder, 86 PaginationDecoderDecorator, 87 XmlDecoder, 88 ZipfileDecoder, 89) 90from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 91 CompositeRawDecoder, 92 CsvParser, 93 GzipParser, 94 JsonLineParser, 95 JsonParser, 96 Parser, 97) 98from airbyte_cdk.sources.declarative.extractors import ( 99 DpathExtractor, 100 RecordFilter, 101 RecordSelector, 102 ResponseToFileExtractor, 103) 104from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 105from airbyte_cdk.sources.declarative.extractors.record_filter import ( 106 ClientSideIncrementalRecordFilterDecorator, 107) 108from airbyte_cdk.sources.declarative.incremental import ( 109 ConcurrentCursorFactory, 110 ConcurrentPerPartitionCursor, 111) 112from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 113from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 114from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 115 LegacyToPerPartitionStateMigration, 116) 117from airbyte_cdk.sources.declarative.models import ( 118 CustomStateMigration, 119) 120from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 121 DEPRECATION_LOGS_TAG, 122 BaseModelWithDeprecations, 123) 124from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 125 AddedFieldDefinition as AddedFieldDefinitionModel, 126) 127from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 128 AddFields as AddFieldsModel, 129) 130from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 131 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 132) 133from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 134 AsyncJobStatusMap as AsyncJobStatusMapModel, 135) 136from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 137 AsyncRetriever as AsyncRetrieverModel, 138) 139from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 140 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 141) 142from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 143 BearerAuthenticator as BearerAuthenticatorModel, 144) 145from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 146 CheckDynamicStream as CheckDynamicStreamModel, 147) 148from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 149 CheckStream as CheckStreamModel, 150) 151from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 152 ComplexFieldType as ComplexFieldTypeModel, 153) 154from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 155 ComponentMappingDefinition as ComponentMappingDefinitionModel, 156) 157from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 158 CompositeErrorHandler as CompositeErrorHandlerModel, 159) 160from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 161 ConcurrencyLevel as ConcurrencyLevelModel, 162) 163from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 164 ConfigAddFields as ConfigAddFieldsModel, 165) 166from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 167 ConfigComponentsResolver as ConfigComponentsResolverModel, 168) 169from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 170 ConfigMigration as ConfigMigrationModel, 171) 172from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 173 ConfigRemapField as ConfigRemapFieldModel, 174) 175from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 176 ConfigRemoveFields as ConfigRemoveFieldsModel, 177) 178from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 179 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 180) 181from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 182 CsvDecoder as CsvDecoderModel, 183) 184from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 185 CursorPagination as CursorPaginationModel, 186) 187from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 188 CustomAuthenticator as CustomAuthenticatorModel, 189) 190from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 191 CustomBackoffStrategy as CustomBackoffStrategyModel, 192) 193from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 194 CustomConfigTransformation as CustomConfigTransformationModel, 195) 196from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 197 CustomDecoder as CustomDecoderModel, 198) 199from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 200 CustomErrorHandler as CustomErrorHandlerModel, 201) 202from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 203 CustomPaginationStrategy as CustomPaginationStrategyModel, 204) 205from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 206 CustomPartitionRouter as CustomPartitionRouterModel, 207) 208from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 209 CustomRecordExtractor as CustomRecordExtractorModel, 210) 211from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 212 CustomRecordFilter as CustomRecordFilterModel, 213) 214from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 215 CustomRequester as CustomRequesterModel, 216) 217from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 218 CustomRetriever as CustomRetrieverModel, 219) 220from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 221 CustomSchemaLoader as CustomSchemaLoader, 222) 223from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 224 CustomSchemaNormalization as CustomSchemaNormalizationModel, 225) 226from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 227 CustomTransformation as CustomTransformationModel, 228) 229from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 230 CustomValidationStrategy as CustomValidationStrategyModel, 231) 232from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 233 DatetimeBasedCursor as DatetimeBasedCursorModel, 234) 235from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 236 DeclarativeStream as DeclarativeStreamModel, 237) 238from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 239 DefaultErrorHandler as DefaultErrorHandlerModel, 240) 241from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 242 DefaultPaginator as DefaultPaginatorModel, 243) 244from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 245 DpathExtractor as DpathExtractorModel, 246) 247from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 248 DpathFlattenFields as DpathFlattenFieldsModel, 249) 250from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 251 DpathValidator as DpathValidatorModel, 252) 253from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 254 DynamicSchemaLoader as DynamicSchemaLoaderModel, 255) 256from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 257 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 258) 259from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 260 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 261) 262from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 263 FileUploader as FileUploaderModel, 264) 265from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 266 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 267) 268from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 269 FlattenFields as FlattenFieldsModel, 270) 271from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 272 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 273) 274from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 275 GroupingPartitionRouter as GroupingPartitionRouterModel, 276) 277from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 278 GzipDecoder as GzipDecoderModel, 279) 280from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 281 HTTPAPIBudget as HTTPAPIBudgetModel, 282) 283from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 284 HttpComponentsResolver as HttpComponentsResolverModel, 285) 286from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 287 HttpRequester as HttpRequesterModel, 288) 289from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 290 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 291) 292from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 293 HttpResponseFilter as HttpResponseFilterModel, 294) 295from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 296 IncrementingCountCursor as IncrementingCountCursorModel, 297) 298from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 299 InlineSchemaLoader as InlineSchemaLoaderModel, 300) 301from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 302 IterableDecoder as IterableDecoderModel, 303) 304from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 305 JsonDecoder as JsonDecoderModel, 306) 307from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 308 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 309) 310from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 311 JsonlDecoder as JsonlDecoderModel, 312) 313from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 314 JwtAuthenticator as JwtAuthenticatorModel, 315) 316from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 317 JwtHeaders as JwtHeadersModel, 318) 319from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 320 JwtPayload as JwtPayloadModel, 321) 322from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 323 KeysReplace as KeysReplaceModel, 324) 325from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 326 KeysToLower as KeysToLowerModel, 327) 328from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 329 KeysToSnakeCase as KeysToSnakeCaseModel, 330) 331from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 332 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 333) 334from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 335 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 336) 337from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 338 ListPartitionRouter as ListPartitionRouterModel, 339) 340from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 341 MinMaxDatetime as MinMaxDatetimeModel, 342) 343from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 344 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 345) 346from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 347 NoAuth as NoAuthModel, 348) 349from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 350 NoPagination as NoPaginationModel, 351) 352from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 353 OAuthAuthenticator as OAuthAuthenticatorModel, 354) 355from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 356 OffsetIncrement as OffsetIncrementModel, 357) 358from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 359 PageIncrement as PageIncrementModel, 360) 361from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 362 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 363) 364from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 365 ParentStreamConfig as ParentStreamConfigModel, 366) 367from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 368 PredicateValidator as PredicateValidatorModel, 369) 370from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 371 PropertiesFromEndpoint as PropertiesFromEndpointModel, 372) 373from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 374 PropertyChunking as PropertyChunkingModel, 375) 376from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 377 PropertyLimitType as PropertyLimitTypeModel, 378) 379from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 380 QueryProperties as QueryPropertiesModel, 381) 382from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 383 Rate as RateModel, 384) 385from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 386 RecordFilter as RecordFilterModel, 387) 388from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 389 RecordSelector as RecordSelectorModel, 390) 391from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 392 RemoveFields as RemoveFieldsModel, 393) 394from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 395 RequestOption as RequestOptionModel, 396) 397from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 398 RequestPath as RequestPathModel, 399) 400from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 401 ResponseToFileExtractor as ResponseToFileExtractorModel, 402) 403from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 404 SchemaNormalization as SchemaNormalizationModel, 405) 406from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 407 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 408) 409from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 410 SelectiveAuthenticator as SelectiveAuthenticatorModel, 411) 412from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 413 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 414) 415from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 416 SimpleRetriever as SimpleRetrieverModel, 417) 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 419from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 420 StateDelegatingStream as StateDelegatingStreamModel, 421) 422from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 423 StreamConfig as StreamConfigModel, 424) 425from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 426 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 427) 428from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 429 TypesMap as TypesMapModel, 430) 431from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 432 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 433) 434from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 435 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 436) 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 438from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 439 WaitTimeFromHeader as WaitTimeFromHeaderModel, 440) 441from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 442 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 443) 444from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 445 XmlDecoder as XmlDecoderModel, 446) 447from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 448 ZipfileDecoder as ZipfileDecoderModel, 449) 450from airbyte_cdk.sources.declarative.partition_routers import ( 451 CartesianProductStreamSlicer, 452 GroupingPartitionRouter, 453 ListPartitionRouter, 454 PartitionRouter, 455 SinglePartitionRouter, 456 SubstreamPartitionRouter, 457) 458from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 459 AsyncJobPartitionRouter, 460) 461from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 462 ParentStreamConfig, 463) 464from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 465from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 466 CompositeErrorHandler, 467 DefaultErrorHandler, 468 HttpResponseFilter, 469) 470from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 471 ConstantBackoffStrategy, 472 ExponentialBackoffStrategy, 473 WaitTimeFromHeaderBackoffStrategy, 474 WaitUntilTimeFromHeaderBackoffStrategy, 475) 476from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 477from airbyte_cdk.sources.declarative.requesters.paginators import ( 478 DefaultPaginator, 479 NoPagination, 480 PaginatorTestReadDecorator, 481) 482from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 483 CursorPaginationStrategy, 484 CursorStopCondition, 485 OffsetIncrement, 486 PageIncrement, 487 StopConditionPaginationStrategyDecorator, 488) 489from airbyte_cdk.sources.declarative.requesters.query_properties import ( 490 PropertiesFromEndpoint, 491 PropertyChunking, 492 QueryProperties, 493) 494from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 495 PropertyLimitType, 496) 497from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 498 GroupByKey, 499) 500from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 501from airbyte_cdk.sources.declarative.requesters.request_options import ( 502 DatetimeBasedRequestOptionsProvider, 503 DefaultRequestOptionsProvider, 504 InterpolatedRequestOptionsProvider, 505 RequestOptionsProvider, 506) 507from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 508 PerPartitionRequestOptionsProvider, 509) 510from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 511from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 512from airbyte_cdk.sources.declarative.resolvers import ( 513 ComponentMappingDefinition, 514 ConfigComponentsResolver, 515 HttpComponentsResolver, 516 ParametrizedComponentsResolver, 517 StreamConfig, 518 StreamParametersDefinition, 519) 520from airbyte_cdk.sources.declarative.retrievers import ( 521 AsyncRetriever, 522 LazySimpleRetriever, 523 SimpleRetriever, 524) 525from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 526 ConnectorBuilderFileUploader, 527 DefaultFileUploader, 528 FileUploader, 529 LocalFileSystemFileWriter, 530 NoopFileWriter, 531) 532from airbyte_cdk.sources.declarative.schema import ( 533 ComplexFieldType, 534 DefaultSchemaLoader, 535 DynamicSchemaLoader, 536 InlineSchemaLoader, 537 JsonFileSchemaLoader, 538 SchemaTypeIdentifier, 539 TypesMap, 540) 541from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 542from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 543from airbyte_cdk.sources.declarative.stream_slicers import ( 544 StreamSlicer, 545 StreamSlicerTestReadDecorator, 546) 547from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 548 DeclarativePartitionFactory, 549 StreamSlicerPartitionGenerator, 550) 551from airbyte_cdk.sources.declarative.transformations import ( 552 AddFields, 553 RecordTransformation, 554 RemoveFields, 555) 556from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 557from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 558 ConfigAddFields, 559 ConfigRemapField, 560 ConfigRemoveFields, 561) 562from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 563 ConfigTransformation, 564) 565from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 566 DpathFlattenFields, 567 KeyTransformation, 568) 569from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 570 FlattenFields, 571) 572from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 573 KeysReplaceTransformation, 574) 575from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 576 KeysToLowerTransformation, 577) 578from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 579 KeysToSnakeCaseTransformation, 580) 581from airbyte_cdk.sources.declarative.validators import ( 582 DpathValidator, 583 PredicateValidator, 584 ValidateAdheresToSchema, 585) 586from airbyte_cdk.sources.http_logger import format_http_message 587from airbyte_cdk.sources.message import ( 588 InMemoryMessageRepository, 589 LogAppenderMessageRepositoryDecorator, 590 MessageRepository, 591 NoopMessageRepository, 592) 593from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 594from airbyte_cdk.sources.streams.call_rate import ( 595 APIBudget, 596 FixedWindowCallRatePolicy, 597 HttpAPIBudget, 598 HttpRequestRegexMatcher, 599 MovingWindowCallRatePolicy, 600 Rate, 601 UnlimitedCallRatePolicy, 602) 603from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 604from airbyte_cdk.sources.streams.concurrent.clamping import ( 605 ClampingEndProvider, 606 ClampingStrategy, 607 DayClampingStrategy, 608 MonthClampingStrategy, 609 NoClamping, 610 WeekClampingStrategy, 611 Weekday, 612) 613from airbyte_cdk.sources.streams.concurrent.cursor import ( 614 ConcurrentCursor, 615 Cursor, 616 CursorField, 617 FinalStateCursor, 618) 619from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 620from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 621from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 622 StreamSlicer as ConcurrentStreamSlicer, 623) 624from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 625 CustomFormatConcurrentStreamStateConverter, 626 DateTimeStreamStateConverter, 627) 628from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 629 IncrementingCountStreamStateConverter, 630) 631from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 632from airbyte_cdk.sources.types import Config 633from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 634 635ComponentDefinition = Mapping[str, Any] 636 637SCHEMA_TRANSFORMER_TYPE_MAPPING = { 638 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 639 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 640} 641_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 642 643# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 644# this would be a circular import 645MAX_SLICES = 5 646 647 648class ModelToComponentFactory: 649 EPOCH_DATETIME_FORMAT = "%s" 650 651 def __init__( 652 self, 653 limit_pages_fetched_per_slice: Optional[int] = None, 654 limit_slices_fetched: Optional[int] = None, 655 emit_connector_builder_messages: bool = False, 656 disable_retries: bool = False, 657 disable_cache: bool = False, 658 message_repository: Optional[MessageRepository] = None, 659 connector_state_manager: Optional[ConnectorStateManager] = None, 660 max_concurrent_async_job_count: Optional[int] = None, 661 ): 662 self._init_mappings() 663 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 664 self._limit_slices_fetched = limit_slices_fetched 665 self._emit_connector_builder_messages = emit_connector_builder_messages 666 self._disable_retries = disable_retries 667 self._disable_cache = disable_cache 668 self._message_repository = message_repository or InMemoryMessageRepository( 669 self._evaluate_log_level(emit_connector_builder_messages) 670 ) 671 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 672 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 673 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 674 # placeholder for deprecation warnings 675 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 676 677 def _init_mappings(self) -> None: 678 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 679 AddedFieldDefinitionModel: self.create_added_field_definition, 680 AddFieldsModel: self.create_add_fields, 681 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 682 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 683 BearerAuthenticatorModel: self.create_bearer_authenticator, 684 CheckStreamModel: self.create_check_stream, 685 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 686 CheckDynamicStreamModel: self.create_check_dynamic_stream, 687 CompositeErrorHandlerModel: self.create_composite_error_handler, 688 ConcurrencyLevelModel: self.create_concurrency_level, 689 ConfigMigrationModel: self.create_config_migration, 690 ConfigAddFieldsModel: self.create_config_add_fields, 691 ConfigRemapFieldModel: self.create_config_remap_field, 692 ConfigRemoveFieldsModel: self.create_config_remove_fields, 693 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 694 CsvDecoderModel: self.create_csv_decoder, 695 CursorPaginationModel: self.create_cursor_pagination, 696 CustomAuthenticatorModel: self.create_custom_component, 697 CustomBackoffStrategyModel: self.create_custom_component, 698 CustomDecoderModel: self.create_custom_component, 699 CustomErrorHandlerModel: self.create_custom_component, 700 CustomRecordExtractorModel: self.create_custom_component, 701 CustomRecordFilterModel: self.create_custom_component, 702 CustomRequesterModel: self.create_custom_component, 703 CustomRetrieverModel: self.create_custom_component, 704 CustomSchemaLoader: self.create_custom_component, 705 CustomSchemaNormalizationModel: self.create_custom_component, 706 CustomStateMigration: self.create_custom_component, 707 CustomPaginationStrategyModel: self.create_custom_component, 708 CustomPartitionRouterModel: self.create_custom_component, 709 CustomTransformationModel: self.create_custom_component, 710 CustomValidationStrategyModel: self.create_custom_component, 711 CustomConfigTransformationModel: self.create_custom_component, 712 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 713 DeclarativeStreamModel: self.create_default_stream, 714 DefaultErrorHandlerModel: self.create_default_error_handler, 715 DefaultPaginatorModel: self.create_default_paginator, 716 DpathExtractorModel: self.create_dpath_extractor, 717 DpathValidatorModel: self.create_dpath_validator, 718 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 719 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 720 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 721 GroupByKeyMergeStrategyModel: self.create_group_by_key, 722 HttpRequesterModel: self.create_http_requester, 723 HttpResponseFilterModel: self.create_http_response_filter, 724 InlineSchemaLoaderModel: self.create_inline_schema_loader, 725 JsonDecoderModel: self.create_json_decoder, 726 JsonlDecoderModel: self.create_jsonl_decoder, 727 GzipDecoderModel: self.create_gzip_decoder, 728 KeysToLowerModel: self.create_keys_to_lower_transformation, 729 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 730 KeysReplaceModel: self.create_keys_replace_transformation, 731 FlattenFieldsModel: self.create_flatten_fields, 732 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 733 IterableDecoderModel: self.create_iterable_decoder, 734 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 735 XmlDecoderModel: self.create_xml_decoder, 736 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 737 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 738 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 739 TypesMapModel: self.create_types_map, 740 ComplexFieldTypeModel: self.create_complex_field_type, 741 JwtAuthenticatorModel: self.create_jwt_authenticator, 742 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 743 ListPartitionRouterModel: self.create_list_partition_router, 744 MinMaxDatetimeModel: self.create_min_max_datetime, 745 NoAuthModel: self.create_no_auth, 746 NoPaginationModel: self.create_no_pagination, 747 OAuthAuthenticatorModel: self.create_oauth_authenticator, 748 OffsetIncrementModel: self.create_offset_increment, 749 PageIncrementModel: self.create_page_increment, 750 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 751 PredicateValidatorModel: self.create_predicate_validator, 752 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 753 PropertyChunkingModel: self.create_property_chunking, 754 QueryPropertiesModel: self.create_query_properties, 755 RecordFilterModel: self.create_record_filter, 756 RecordSelectorModel: self.create_record_selector, 757 RemoveFieldsModel: self.create_remove_fields, 758 RequestPathModel: self.create_request_path, 759 RequestOptionModel: self.create_request_option, 760 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 761 SelectiveAuthenticatorModel: self.create_selective_authenticator, 762 SimpleRetrieverModel: self.create_simple_retriever, 763 StateDelegatingStreamModel: self.create_state_delegating_stream, 764 SpecModel: self.create_spec, 765 SubstreamPartitionRouterModel: self.create_substream_partition_router, 766 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 767 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 768 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 769 AsyncRetrieverModel: self.create_async_retriever, 770 HttpComponentsResolverModel: self.create_http_components_resolver, 771 ConfigComponentsResolverModel: self.create_config_components_resolver, 772 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 773 StreamConfigModel: self.create_stream_config, 774 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 775 ZipfileDecoderModel: self.create_zipfile_decoder, 776 HTTPAPIBudgetModel: self.create_http_api_budget, 777 FileUploaderModel: self.create_file_uploader, 778 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 779 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 780 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 781 RateModel: self.create_rate, 782 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 783 GroupingPartitionRouterModel: self.create_grouping_partition_router, 784 } 785 786 # Needed for the case where we need to perform a second parse on the fields of a custom component 787 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 788 789 def create_component( 790 self, 791 model_type: Type[BaseModel], 792 component_definition: ComponentDefinition, 793 config: Config, 794 **kwargs: Any, 795 ) -> Any: 796 """ 797 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 798 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 799 creating declarative components from that model. 800 801 :param model_type: The type of declarative component that is being initialized 802 :param component_definition: The mapping that represents a declarative component 803 :param config: The connector config that is provided by the customer 804 :return: The declarative component to be used at runtime 805 """ 806 807 component_type = component_definition.get("type") 808 if component_definition.get("type") != model_type.__name__: 809 raise ValueError( 810 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 811 ) 812 813 declarative_component_model = model_type.parse_obj(component_definition) 814 815 if not isinstance(declarative_component_model, model_type): 816 raise ValueError( 817 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 818 ) 819 820 return self._create_component_from_model( 821 model=declarative_component_model, config=config, **kwargs 822 ) 823 824 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 825 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 826 raise ValueError( 827 f"{model.__class__} with attributes {model} is not a valid component type" 828 ) 829 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 830 if not component_constructor: 831 raise ValueError(f"Could not find constructor for {model.__class__}") 832 833 # collect deprecation warnings for supported models. 834 if isinstance(model, BaseModelWithDeprecations): 835 self._collect_model_deprecations(model) 836 837 return component_constructor(model=model, config=config, **kwargs) 838 839 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 840 """ 841 Returns the deprecation warnings that were collected during the creation of components. 842 """ 843 return self._collected_deprecation_logs 844 845 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 846 """ 847 Collects deprecation logs from the given model and appends any new logs to the internal collection. 848 849 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 850 851 Args: 852 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 853 """ 854 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 855 for log in model._deprecation_logs: 856 # avoid duplicates for deprecation logs observed. 857 if log not in self._collected_deprecation_logs: 858 self._collected_deprecation_logs.append(log) 859 860 def create_config_migration( 861 self, model: ConfigMigrationModel, config: Config 862 ) -> ConfigMigration: 863 transformations: List[ConfigTransformation] = [ 864 self._create_component_from_model(transformation, config) 865 for transformation in model.transformations 866 ] 867 868 return ConfigMigration( 869 description=model.description, 870 transformations=transformations, 871 ) 872 873 def create_config_add_fields( 874 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 875 ) -> ConfigAddFields: 876 fields = [self._create_component_from_model(field, config) for field in model.fields] 877 return ConfigAddFields( 878 fields=fields, 879 condition=model.condition or "", 880 ) 881 882 @staticmethod 883 def create_config_remove_fields( 884 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 885 ) -> ConfigRemoveFields: 886 return ConfigRemoveFields( 887 field_pointers=model.field_pointers, 888 condition=model.condition or "", 889 ) 890 891 @staticmethod 892 def create_config_remap_field( 893 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 894 ) -> ConfigRemapField: 895 mapping = cast(Mapping[str, Any], model.map) 896 return ConfigRemapField( 897 map=mapping, 898 field_path=model.field_path, 899 config=config, 900 ) 901 902 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 903 strategy = self._create_component_from_model(model.validation_strategy, config) 904 905 return DpathValidator( 906 field_path=model.field_path, 907 strategy=strategy, 908 ) 909 910 def create_predicate_validator( 911 self, model: PredicateValidatorModel, config: Config 912 ) -> PredicateValidator: 913 strategy = self._create_component_from_model(model.validation_strategy, config) 914 915 return PredicateValidator( 916 value=model.value, 917 strategy=strategy, 918 ) 919 920 @staticmethod 921 def create_validate_adheres_to_schema( 922 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 923 ) -> ValidateAdheresToSchema: 924 base_schema = cast(Mapping[str, Any], model.base_schema) 925 return ValidateAdheresToSchema( 926 schema=base_schema, 927 ) 928 929 @staticmethod 930 def create_added_field_definition( 931 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 932 ) -> AddedFieldDefinition: 933 interpolated_value = InterpolatedString.create( 934 model.value, parameters=model.parameters or {} 935 ) 936 return AddedFieldDefinition( 937 path=model.path, 938 value=interpolated_value, 939 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 940 parameters=model.parameters or {}, 941 ) 942 943 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 944 added_field_definitions = [ 945 self._create_component_from_model( 946 model=added_field_definition_model, 947 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 948 added_field_definition_model.value_type 949 ), 950 config=config, 951 ) 952 for added_field_definition_model in model.fields 953 ] 954 return AddFields( 955 fields=added_field_definitions, 956 condition=model.condition or "", 957 parameters=model.parameters or {}, 958 ) 959 960 def create_keys_to_lower_transformation( 961 self, model: KeysToLowerModel, config: Config, **kwargs: Any 962 ) -> KeysToLowerTransformation: 963 return KeysToLowerTransformation() 964 965 def create_keys_to_snake_transformation( 966 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 967 ) -> KeysToSnakeCaseTransformation: 968 return KeysToSnakeCaseTransformation() 969 970 def create_keys_replace_transformation( 971 self, model: KeysReplaceModel, config: Config, **kwargs: Any 972 ) -> KeysReplaceTransformation: 973 return KeysReplaceTransformation( 974 old=model.old, new=model.new, parameters=model.parameters or {} 975 ) 976 977 def create_flatten_fields( 978 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 979 ) -> FlattenFields: 980 return FlattenFields( 981 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 982 ) 983 984 def create_dpath_flatten_fields( 985 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 986 ) -> DpathFlattenFields: 987 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 988 key_transformation = ( 989 KeyTransformation( 990 config=config, 991 prefix=model.key_transformation.prefix, 992 suffix=model.key_transformation.suffix, 993 parameters=model.parameters or {}, 994 ) 995 if model.key_transformation is not None 996 else None 997 ) 998 return DpathFlattenFields( 999 config=config, 1000 field_path=model_field_path, 1001 delete_origin_value=model.delete_origin_value 1002 if model.delete_origin_value is not None 1003 else False, 1004 replace_record=model.replace_record if model.replace_record is not None else False, 1005 key_transformation=key_transformation, 1006 parameters=model.parameters or {}, 1007 ) 1008 1009 @staticmethod 1010 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1011 if not value_type: 1012 return None 1013 names_to_types = { 1014 ValueType.string: str, 1015 ValueType.number: float, 1016 ValueType.integer: int, 1017 ValueType.boolean: bool, 1018 } 1019 return names_to_types[value_type] 1020 1021 def create_api_key_authenticator( 1022 self, 1023 model: ApiKeyAuthenticatorModel, 1024 config: Config, 1025 token_provider: Optional[TokenProvider] = None, 1026 **kwargs: Any, 1027 ) -> ApiKeyAuthenticator: 1028 if model.inject_into is None and model.header is None: 1029 raise ValueError( 1030 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1031 ) 1032 1033 if model.inject_into is not None and model.header is not None: 1034 raise ValueError( 1035 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1036 ) 1037 1038 if token_provider is not None and model.api_token != "": 1039 raise ValueError( 1040 "If token_provider is set, api_token is ignored and has to be set to empty string." 1041 ) 1042 1043 request_option = ( 1044 self._create_component_from_model( 1045 model.inject_into, config, parameters=model.parameters or {} 1046 ) 1047 if model.inject_into 1048 else RequestOption( 1049 inject_into=RequestOptionType.header, 1050 field_name=model.header or "", 1051 parameters=model.parameters or {}, 1052 ) 1053 ) 1054 1055 return ApiKeyAuthenticator( 1056 token_provider=( 1057 token_provider 1058 if token_provider is not None 1059 else InterpolatedStringTokenProvider( 1060 api_token=model.api_token or "", 1061 config=config, 1062 parameters=model.parameters or {}, 1063 ) 1064 ), 1065 request_option=request_option, 1066 config=config, 1067 parameters=model.parameters or {}, 1068 ) 1069 1070 def create_legacy_to_per_partition_state_migration( 1071 self, 1072 model: LegacyToPerPartitionStateMigrationModel, 1073 config: Mapping[str, Any], 1074 declarative_stream: DeclarativeStreamModel, 1075 ) -> LegacyToPerPartitionStateMigration: 1076 retriever = declarative_stream.retriever 1077 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1078 raise ValueError( 1079 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1080 ) 1081 partition_router = retriever.partition_router 1082 if not isinstance( 1083 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1084 ): 1085 raise ValueError( 1086 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1087 ) 1088 if not hasattr(partition_router, "parent_stream_configs"): 1089 raise ValueError( 1090 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1091 ) 1092 1093 if not hasattr(declarative_stream, "incremental_sync"): 1094 raise ValueError( 1095 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1096 ) 1097 1098 return LegacyToPerPartitionStateMigration( 1099 partition_router, # type: ignore # was already checked above 1100 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1101 config, 1102 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1103 ) 1104 1105 def create_session_token_authenticator( 1106 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1107 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1108 decoder = ( 1109 self._create_component_from_model(model=model.decoder, config=config) 1110 if model.decoder 1111 else JsonDecoder(parameters={}) 1112 ) 1113 login_requester = self._create_component_from_model( 1114 model=model.login_requester, 1115 config=config, 1116 name=f"{name}_login_requester", 1117 decoder=decoder, 1118 ) 1119 token_provider = SessionTokenProvider( 1120 login_requester=login_requester, 1121 session_token_path=model.session_token_path, 1122 expiration_duration=parse_duration(model.expiration_duration) 1123 if model.expiration_duration 1124 else None, 1125 parameters=model.parameters or {}, 1126 message_repository=self._message_repository, 1127 decoder=decoder, 1128 ) 1129 if model.request_authentication.type == "Bearer": 1130 return ModelToComponentFactory.create_bearer_authenticator( 1131 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1132 config, 1133 token_provider=token_provider, 1134 ) 1135 else: 1136 return self.create_api_key_authenticator( 1137 ApiKeyAuthenticatorModel( 1138 type="ApiKeyAuthenticator", 1139 api_token="", 1140 inject_into=model.request_authentication.inject_into, 1141 ), # type: ignore # $parameters and headers default to None 1142 config=config, 1143 token_provider=token_provider, 1144 ) 1145 1146 @staticmethod 1147 def create_basic_http_authenticator( 1148 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1149 ) -> BasicHttpAuthenticator: 1150 return BasicHttpAuthenticator( 1151 password=model.password or "", 1152 username=model.username, 1153 config=config, 1154 parameters=model.parameters or {}, 1155 ) 1156 1157 @staticmethod 1158 def create_bearer_authenticator( 1159 model: BearerAuthenticatorModel, 1160 config: Config, 1161 token_provider: Optional[TokenProvider] = None, 1162 **kwargs: Any, 1163 ) -> BearerAuthenticator: 1164 if token_provider is not None and model.api_token != "": 1165 raise ValueError( 1166 "If token_provider is set, api_token is ignored and has to be set to empty string." 1167 ) 1168 return BearerAuthenticator( 1169 token_provider=( 1170 token_provider 1171 if token_provider is not None 1172 else InterpolatedStringTokenProvider( 1173 api_token=model.api_token or "", 1174 config=config, 1175 parameters=model.parameters or {}, 1176 ) 1177 ), 1178 config=config, 1179 parameters=model.parameters or {}, 1180 ) 1181 1182 @staticmethod 1183 def create_dynamic_stream_check_config( 1184 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1185 ) -> DynamicStreamCheckConfig: 1186 return DynamicStreamCheckConfig( 1187 dynamic_stream_name=model.dynamic_stream_name, 1188 stream_count=model.stream_count or 0, 1189 ) 1190 1191 def create_check_stream( 1192 self, model: CheckStreamModel, config: Config, **kwargs: Any 1193 ) -> CheckStream: 1194 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1195 raise ValueError( 1196 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1197 ) 1198 1199 dynamic_streams_check_configs = ( 1200 [ 1201 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1202 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1203 ] 1204 if model.dynamic_streams_check_configs 1205 else [] 1206 ) 1207 1208 return CheckStream( 1209 stream_names=model.stream_names or [], 1210 dynamic_streams_check_configs=dynamic_streams_check_configs, 1211 parameters={}, 1212 ) 1213 1214 @staticmethod 1215 def create_check_dynamic_stream( 1216 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1217 ) -> CheckDynamicStream: 1218 assert model.use_check_availability is not None # for mypy 1219 1220 use_check_availability = model.use_check_availability 1221 1222 return CheckDynamicStream( 1223 stream_count=model.stream_count, 1224 use_check_availability=use_check_availability, 1225 parameters={}, 1226 ) 1227 1228 def create_composite_error_handler( 1229 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1230 ) -> CompositeErrorHandler: 1231 error_handlers = [ 1232 self._create_component_from_model(model=error_handler_model, config=config) 1233 for error_handler_model in model.error_handlers 1234 ] 1235 return CompositeErrorHandler( 1236 error_handlers=error_handlers, parameters=model.parameters or {} 1237 ) 1238 1239 @staticmethod 1240 def create_concurrency_level( 1241 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1242 ) -> ConcurrencyLevel: 1243 return ConcurrencyLevel( 1244 default_concurrency=model.default_concurrency, 1245 max_concurrency=model.max_concurrency, 1246 config=config, 1247 parameters={}, 1248 ) 1249 1250 @staticmethod 1251 def apply_stream_state_migrations( 1252 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1253 ) -> MutableMapping[str, Any]: 1254 if stream_state_migrations: 1255 for state_migration in stream_state_migrations: 1256 if state_migration.should_migrate(stream_state): 1257 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1258 stream_state = dict(state_migration.migrate(stream_state)) 1259 return stream_state 1260 1261 def create_concurrent_cursor_from_datetime_based_cursor( 1262 self, 1263 model_type: Type[BaseModel], 1264 component_definition: ComponentDefinition, 1265 stream_name: str, 1266 stream_namespace: Optional[str], 1267 config: Config, 1268 message_repository: Optional[MessageRepository] = None, 1269 runtime_lookback_window: Optional[datetime.timedelta] = None, 1270 stream_state_migrations: Optional[List[Any]] = None, 1271 **kwargs: Any, 1272 ) -> ConcurrentCursor: 1273 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1274 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1275 # incoming state and connector_state_manager that is initialized when the component factory is created 1276 stream_state = ( 1277 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1278 if "stream_state" not in kwargs 1279 else kwargs["stream_state"] 1280 ) 1281 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1282 1283 component_type = component_definition.get("type") 1284 if component_definition.get("type") != model_type.__name__: 1285 raise ValueError( 1286 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1287 ) 1288 1289 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1290 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1291 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1292 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1293 if "$parameters" not in component_definition and "parameters" in component_definition: 1294 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1295 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1296 1297 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1298 raise ValueError( 1299 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1300 ) 1301 1302 model_parameters = datetime_based_cursor_model.parameters or {} 1303 interpolated_cursor_field = InterpolatedString.create( 1304 datetime_based_cursor_model.cursor_field, 1305 parameters=model_parameters, 1306 ) 1307 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1308 1309 interpolated_partition_field_start = InterpolatedString.create( 1310 datetime_based_cursor_model.partition_field_start or "start_time", 1311 parameters=model_parameters, 1312 ) 1313 interpolated_partition_field_end = InterpolatedString.create( 1314 datetime_based_cursor_model.partition_field_end or "end_time", 1315 parameters=model_parameters, 1316 ) 1317 1318 slice_boundary_fields = ( 1319 interpolated_partition_field_start.eval(config=config), 1320 interpolated_partition_field_end.eval(config=config), 1321 ) 1322 1323 datetime_format = datetime_based_cursor_model.datetime_format 1324 1325 cursor_granularity = ( 1326 parse_duration(datetime_based_cursor_model.cursor_granularity) 1327 if datetime_based_cursor_model.cursor_granularity 1328 else None 1329 ) 1330 1331 lookback_window = None 1332 interpolated_lookback_window = ( 1333 InterpolatedString.create( 1334 datetime_based_cursor_model.lookback_window, 1335 parameters=model_parameters, 1336 ) 1337 if datetime_based_cursor_model.lookback_window 1338 else None 1339 ) 1340 if interpolated_lookback_window: 1341 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1342 if evaluated_lookback_window: 1343 lookback_window = parse_duration(evaluated_lookback_window) 1344 1345 connector_state_converter: DateTimeStreamStateConverter 1346 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1347 datetime_format=datetime_format, 1348 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1349 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1350 cursor_granularity=cursor_granularity, 1351 ) 1352 1353 # Adjusts the stream state by applying the runtime lookback window. 1354 # This is used to ensure correct state handling in case of failed partitions. 1355 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1356 if runtime_lookback_window and stream_state_value: 1357 new_stream_state = ( 1358 connector_state_converter.parse_timestamp(stream_state_value) 1359 - runtime_lookback_window 1360 ) 1361 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1362 new_stream_state 1363 ) 1364 1365 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1366 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1367 start_date_runtime_value = self.create_min_max_datetime( 1368 model=datetime_based_cursor_model.start_datetime, config=config 1369 ) 1370 else: 1371 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1372 1373 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1374 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1375 end_date_runtime_value = self.create_min_max_datetime( 1376 model=datetime_based_cursor_model.end_datetime, config=config 1377 ) 1378 else: 1379 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1380 1381 interpolated_start_date = MinMaxDatetime.create( 1382 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1383 parameters=datetime_based_cursor_model.parameters, 1384 ) 1385 interpolated_end_date = ( 1386 None 1387 if not end_date_runtime_value 1388 else MinMaxDatetime.create( 1389 end_date_runtime_value, datetime_based_cursor_model.parameters 1390 ) 1391 ) 1392 1393 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1394 if not interpolated_start_date.datetime_format: 1395 interpolated_start_date.datetime_format = datetime_format 1396 if interpolated_end_date and not interpolated_end_date.datetime_format: 1397 interpolated_end_date.datetime_format = datetime_format 1398 1399 start_date = interpolated_start_date.get_datetime(config=config) 1400 end_date_provider = ( 1401 partial(interpolated_end_date.get_datetime, config) 1402 if interpolated_end_date 1403 else connector_state_converter.get_end_provider() 1404 ) 1405 1406 if ( 1407 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1408 ) or ( 1409 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1410 ): 1411 raise ValueError( 1412 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1413 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1414 ) 1415 1416 # When step is not defined, default to a step size from the starting date to the present moment 1417 step_length = datetime.timedelta.max 1418 interpolated_step = ( 1419 InterpolatedString.create( 1420 datetime_based_cursor_model.step, 1421 parameters=model_parameters, 1422 ) 1423 if datetime_based_cursor_model.step 1424 else None 1425 ) 1426 if interpolated_step: 1427 evaluated_step = interpolated_step.eval(config) 1428 if evaluated_step: 1429 step_length = parse_duration(evaluated_step) 1430 1431 clamping_strategy: ClampingStrategy = NoClamping() 1432 if datetime_based_cursor_model.clamping: 1433 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1434 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1435 # object which we want to keep agnostic of being low-code 1436 target = InterpolatedString( 1437 string=datetime_based_cursor_model.clamping.target, 1438 parameters=model_parameters, 1439 ) 1440 evaluated_target = target.eval(config=config) 1441 match evaluated_target: 1442 case "DAY": 1443 clamping_strategy = DayClampingStrategy() 1444 end_date_provider = ClampingEndProvider( 1445 DayClampingStrategy(is_ceiling=False), 1446 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1447 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1448 ) 1449 case "WEEK": 1450 if ( 1451 not datetime_based_cursor_model.clamping.target_details 1452 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1453 ): 1454 raise ValueError( 1455 "Given WEEK clamping, weekday needs to be provided as target_details" 1456 ) 1457 weekday = self._assemble_weekday( 1458 datetime_based_cursor_model.clamping.target_details["weekday"] 1459 ) 1460 clamping_strategy = WeekClampingStrategy(weekday) 1461 end_date_provider = ClampingEndProvider( 1462 WeekClampingStrategy(weekday, is_ceiling=False), 1463 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1464 granularity=cursor_granularity or datetime.timedelta(days=1), 1465 ) 1466 case "MONTH": 1467 clamping_strategy = MonthClampingStrategy() 1468 end_date_provider = ClampingEndProvider( 1469 MonthClampingStrategy(is_ceiling=False), 1470 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1471 granularity=cursor_granularity or datetime.timedelta(days=1), 1472 ) 1473 case _: 1474 raise ValueError( 1475 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1476 ) 1477 1478 return ConcurrentCursor( 1479 stream_name=stream_name, 1480 stream_namespace=stream_namespace, 1481 stream_state=stream_state, 1482 message_repository=message_repository or self._message_repository, 1483 connector_state_manager=self._connector_state_manager, 1484 connector_state_converter=connector_state_converter, 1485 cursor_field=cursor_field, 1486 slice_boundary_fields=slice_boundary_fields, 1487 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1488 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1489 lookback_window=lookback_window, 1490 slice_range=step_length, 1491 cursor_granularity=cursor_granularity, 1492 clamping_strategy=clamping_strategy, 1493 ) 1494 1495 def create_concurrent_cursor_from_incrementing_count_cursor( 1496 self, 1497 model_type: Type[BaseModel], 1498 component_definition: ComponentDefinition, 1499 stream_name: str, 1500 stream_namespace: Optional[str], 1501 config: Config, 1502 message_repository: Optional[MessageRepository] = None, 1503 stream_state_migrations: Optional[List[Any]] = None, 1504 **kwargs: Any, 1505 ) -> ConcurrentCursor: 1506 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1507 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1508 # incoming state and connector_state_manager that is initialized when the component factory is created 1509 stream_state = ( 1510 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1511 if "stream_state" not in kwargs 1512 else kwargs["stream_state"] 1513 ) 1514 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1515 1516 component_type = component_definition.get("type") 1517 if component_definition.get("type") != model_type.__name__: 1518 raise ValueError( 1519 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1520 ) 1521 1522 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1523 1524 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1525 raise ValueError( 1526 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1527 ) 1528 1529 interpolated_start_value = ( 1530 InterpolatedString.create( 1531 incrementing_count_cursor_model.start_value, # type: ignore 1532 parameters=incrementing_count_cursor_model.parameters or {}, 1533 ) 1534 if incrementing_count_cursor_model.start_value 1535 else 0 1536 ) 1537 1538 interpolated_cursor_field = InterpolatedString.create( 1539 incrementing_count_cursor_model.cursor_field, 1540 parameters=incrementing_count_cursor_model.parameters or {}, 1541 ) 1542 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1543 1544 connector_state_converter = IncrementingCountStreamStateConverter( 1545 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1546 ) 1547 1548 return ConcurrentCursor( 1549 stream_name=stream_name, 1550 stream_namespace=stream_namespace, 1551 stream_state=stream_state, 1552 message_repository=message_repository or self._message_repository, 1553 connector_state_manager=self._connector_state_manager, 1554 connector_state_converter=connector_state_converter, 1555 cursor_field=cursor_field, 1556 slice_boundary_fields=None, 1557 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1558 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1559 ) 1560 1561 def _assemble_weekday(self, weekday: str) -> Weekday: 1562 match weekday: 1563 case "MONDAY": 1564 return Weekday.MONDAY 1565 case "TUESDAY": 1566 return Weekday.TUESDAY 1567 case "WEDNESDAY": 1568 return Weekday.WEDNESDAY 1569 case "THURSDAY": 1570 return Weekday.THURSDAY 1571 case "FRIDAY": 1572 return Weekday.FRIDAY 1573 case "SATURDAY": 1574 return Weekday.SATURDAY 1575 case "SUNDAY": 1576 return Weekday.SUNDAY 1577 case _: 1578 raise ValueError(f"Unknown weekday {weekday}") 1579 1580 def create_concurrent_cursor_from_perpartition_cursor( 1581 self, 1582 state_manager: ConnectorStateManager, 1583 model_type: Type[BaseModel], 1584 component_definition: ComponentDefinition, 1585 stream_name: str, 1586 stream_namespace: Optional[str], 1587 config: Config, 1588 stream_state: MutableMapping[str, Any], 1589 partition_router: PartitionRouter, 1590 stream_state_migrations: Optional[List[Any]] = None, 1591 attempt_to_create_cursor_if_not_provided: bool = False, 1592 **kwargs: Any, 1593 ) -> ConcurrentPerPartitionCursor: 1594 component_type = component_definition.get("type") 1595 if component_definition.get("type") != model_type.__name__: 1596 raise ValueError( 1597 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1598 ) 1599 1600 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1601 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1602 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1603 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1604 if "$parameters" not in component_definition and "parameters" in component_definition: 1605 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1606 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1607 1608 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1609 raise ValueError( 1610 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1611 ) 1612 1613 interpolated_cursor_field = InterpolatedString.create( 1614 datetime_based_cursor_model.cursor_field, 1615 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1616 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1617 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1618 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1619 parameters=datetime_based_cursor_model.parameters or {}, 1620 ) 1621 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1622 1623 datetime_format = datetime_based_cursor_model.datetime_format 1624 1625 cursor_granularity = ( 1626 parse_duration(datetime_based_cursor_model.cursor_granularity) 1627 if datetime_based_cursor_model.cursor_granularity 1628 else None 1629 ) 1630 1631 connector_state_converter: DateTimeStreamStateConverter 1632 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1633 datetime_format=datetime_format, 1634 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1635 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1636 cursor_granularity=cursor_granularity, 1637 ) 1638 1639 # Create the cursor factory 1640 cursor_factory = ConcurrentCursorFactory( 1641 partial( 1642 self.create_concurrent_cursor_from_datetime_based_cursor, 1643 state_manager=state_manager, 1644 model_type=model_type, 1645 component_definition=component_definition, 1646 stream_name=stream_name, 1647 stream_namespace=stream_namespace, 1648 config=config, 1649 message_repository=NoopMessageRepository(), 1650 # stream_state_migrations=stream_state_migrations, # FIXME is it expected to run migration on per partition state too? 1651 ) 1652 ) 1653 1654 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1655 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1656 use_global_cursor = isinstance( 1657 partition_router, GroupingPartitionRouter 1658 ) or component_definition.get("global_substream_cursor", False) 1659 1660 # Return the concurrent cursor and state converter 1661 return ConcurrentPerPartitionCursor( 1662 cursor_factory=cursor_factory, 1663 partition_router=partition_router, 1664 stream_name=stream_name, 1665 stream_namespace=stream_namespace, 1666 stream_state=stream_state, 1667 message_repository=self._message_repository, # type: ignore 1668 connector_state_manager=state_manager, 1669 connector_state_converter=connector_state_converter, 1670 cursor_field=cursor_field, 1671 use_global_cursor=use_global_cursor, 1672 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1673 ) 1674 1675 @staticmethod 1676 def create_constant_backoff_strategy( 1677 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1678 ) -> ConstantBackoffStrategy: 1679 return ConstantBackoffStrategy( 1680 backoff_time_in_seconds=model.backoff_time_in_seconds, 1681 config=config, 1682 parameters=model.parameters or {}, 1683 ) 1684 1685 def create_cursor_pagination( 1686 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1687 ) -> CursorPaginationStrategy: 1688 if isinstance(decoder, PaginationDecoderDecorator): 1689 inner_decoder = decoder.decoder 1690 else: 1691 inner_decoder = decoder 1692 decoder = PaginationDecoderDecorator(decoder=decoder) 1693 1694 if self._is_supported_decoder_for_pagination(inner_decoder): 1695 decoder_to_use = decoder 1696 else: 1697 raise ValueError( 1698 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1699 ) 1700 1701 return CursorPaginationStrategy( 1702 cursor_value=model.cursor_value, 1703 decoder=decoder_to_use, 1704 page_size=model.page_size, 1705 stop_condition=model.stop_condition, 1706 config=config, 1707 parameters=model.parameters or {}, 1708 ) 1709 1710 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1711 """ 1712 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1713 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1714 :param model: The Pydantic model of the custom component being created 1715 :param config: The custom defined connector config 1716 :return: The declarative component built from the Pydantic model to be used at runtime 1717 """ 1718 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1719 component_fields = get_type_hints(custom_component_class) 1720 model_args = model.dict() 1721 model_args["config"] = config 1722 1723 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1724 # we defer to these arguments over the component's definition 1725 for key, arg in kwargs.items(): 1726 model_args[key] = arg 1727 1728 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1729 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1730 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1731 for model_field, model_value in model_args.items(): 1732 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1733 if ( 1734 isinstance(model_value, dict) 1735 and "type" not in model_value 1736 and model_field in component_fields 1737 ): 1738 derived_type = self._derive_component_type_from_type_hints( 1739 component_fields.get(model_field) 1740 ) 1741 if derived_type: 1742 model_value["type"] = derived_type 1743 1744 if self._is_component(model_value): 1745 model_args[model_field] = self._create_nested_component( 1746 model, 1747 model_field, 1748 model_value, 1749 config, 1750 **kwargs, 1751 ) 1752 elif isinstance(model_value, list): 1753 vals = [] 1754 for v in model_value: 1755 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1756 derived_type = self._derive_component_type_from_type_hints( 1757 component_fields.get(model_field) 1758 ) 1759 if derived_type: 1760 v["type"] = derived_type 1761 if self._is_component(v): 1762 vals.append( 1763 self._create_nested_component( 1764 model, 1765 model_field, 1766 v, 1767 config, 1768 **kwargs, 1769 ) 1770 ) 1771 else: 1772 vals.append(v) 1773 model_args[model_field] = vals 1774 1775 kwargs = { 1776 class_field: model_args[class_field] 1777 for class_field in component_fields.keys() 1778 if class_field in model_args 1779 } 1780 return custom_component_class(**kwargs) 1781 1782 @staticmethod 1783 def _get_class_from_fully_qualified_class_name( 1784 full_qualified_class_name: str, 1785 ) -> Any: 1786 """Get a class from its fully qualified name. 1787 1788 If a custom components module is needed, we assume it is already registered - probably 1789 as `source_declarative_manifest.components` or `components`. 1790 1791 Args: 1792 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1793 1794 Returns: 1795 Any: The class object. 1796 1797 Raises: 1798 ValueError: If the class cannot be loaded. 1799 """ 1800 split = full_qualified_class_name.split(".") 1801 module_name_full = ".".join(split[:-1]) 1802 class_name = split[-1] 1803 1804 try: 1805 module_ref = importlib.import_module(module_name_full) 1806 except ModuleNotFoundError as e: 1807 if split[0] == "source_declarative_manifest": 1808 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1809 try: 1810 import os 1811 1812 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1813 module_ref = importlib.import_module( 1814 module_name_with_source_declarative_manifest 1815 ) 1816 except ModuleNotFoundError: 1817 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1818 else: 1819 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1820 1821 try: 1822 return getattr(module_ref, class_name) 1823 except AttributeError as e: 1824 raise ValueError( 1825 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1826 ) from e 1827 1828 @staticmethod 1829 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1830 interface = field_type 1831 while True: 1832 origin = get_origin(interface) 1833 if origin: 1834 # Unnest types until we reach the raw type 1835 # List[T] -> T 1836 # Optional[List[T]] -> T 1837 args = get_args(interface) 1838 interface = args[0] 1839 else: 1840 break 1841 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1842 return interface.__name__ 1843 return None 1844 1845 @staticmethod 1846 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1847 if not cls: 1848 return False 1849 return cls.__module__ == "builtins" 1850 1851 @staticmethod 1852 def _extract_missing_parameters(error: TypeError) -> List[str]: 1853 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1854 if parameter_search: 1855 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1856 else: 1857 return [] 1858 1859 def _create_nested_component( 1860 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1861 ) -> Any: 1862 type_name = model_value.get("type", None) 1863 if not type_name: 1864 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1865 return model_value 1866 1867 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1868 if model_type: 1869 parsed_model = model_type.parse_obj(model_value) 1870 try: 1871 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1872 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1873 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1874 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1875 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1876 # are needed by a component and could not be shared. 1877 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1878 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1879 model_parameters = model_value.get("$parameters", {}) 1880 matching_parameters = { 1881 kwarg: model_parameters[kwarg] 1882 for kwarg in constructor_kwargs 1883 if kwarg in model_parameters 1884 } 1885 matching_kwargs = { 1886 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1887 } 1888 return self._create_component_from_model( 1889 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1890 ) 1891 except TypeError as error: 1892 missing_parameters = self._extract_missing_parameters(error) 1893 if missing_parameters: 1894 raise ValueError( 1895 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1896 + ", ".join( 1897 ( 1898 f"{type_name}.$parameters.{parameter}" 1899 for parameter in missing_parameters 1900 ) 1901 ) 1902 ) 1903 raise TypeError( 1904 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1905 ) 1906 else: 1907 raise ValueError( 1908 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1909 ) 1910 1911 @staticmethod 1912 def _is_component(model_value: Any) -> bool: 1913 return isinstance(model_value, dict) and model_value.get("type") is not None 1914 1915 def create_datetime_based_cursor( 1916 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1917 ) -> DatetimeBasedCursor: 1918 start_datetime: Union[str, MinMaxDatetime] = ( 1919 model.start_datetime 1920 if isinstance(model.start_datetime, str) 1921 else self.create_min_max_datetime(model.start_datetime, config) 1922 ) 1923 end_datetime: Union[str, MinMaxDatetime, None] = None 1924 if model.is_data_feed and model.end_datetime: 1925 raise ValueError("Data feed does not support end_datetime") 1926 if model.is_data_feed and model.is_client_side_incremental: 1927 raise ValueError( 1928 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1929 ) 1930 if model.end_datetime: 1931 end_datetime = ( 1932 model.end_datetime 1933 if isinstance(model.end_datetime, str) 1934 else self.create_min_max_datetime(model.end_datetime, config) 1935 ) 1936 1937 end_time_option = ( 1938 self._create_component_from_model( 1939 model.end_time_option, config, parameters=model.parameters or {} 1940 ) 1941 if model.end_time_option 1942 else None 1943 ) 1944 start_time_option = ( 1945 self._create_component_from_model( 1946 model.start_time_option, config, parameters=model.parameters or {} 1947 ) 1948 if model.start_time_option 1949 else None 1950 ) 1951 1952 return DatetimeBasedCursor( 1953 cursor_field=model.cursor_field, 1954 cursor_datetime_formats=model.cursor_datetime_formats 1955 if model.cursor_datetime_formats 1956 else [], 1957 cursor_granularity=model.cursor_granularity, 1958 datetime_format=model.datetime_format, 1959 end_datetime=end_datetime, 1960 start_datetime=start_datetime, 1961 step=model.step, 1962 end_time_option=end_time_option, 1963 lookback_window=model.lookback_window, 1964 start_time_option=start_time_option, 1965 partition_field_end=model.partition_field_end, 1966 partition_field_start=model.partition_field_start, 1967 message_repository=self._message_repository, 1968 is_compare_strictly=model.is_compare_strictly, 1969 config=config, 1970 parameters=model.parameters or {}, 1971 ) 1972 1973 def create_default_stream( 1974 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1975 ) -> AbstractStream: 1976 primary_key = model.primary_key.__root__ if model.primary_key else None 1977 1978 partition_router = self._build_stream_slicer_from_partition_router( 1979 model.retriever, 1980 config, 1981 stream_name=model.name, 1982 **kwargs, 1983 ) 1984 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 1985 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1986 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 1987 1988 end_time_option = ( 1989 self._create_component_from_model( 1990 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1991 ) 1992 if cursor_model.end_time_option 1993 else None 1994 ) 1995 start_time_option = ( 1996 self._create_component_from_model( 1997 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1998 ) 1999 if cursor_model.start_time_option 2000 else None 2001 ) 2002 2003 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2004 start_time_option=start_time_option, 2005 end_time_option=end_time_option, 2006 partition_field_start=cursor_model.partition_field_start, 2007 partition_field_end=cursor_model.partition_field_end, 2008 config=config, 2009 parameters=model.parameters or {}, 2010 ) 2011 request_options_provider = ( 2012 datetime_request_options_provider 2013 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2014 else PerPartitionRequestOptionsProvider( 2015 partition_router, datetime_request_options_provider 2016 ) 2017 ) 2018 elif model.incremental_sync and isinstance( 2019 model.incremental_sync, IncrementingCountCursorModel 2020 ): 2021 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2022 raise ValueError( 2023 "PerPartition does not support per partition states because switching to global state is time based" 2024 ) 2025 2026 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2027 2028 start_time_option = ( 2029 self._create_component_from_model( 2030 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2031 config, 2032 parameters=cursor_model.parameters or {}, 2033 ) 2034 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2035 else None 2036 ) 2037 2038 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2039 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2040 partition_field_start = "start" 2041 2042 request_options_provider = DatetimeBasedRequestOptionsProvider( 2043 start_time_option=start_time_option, 2044 partition_field_start=partition_field_start, 2045 config=config, 2046 parameters=model.parameters or {}, 2047 ) 2048 else: 2049 request_options_provider = None 2050 2051 transformations = [] 2052 if model.transformations: 2053 for transformation_model in model.transformations: 2054 transformations.append( 2055 self._create_component_from_model(model=transformation_model, config=config) 2056 ) 2057 file_uploader = None 2058 if model.file_uploader: 2059 file_uploader = self._create_component_from_model( 2060 model=model.file_uploader, config=config 2061 ) 2062 2063 stream_slicer: ConcurrentStreamSlicer = ( 2064 partition_router 2065 if isinstance(concurrent_cursor, FinalStateCursor) 2066 else concurrent_cursor 2067 ) 2068 retriever = self._create_component_from_model( 2069 model=model.retriever, 2070 config=config, 2071 name=model.name, 2072 primary_key=primary_key, 2073 request_options_provider=request_options_provider, 2074 stream_slicer=stream_slicer, 2075 partition_router=partition_router, 2076 stop_condition_cursor=concurrent_cursor 2077 if self._is_stop_condition_on_cursor(model) 2078 else None, 2079 client_side_incremental_sync={"cursor": concurrent_cursor} 2080 if self._is_client_side_filtering_enabled(model) 2081 else None, 2082 transformations=transformations, 2083 file_uploader=file_uploader, 2084 incremental_sync=model.incremental_sync, 2085 ) 2086 if isinstance(retriever, AsyncRetriever): 2087 stream_slicer = retriever.stream_slicer 2088 2089 schema_loader: Union[ 2090 CompositeSchemaLoader, 2091 DefaultSchemaLoader, 2092 DynamicSchemaLoader, 2093 InlineSchemaLoader, 2094 JsonFileSchemaLoader, 2095 ] 2096 if model.schema_loader and isinstance(model.schema_loader, list): 2097 nested_schema_loaders = [ 2098 self._create_component_from_model(model=nested_schema_loader, config=config) 2099 for nested_schema_loader in model.schema_loader 2100 ] 2101 schema_loader = CompositeSchemaLoader( 2102 schema_loaders=nested_schema_loaders, parameters={} 2103 ) 2104 elif model.schema_loader: 2105 schema_loader = self._create_component_from_model( 2106 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2107 config=config, 2108 ) 2109 else: 2110 options = model.parameters or {} 2111 if "name" not in options: 2112 options["name"] = model.name 2113 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2114 2115 stream_name = model.name or "" 2116 return DefaultStream( 2117 partition_generator=StreamSlicerPartitionGenerator( 2118 DeclarativePartitionFactory( 2119 stream_name, 2120 schema_loader, 2121 retriever, 2122 self._message_repository, 2123 ), 2124 stream_slicer, 2125 slice_limit=self._limit_slices_fetched, 2126 ), 2127 name=stream_name, 2128 json_schema=schema_loader.get_json_schema, 2129 primary_key=get_primary_key_from_stream(primary_key), 2130 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2131 if hasattr(concurrent_cursor, "cursor_field") 2132 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2133 logger=logging.getLogger(f"airbyte.{stream_name}"), 2134 cursor=concurrent_cursor, 2135 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2136 ) 2137 2138 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2139 return bool( 2140 model.incremental_sync 2141 and hasattr(model.incremental_sync, "is_data_feed") 2142 and model.incremental_sync.is_data_feed 2143 ) 2144 2145 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2146 return bool( 2147 model.incremental_sync 2148 and hasattr(model.incremental_sync, "is_client_side_incremental") 2149 and model.incremental_sync.is_client_side_incremental 2150 ) 2151 2152 def _build_stream_slicer_from_partition_router( 2153 self, 2154 model: Union[ 2155 AsyncRetrieverModel, 2156 CustomRetrieverModel, 2157 SimpleRetrieverModel, 2158 ], 2159 config: Config, 2160 stream_name: Optional[str] = None, 2161 **kwargs: Any, 2162 ) -> PartitionRouter: 2163 if ( 2164 hasattr(model, "partition_router") 2165 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2166 and model.partition_router 2167 ): 2168 stream_slicer_model = model.partition_router 2169 if isinstance(stream_slicer_model, list): 2170 return CartesianProductStreamSlicer( 2171 [ 2172 self._create_component_from_model( 2173 model=slicer, config=config, stream_name=stream_name or "" 2174 ) 2175 for slicer in stream_slicer_model 2176 ], 2177 parameters={}, 2178 ) 2179 elif isinstance(stream_slicer_model, dict): 2180 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2181 params = stream_slicer_model.get("$parameters") 2182 if not isinstance(params, dict): 2183 params = {} 2184 stream_slicer_model["$parameters"] = params 2185 2186 if stream_name is not None: 2187 params["stream_name"] = stream_name 2188 2189 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2190 model, 2191 "partition_router", 2192 stream_slicer_model, 2193 config, 2194 **kwargs, 2195 ) 2196 else: 2197 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2198 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2199 ) 2200 return SinglePartitionRouter(parameters={}) 2201 2202 def _build_concurrent_cursor( 2203 self, 2204 model: DeclarativeStreamModel, 2205 stream_slicer: Optional[PartitionRouter], 2206 config: Config, 2207 ) -> Cursor: 2208 stream_name = model.name or "" 2209 stream_state = self._connector_state_manager.get_stream_state( 2210 stream_name=stream_name, namespace=None 2211 ) 2212 2213 if model.state_migrations: 2214 state_transformations = [ 2215 self._create_component_from_model(state_migration, config, declarative_stream=model) 2216 for state_migration in model.state_migrations 2217 ] 2218 else: 2219 state_transformations = [] 2220 2221 if ( 2222 model.incremental_sync 2223 and stream_slicer 2224 and not isinstance(stream_slicer, SinglePartitionRouter) 2225 ): 2226 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2227 state_manager=self._connector_state_manager, 2228 model_type=DatetimeBasedCursorModel, 2229 component_definition=model.incremental_sync.__dict__, 2230 stream_name=stream_name, 2231 stream_namespace=None, 2232 config=config or {}, 2233 stream_state=stream_state, 2234 stream_state_migrations=state_transformations, 2235 partition_router=stream_slicer, 2236 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2237 ) 2238 elif model.incremental_sync: 2239 if type(model.incremental_sync) == IncrementingCountCursorModel: 2240 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2241 model_type=IncrementingCountCursorModel, 2242 component_definition=model.incremental_sync.__dict__, 2243 stream_name=stream_name, 2244 stream_namespace=None, 2245 config=config or {}, 2246 stream_state_migrations=state_transformations, 2247 ) 2248 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2249 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2250 model_type=type(model.incremental_sync), 2251 component_definition=model.incremental_sync.__dict__, 2252 stream_name=stream_name, 2253 stream_namespace=None, 2254 config=config or {}, 2255 stream_state_migrations=state_transformations, 2256 attempt_to_create_cursor_if_not_provided=True, 2257 ) 2258 else: 2259 raise ValueError( 2260 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2261 ) 2262 return FinalStateCursor(stream_name, None, self._message_repository) 2263 2264 def create_default_error_handler( 2265 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2266 ) -> DefaultErrorHandler: 2267 backoff_strategies = [] 2268 if model.backoff_strategies: 2269 for backoff_strategy_model in model.backoff_strategies: 2270 backoff_strategies.append( 2271 self._create_component_from_model(model=backoff_strategy_model, config=config) 2272 ) 2273 2274 response_filters = [] 2275 if model.response_filters: 2276 for response_filter_model in model.response_filters: 2277 response_filters.append( 2278 self._create_component_from_model(model=response_filter_model, config=config) 2279 ) 2280 response_filters.append( 2281 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2282 ) 2283 2284 return DefaultErrorHandler( 2285 backoff_strategies=backoff_strategies, 2286 max_retries=model.max_retries, 2287 response_filters=response_filters, 2288 config=config, 2289 parameters=model.parameters or {}, 2290 ) 2291 2292 def create_default_paginator( 2293 self, 2294 model: DefaultPaginatorModel, 2295 config: Config, 2296 *, 2297 url_base: str, 2298 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2299 decoder: Optional[Decoder] = None, 2300 cursor_used_for_stop_condition: Optional[Cursor] = None, 2301 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2302 if decoder: 2303 if self._is_supported_decoder_for_pagination(decoder): 2304 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2305 else: 2306 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2307 else: 2308 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2309 page_size_option = ( 2310 self._create_component_from_model(model=model.page_size_option, config=config) 2311 if model.page_size_option 2312 else None 2313 ) 2314 page_token_option = ( 2315 self._create_component_from_model(model=model.page_token_option, config=config) 2316 if model.page_token_option 2317 else None 2318 ) 2319 pagination_strategy = self._create_component_from_model( 2320 model=model.pagination_strategy, 2321 config=config, 2322 decoder=decoder_to_use, 2323 extractor_model=extractor_model, 2324 ) 2325 if cursor_used_for_stop_condition: 2326 pagination_strategy = StopConditionPaginationStrategyDecorator( 2327 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2328 ) 2329 paginator = DefaultPaginator( 2330 decoder=decoder_to_use, 2331 page_size_option=page_size_option, 2332 page_token_option=page_token_option, 2333 pagination_strategy=pagination_strategy, 2334 url_base=url_base, 2335 config=config, 2336 parameters=model.parameters or {}, 2337 ) 2338 if self._limit_pages_fetched_per_slice: 2339 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2340 return paginator 2341 2342 def create_dpath_extractor( 2343 self, 2344 model: DpathExtractorModel, 2345 config: Config, 2346 decoder: Optional[Decoder] = None, 2347 **kwargs: Any, 2348 ) -> DpathExtractor: 2349 if decoder: 2350 decoder_to_use = decoder 2351 else: 2352 decoder_to_use = JsonDecoder(parameters={}) 2353 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2354 return DpathExtractor( 2355 decoder=decoder_to_use, 2356 field_path=model_field_path, 2357 config=config, 2358 parameters=model.parameters or {}, 2359 ) 2360 2361 @staticmethod 2362 def create_response_to_file_extractor( 2363 model: ResponseToFileExtractorModel, 2364 **kwargs: Any, 2365 ) -> ResponseToFileExtractor: 2366 return ResponseToFileExtractor(parameters=model.parameters or {}) 2367 2368 @staticmethod 2369 def create_exponential_backoff_strategy( 2370 model: ExponentialBackoffStrategyModel, config: Config 2371 ) -> ExponentialBackoffStrategy: 2372 return ExponentialBackoffStrategy( 2373 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2374 ) 2375 2376 @staticmethod 2377 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2378 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2379 2380 def create_http_requester( 2381 self, 2382 model: HttpRequesterModel, 2383 config: Config, 2384 decoder: Decoder = JsonDecoder(parameters={}), 2385 query_properties_key: Optional[str] = None, 2386 use_cache: Optional[bool] = None, 2387 *, 2388 name: str, 2389 ) -> HttpRequester: 2390 authenticator = ( 2391 self._create_component_from_model( 2392 model=model.authenticator, 2393 config=config, 2394 url_base=model.url or model.url_base, 2395 name=name, 2396 decoder=decoder, 2397 ) 2398 if model.authenticator 2399 else None 2400 ) 2401 error_handler = ( 2402 self._create_component_from_model(model=model.error_handler, config=config) 2403 if model.error_handler 2404 else DefaultErrorHandler( 2405 backoff_strategies=[], 2406 response_filters=[], 2407 config=config, 2408 parameters=model.parameters or {}, 2409 ) 2410 ) 2411 2412 api_budget = self._api_budget 2413 2414 # Removes QueryProperties components from the interpolated mappings because it has been designed 2415 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2416 # instead of through jinja interpolation 2417 request_parameters: Optional[Union[str, Mapping[str, str]]] 2418 if isinstance(model.request_parameters, Mapping): 2419 request_parameters = self._remove_query_properties(model.request_parameters) 2420 else: 2421 request_parameters = model.request_parameters 2422 2423 request_options_provider = InterpolatedRequestOptionsProvider( 2424 request_body=model.request_body, 2425 request_body_data=model.request_body_data, 2426 request_body_json=model.request_body_json, 2427 request_headers=model.request_headers, 2428 request_parameters=request_parameters, 2429 query_properties_key=query_properties_key, 2430 config=config, 2431 parameters=model.parameters or {}, 2432 ) 2433 2434 assert model.use_cache is not None # for mypy 2435 assert model.http_method is not None # for mypy 2436 2437 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2438 2439 return HttpRequester( 2440 name=name, 2441 url=model.url, 2442 url_base=model.url_base, 2443 path=model.path, 2444 authenticator=authenticator, 2445 error_handler=error_handler, 2446 api_budget=api_budget, 2447 http_method=HttpMethod[model.http_method.value], 2448 request_options_provider=request_options_provider, 2449 config=config, 2450 disable_retries=self._disable_retries, 2451 parameters=model.parameters or {}, 2452 message_repository=self._message_repository, 2453 use_cache=should_use_cache, 2454 decoder=decoder, 2455 stream_response=decoder.is_stream_response() if decoder else False, 2456 ) 2457 2458 @staticmethod 2459 def create_http_response_filter( 2460 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2461 ) -> HttpResponseFilter: 2462 if model.action: 2463 action = ResponseAction(model.action.value) 2464 else: 2465 action = None 2466 2467 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2468 2469 http_codes = ( 2470 set(model.http_codes) if model.http_codes else set() 2471 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2472 2473 return HttpResponseFilter( 2474 action=action, 2475 failure_type=failure_type, 2476 error_message=model.error_message or "", 2477 error_message_contains=model.error_message_contains or "", 2478 http_codes=http_codes, 2479 predicate=model.predicate or "", 2480 config=config, 2481 parameters=model.parameters or {}, 2482 ) 2483 2484 @staticmethod 2485 def create_inline_schema_loader( 2486 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2487 ) -> InlineSchemaLoader: 2488 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2489 2490 def create_complex_field_type( 2491 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2492 ) -> ComplexFieldType: 2493 items = ( 2494 self._create_component_from_model(model=model.items, config=config) 2495 if isinstance(model.items, ComplexFieldTypeModel) 2496 else model.items 2497 ) 2498 2499 return ComplexFieldType(field_type=model.field_type, items=items) 2500 2501 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2502 target_type = ( 2503 self._create_component_from_model(model=model.target_type, config=config) 2504 if isinstance(model.target_type, ComplexFieldTypeModel) 2505 else model.target_type 2506 ) 2507 2508 return TypesMap( 2509 target_type=target_type, 2510 current_type=model.current_type, 2511 condition=model.condition if model.condition is not None else "True", 2512 ) 2513 2514 def create_schema_type_identifier( 2515 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2516 ) -> SchemaTypeIdentifier: 2517 types_mapping = [] 2518 if model.types_mapping: 2519 types_mapping.extend( 2520 [ 2521 self._create_component_from_model(types_map, config=config) 2522 for types_map in model.types_mapping 2523 ] 2524 ) 2525 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2526 [x for x in model.schema_pointer] if model.schema_pointer else [] 2527 ) 2528 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2529 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2530 [x for x in model.type_pointer] if model.type_pointer else None 2531 ) 2532 2533 return SchemaTypeIdentifier( 2534 schema_pointer=model_schema_pointer, 2535 key_pointer=model_key_pointer, 2536 type_pointer=model_type_pointer, 2537 types_mapping=types_mapping, 2538 parameters=model.parameters or {}, 2539 ) 2540 2541 def create_dynamic_schema_loader( 2542 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2543 ) -> DynamicSchemaLoader: 2544 schema_transformations = [] 2545 if model.schema_transformations: 2546 for transformation_model in model.schema_transformations: 2547 schema_transformations.append( 2548 self._create_component_from_model(model=transformation_model, config=config) 2549 ) 2550 name = "dynamic_properties" 2551 retriever = self._create_component_from_model( 2552 model=model.retriever, 2553 config=config, 2554 name=name, 2555 primary_key=None, 2556 partition_router=self._build_stream_slicer_from_partition_router( 2557 model.retriever, config 2558 ), 2559 transformations=[], 2560 use_cache=True, 2561 log_formatter=( 2562 lambda response: format_http_message( 2563 response, 2564 f"Schema loader '{name}' request", 2565 f"Request performed in order to extract schema.", 2566 name, 2567 is_auxiliary=True, 2568 ) 2569 ), 2570 ) 2571 schema_type_identifier = self._create_component_from_model( 2572 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2573 ) 2574 schema_filter = ( 2575 self._create_component_from_model( 2576 model.schema_filter, config=config, parameters=model.parameters or {} 2577 ) 2578 if model.schema_filter is not None 2579 else None 2580 ) 2581 2582 return DynamicSchemaLoader( 2583 retriever=retriever, 2584 config=config, 2585 schema_transformations=schema_transformations, 2586 schema_filter=schema_filter, 2587 schema_type_identifier=schema_type_identifier, 2588 parameters=model.parameters or {}, 2589 ) 2590 2591 @staticmethod 2592 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2593 return JsonDecoder(parameters={}) 2594 2595 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2596 return CompositeRawDecoder( 2597 parser=ModelToComponentFactory._get_parser(model, config), 2598 stream_response=False if self._emit_connector_builder_messages else True, 2599 ) 2600 2601 def create_jsonl_decoder( 2602 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2603 ) -> Decoder: 2604 return CompositeRawDecoder( 2605 parser=ModelToComponentFactory._get_parser(model, config), 2606 stream_response=False if self._emit_connector_builder_messages else True, 2607 ) 2608 2609 def create_gzip_decoder( 2610 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2611 ) -> Decoder: 2612 _compressed_response_types = { 2613 "gzip", 2614 "x-gzip", 2615 "gzip, deflate", 2616 "x-gzip, deflate", 2617 "application/zip", 2618 "application/gzip", 2619 "application/x-gzip", 2620 "application/x-zip-compressed", 2621 } 2622 2623 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2624 2625 if self._emit_connector_builder_messages: 2626 # This is very surprising but if the response is not streamed, 2627 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2628 # which uses urllib3 directly and does not uncompress the data. 2629 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2630 2631 return CompositeRawDecoder.by_headers( 2632 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2633 stream_response=True, 2634 fallback_parser=gzip_parser.inner_parser, 2635 ) 2636 2637 # todo: This method should be removed once we deprecate the SimpleRetriever.cursor field and the various 2638 # state methods 2639 @staticmethod 2640 def create_incrementing_count_cursor( 2641 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2642 ) -> DatetimeBasedCursor: 2643 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2644 # we still parse models into components. The issue is that there's no runtime implementation of a 2645 # IncrementingCountCursor. 2646 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2647 return DatetimeBasedCursor( 2648 cursor_field=model.cursor_field, 2649 datetime_format="%Y-%m-%d", 2650 start_datetime="2024-12-12", 2651 config=config, 2652 parameters={}, 2653 ) 2654 2655 @staticmethod 2656 def create_iterable_decoder( 2657 model: IterableDecoderModel, config: Config, **kwargs: Any 2658 ) -> IterableDecoder: 2659 return IterableDecoder(parameters={}) 2660 2661 @staticmethod 2662 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2663 return XmlDecoder(parameters={}) 2664 2665 def create_zipfile_decoder( 2666 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2667 ) -> ZipfileDecoder: 2668 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2669 2670 @staticmethod 2671 def _get_parser(model: BaseModel, config: Config) -> Parser: 2672 if isinstance(model, JsonDecoderModel): 2673 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2674 return JsonParser() 2675 elif isinstance(model, JsonlDecoderModel): 2676 return JsonLineParser() 2677 elif isinstance(model, CsvDecoderModel): 2678 return CsvParser( 2679 encoding=model.encoding, 2680 delimiter=model.delimiter, 2681 set_values_to_none=model.set_values_to_none, 2682 ) 2683 elif isinstance(model, GzipDecoderModel): 2684 return GzipParser( 2685 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2686 ) 2687 elif isinstance( 2688 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2689 ): 2690 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2691 2692 raise ValueError(f"Unknown decoder type {model}") 2693 2694 @staticmethod 2695 def create_json_file_schema_loader( 2696 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2697 ) -> JsonFileSchemaLoader: 2698 return JsonFileSchemaLoader( 2699 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2700 ) 2701 2702 @staticmethod 2703 def create_jwt_authenticator( 2704 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2705 ) -> JwtAuthenticator: 2706 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2707 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2708 return JwtAuthenticator( 2709 config=config, 2710 parameters=model.parameters or {}, 2711 algorithm=JwtAlgorithm(model.algorithm.value), 2712 secret_key=model.secret_key, 2713 base64_encode_secret_key=model.base64_encode_secret_key, 2714 token_duration=model.token_duration, 2715 header_prefix=model.header_prefix, 2716 kid=jwt_headers.kid, 2717 typ=jwt_headers.typ, 2718 cty=jwt_headers.cty, 2719 iss=jwt_payload.iss, 2720 sub=jwt_payload.sub, 2721 aud=jwt_payload.aud, 2722 additional_jwt_headers=model.additional_jwt_headers, 2723 additional_jwt_payload=model.additional_jwt_payload, 2724 ) 2725 2726 def create_list_partition_router( 2727 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2728 ) -> ListPartitionRouter: 2729 request_option = ( 2730 self._create_component_from_model(model.request_option, config) 2731 if model.request_option 2732 else None 2733 ) 2734 return ListPartitionRouter( 2735 cursor_field=model.cursor_field, 2736 request_option=request_option, 2737 values=model.values, 2738 config=config, 2739 parameters=model.parameters or {}, 2740 ) 2741 2742 @staticmethod 2743 def create_min_max_datetime( 2744 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2745 ) -> MinMaxDatetime: 2746 return MinMaxDatetime( 2747 datetime=model.datetime, 2748 datetime_format=model.datetime_format or "", 2749 max_datetime=model.max_datetime or "", 2750 min_datetime=model.min_datetime or "", 2751 parameters=model.parameters or {}, 2752 ) 2753 2754 @staticmethod 2755 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2756 return NoAuth(parameters=model.parameters or {}) 2757 2758 @staticmethod 2759 def create_no_pagination( 2760 model: NoPaginationModel, config: Config, **kwargs: Any 2761 ) -> NoPagination: 2762 return NoPagination(parameters={}) 2763 2764 def create_oauth_authenticator( 2765 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2766 ) -> DeclarativeOauth2Authenticator: 2767 profile_assertion = ( 2768 self._create_component_from_model(model.profile_assertion, config=config) 2769 if model.profile_assertion 2770 else None 2771 ) 2772 2773 if model.refresh_token_updater: 2774 # ignore type error because fixing it would have a lot of dependencies, revisit later 2775 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2776 config, 2777 InterpolatedString.create( 2778 model.token_refresh_endpoint, # type: ignore 2779 parameters=model.parameters or {}, 2780 ).eval(config), 2781 access_token_name=InterpolatedString.create( 2782 model.access_token_name or "access_token", parameters=model.parameters or {} 2783 ).eval(config), 2784 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2785 expires_in_name=InterpolatedString.create( 2786 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2787 ).eval(config), 2788 client_id_name=InterpolatedString.create( 2789 model.client_id_name or "client_id", parameters=model.parameters or {} 2790 ).eval(config), 2791 client_id=InterpolatedString.create( 2792 model.client_id, parameters=model.parameters or {} 2793 ).eval(config) 2794 if model.client_id 2795 else model.client_id, 2796 client_secret_name=InterpolatedString.create( 2797 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2798 ).eval(config), 2799 client_secret=InterpolatedString.create( 2800 model.client_secret, parameters=model.parameters or {} 2801 ).eval(config) 2802 if model.client_secret 2803 else model.client_secret, 2804 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2805 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2806 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2807 grant_type_name=InterpolatedString.create( 2808 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2809 ).eval(config), 2810 grant_type=InterpolatedString.create( 2811 model.grant_type or "refresh_token", parameters=model.parameters or {} 2812 ).eval(config), 2813 refresh_request_body=InterpolatedMapping( 2814 model.refresh_request_body or {}, parameters=model.parameters or {} 2815 ).eval(config), 2816 refresh_request_headers=InterpolatedMapping( 2817 model.refresh_request_headers or {}, parameters=model.parameters or {} 2818 ).eval(config), 2819 scopes=model.scopes, 2820 token_expiry_date_format=model.token_expiry_date_format, 2821 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2822 message_repository=self._message_repository, 2823 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2824 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2825 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2826 ) 2827 # ignore type error because fixing it would have a lot of dependencies, revisit later 2828 return DeclarativeOauth2Authenticator( # type: ignore 2829 access_token_name=model.access_token_name or "access_token", 2830 access_token_value=model.access_token_value, 2831 client_id_name=model.client_id_name or "client_id", 2832 client_id=model.client_id, 2833 client_secret_name=model.client_secret_name or "client_secret", 2834 client_secret=model.client_secret, 2835 expires_in_name=model.expires_in_name or "expires_in", 2836 grant_type_name=model.grant_type_name or "grant_type", 2837 grant_type=model.grant_type or "refresh_token", 2838 refresh_request_body=model.refresh_request_body, 2839 refresh_request_headers=model.refresh_request_headers, 2840 refresh_token_name=model.refresh_token_name or "refresh_token", 2841 refresh_token=model.refresh_token, 2842 scopes=model.scopes, 2843 token_expiry_date=model.token_expiry_date, 2844 token_expiry_date_format=model.token_expiry_date_format, 2845 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2846 token_refresh_endpoint=model.token_refresh_endpoint, 2847 config=config, 2848 parameters=model.parameters or {}, 2849 message_repository=self._message_repository, 2850 profile_assertion=profile_assertion, 2851 use_profile_assertion=model.use_profile_assertion, 2852 ) 2853 2854 def create_offset_increment( 2855 self, 2856 model: OffsetIncrementModel, 2857 config: Config, 2858 decoder: Decoder, 2859 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2860 **kwargs: Any, 2861 ) -> OffsetIncrement: 2862 if isinstance(decoder, PaginationDecoderDecorator): 2863 inner_decoder = decoder.decoder 2864 else: 2865 inner_decoder = decoder 2866 decoder = PaginationDecoderDecorator(decoder=decoder) 2867 2868 if self._is_supported_decoder_for_pagination(inner_decoder): 2869 decoder_to_use = decoder 2870 else: 2871 raise ValueError( 2872 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2873 ) 2874 2875 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2876 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2877 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2878 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2879 # When we have more time to investigate we can look into reusing the same component. 2880 extractor = ( 2881 self._create_component_from_model( 2882 model=extractor_model, config=config, decoder=decoder_to_use 2883 ) 2884 if extractor_model 2885 else None 2886 ) 2887 2888 return OffsetIncrement( 2889 page_size=model.page_size, 2890 config=config, 2891 decoder=decoder_to_use, 2892 extractor=extractor, 2893 inject_on_first_request=model.inject_on_first_request or False, 2894 parameters=model.parameters or {}, 2895 ) 2896 2897 @staticmethod 2898 def create_page_increment( 2899 model: PageIncrementModel, config: Config, **kwargs: Any 2900 ) -> PageIncrement: 2901 return PageIncrement( 2902 page_size=model.page_size, 2903 config=config, 2904 start_from_page=model.start_from_page or 0, 2905 inject_on_first_request=model.inject_on_first_request or False, 2906 parameters=model.parameters or {}, 2907 ) 2908 2909 def create_parent_stream_config( 2910 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2911 ) -> ParentStreamConfig: 2912 declarative_stream = self._create_component_from_model( 2913 model.stream, 2914 config=config, 2915 is_parent=True, 2916 **kwargs, 2917 ) 2918 request_option = ( 2919 self._create_component_from_model(model.request_option, config=config) 2920 if model.request_option 2921 else None 2922 ) 2923 2924 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2925 raise ValueError( 2926 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2927 ) 2928 2929 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2930 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2931 ) 2932 2933 return ParentStreamConfig( 2934 parent_key=model.parent_key, 2935 request_option=request_option, 2936 stream=declarative_stream, 2937 partition_field=model.partition_field, 2938 config=config, 2939 incremental_dependency=model.incremental_dependency or False, 2940 parameters=model.parameters or {}, 2941 extra_fields=model.extra_fields, 2942 lazy_read_pointer=model_lazy_read_pointer, 2943 ) 2944 2945 def create_properties_from_endpoint( 2946 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2947 ) -> PropertiesFromEndpoint: 2948 retriever = self._create_component_from_model( 2949 model=model.retriever, 2950 config=config, 2951 name="dynamic_properties", 2952 primary_key=None, 2953 stream_slicer=None, 2954 transformations=[], 2955 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2956 ) 2957 return PropertiesFromEndpoint( 2958 property_field_path=model.property_field_path, 2959 retriever=retriever, 2960 config=config, 2961 parameters=model.parameters or {}, 2962 ) 2963 2964 def create_property_chunking( 2965 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2966 ) -> PropertyChunking: 2967 record_merge_strategy = ( 2968 self._create_component_from_model( 2969 model=model.record_merge_strategy, config=config, **kwargs 2970 ) 2971 if model.record_merge_strategy 2972 else None 2973 ) 2974 2975 property_limit_type: PropertyLimitType 2976 match model.property_limit_type: 2977 case PropertyLimitTypeModel.property_count: 2978 property_limit_type = PropertyLimitType.property_count 2979 case PropertyLimitTypeModel.characters: 2980 property_limit_type = PropertyLimitType.characters 2981 case _: 2982 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2983 2984 return PropertyChunking( 2985 property_limit_type=property_limit_type, 2986 property_limit=model.property_limit, 2987 record_merge_strategy=record_merge_strategy, 2988 config=config, 2989 parameters=model.parameters or {}, 2990 ) 2991 2992 def create_query_properties( 2993 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2994 ) -> QueryProperties: 2995 if isinstance(model.property_list, list): 2996 property_list = model.property_list 2997 else: 2998 property_list = self._create_component_from_model( 2999 model=model.property_list, config=config, **kwargs 3000 ) 3001 3002 property_chunking = ( 3003 self._create_component_from_model( 3004 model=model.property_chunking, config=config, **kwargs 3005 ) 3006 if model.property_chunking 3007 else None 3008 ) 3009 3010 return QueryProperties( 3011 property_list=property_list, 3012 always_include_properties=model.always_include_properties, 3013 property_chunking=property_chunking, 3014 config=config, 3015 parameters=model.parameters or {}, 3016 ) 3017 3018 @staticmethod 3019 def create_record_filter( 3020 model: RecordFilterModel, config: Config, **kwargs: Any 3021 ) -> RecordFilter: 3022 return RecordFilter( 3023 condition=model.condition or "", config=config, parameters=model.parameters or {} 3024 ) 3025 3026 @staticmethod 3027 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3028 return RequestPath(parameters={}) 3029 3030 @staticmethod 3031 def create_request_option( 3032 model: RequestOptionModel, config: Config, **kwargs: Any 3033 ) -> RequestOption: 3034 inject_into = RequestOptionType(model.inject_into.value) 3035 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3036 [ 3037 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3038 for segment in model.field_path 3039 ] 3040 if model.field_path 3041 else None 3042 ) 3043 field_name = ( 3044 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3045 if model.field_name 3046 else None 3047 ) 3048 return RequestOption( 3049 field_name=field_name, 3050 field_path=field_path, 3051 inject_into=inject_into, 3052 parameters=kwargs.get("parameters", {}), 3053 ) 3054 3055 def create_record_selector( 3056 self, 3057 model: RecordSelectorModel, 3058 config: Config, 3059 *, 3060 name: str, 3061 transformations: List[RecordTransformation] | None = None, 3062 decoder: Decoder | None = None, 3063 client_side_incremental_sync: Dict[str, Any] | None = None, 3064 file_uploader: Optional[DefaultFileUploader] = None, 3065 **kwargs: Any, 3066 ) -> RecordSelector: 3067 extractor = self._create_component_from_model( 3068 model=model.extractor, decoder=decoder, config=config 3069 ) 3070 record_filter = ( 3071 self._create_component_from_model(model.record_filter, config=config) 3072 if model.record_filter 3073 else None 3074 ) 3075 3076 transform_before_filtering = ( 3077 False if model.transform_before_filtering is None else model.transform_before_filtering 3078 ) 3079 if client_side_incremental_sync: 3080 record_filter = ClientSideIncrementalRecordFilterDecorator( 3081 config=config, 3082 parameters=model.parameters, 3083 condition=model.record_filter.condition 3084 if (model.record_filter and hasattr(model.record_filter, "condition")) 3085 else None, 3086 **client_side_incremental_sync, 3087 ) 3088 transform_before_filtering = ( 3089 True 3090 if model.transform_before_filtering is None 3091 else model.transform_before_filtering 3092 ) 3093 3094 if model.schema_normalization is None: 3095 # default to no schema normalization if not set 3096 model.schema_normalization = SchemaNormalizationModel.None_ 3097 3098 schema_normalization = ( 3099 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3100 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3101 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3102 ) 3103 3104 return RecordSelector( 3105 extractor=extractor, 3106 name=name, 3107 config=config, 3108 record_filter=record_filter, 3109 transformations=transformations or [], 3110 file_uploader=file_uploader, 3111 schema_normalization=schema_normalization, 3112 parameters=model.parameters or {}, 3113 transform_before_filtering=transform_before_filtering, 3114 ) 3115 3116 @staticmethod 3117 def create_remove_fields( 3118 model: RemoveFieldsModel, config: Config, **kwargs: Any 3119 ) -> RemoveFields: 3120 return RemoveFields( 3121 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3122 ) 3123 3124 def create_selective_authenticator( 3125 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3126 ) -> DeclarativeAuthenticator: 3127 authenticators = { 3128 name: self._create_component_from_model(model=auth, config=config) 3129 for name, auth in model.authenticators.items() 3130 } 3131 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3132 return SelectiveAuthenticator( # type: ignore[abstract] 3133 config=config, 3134 authenticators=authenticators, 3135 authenticator_selection_path=model.authenticator_selection_path, 3136 **kwargs, 3137 ) 3138 3139 @staticmethod 3140 def create_legacy_session_token_authenticator( 3141 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3142 ) -> LegacySessionTokenAuthenticator: 3143 return LegacySessionTokenAuthenticator( 3144 api_url=url_base, 3145 header=model.header, 3146 login_url=model.login_url, 3147 password=model.password or "", 3148 session_token=model.session_token or "", 3149 session_token_response_key=model.session_token_response_key or "", 3150 username=model.username or "", 3151 validate_session_url=model.validate_session_url, 3152 config=config, 3153 parameters=model.parameters or {}, 3154 ) 3155 3156 def create_simple_retriever( 3157 self, 3158 model: SimpleRetrieverModel, 3159 config: Config, 3160 *, 3161 name: str, 3162 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3163 request_options_provider: Optional[RequestOptionsProvider] = None, 3164 stop_condition_cursor: Optional[Cursor] = None, 3165 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3166 transformations: List[RecordTransformation], 3167 file_uploader: Optional[DefaultFileUploader] = None, 3168 incremental_sync: Optional[ 3169 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3170 ] = None, 3171 use_cache: Optional[bool] = None, 3172 log_formatter: Optional[Callable[[Response], Any]] = None, 3173 partition_router: Optional[PartitionRouter] = None, 3174 **kwargs: Any, 3175 ) -> SimpleRetriever: 3176 def _get_url(req: Requester) -> str: 3177 """ 3178 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3179 This is needed because the URL is not set until the requester is created. 3180 """ 3181 3182 _url: str = ( 3183 model.requester.url 3184 if hasattr(model.requester, "url") and model.requester.url is not None 3185 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3186 ) 3187 _url_base: str = ( 3188 model.requester.url_base 3189 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3190 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3191 ) 3192 3193 return _url or _url_base 3194 3195 decoder = ( 3196 self._create_component_from_model(model=model.decoder, config=config) 3197 if model.decoder 3198 else JsonDecoder(parameters={}) 3199 ) 3200 record_selector = self._create_component_from_model( 3201 model=model.record_selector, 3202 name=name, 3203 config=config, 3204 decoder=decoder, 3205 transformations=transformations, 3206 client_side_incremental_sync=client_side_incremental_sync, 3207 file_uploader=file_uploader, 3208 ) 3209 3210 query_properties: Optional[QueryProperties] = None 3211 query_properties_key: Optional[str] = None 3212 if self._query_properties_in_request_parameters(model.requester): 3213 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3214 # places instead of default to request_parameters which isn't clearly documented 3215 if ( 3216 hasattr(model.requester, "fetch_properties_from_endpoint") 3217 and model.requester.fetch_properties_from_endpoint 3218 ): 3219 raise ValueError( 3220 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3221 ) 3222 3223 query_properties_definitions = [] 3224 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3225 if isinstance(request_parameter, QueryPropertiesModel): 3226 query_properties_key = key 3227 query_properties_definitions.append(request_parameter) 3228 3229 if len(query_properties_definitions) > 1: 3230 raise ValueError( 3231 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3232 ) 3233 3234 if len(query_properties_definitions) == 1: 3235 query_properties = self._create_component_from_model( 3236 model=query_properties_definitions[0], config=config 3237 ) 3238 elif ( 3239 hasattr(model.requester, "fetch_properties_from_endpoint") 3240 and model.requester.fetch_properties_from_endpoint 3241 ): 3242 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3243 query_properties_definition = QueryPropertiesModel( 3244 type="QueryProperties", 3245 property_list=model.requester.fetch_properties_from_endpoint, 3246 always_include_properties=None, 3247 property_chunking=None, 3248 ) # type: ignore # $parameters has a default value 3249 3250 query_properties = self.create_query_properties( 3251 model=query_properties_definition, 3252 config=config, 3253 ) 3254 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3255 query_properties = self.create_query_properties( 3256 model=model.requester.query_properties, 3257 config=config, 3258 ) 3259 3260 requester = self._create_component_from_model( 3261 model=model.requester, 3262 decoder=decoder, 3263 name=name, 3264 query_properties_key=query_properties_key, 3265 use_cache=use_cache, 3266 config=config, 3267 ) 3268 3269 if not request_options_provider: 3270 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3271 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3272 partition_router, PartitionRouter 3273 ): 3274 request_options_provider = partition_router 3275 3276 paginator = ( 3277 self._create_component_from_model( 3278 model=model.paginator, 3279 config=config, 3280 url_base=_get_url(requester), 3281 extractor_model=model.record_selector.extractor, 3282 decoder=decoder, 3283 cursor_used_for_stop_condition=stop_condition_cursor or None, 3284 ) 3285 if model.paginator 3286 else NoPagination(parameters={}) 3287 ) 3288 3289 ignore_stream_slicer_parameters_on_paginated_requests = ( 3290 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3291 ) 3292 3293 if ( 3294 model.partition_router 3295 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3296 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3297 and any( 3298 parent_stream_config.lazy_read_pointer 3299 for parent_stream_config in model.partition_router.parent_stream_configs 3300 ) 3301 ): 3302 if incremental_sync: 3303 if incremental_sync.type != "DatetimeBasedCursor": 3304 raise ValueError( 3305 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3306 ) 3307 3308 elif incremental_sync.step or incremental_sync.cursor_granularity: 3309 raise ValueError( 3310 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3311 ) 3312 3313 if model.decoder and model.decoder.type != "JsonDecoder": 3314 raise ValueError( 3315 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3316 ) 3317 3318 return LazySimpleRetriever( 3319 name=name, 3320 paginator=paginator, 3321 primary_key=primary_key, 3322 requester=requester, 3323 record_selector=record_selector, 3324 stream_slicer=_NO_STREAM_SLICING, 3325 request_option_provider=request_options_provider, 3326 cursor=None, 3327 config=config, 3328 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3329 parameters=model.parameters or {}, 3330 ) 3331 3332 return SimpleRetriever( 3333 name=name, 3334 paginator=paginator, 3335 primary_key=primary_key, 3336 requester=requester, 3337 record_selector=record_selector, 3338 stream_slicer=_NO_STREAM_SLICING, 3339 request_option_provider=request_options_provider, 3340 cursor=None, 3341 config=config, 3342 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3343 additional_query_properties=query_properties, 3344 log_formatter=self._get_log_formatter(log_formatter, name), 3345 parameters=model.parameters or {}, 3346 ) 3347 3348 def _get_log_formatter( 3349 self, log_formatter: Callable[[Response], Any] | None, name: str 3350 ) -> Callable[[Response], Any] | None: 3351 if self._should_limit_slices_fetched(): 3352 return ( 3353 ( 3354 lambda response: format_http_message( 3355 response, 3356 f"Stream '{name}' request", 3357 f"Request performed in order to extract records for stream '{name}'", 3358 name, 3359 ) 3360 ) 3361 if not log_formatter 3362 else log_formatter 3363 ) 3364 return None 3365 3366 def _should_limit_slices_fetched(self) -> bool: 3367 """ 3368 Returns True if the number of slices fetched should be limited, False otherwise. 3369 This is used to limit the number of slices fetched during tests. 3370 """ 3371 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3372 3373 @staticmethod 3374 def _query_properties_in_request_parameters( 3375 requester: Union[HttpRequesterModel, CustomRequesterModel], 3376 ) -> bool: 3377 if not hasattr(requester, "request_parameters"): 3378 return False 3379 request_parameters = requester.request_parameters 3380 if request_parameters and isinstance(request_parameters, Mapping): 3381 for request_parameter in request_parameters.values(): 3382 if isinstance(request_parameter, QueryPropertiesModel): 3383 return True 3384 return False 3385 3386 @staticmethod 3387 def _remove_query_properties( 3388 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3389 ) -> Mapping[str, str]: 3390 return { 3391 parameter_field: request_parameter 3392 for parameter_field, request_parameter in request_parameters.items() 3393 if not isinstance(request_parameter, QueryPropertiesModel) 3394 } 3395 3396 def create_state_delegating_stream( 3397 self, 3398 model: StateDelegatingStreamModel, 3399 config: Config, 3400 has_parent_state: Optional[bool] = None, 3401 **kwargs: Any, 3402 ) -> DeclarativeStream: 3403 if ( 3404 model.full_refresh_stream.name != model.name 3405 or model.name != model.incremental_stream.name 3406 ): 3407 raise ValueError( 3408 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3409 ) 3410 3411 stream_model = self._get_state_delegating_stream_model( 3412 False if has_parent_state is None else has_parent_state, model 3413 ) 3414 3415 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel 3416 3417 def _get_state_delegating_stream_model( 3418 self, has_parent_state: bool, model: StateDelegatingStreamModel 3419 ) -> DeclarativeStreamModel: 3420 return ( 3421 model.incremental_stream 3422 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3423 else model.full_refresh_stream 3424 ) 3425 3426 def _create_async_job_status_mapping( 3427 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3428 ) -> Mapping[str, AsyncJobStatus]: 3429 api_status_to_cdk_status = {} 3430 for cdk_status, api_statuses in model.dict().items(): 3431 if cdk_status == "type": 3432 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3433 continue 3434 3435 for status in api_statuses: 3436 if status in api_status_to_cdk_status: 3437 raise ValueError( 3438 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3439 ) 3440 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3441 return api_status_to_cdk_status 3442 3443 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3444 match status: 3445 case "running": 3446 return AsyncJobStatus.RUNNING 3447 case "completed": 3448 return AsyncJobStatus.COMPLETED 3449 case "failed": 3450 return AsyncJobStatus.FAILED 3451 case "timeout": 3452 return AsyncJobStatus.TIMED_OUT 3453 case _: 3454 raise ValueError(f"Unsupported CDK status {status}") 3455 3456 def create_async_retriever( 3457 self, 3458 model: AsyncRetrieverModel, 3459 config: Config, 3460 *, 3461 name: str, 3462 primary_key: Optional[ 3463 Union[str, List[str], List[List[str]]] 3464 ], # this seems to be needed to match create_simple_retriever 3465 stream_slicer: Optional[StreamSlicer], 3466 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3467 transformations: List[RecordTransformation], 3468 **kwargs: Any, 3469 ) -> AsyncRetriever: 3470 if model.download_target_requester and not model.download_target_extractor: 3471 raise ValueError( 3472 f"`download_target_extractor` required if using a `download_target_requester`" 3473 ) 3474 3475 def _get_download_retriever( 3476 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3477 ) -> SimpleRetriever: 3478 # We create a record selector for the download retriever 3479 # with no schema normalization and no transformations, neither record filter 3480 # as all this occurs in the record_selector of the AsyncRetriever 3481 record_selector = RecordSelector( 3482 extractor=extractor, 3483 name=name, 3484 record_filter=None, 3485 transformations=[], 3486 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3487 config=config, 3488 parameters={}, 3489 ) 3490 paginator = ( 3491 self._create_component_from_model( 3492 model=model.download_paginator, 3493 decoder=_decoder, 3494 config=config, 3495 url_base="", 3496 ) 3497 if model.download_paginator 3498 else NoPagination(parameters={}) 3499 ) 3500 3501 return SimpleRetriever( 3502 requester=requester, 3503 record_selector=record_selector, 3504 primary_key=None, 3505 name=name, 3506 paginator=paginator, 3507 config=config, 3508 parameters={}, 3509 log_formatter=self._get_log_formatter(None, name), 3510 ) 3511 3512 def _get_job_timeout() -> datetime.timedelta: 3513 user_defined_timeout: Optional[int] = ( 3514 int( 3515 InterpolatedString.create( 3516 str(model.polling_job_timeout), 3517 parameters={}, 3518 ).eval(config) 3519 ) 3520 if model.polling_job_timeout 3521 else None 3522 ) 3523 3524 # check for user defined timeout during the test read or 15 minutes 3525 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3526 # default value for non-connector builder is 60 minutes. 3527 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3528 3529 return ( 3530 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3531 ) 3532 3533 decoder = ( 3534 self._create_component_from_model(model=model.decoder, config=config) 3535 if model.decoder 3536 else JsonDecoder(parameters={}) 3537 ) 3538 record_selector = self._create_component_from_model( 3539 model=model.record_selector, 3540 config=config, 3541 decoder=decoder, 3542 name=name, 3543 transformations=transformations, 3544 client_side_incremental_sync=client_side_incremental_sync, 3545 ) 3546 3547 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3548 if self._should_limit_slices_fetched(): 3549 stream_slicer = cast( 3550 StreamSlicer, 3551 StreamSlicerTestReadDecorator( 3552 wrapped_slicer=stream_slicer, 3553 maximum_number_of_slices=self._limit_slices_fetched or 5, 3554 ), 3555 ) 3556 3557 creation_requester = self._create_component_from_model( 3558 model=model.creation_requester, 3559 decoder=decoder, 3560 config=config, 3561 name=f"job creation - {name}", 3562 ) 3563 polling_requester = self._create_component_from_model( 3564 model=model.polling_requester, 3565 decoder=decoder, 3566 config=config, 3567 name=f"job polling - {name}", 3568 ) 3569 job_download_components_name = f"job download - {name}" 3570 download_decoder = ( 3571 self._create_component_from_model(model=model.download_decoder, config=config) 3572 if model.download_decoder 3573 else JsonDecoder(parameters={}) 3574 ) 3575 download_extractor = ( 3576 self._create_component_from_model( 3577 model=model.download_extractor, 3578 config=config, 3579 decoder=download_decoder, 3580 parameters=model.parameters, 3581 ) 3582 if model.download_extractor 3583 else DpathExtractor( 3584 [], 3585 config=config, 3586 decoder=download_decoder, 3587 parameters=model.parameters or {}, 3588 ) 3589 ) 3590 download_requester = self._create_component_from_model( 3591 model=model.download_requester, 3592 decoder=download_decoder, 3593 config=config, 3594 name=job_download_components_name, 3595 ) 3596 download_retriever = _get_download_retriever( 3597 download_requester, download_extractor, download_decoder 3598 ) 3599 abort_requester = ( 3600 self._create_component_from_model( 3601 model=model.abort_requester, 3602 decoder=decoder, 3603 config=config, 3604 name=f"job abort - {name}", 3605 ) 3606 if model.abort_requester 3607 else None 3608 ) 3609 delete_requester = ( 3610 self._create_component_from_model( 3611 model=model.delete_requester, 3612 decoder=decoder, 3613 config=config, 3614 name=f"job delete - {name}", 3615 ) 3616 if model.delete_requester 3617 else None 3618 ) 3619 download_target_requester = ( 3620 self._create_component_from_model( 3621 model=model.download_target_requester, 3622 decoder=decoder, 3623 config=config, 3624 name=f"job extract_url - {name}", 3625 ) 3626 if model.download_target_requester 3627 else None 3628 ) 3629 status_extractor = self._create_component_from_model( 3630 model=model.status_extractor, decoder=decoder, config=config, name=name 3631 ) 3632 download_target_extractor = ( 3633 self._create_component_from_model( 3634 model=model.download_target_extractor, 3635 decoder=decoder, 3636 config=config, 3637 name=name, 3638 ) 3639 if model.download_target_extractor 3640 else None 3641 ) 3642 3643 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3644 creation_requester=creation_requester, 3645 polling_requester=polling_requester, 3646 download_retriever=download_retriever, 3647 download_target_requester=download_target_requester, 3648 abort_requester=abort_requester, 3649 delete_requester=delete_requester, 3650 status_extractor=status_extractor, 3651 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3652 download_target_extractor=download_target_extractor, 3653 job_timeout=_get_job_timeout(), 3654 ) 3655 3656 async_job_partition_router = AsyncJobPartitionRouter( 3657 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3658 job_repository, 3659 stream_slices, 3660 self._job_tracker, 3661 self._message_repository, 3662 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3663 has_bulk_parent=False, 3664 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3665 # `None` == default retry is set to 3 attempts, under the hood. 3666 job_max_retry=1 if self._emit_connector_builder_messages else None, 3667 ), 3668 stream_slicer=stream_slicer, 3669 config=config, 3670 parameters=model.parameters or {}, 3671 ) 3672 3673 return AsyncRetriever( 3674 record_selector=record_selector, 3675 stream_slicer=async_job_partition_router, 3676 config=config, 3677 parameters=model.parameters or {}, 3678 ) 3679 3680 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3681 config_migrations = [ 3682 self._create_component_from_model(migration, config) 3683 for migration in ( 3684 model.config_normalization_rules.config_migrations 3685 if ( 3686 model.config_normalization_rules 3687 and model.config_normalization_rules.config_migrations 3688 ) 3689 else [] 3690 ) 3691 ] 3692 config_transformations = [ 3693 self._create_component_from_model(transformation, config) 3694 for transformation in ( 3695 model.config_normalization_rules.transformations 3696 if ( 3697 model.config_normalization_rules 3698 and model.config_normalization_rules.transformations 3699 ) 3700 else [] 3701 ) 3702 ] 3703 config_validations = [ 3704 self._create_component_from_model(validation, config) 3705 for validation in ( 3706 model.config_normalization_rules.validations 3707 if ( 3708 model.config_normalization_rules 3709 and model.config_normalization_rules.validations 3710 ) 3711 else [] 3712 ) 3713 ] 3714 3715 return Spec( 3716 connection_specification=model.connection_specification, 3717 documentation_url=model.documentation_url, 3718 advanced_auth=model.advanced_auth, 3719 parameters={}, 3720 config_migrations=config_migrations, 3721 config_transformations=config_transformations, 3722 config_validations=config_validations, 3723 ) 3724 3725 def create_substream_partition_router( 3726 self, 3727 model: SubstreamPartitionRouterModel, 3728 config: Config, 3729 *, 3730 stream_name: str, 3731 **kwargs: Any, 3732 ) -> SubstreamPartitionRouter: 3733 parent_stream_configs = [] 3734 if model.parent_stream_configs: 3735 parent_stream_configs.extend( 3736 [ 3737 self.create_parent_stream_config_with_substream_wrapper( 3738 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3739 ) 3740 for parent_stream_config in model.parent_stream_configs 3741 ] 3742 ) 3743 3744 return SubstreamPartitionRouter( 3745 parent_stream_configs=parent_stream_configs, 3746 parameters=model.parameters or {}, 3747 config=config, 3748 ) 3749 3750 def create_parent_stream_config_with_substream_wrapper( 3751 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3752 ) -> Any: 3753 # getting the parent state 3754 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3755 3756 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3757 has_parent_state = bool( 3758 self._connector_state_manager.get_stream_state(stream_name, None) 3759 if model.incremental_dependency 3760 else False 3761 ) 3762 connector_state_manager = self._instantiate_parent_stream_state_manager( 3763 child_state, config, model, has_parent_state 3764 ) 3765 3766 substream_factory = ModelToComponentFactory( 3767 connector_state_manager=connector_state_manager, 3768 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3769 limit_slices_fetched=self._limit_slices_fetched, 3770 emit_connector_builder_messages=self._emit_connector_builder_messages, 3771 disable_retries=self._disable_retries, 3772 disable_cache=self._disable_cache, 3773 message_repository=StateFilteringMessageRepository( 3774 LogAppenderMessageRepositoryDecorator( 3775 { 3776 "airbyte_cdk": {"stream": {"is_substream": True}}, 3777 "http": {"is_auxiliary": True}, 3778 }, 3779 self._message_repository, 3780 self._evaluate_log_level(self._emit_connector_builder_messages), 3781 ), 3782 ), 3783 ) 3784 3785 return substream_factory.create_parent_stream_config( 3786 model=model, config=config, stream_name=stream_name, **kwargs 3787 ) 3788 3789 def _instantiate_parent_stream_state_manager( 3790 self, 3791 child_state: MutableMapping[str, Any], 3792 config: Config, 3793 model: ParentStreamConfigModel, 3794 has_parent_state: bool, 3795 ) -> ConnectorStateManager: 3796 """ 3797 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 3798 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 3799 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 3800 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 3801 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 3802 incremental_dependency is set. 3803 """ 3804 if model.incremental_dependency and child_state: 3805 parent_stream_name = model.stream.name or "" 3806 parent_state = ConcurrentPerPartitionCursor.get_parent_state( 3807 child_state, parent_stream_name 3808 ) 3809 3810 if not parent_state: 3811 # there are two migration cases: state value from child stream or from global state 3812 parent_state = ConcurrentPerPartitionCursor.get_global_state( 3813 child_state, parent_stream_name 3814 ) 3815 3816 if not parent_state and not isinstance(parent_state, dict): 3817 cursor_values = child_state.values() 3818 if cursor_values: 3819 incremental_sync_model: Union[ 3820 DatetimeBasedCursorModel, 3821 IncrementingCountCursorModel, 3822 ] = ( 3823 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 3824 if isinstance(model.stream, DeclarativeStreamModel) 3825 else self._get_state_delegating_stream_model( 3826 has_parent_state, model.stream 3827 ).incremental_sync 3828 ) 3829 cursor_field = InterpolatedString.create( 3830 incremental_sync_model.cursor_field, 3831 parameters=incremental_sync_model.parameters or {}, 3832 ).eval(config) 3833 parent_state = AirbyteStateMessage( 3834 type=AirbyteStateType.STREAM, 3835 stream=AirbyteStreamState( 3836 stream_descriptor=StreamDescriptor( 3837 name=parent_stream_name, namespace=None 3838 ), 3839 stream_state=AirbyteStateBlob( 3840 {cursor_field: list(cursor_values)[0]} 3841 ), 3842 ), 3843 ) 3844 return ConnectorStateManager([parent_state] if parent_state else []) 3845 3846 return ConnectorStateManager([]) 3847 3848 @staticmethod 3849 def create_wait_time_from_header( 3850 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3851 ) -> WaitTimeFromHeaderBackoffStrategy: 3852 return WaitTimeFromHeaderBackoffStrategy( 3853 header=model.header, 3854 parameters=model.parameters or {}, 3855 config=config, 3856 regex=model.regex, 3857 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3858 if model.max_waiting_time_in_seconds is not None 3859 else None, 3860 ) 3861 3862 @staticmethod 3863 def create_wait_until_time_from_header( 3864 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3865 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3866 return WaitUntilTimeFromHeaderBackoffStrategy( 3867 header=model.header, 3868 parameters=model.parameters or {}, 3869 config=config, 3870 min_wait=model.min_wait, 3871 regex=model.regex, 3872 ) 3873 3874 def get_message_repository(self) -> MessageRepository: 3875 return self._message_repository 3876 3877 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3878 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3879 3880 @staticmethod 3881 def create_components_mapping_definition( 3882 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3883 ) -> ComponentMappingDefinition: 3884 interpolated_value = InterpolatedString.create( 3885 model.value, parameters=model.parameters or {} 3886 ) 3887 field_path = [ 3888 InterpolatedString.create(path, parameters=model.parameters or {}) 3889 for path in model.field_path 3890 ] 3891 return ComponentMappingDefinition( 3892 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3893 value=interpolated_value, 3894 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3895 create_or_update=model.create_or_update, 3896 condition=model.condition, 3897 parameters=model.parameters or {}, 3898 ) 3899 3900 def create_http_components_resolver( 3901 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3902 ) -> Any: 3903 retriever = self._create_component_from_model( 3904 model=model.retriever, 3905 config=config, 3906 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3907 primary_key=None, 3908 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3909 transformations=[], 3910 ) 3911 3912 components_mapping = [] 3913 for component_mapping_definition_model in model.components_mapping: 3914 if component_mapping_definition_model.condition: 3915 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3916 components_mapping.append( 3917 self._create_component_from_model( 3918 model=component_mapping_definition_model, 3919 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3920 component_mapping_definition_model.value_type 3921 ), 3922 config=config, 3923 ) 3924 ) 3925 3926 return HttpComponentsResolver( 3927 retriever=retriever, 3928 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3929 config=config, 3930 components_mapping=components_mapping, 3931 parameters=model.parameters or {}, 3932 ) 3933 3934 @staticmethod 3935 def create_stream_config( 3936 model: StreamConfigModel, config: Config, **kwargs: Any 3937 ) -> StreamConfig: 3938 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3939 [x for x in model.configs_pointer] if model.configs_pointer else [] 3940 ) 3941 3942 return StreamConfig( 3943 configs_pointer=model_configs_pointer, 3944 default_values=model.default_values, 3945 parameters=model.parameters or {}, 3946 ) 3947 3948 def create_config_components_resolver( 3949 self, 3950 model: ConfigComponentsResolverModel, 3951 config: Config, 3952 ) -> Any: 3953 model_stream_configs = ( 3954 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3955 ) 3956 3957 stream_configs = [ 3958 self._create_component_from_model( 3959 stream_config, config=config, parameters=model.parameters or {} 3960 ) 3961 for stream_config in model_stream_configs 3962 ] 3963 3964 components_mapping = [ 3965 self._create_component_from_model( 3966 model=components_mapping_definition_model, 3967 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3968 components_mapping_definition_model.value_type 3969 ), 3970 config=config, 3971 parameters=model.parameters, 3972 ) 3973 for components_mapping_definition_model in model.components_mapping 3974 ] 3975 3976 return ConfigComponentsResolver( 3977 stream_configs=stream_configs, 3978 config=config, 3979 components_mapping=components_mapping, 3980 parameters=model.parameters or {}, 3981 ) 3982 3983 def create_parametrized_components_resolver( 3984 self, 3985 model: ParametrizedComponentsResolverModel, 3986 config: Config, 3987 ) -> ParametrizedComponentsResolver: 3988 stream_parameters = StreamParametersDefinition( 3989 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3990 ) 3991 3992 components_mapping = [] 3993 for components_mapping_definition_model in model.components_mapping: 3994 if components_mapping_definition_model.condition: 3995 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3996 components_mapping.append( 3997 self._create_component_from_model( 3998 model=components_mapping_definition_model, 3999 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4000 components_mapping_definition_model.value_type 4001 ), 4002 config=config, 4003 ) 4004 ) 4005 return ParametrizedComponentsResolver( 4006 stream_parameters=stream_parameters, 4007 config=config, 4008 components_mapping=components_mapping, 4009 parameters=model.parameters or {}, 4010 ) 4011 4012 _UNSUPPORTED_DECODER_ERROR = ( 4013 "Specified decoder of {decoder_type} is not supported for pagination." 4014 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4015 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4016 ) 4017 4018 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4019 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4020 return True 4021 elif isinstance(decoder, CompositeRawDecoder): 4022 return self._is_supported_parser_for_pagination(decoder.parser) 4023 else: 4024 return False 4025 4026 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4027 if isinstance(parser, JsonParser): 4028 return True 4029 elif isinstance(parser, GzipParser): 4030 return isinstance(parser.inner_parser, JsonParser) 4031 else: 4032 return False 4033 4034 def create_http_api_budget( 4035 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4036 ) -> HttpAPIBudget: 4037 policies = [ 4038 self._create_component_from_model(model=policy, config=config) 4039 for policy in model.policies 4040 ] 4041 4042 return HttpAPIBudget( 4043 policies=policies, 4044 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4045 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4046 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4047 ) 4048 4049 def create_fixed_window_call_rate_policy( 4050 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4051 ) -> FixedWindowCallRatePolicy: 4052 matchers = [ 4053 self._create_component_from_model(model=matcher, config=config) 4054 for matcher in model.matchers 4055 ] 4056 4057 # Set the initial reset timestamp to 10 days from now. 4058 # This value will be updated by the first request. 4059 return FixedWindowCallRatePolicy( 4060 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4061 period=parse_duration(model.period), 4062 call_limit=model.call_limit, 4063 matchers=matchers, 4064 ) 4065 4066 def create_file_uploader( 4067 self, model: FileUploaderModel, config: Config, **kwargs: Any 4068 ) -> FileUploader: 4069 name = "File Uploader" 4070 requester = self._create_component_from_model( 4071 model=model.requester, 4072 config=config, 4073 name=name, 4074 **kwargs, 4075 ) 4076 download_target_extractor = self._create_component_from_model( 4077 model=model.download_target_extractor, 4078 config=config, 4079 name=name, 4080 **kwargs, 4081 ) 4082 emit_connector_builder_messages = self._emit_connector_builder_messages 4083 file_uploader = DefaultFileUploader( 4084 requester=requester, 4085 download_target_extractor=download_target_extractor, 4086 config=config, 4087 file_writer=NoopFileWriter() 4088 if emit_connector_builder_messages 4089 else LocalFileSystemFileWriter(), 4090 parameters=model.parameters or {}, 4091 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4092 ) 4093 4094 return ( 4095 ConnectorBuilderFileUploader(file_uploader) 4096 if emit_connector_builder_messages 4097 else file_uploader 4098 ) 4099 4100 def create_moving_window_call_rate_policy( 4101 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4102 ) -> MovingWindowCallRatePolicy: 4103 rates = [ 4104 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4105 ] 4106 matchers = [ 4107 self._create_component_from_model(model=matcher, config=config) 4108 for matcher in model.matchers 4109 ] 4110 return MovingWindowCallRatePolicy( 4111 rates=rates, 4112 matchers=matchers, 4113 ) 4114 4115 def create_unlimited_call_rate_policy( 4116 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4117 ) -> UnlimitedCallRatePolicy: 4118 matchers = [ 4119 self._create_component_from_model(model=matcher, config=config) 4120 for matcher in model.matchers 4121 ] 4122 4123 return UnlimitedCallRatePolicy( 4124 matchers=matchers, 4125 ) 4126 4127 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4128 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4129 return Rate( 4130 limit=int(interpolated_limit.eval(config=config)), 4131 interval=parse_duration(model.interval), 4132 ) 4133 4134 def create_http_request_matcher( 4135 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4136 ) -> HttpRequestRegexMatcher: 4137 return HttpRequestRegexMatcher( 4138 method=model.method, 4139 url_base=model.url_base, 4140 url_path_pattern=model.url_path_pattern, 4141 params=model.params, 4142 headers=model.headers, 4143 ) 4144 4145 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4146 self._api_budget = self.create_component( 4147 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4148 ) 4149 4150 def create_grouping_partition_router( 4151 self, 4152 model: GroupingPartitionRouterModel, 4153 config: Config, 4154 *, 4155 stream_name: str, 4156 **kwargs: Any, 4157 ) -> GroupingPartitionRouter: 4158 underlying_router = self._create_component_from_model( 4159 model=model.underlying_partition_router, 4160 config=config, 4161 stream_name=stream_name, 4162 **kwargs, 4163 ) 4164 if model.group_size < 1: 4165 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4166 4167 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4168 # because they are specific to individual partitions and cannot be aggregated or handled 4169 # when grouping, potentially leading to incorrect API calls. Any request customization 4170 # should be managed at the stream level through the requester's configuration. 4171 if isinstance(underlying_router, SubstreamPartitionRouter): 4172 if any( 4173 parent_config.request_option 4174 for parent_config in underlying_router.parent_stream_configs 4175 ): 4176 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4177 4178 if isinstance(underlying_router, ListPartitionRouter): 4179 if underlying_router.request_option: 4180 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4181 4182 return GroupingPartitionRouter( 4183 group_size=model.group_size, 4184 underlying_partition_router=underlying_router, 4185 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4186 config=config, 4187 )
649class ModelToComponentFactory: 650 EPOCH_DATETIME_FORMAT = "%s" 651 652 def __init__( 653 self, 654 limit_pages_fetched_per_slice: Optional[int] = None, 655 limit_slices_fetched: Optional[int] = None, 656 emit_connector_builder_messages: bool = False, 657 disable_retries: bool = False, 658 disable_cache: bool = False, 659 message_repository: Optional[MessageRepository] = None, 660 connector_state_manager: Optional[ConnectorStateManager] = None, 661 max_concurrent_async_job_count: Optional[int] = None, 662 ): 663 self._init_mappings() 664 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 665 self._limit_slices_fetched = limit_slices_fetched 666 self._emit_connector_builder_messages = emit_connector_builder_messages 667 self._disable_retries = disable_retries 668 self._disable_cache = disable_cache 669 self._message_repository = message_repository or InMemoryMessageRepository( 670 self._evaluate_log_level(emit_connector_builder_messages) 671 ) 672 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 673 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 674 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 675 # placeholder for deprecation warnings 676 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 677 678 def _init_mappings(self) -> None: 679 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 680 AddedFieldDefinitionModel: self.create_added_field_definition, 681 AddFieldsModel: self.create_add_fields, 682 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 683 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 684 BearerAuthenticatorModel: self.create_bearer_authenticator, 685 CheckStreamModel: self.create_check_stream, 686 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 687 CheckDynamicStreamModel: self.create_check_dynamic_stream, 688 CompositeErrorHandlerModel: self.create_composite_error_handler, 689 ConcurrencyLevelModel: self.create_concurrency_level, 690 ConfigMigrationModel: self.create_config_migration, 691 ConfigAddFieldsModel: self.create_config_add_fields, 692 ConfigRemapFieldModel: self.create_config_remap_field, 693 ConfigRemoveFieldsModel: self.create_config_remove_fields, 694 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 695 CsvDecoderModel: self.create_csv_decoder, 696 CursorPaginationModel: self.create_cursor_pagination, 697 CustomAuthenticatorModel: self.create_custom_component, 698 CustomBackoffStrategyModel: self.create_custom_component, 699 CustomDecoderModel: self.create_custom_component, 700 CustomErrorHandlerModel: self.create_custom_component, 701 CustomRecordExtractorModel: self.create_custom_component, 702 CustomRecordFilterModel: self.create_custom_component, 703 CustomRequesterModel: self.create_custom_component, 704 CustomRetrieverModel: self.create_custom_component, 705 CustomSchemaLoader: self.create_custom_component, 706 CustomSchemaNormalizationModel: self.create_custom_component, 707 CustomStateMigration: self.create_custom_component, 708 CustomPaginationStrategyModel: self.create_custom_component, 709 CustomPartitionRouterModel: self.create_custom_component, 710 CustomTransformationModel: self.create_custom_component, 711 CustomValidationStrategyModel: self.create_custom_component, 712 CustomConfigTransformationModel: self.create_custom_component, 713 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 714 DeclarativeStreamModel: self.create_default_stream, 715 DefaultErrorHandlerModel: self.create_default_error_handler, 716 DefaultPaginatorModel: self.create_default_paginator, 717 DpathExtractorModel: self.create_dpath_extractor, 718 DpathValidatorModel: self.create_dpath_validator, 719 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 720 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 721 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 722 GroupByKeyMergeStrategyModel: self.create_group_by_key, 723 HttpRequesterModel: self.create_http_requester, 724 HttpResponseFilterModel: self.create_http_response_filter, 725 InlineSchemaLoaderModel: self.create_inline_schema_loader, 726 JsonDecoderModel: self.create_json_decoder, 727 JsonlDecoderModel: self.create_jsonl_decoder, 728 GzipDecoderModel: self.create_gzip_decoder, 729 KeysToLowerModel: self.create_keys_to_lower_transformation, 730 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 731 KeysReplaceModel: self.create_keys_replace_transformation, 732 FlattenFieldsModel: self.create_flatten_fields, 733 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 734 IterableDecoderModel: self.create_iterable_decoder, 735 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 736 XmlDecoderModel: self.create_xml_decoder, 737 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 738 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 739 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 740 TypesMapModel: self.create_types_map, 741 ComplexFieldTypeModel: self.create_complex_field_type, 742 JwtAuthenticatorModel: self.create_jwt_authenticator, 743 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 744 ListPartitionRouterModel: self.create_list_partition_router, 745 MinMaxDatetimeModel: self.create_min_max_datetime, 746 NoAuthModel: self.create_no_auth, 747 NoPaginationModel: self.create_no_pagination, 748 OAuthAuthenticatorModel: self.create_oauth_authenticator, 749 OffsetIncrementModel: self.create_offset_increment, 750 PageIncrementModel: self.create_page_increment, 751 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 752 PredicateValidatorModel: self.create_predicate_validator, 753 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 754 PropertyChunkingModel: self.create_property_chunking, 755 QueryPropertiesModel: self.create_query_properties, 756 RecordFilterModel: self.create_record_filter, 757 RecordSelectorModel: self.create_record_selector, 758 RemoveFieldsModel: self.create_remove_fields, 759 RequestPathModel: self.create_request_path, 760 RequestOptionModel: self.create_request_option, 761 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 762 SelectiveAuthenticatorModel: self.create_selective_authenticator, 763 SimpleRetrieverModel: self.create_simple_retriever, 764 StateDelegatingStreamModel: self.create_state_delegating_stream, 765 SpecModel: self.create_spec, 766 SubstreamPartitionRouterModel: self.create_substream_partition_router, 767 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 768 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 769 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 770 AsyncRetrieverModel: self.create_async_retriever, 771 HttpComponentsResolverModel: self.create_http_components_resolver, 772 ConfigComponentsResolverModel: self.create_config_components_resolver, 773 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 774 StreamConfigModel: self.create_stream_config, 775 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 776 ZipfileDecoderModel: self.create_zipfile_decoder, 777 HTTPAPIBudgetModel: self.create_http_api_budget, 778 FileUploaderModel: self.create_file_uploader, 779 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 780 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 781 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 782 RateModel: self.create_rate, 783 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 784 GroupingPartitionRouterModel: self.create_grouping_partition_router, 785 } 786 787 # Needed for the case where we need to perform a second parse on the fields of a custom component 788 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 789 790 def create_component( 791 self, 792 model_type: Type[BaseModel], 793 component_definition: ComponentDefinition, 794 config: Config, 795 **kwargs: Any, 796 ) -> Any: 797 """ 798 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 799 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 800 creating declarative components from that model. 801 802 :param model_type: The type of declarative component that is being initialized 803 :param component_definition: The mapping that represents a declarative component 804 :param config: The connector config that is provided by the customer 805 :return: The declarative component to be used at runtime 806 """ 807 808 component_type = component_definition.get("type") 809 if component_definition.get("type") != model_type.__name__: 810 raise ValueError( 811 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 812 ) 813 814 declarative_component_model = model_type.parse_obj(component_definition) 815 816 if not isinstance(declarative_component_model, model_type): 817 raise ValueError( 818 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 819 ) 820 821 return self._create_component_from_model( 822 model=declarative_component_model, config=config, **kwargs 823 ) 824 825 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 826 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 827 raise ValueError( 828 f"{model.__class__} with attributes {model} is not a valid component type" 829 ) 830 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 831 if not component_constructor: 832 raise ValueError(f"Could not find constructor for {model.__class__}") 833 834 # collect deprecation warnings for supported models. 835 if isinstance(model, BaseModelWithDeprecations): 836 self._collect_model_deprecations(model) 837 838 return component_constructor(model=model, config=config, **kwargs) 839 840 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 841 """ 842 Returns the deprecation warnings that were collected during the creation of components. 843 """ 844 return self._collected_deprecation_logs 845 846 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 847 """ 848 Collects deprecation logs from the given model and appends any new logs to the internal collection. 849 850 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 851 852 Args: 853 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 854 """ 855 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 856 for log in model._deprecation_logs: 857 # avoid duplicates for deprecation logs observed. 858 if log not in self._collected_deprecation_logs: 859 self._collected_deprecation_logs.append(log) 860 861 def create_config_migration( 862 self, model: ConfigMigrationModel, config: Config 863 ) -> ConfigMigration: 864 transformations: List[ConfigTransformation] = [ 865 self._create_component_from_model(transformation, config) 866 for transformation in model.transformations 867 ] 868 869 return ConfigMigration( 870 description=model.description, 871 transformations=transformations, 872 ) 873 874 def create_config_add_fields( 875 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 876 ) -> ConfigAddFields: 877 fields = [self._create_component_from_model(field, config) for field in model.fields] 878 return ConfigAddFields( 879 fields=fields, 880 condition=model.condition or "", 881 ) 882 883 @staticmethod 884 def create_config_remove_fields( 885 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 886 ) -> ConfigRemoveFields: 887 return ConfigRemoveFields( 888 field_pointers=model.field_pointers, 889 condition=model.condition or "", 890 ) 891 892 @staticmethod 893 def create_config_remap_field( 894 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 895 ) -> ConfigRemapField: 896 mapping = cast(Mapping[str, Any], model.map) 897 return ConfigRemapField( 898 map=mapping, 899 field_path=model.field_path, 900 config=config, 901 ) 902 903 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 904 strategy = self._create_component_from_model(model.validation_strategy, config) 905 906 return DpathValidator( 907 field_path=model.field_path, 908 strategy=strategy, 909 ) 910 911 def create_predicate_validator( 912 self, model: PredicateValidatorModel, config: Config 913 ) -> PredicateValidator: 914 strategy = self._create_component_from_model(model.validation_strategy, config) 915 916 return PredicateValidator( 917 value=model.value, 918 strategy=strategy, 919 ) 920 921 @staticmethod 922 def create_validate_adheres_to_schema( 923 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 924 ) -> ValidateAdheresToSchema: 925 base_schema = cast(Mapping[str, Any], model.base_schema) 926 return ValidateAdheresToSchema( 927 schema=base_schema, 928 ) 929 930 @staticmethod 931 def create_added_field_definition( 932 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 933 ) -> AddedFieldDefinition: 934 interpolated_value = InterpolatedString.create( 935 model.value, parameters=model.parameters or {} 936 ) 937 return AddedFieldDefinition( 938 path=model.path, 939 value=interpolated_value, 940 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 941 parameters=model.parameters or {}, 942 ) 943 944 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 945 added_field_definitions = [ 946 self._create_component_from_model( 947 model=added_field_definition_model, 948 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 949 added_field_definition_model.value_type 950 ), 951 config=config, 952 ) 953 for added_field_definition_model in model.fields 954 ] 955 return AddFields( 956 fields=added_field_definitions, 957 condition=model.condition or "", 958 parameters=model.parameters or {}, 959 ) 960 961 def create_keys_to_lower_transformation( 962 self, model: KeysToLowerModel, config: Config, **kwargs: Any 963 ) -> KeysToLowerTransformation: 964 return KeysToLowerTransformation() 965 966 def create_keys_to_snake_transformation( 967 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 968 ) -> KeysToSnakeCaseTransformation: 969 return KeysToSnakeCaseTransformation() 970 971 def create_keys_replace_transformation( 972 self, model: KeysReplaceModel, config: Config, **kwargs: Any 973 ) -> KeysReplaceTransformation: 974 return KeysReplaceTransformation( 975 old=model.old, new=model.new, parameters=model.parameters or {} 976 ) 977 978 def create_flatten_fields( 979 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 980 ) -> FlattenFields: 981 return FlattenFields( 982 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 983 ) 984 985 def create_dpath_flatten_fields( 986 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 987 ) -> DpathFlattenFields: 988 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 989 key_transformation = ( 990 KeyTransformation( 991 config=config, 992 prefix=model.key_transformation.prefix, 993 suffix=model.key_transformation.suffix, 994 parameters=model.parameters or {}, 995 ) 996 if model.key_transformation is not None 997 else None 998 ) 999 return DpathFlattenFields( 1000 config=config, 1001 field_path=model_field_path, 1002 delete_origin_value=model.delete_origin_value 1003 if model.delete_origin_value is not None 1004 else False, 1005 replace_record=model.replace_record if model.replace_record is not None else False, 1006 key_transformation=key_transformation, 1007 parameters=model.parameters or {}, 1008 ) 1009 1010 @staticmethod 1011 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1012 if not value_type: 1013 return None 1014 names_to_types = { 1015 ValueType.string: str, 1016 ValueType.number: float, 1017 ValueType.integer: int, 1018 ValueType.boolean: bool, 1019 } 1020 return names_to_types[value_type] 1021 1022 def create_api_key_authenticator( 1023 self, 1024 model: ApiKeyAuthenticatorModel, 1025 config: Config, 1026 token_provider: Optional[TokenProvider] = None, 1027 **kwargs: Any, 1028 ) -> ApiKeyAuthenticator: 1029 if model.inject_into is None and model.header is None: 1030 raise ValueError( 1031 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1032 ) 1033 1034 if model.inject_into is not None and model.header is not None: 1035 raise ValueError( 1036 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1037 ) 1038 1039 if token_provider is not None and model.api_token != "": 1040 raise ValueError( 1041 "If token_provider is set, api_token is ignored and has to be set to empty string." 1042 ) 1043 1044 request_option = ( 1045 self._create_component_from_model( 1046 model.inject_into, config, parameters=model.parameters or {} 1047 ) 1048 if model.inject_into 1049 else RequestOption( 1050 inject_into=RequestOptionType.header, 1051 field_name=model.header or "", 1052 parameters=model.parameters or {}, 1053 ) 1054 ) 1055 1056 return ApiKeyAuthenticator( 1057 token_provider=( 1058 token_provider 1059 if token_provider is not None 1060 else InterpolatedStringTokenProvider( 1061 api_token=model.api_token or "", 1062 config=config, 1063 parameters=model.parameters or {}, 1064 ) 1065 ), 1066 request_option=request_option, 1067 config=config, 1068 parameters=model.parameters or {}, 1069 ) 1070 1071 def create_legacy_to_per_partition_state_migration( 1072 self, 1073 model: LegacyToPerPartitionStateMigrationModel, 1074 config: Mapping[str, Any], 1075 declarative_stream: DeclarativeStreamModel, 1076 ) -> LegacyToPerPartitionStateMigration: 1077 retriever = declarative_stream.retriever 1078 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1079 raise ValueError( 1080 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1081 ) 1082 partition_router = retriever.partition_router 1083 if not isinstance( 1084 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1085 ): 1086 raise ValueError( 1087 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1088 ) 1089 if not hasattr(partition_router, "parent_stream_configs"): 1090 raise ValueError( 1091 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1092 ) 1093 1094 if not hasattr(declarative_stream, "incremental_sync"): 1095 raise ValueError( 1096 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1097 ) 1098 1099 return LegacyToPerPartitionStateMigration( 1100 partition_router, # type: ignore # was already checked above 1101 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1102 config, 1103 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1104 ) 1105 1106 def create_session_token_authenticator( 1107 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1108 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1109 decoder = ( 1110 self._create_component_from_model(model=model.decoder, config=config) 1111 if model.decoder 1112 else JsonDecoder(parameters={}) 1113 ) 1114 login_requester = self._create_component_from_model( 1115 model=model.login_requester, 1116 config=config, 1117 name=f"{name}_login_requester", 1118 decoder=decoder, 1119 ) 1120 token_provider = SessionTokenProvider( 1121 login_requester=login_requester, 1122 session_token_path=model.session_token_path, 1123 expiration_duration=parse_duration(model.expiration_duration) 1124 if model.expiration_duration 1125 else None, 1126 parameters=model.parameters or {}, 1127 message_repository=self._message_repository, 1128 decoder=decoder, 1129 ) 1130 if model.request_authentication.type == "Bearer": 1131 return ModelToComponentFactory.create_bearer_authenticator( 1132 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1133 config, 1134 token_provider=token_provider, 1135 ) 1136 else: 1137 return self.create_api_key_authenticator( 1138 ApiKeyAuthenticatorModel( 1139 type="ApiKeyAuthenticator", 1140 api_token="", 1141 inject_into=model.request_authentication.inject_into, 1142 ), # type: ignore # $parameters and headers default to None 1143 config=config, 1144 token_provider=token_provider, 1145 ) 1146 1147 @staticmethod 1148 def create_basic_http_authenticator( 1149 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1150 ) -> BasicHttpAuthenticator: 1151 return BasicHttpAuthenticator( 1152 password=model.password or "", 1153 username=model.username, 1154 config=config, 1155 parameters=model.parameters or {}, 1156 ) 1157 1158 @staticmethod 1159 def create_bearer_authenticator( 1160 model: BearerAuthenticatorModel, 1161 config: Config, 1162 token_provider: Optional[TokenProvider] = None, 1163 **kwargs: Any, 1164 ) -> BearerAuthenticator: 1165 if token_provider is not None and model.api_token != "": 1166 raise ValueError( 1167 "If token_provider is set, api_token is ignored and has to be set to empty string." 1168 ) 1169 return BearerAuthenticator( 1170 token_provider=( 1171 token_provider 1172 if token_provider is not None 1173 else InterpolatedStringTokenProvider( 1174 api_token=model.api_token or "", 1175 config=config, 1176 parameters=model.parameters or {}, 1177 ) 1178 ), 1179 config=config, 1180 parameters=model.parameters or {}, 1181 ) 1182 1183 @staticmethod 1184 def create_dynamic_stream_check_config( 1185 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1186 ) -> DynamicStreamCheckConfig: 1187 return DynamicStreamCheckConfig( 1188 dynamic_stream_name=model.dynamic_stream_name, 1189 stream_count=model.stream_count or 0, 1190 ) 1191 1192 def create_check_stream( 1193 self, model: CheckStreamModel, config: Config, **kwargs: Any 1194 ) -> CheckStream: 1195 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1196 raise ValueError( 1197 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1198 ) 1199 1200 dynamic_streams_check_configs = ( 1201 [ 1202 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1203 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1204 ] 1205 if model.dynamic_streams_check_configs 1206 else [] 1207 ) 1208 1209 return CheckStream( 1210 stream_names=model.stream_names or [], 1211 dynamic_streams_check_configs=dynamic_streams_check_configs, 1212 parameters={}, 1213 ) 1214 1215 @staticmethod 1216 def create_check_dynamic_stream( 1217 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1218 ) -> CheckDynamicStream: 1219 assert model.use_check_availability is not None # for mypy 1220 1221 use_check_availability = model.use_check_availability 1222 1223 return CheckDynamicStream( 1224 stream_count=model.stream_count, 1225 use_check_availability=use_check_availability, 1226 parameters={}, 1227 ) 1228 1229 def create_composite_error_handler( 1230 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1231 ) -> CompositeErrorHandler: 1232 error_handlers = [ 1233 self._create_component_from_model(model=error_handler_model, config=config) 1234 for error_handler_model in model.error_handlers 1235 ] 1236 return CompositeErrorHandler( 1237 error_handlers=error_handlers, parameters=model.parameters or {} 1238 ) 1239 1240 @staticmethod 1241 def create_concurrency_level( 1242 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1243 ) -> ConcurrencyLevel: 1244 return ConcurrencyLevel( 1245 default_concurrency=model.default_concurrency, 1246 max_concurrency=model.max_concurrency, 1247 config=config, 1248 parameters={}, 1249 ) 1250 1251 @staticmethod 1252 def apply_stream_state_migrations( 1253 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1254 ) -> MutableMapping[str, Any]: 1255 if stream_state_migrations: 1256 for state_migration in stream_state_migrations: 1257 if state_migration.should_migrate(stream_state): 1258 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1259 stream_state = dict(state_migration.migrate(stream_state)) 1260 return stream_state 1261 1262 def create_concurrent_cursor_from_datetime_based_cursor( 1263 self, 1264 model_type: Type[BaseModel], 1265 component_definition: ComponentDefinition, 1266 stream_name: str, 1267 stream_namespace: Optional[str], 1268 config: Config, 1269 message_repository: Optional[MessageRepository] = None, 1270 runtime_lookback_window: Optional[datetime.timedelta] = None, 1271 stream_state_migrations: Optional[List[Any]] = None, 1272 **kwargs: Any, 1273 ) -> ConcurrentCursor: 1274 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1275 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1276 # incoming state and connector_state_manager that is initialized when the component factory is created 1277 stream_state = ( 1278 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1279 if "stream_state" not in kwargs 1280 else kwargs["stream_state"] 1281 ) 1282 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1283 1284 component_type = component_definition.get("type") 1285 if component_definition.get("type") != model_type.__name__: 1286 raise ValueError( 1287 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1288 ) 1289 1290 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1291 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1292 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1293 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1294 if "$parameters" not in component_definition and "parameters" in component_definition: 1295 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1296 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1297 1298 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1299 raise ValueError( 1300 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1301 ) 1302 1303 model_parameters = datetime_based_cursor_model.parameters or {} 1304 interpolated_cursor_field = InterpolatedString.create( 1305 datetime_based_cursor_model.cursor_field, 1306 parameters=model_parameters, 1307 ) 1308 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1309 1310 interpolated_partition_field_start = InterpolatedString.create( 1311 datetime_based_cursor_model.partition_field_start or "start_time", 1312 parameters=model_parameters, 1313 ) 1314 interpolated_partition_field_end = InterpolatedString.create( 1315 datetime_based_cursor_model.partition_field_end or "end_time", 1316 parameters=model_parameters, 1317 ) 1318 1319 slice_boundary_fields = ( 1320 interpolated_partition_field_start.eval(config=config), 1321 interpolated_partition_field_end.eval(config=config), 1322 ) 1323 1324 datetime_format = datetime_based_cursor_model.datetime_format 1325 1326 cursor_granularity = ( 1327 parse_duration(datetime_based_cursor_model.cursor_granularity) 1328 if datetime_based_cursor_model.cursor_granularity 1329 else None 1330 ) 1331 1332 lookback_window = None 1333 interpolated_lookback_window = ( 1334 InterpolatedString.create( 1335 datetime_based_cursor_model.lookback_window, 1336 parameters=model_parameters, 1337 ) 1338 if datetime_based_cursor_model.lookback_window 1339 else None 1340 ) 1341 if interpolated_lookback_window: 1342 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1343 if evaluated_lookback_window: 1344 lookback_window = parse_duration(evaluated_lookback_window) 1345 1346 connector_state_converter: DateTimeStreamStateConverter 1347 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1348 datetime_format=datetime_format, 1349 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1350 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1351 cursor_granularity=cursor_granularity, 1352 ) 1353 1354 # Adjusts the stream state by applying the runtime lookback window. 1355 # This is used to ensure correct state handling in case of failed partitions. 1356 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1357 if runtime_lookback_window and stream_state_value: 1358 new_stream_state = ( 1359 connector_state_converter.parse_timestamp(stream_state_value) 1360 - runtime_lookback_window 1361 ) 1362 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1363 new_stream_state 1364 ) 1365 1366 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1367 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1368 start_date_runtime_value = self.create_min_max_datetime( 1369 model=datetime_based_cursor_model.start_datetime, config=config 1370 ) 1371 else: 1372 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1373 1374 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1375 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1376 end_date_runtime_value = self.create_min_max_datetime( 1377 model=datetime_based_cursor_model.end_datetime, config=config 1378 ) 1379 else: 1380 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1381 1382 interpolated_start_date = MinMaxDatetime.create( 1383 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1384 parameters=datetime_based_cursor_model.parameters, 1385 ) 1386 interpolated_end_date = ( 1387 None 1388 if not end_date_runtime_value 1389 else MinMaxDatetime.create( 1390 end_date_runtime_value, datetime_based_cursor_model.parameters 1391 ) 1392 ) 1393 1394 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1395 if not interpolated_start_date.datetime_format: 1396 interpolated_start_date.datetime_format = datetime_format 1397 if interpolated_end_date and not interpolated_end_date.datetime_format: 1398 interpolated_end_date.datetime_format = datetime_format 1399 1400 start_date = interpolated_start_date.get_datetime(config=config) 1401 end_date_provider = ( 1402 partial(interpolated_end_date.get_datetime, config) 1403 if interpolated_end_date 1404 else connector_state_converter.get_end_provider() 1405 ) 1406 1407 if ( 1408 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1409 ) or ( 1410 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1411 ): 1412 raise ValueError( 1413 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1414 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1415 ) 1416 1417 # When step is not defined, default to a step size from the starting date to the present moment 1418 step_length = datetime.timedelta.max 1419 interpolated_step = ( 1420 InterpolatedString.create( 1421 datetime_based_cursor_model.step, 1422 parameters=model_parameters, 1423 ) 1424 if datetime_based_cursor_model.step 1425 else None 1426 ) 1427 if interpolated_step: 1428 evaluated_step = interpolated_step.eval(config) 1429 if evaluated_step: 1430 step_length = parse_duration(evaluated_step) 1431 1432 clamping_strategy: ClampingStrategy = NoClamping() 1433 if datetime_based_cursor_model.clamping: 1434 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1435 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1436 # object which we want to keep agnostic of being low-code 1437 target = InterpolatedString( 1438 string=datetime_based_cursor_model.clamping.target, 1439 parameters=model_parameters, 1440 ) 1441 evaluated_target = target.eval(config=config) 1442 match evaluated_target: 1443 case "DAY": 1444 clamping_strategy = DayClampingStrategy() 1445 end_date_provider = ClampingEndProvider( 1446 DayClampingStrategy(is_ceiling=False), 1447 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1448 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1449 ) 1450 case "WEEK": 1451 if ( 1452 not datetime_based_cursor_model.clamping.target_details 1453 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1454 ): 1455 raise ValueError( 1456 "Given WEEK clamping, weekday needs to be provided as target_details" 1457 ) 1458 weekday = self._assemble_weekday( 1459 datetime_based_cursor_model.clamping.target_details["weekday"] 1460 ) 1461 clamping_strategy = WeekClampingStrategy(weekday) 1462 end_date_provider = ClampingEndProvider( 1463 WeekClampingStrategy(weekday, is_ceiling=False), 1464 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1465 granularity=cursor_granularity or datetime.timedelta(days=1), 1466 ) 1467 case "MONTH": 1468 clamping_strategy = MonthClampingStrategy() 1469 end_date_provider = ClampingEndProvider( 1470 MonthClampingStrategy(is_ceiling=False), 1471 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1472 granularity=cursor_granularity or datetime.timedelta(days=1), 1473 ) 1474 case _: 1475 raise ValueError( 1476 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1477 ) 1478 1479 return ConcurrentCursor( 1480 stream_name=stream_name, 1481 stream_namespace=stream_namespace, 1482 stream_state=stream_state, 1483 message_repository=message_repository or self._message_repository, 1484 connector_state_manager=self._connector_state_manager, 1485 connector_state_converter=connector_state_converter, 1486 cursor_field=cursor_field, 1487 slice_boundary_fields=slice_boundary_fields, 1488 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1489 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1490 lookback_window=lookback_window, 1491 slice_range=step_length, 1492 cursor_granularity=cursor_granularity, 1493 clamping_strategy=clamping_strategy, 1494 ) 1495 1496 def create_concurrent_cursor_from_incrementing_count_cursor( 1497 self, 1498 model_type: Type[BaseModel], 1499 component_definition: ComponentDefinition, 1500 stream_name: str, 1501 stream_namespace: Optional[str], 1502 config: Config, 1503 message_repository: Optional[MessageRepository] = None, 1504 stream_state_migrations: Optional[List[Any]] = None, 1505 **kwargs: Any, 1506 ) -> ConcurrentCursor: 1507 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1508 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1509 # incoming state and connector_state_manager that is initialized when the component factory is created 1510 stream_state = ( 1511 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1512 if "stream_state" not in kwargs 1513 else kwargs["stream_state"] 1514 ) 1515 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1516 1517 component_type = component_definition.get("type") 1518 if component_definition.get("type") != model_type.__name__: 1519 raise ValueError( 1520 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1521 ) 1522 1523 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1524 1525 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1526 raise ValueError( 1527 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1528 ) 1529 1530 interpolated_start_value = ( 1531 InterpolatedString.create( 1532 incrementing_count_cursor_model.start_value, # type: ignore 1533 parameters=incrementing_count_cursor_model.parameters or {}, 1534 ) 1535 if incrementing_count_cursor_model.start_value 1536 else 0 1537 ) 1538 1539 interpolated_cursor_field = InterpolatedString.create( 1540 incrementing_count_cursor_model.cursor_field, 1541 parameters=incrementing_count_cursor_model.parameters or {}, 1542 ) 1543 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1544 1545 connector_state_converter = IncrementingCountStreamStateConverter( 1546 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1547 ) 1548 1549 return ConcurrentCursor( 1550 stream_name=stream_name, 1551 stream_namespace=stream_namespace, 1552 stream_state=stream_state, 1553 message_repository=message_repository or self._message_repository, 1554 connector_state_manager=self._connector_state_manager, 1555 connector_state_converter=connector_state_converter, 1556 cursor_field=cursor_field, 1557 slice_boundary_fields=None, 1558 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1559 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1560 ) 1561 1562 def _assemble_weekday(self, weekday: str) -> Weekday: 1563 match weekday: 1564 case "MONDAY": 1565 return Weekday.MONDAY 1566 case "TUESDAY": 1567 return Weekday.TUESDAY 1568 case "WEDNESDAY": 1569 return Weekday.WEDNESDAY 1570 case "THURSDAY": 1571 return Weekday.THURSDAY 1572 case "FRIDAY": 1573 return Weekday.FRIDAY 1574 case "SATURDAY": 1575 return Weekday.SATURDAY 1576 case "SUNDAY": 1577 return Weekday.SUNDAY 1578 case _: 1579 raise ValueError(f"Unknown weekday {weekday}") 1580 1581 def create_concurrent_cursor_from_perpartition_cursor( 1582 self, 1583 state_manager: ConnectorStateManager, 1584 model_type: Type[BaseModel], 1585 component_definition: ComponentDefinition, 1586 stream_name: str, 1587 stream_namespace: Optional[str], 1588 config: Config, 1589 stream_state: MutableMapping[str, Any], 1590 partition_router: PartitionRouter, 1591 stream_state_migrations: Optional[List[Any]] = None, 1592 attempt_to_create_cursor_if_not_provided: bool = False, 1593 **kwargs: Any, 1594 ) -> ConcurrentPerPartitionCursor: 1595 component_type = component_definition.get("type") 1596 if component_definition.get("type") != model_type.__name__: 1597 raise ValueError( 1598 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1599 ) 1600 1601 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1602 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1603 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1604 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1605 if "$parameters" not in component_definition and "parameters" in component_definition: 1606 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1607 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1608 1609 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1610 raise ValueError( 1611 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1612 ) 1613 1614 interpolated_cursor_field = InterpolatedString.create( 1615 datetime_based_cursor_model.cursor_field, 1616 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1617 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1618 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1619 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1620 parameters=datetime_based_cursor_model.parameters or {}, 1621 ) 1622 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1623 1624 datetime_format = datetime_based_cursor_model.datetime_format 1625 1626 cursor_granularity = ( 1627 parse_duration(datetime_based_cursor_model.cursor_granularity) 1628 if datetime_based_cursor_model.cursor_granularity 1629 else None 1630 ) 1631 1632 connector_state_converter: DateTimeStreamStateConverter 1633 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1634 datetime_format=datetime_format, 1635 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1636 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1637 cursor_granularity=cursor_granularity, 1638 ) 1639 1640 # Create the cursor factory 1641 cursor_factory = ConcurrentCursorFactory( 1642 partial( 1643 self.create_concurrent_cursor_from_datetime_based_cursor, 1644 state_manager=state_manager, 1645 model_type=model_type, 1646 component_definition=component_definition, 1647 stream_name=stream_name, 1648 stream_namespace=stream_namespace, 1649 config=config, 1650 message_repository=NoopMessageRepository(), 1651 # stream_state_migrations=stream_state_migrations, # FIXME is it expected to run migration on per partition state too? 1652 ) 1653 ) 1654 1655 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1656 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1657 use_global_cursor = isinstance( 1658 partition_router, GroupingPartitionRouter 1659 ) or component_definition.get("global_substream_cursor", False) 1660 1661 # Return the concurrent cursor and state converter 1662 return ConcurrentPerPartitionCursor( 1663 cursor_factory=cursor_factory, 1664 partition_router=partition_router, 1665 stream_name=stream_name, 1666 stream_namespace=stream_namespace, 1667 stream_state=stream_state, 1668 message_repository=self._message_repository, # type: ignore 1669 connector_state_manager=state_manager, 1670 connector_state_converter=connector_state_converter, 1671 cursor_field=cursor_field, 1672 use_global_cursor=use_global_cursor, 1673 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1674 ) 1675 1676 @staticmethod 1677 def create_constant_backoff_strategy( 1678 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1679 ) -> ConstantBackoffStrategy: 1680 return ConstantBackoffStrategy( 1681 backoff_time_in_seconds=model.backoff_time_in_seconds, 1682 config=config, 1683 parameters=model.parameters or {}, 1684 ) 1685 1686 def create_cursor_pagination( 1687 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1688 ) -> CursorPaginationStrategy: 1689 if isinstance(decoder, PaginationDecoderDecorator): 1690 inner_decoder = decoder.decoder 1691 else: 1692 inner_decoder = decoder 1693 decoder = PaginationDecoderDecorator(decoder=decoder) 1694 1695 if self._is_supported_decoder_for_pagination(inner_decoder): 1696 decoder_to_use = decoder 1697 else: 1698 raise ValueError( 1699 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1700 ) 1701 1702 return CursorPaginationStrategy( 1703 cursor_value=model.cursor_value, 1704 decoder=decoder_to_use, 1705 page_size=model.page_size, 1706 stop_condition=model.stop_condition, 1707 config=config, 1708 parameters=model.parameters or {}, 1709 ) 1710 1711 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1712 """ 1713 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1714 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1715 :param model: The Pydantic model of the custom component being created 1716 :param config: The custom defined connector config 1717 :return: The declarative component built from the Pydantic model to be used at runtime 1718 """ 1719 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1720 component_fields = get_type_hints(custom_component_class) 1721 model_args = model.dict() 1722 model_args["config"] = config 1723 1724 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1725 # we defer to these arguments over the component's definition 1726 for key, arg in kwargs.items(): 1727 model_args[key] = arg 1728 1729 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1730 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1731 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1732 for model_field, model_value in model_args.items(): 1733 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1734 if ( 1735 isinstance(model_value, dict) 1736 and "type" not in model_value 1737 and model_field in component_fields 1738 ): 1739 derived_type = self._derive_component_type_from_type_hints( 1740 component_fields.get(model_field) 1741 ) 1742 if derived_type: 1743 model_value["type"] = derived_type 1744 1745 if self._is_component(model_value): 1746 model_args[model_field] = self._create_nested_component( 1747 model, 1748 model_field, 1749 model_value, 1750 config, 1751 **kwargs, 1752 ) 1753 elif isinstance(model_value, list): 1754 vals = [] 1755 for v in model_value: 1756 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1757 derived_type = self._derive_component_type_from_type_hints( 1758 component_fields.get(model_field) 1759 ) 1760 if derived_type: 1761 v["type"] = derived_type 1762 if self._is_component(v): 1763 vals.append( 1764 self._create_nested_component( 1765 model, 1766 model_field, 1767 v, 1768 config, 1769 **kwargs, 1770 ) 1771 ) 1772 else: 1773 vals.append(v) 1774 model_args[model_field] = vals 1775 1776 kwargs = { 1777 class_field: model_args[class_field] 1778 for class_field in component_fields.keys() 1779 if class_field in model_args 1780 } 1781 return custom_component_class(**kwargs) 1782 1783 @staticmethod 1784 def _get_class_from_fully_qualified_class_name( 1785 full_qualified_class_name: str, 1786 ) -> Any: 1787 """Get a class from its fully qualified name. 1788 1789 If a custom components module is needed, we assume it is already registered - probably 1790 as `source_declarative_manifest.components` or `components`. 1791 1792 Args: 1793 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1794 1795 Returns: 1796 Any: The class object. 1797 1798 Raises: 1799 ValueError: If the class cannot be loaded. 1800 """ 1801 split = full_qualified_class_name.split(".") 1802 module_name_full = ".".join(split[:-1]) 1803 class_name = split[-1] 1804 1805 try: 1806 module_ref = importlib.import_module(module_name_full) 1807 except ModuleNotFoundError as e: 1808 if split[0] == "source_declarative_manifest": 1809 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1810 try: 1811 import os 1812 1813 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1814 module_ref = importlib.import_module( 1815 module_name_with_source_declarative_manifest 1816 ) 1817 except ModuleNotFoundError: 1818 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1819 else: 1820 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1821 1822 try: 1823 return getattr(module_ref, class_name) 1824 except AttributeError as e: 1825 raise ValueError( 1826 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1827 ) from e 1828 1829 @staticmethod 1830 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1831 interface = field_type 1832 while True: 1833 origin = get_origin(interface) 1834 if origin: 1835 # Unnest types until we reach the raw type 1836 # List[T] -> T 1837 # Optional[List[T]] -> T 1838 args = get_args(interface) 1839 interface = args[0] 1840 else: 1841 break 1842 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1843 return interface.__name__ 1844 return None 1845 1846 @staticmethod 1847 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1848 if not cls: 1849 return False 1850 return cls.__module__ == "builtins" 1851 1852 @staticmethod 1853 def _extract_missing_parameters(error: TypeError) -> List[str]: 1854 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1855 if parameter_search: 1856 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1857 else: 1858 return [] 1859 1860 def _create_nested_component( 1861 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1862 ) -> Any: 1863 type_name = model_value.get("type", None) 1864 if not type_name: 1865 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1866 return model_value 1867 1868 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1869 if model_type: 1870 parsed_model = model_type.parse_obj(model_value) 1871 try: 1872 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1873 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1874 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1875 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1876 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1877 # are needed by a component and could not be shared. 1878 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1879 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1880 model_parameters = model_value.get("$parameters", {}) 1881 matching_parameters = { 1882 kwarg: model_parameters[kwarg] 1883 for kwarg in constructor_kwargs 1884 if kwarg in model_parameters 1885 } 1886 matching_kwargs = { 1887 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1888 } 1889 return self._create_component_from_model( 1890 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1891 ) 1892 except TypeError as error: 1893 missing_parameters = self._extract_missing_parameters(error) 1894 if missing_parameters: 1895 raise ValueError( 1896 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1897 + ", ".join( 1898 ( 1899 f"{type_name}.$parameters.{parameter}" 1900 for parameter in missing_parameters 1901 ) 1902 ) 1903 ) 1904 raise TypeError( 1905 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1906 ) 1907 else: 1908 raise ValueError( 1909 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1910 ) 1911 1912 @staticmethod 1913 def _is_component(model_value: Any) -> bool: 1914 return isinstance(model_value, dict) and model_value.get("type") is not None 1915 1916 def create_datetime_based_cursor( 1917 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1918 ) -> DatetimeBasedCursor: 1919 start_datetime: Union[str, MinMaxDatetime] = ( 1920 model.start_datetime 1921 if isinstance(model.start_datetime, str) 1922 else self.create_min_max_datetime(model.start_datetime, config) 1923 ) 1924 end_datetime: Union[str, MinMaxDatetime, None] = None 1925 if model.is_data_feed and model.end_datetime: 1926 raise ValueError("Data feed does not support end_datetime") 1927 if model.is_data_feed and model.is_client_side_incremental: 1928 raise ValueError( 1929 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1930 ) 1931 if model.end_datetime: 1932 end_datetime = ( 1933 model.end_datetime 1934 if isinstance(model.end_datetime, str) 1935 else self.create_min_max_datetime(model.end_datetime, config) 1936 ) 1937 1938 end_time_option = ( 1939 self._create_component_from_model( 1940 model.end_time_option, config, parameters=model.parameters or {} 1941 ) 1942 if model.end_time_option 1943 else None 1944 ) 1945 start_time_option = ( 1946 self._create_component_from_model( 1947 model.start_time_option, config, parameters=model.parameters or {} 1948 ) 1949 if model.start_time_option 1950 else None 1951 ) 1952 1953 return DatetimeBasedCursor( 1954 cursor_field=model.cursor_field, 1955 cursor_datetime_formats=model.cursor_datetime_formats 1956 if model.cursor_datetime_formats 1957 else [], 1958 cursor_granularity=model.cursor_granularity, 1959 datetime_format=model.datetime_format, 1960 end_datetime=end_datetime, 1961 start_datetime=start_datetime, 1962 step=model.step, 1963 end_time_option=end_time_option, 1964 lookback_window=model.lookback_window, 1965 start_time_option=start_time_option, 1966 partition_field_end=model.partition_field_end, 1967 partition_field_start=model.partition_field_start, 1968 message_repository=self._message_repository, 1969 is_compare_strictly=model.is_compare_strictly, 1970 config=config, 1971 parameters=model.parameters or {}, 1972 ) 1973 1974 def create_default_stream( 1975 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1976 ) -> AbstractStream: 1977 primary_key = model.primary_key.__root__ if model.primary_key else None 1978 1979 partition_router = self._build_stream_slicer_from_partition_router( 1980 model.retriever, 1981 config, 1982 stream_name=model.name, 1983 **kwargs, 1984 ) 1985 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 1986 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1987 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 1988 1989 end_time_option = ( 1990 self._create_component_from_model( 1991 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1992 ) 1993 if cursor_model.end_time_option 1994 else None 1995 ) 1996 start_time_option = ( 1997 self._create_component_from_model( 1998 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1999 ) 2000 if cursor_model.start_time_option 2001 else None 2002 ) 2003 2004 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2005 start_time_option=start_time_option, 2006 end_time_option=end_time_option, 2007 partition_field_start=cursor_model.partition_field_start, 2008 partition_field_end=cursor_model.partition_field_end, 2009 config=config, 2010 parameters=model.parameters or {}, 2011 ) 2012 request_options_provider = ( 2013 datetime_request_options_provider 2014 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2015 else PerPartitionRequestOptionsProvider( 2016 partition_router, datetime_request_options_provider 2017 ) 2018 ) 2019 elif model.incremental_sync and isinstance( 2020 model.incremental_sync, IncrementingCountCursorModel 2021 ): 2022 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2023 raise ValueError( 2024 "PerPartition does not support per partition states because switching to global state is time based" 2025 ) 2026 2027 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2028 2029 start_time_option = ( 2030 self._create_component_from_model( 2031 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2032 config, 2033 parameters=cursor_model.parameters or {}, 2034 ) 2035 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2036 else None 2037 ) 2038 2039 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2040 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2041 partition_field_start = "start" 2042 2043 request_options_provider = DatetimeBasedRequestOptionsProvider( 2044 start_time_option=start_time_option, 2045 partition_field_start=partition_field_start, 2046 config=config, 2047 parameters=model.parameters or {}, 2048 ) 2049 else: 2050 request_options_provider = None 2051 2052 transformations = [] 2053 if model.transformations: 2054 for transformation_model in model.transformations: 2055 transformations.append( 2056 self._create_component_from_model(model=transformation_model, config=config) 2057 ) 2058 file_uploader = None 2059 if model.file_uploader: 2060 file_uploader = self._create_component_from_model( 2061 model=model.file_uploader, config=config 2062 ) 2063 2064 stream_slicer: ConcurrentStreamSlicer = ( 2065 partition_router 2066 if isinstance(concurrent_cursor, FinalStateCursor) 2067 else concurrent_cursor 2068 ) 2069 retriever = self._create_component_from_model( 2070 model=model.retriever, 2071 config=config, 2072 name=model.name, 2073 primary_key=primary_key, 2074 request_options_provider=request_options_provider, 2075 stream_slicer=stream_slicer, 2076 partition_router=partition_router, 2077 stop_condition_cursor=concurrent_cursor 2078 if self._is_stop_condition_on_cursor(model) 2079 else None, 2080 client_side_incremental_sync={"cursor": concurrent_cursor} 2081 if self._is_client_side_filtering_enabled(model) 2082 else None, 2083 transformations=transformations, 2084 file_uploader=file_uploader, 2085 incremental_sync=model.incremental_sync, 2086 ) 2087 if isinstance(retriever, AsyncRetriever): 2088 stream_slicer = retriever.stream_slicer 2089 2090 schema_loader: Union[ 2091 CompositeSchemaLoader, 2092 DefaultSchemaLoader, 2093 DynamicSchemaLoader, 2094 InlineSchemaLoader, 2095 JsonFileSchemaLoader, 2096 ] 2097 if model.schema_loader and isinstance(model.schema_loader, list): 2098 nested_schema_loaders = [ 2099 self._create_component_from_model(model=nested_schema_loader, config=config) 2100 for nested_schema_loader in model.schema_loader 2101 ] 2102 schema_loader = CompositeSchemaLoader( 2103 schema_loaders=nested_schema_loaders, parameters={} 2104 ) 2105 elif model.schema_loader: 2106 schema_loader = self._create_component_from_model( 2107 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2108 config=config, 2109 ) 2110 else: 2111 options = model.parameters or {} 2112 if "name" not in options: 2113 options["name"] = model.name 2114 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2115 2116 stream_name = model.name or "" 2117 return DefaultStream( 2118 partition_generator=StreamSlicerPartitionGenerator( 2119 DeclarativePartitionFactory( 2120 stream_name, 2121 schema_loader, 2122 retriever, 2123 self._message_repository, 2124 ), 2125 stream_slicer, 2126 slice_limit=self._limit_slices_fetched, 2127 ), 2128 name=stream_name, 2129 json_schema=schema_loader.get_json_schema, 2130 primary_key=get_primary_key_from_stream(primary_key), 2131 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2132 if hasattr(concurrent_cursor, "cursor_field") 2133 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2134 logger=logging.getLogger(f"airbyte.{stream_name}"), 2135 cursor=concurrent_cursor, 2136 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2137 ) 2138 2139 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2140 return bool( 2141 model.incremental_sync 2142 and hasattr(model.incremental_sync, "is_data_feed") 2143 and model.incremental_sync.is_data_feed 2144 ) 2145 2146 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2147 return bool( 2148 model.incremental_sync 2149 and hasattr(model.incremental_sync, "is_client_side_incremental") 2150 and model.incremental_sync.is_client_side_incremental 2151 ) 2152 2153 def _build_stream_slicer_from_partition_router( 2154 self, 2155 model: Union[ 2156 AsyncRetrieverModel, 2157 CustomRetrieverModel, 2158 SimpleRetrieverModel, 2159 ], 2160 config: Config, 2161 stream_name: Optional[str] = None, 2162 **kwargs: Any, 2163 ) -> PartitionRouter: 2164 if ( 2165 hasattr(model, "partition_router") 2166 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2167 and model.partition_router 2168 ): 2169 stream_slicer_model = model.partition_router 2170 if isinstance(stream_slicer_model, list): 2171 return CartesianProductStreamSlicer( 2172 [ 2173 self._create_component_from_model( 2174 model=slicer, config=config, stream_name=stream_name or "" 2175 ) 2176 for slicer in stream_slicer_model 2177 ], 2178 parameters={}, 2179 ) 2180 elif isinstance(stream_slicer_model, dict): 2181 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2182 params = stream_slicer_model.get("$parameters") 2183 if not isinstance(params, dict): 2184 params = {} 2185 stream_slicer_model["$parameters"] = params 2186 2187 if stream_name is not None: 2188 params["stream_name"] = stream_name 2189 2190 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2191 model, 2192 "partition_router", 2193 stream_slicer_model, 2194 config, 2195 **kwargs, 2196 ) 2197 else: 2198 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2199 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2200 ) 2201 return SinglePartitionRouter(parameters={}) 2202 2203 def _build_concurrent_cursor( 2204 self, 2205 model: DeclarativeStreamModel, 2206 stream_slicer: Optional[PartitionRouter], 2207 config: Config, 2208 ) -> Cursor: 2209 stream_name = model.name or "" 2210 stream_state = self._connector_state_manager.get_stream_state( 2211 stream_name=stream_name, namespace=None 2212 ) 2213 2214 if model.state_migrations: 2215 state_transformations = [ 2216 self._create_component_from_model(state_migration, config, declarative_stream=model) 2217 for state_migration in model.state_migrations 2218 ] 2219 else: 2220 state_transformations = [] 2221 2222 if ( 2223 model.incremental_sync 2224 and stream_slicer 2225 and not isinstance(stream_slicer, SinglePartitionRouter) 2226 ): 2227 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2228 state_manager=self._connector_state_manager, 2229 model_type=DatetimeBasedCursorModel, 2230 component_definition=model.incremental_sync.__dict__, 2231 stream_name=stream_name, 2232 stream_namespace=None, 2233 config=config or {}, 2234 stream_state=stream_state, 2235 stream_state_migrations=state_transformations, 2236 partition_router=stream_slicer, 2237 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2238 ) 2239 elif model.incremental_sync: 2240 if type(model.incremental_sync) == IncrementingCountCursorModel: 2241 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2242 model_type=IncrementingCountCursorModel, 2243 component_definition=model.incremental_sync.__dict__, 2244 stream_name=stream_name, 2245 stream_namespace=None, 2246 config=config or {}, 2247 stream_state_migrations=state_transformations, 2248 ) 2249 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2250 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2251 model_type=type(model.incremental_sync), 2252 component_definition=model.incremental_sync.__dict__, 2253 stream_name=stream_name, 2254 stream_namespace=None, 2255 config=config or {}, 2256 stream_state_migrations=state_transformations, 2257 attempt_to_create_cursor_if_not_provided=True, 2258 ) 2259 else: 2260 raise ValueError( 2261 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2262 ) 2263 return FinalStateCursor(stream_name, None, self._message_repository) 2264 2265 def create_default_error_handler( 2266 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2267 ) -> DefaultErrorHandler: 2268 backoff_strategies = [] 2269 if model.backoff_strategies: 2270 for backoff_strategy_model in model.backoff_strategies: 2271 backoff_strategies.append( 2272 self._create_component_from_model(model=backoff_strategy_model, config=config) 2273 ) 2274 2275 response_filters = [] 2276 if model.response_filters: 2277 for response_filter_model in model.response_filters: 2278 response_filters.append( 2279 self._create_component_from_model(model=response_filter_model, config=config) 2280 ) 2281 response_filters.append( 2282 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2283 ) 2284 2285 return DefaultErrorHandler( 2286 backoff_strategies=backoff_strategies, 2287 max_retries=model.max_retries, 2288 response_filters=response_filters, 2289 config=config, 2290 parameters=model.parameters or {}, 2291 ) 2292 2293 def create_default_paginator( 2294 self, 2295 model: DefaultPaginatorModel, 2296 config: Config, 2297 *, 2298 url_base: str, 2299 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2300 decoder: Optional[Decoder] = None, 2301 cursor_used_for_stop_condition: Optional[Cursor] = None, 2302 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2303 if decoder: 2304 if self._is_supported_decoder_for_pagination(decoder): 2305 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2306 else: 2307 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2308 else: 2309 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2310 page_size_option = ( 2311 self._create_component_from_model(model=model.page_size_option, config=config) 2312 if model.page_size_option 2313 else None 2314 ) 2315 page_token_option = ( 2316 self._create_component_from_model(model=model.page_token_option, config=config) 2317 if model.page_token_option 2318 else None 2319 ) 2320 pagination_strategy = self._create_component_from_model( 2321 model=model.pagination_strategy, 2322 config=config, 2323 decoder=decoder_to_use, 2324 extractor_model=extractor_model, 2325 ) 2326 if cursor_used_for_stop_condition: 2327 pagination_strategy = StopConditionPaginationStrategyDecorator( 2328 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2329 ) 2330 paginator = DefaultPaginator( 2331 decoder=decoder_to_use, 2332 page_size_option=page_size_option, 2333 page_token_option=page_token_option, 2334 pagination_strategy=pagination_strategy, 2335 url_base=url_base, 2336 config=config, 2337 parameters=model.parameters or {}, 2338 ) 2339 if self._limit_pages_fetched_per_slice: 2340 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2341 return paginator 2342 2343 def create_dpath_extractor( 2344 self, 2345 model: DpathExtractorModel, 2346 config: Config, 2347 decoder: Optional[Decoder] = None, 2348 **kwargs: Any, 2349 ) -> DpathExtractor: 2350 if decoder: 2351 decoder_to_use = decoder 2352 else: 2353 decoder_to_use = JsonDecoder(parameters={}) 2354 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2355 return DpathExtractor( 2356 decoder=decoder_to_use, 2357 field_path=model_field_path, 2358 config=config, 2359 parameters=model.parameters or {}, 2360 ) 2361 2362 @staticmethod 2363 def create_response_to_file_extractor( 2364 model: ResponseToFileExtractorModel, 2365 **kwargs: Any, 2366 ) -> ResponseToFileExtractor: 2367 return ResponseToFileExtractor(parameters=model.parameters or {}) 2368 2369 @staticmethod 2370 def create_exponential_backoff_strategy( 2371 model: ExponentialBackoffStrategyModel, config: Config 2372 ) -> ExponentialBackoffStrategy: 2373 return ExponentialBackoffStrategy( 2374 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2375 ) 2376 2377 @staticmethod 2378 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2379 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2380 2381 def create_http_requester( 2382 self, 2383 model: HttpRequesterModel, 2384 config: Config, 2385 decoder: Decoder = JsonDecoder(parameters={}), 2386 query_properties_key: Optional[str] = None, 2387 use_cache: Optional[bool] = None, 2388 *, 2389 name: str, 2390 ) -> HttpRequester: 2391 authenticator = ( 2392 self._create_component_from_model( 2393 model=model.authenticator, 2394 config=config, 2395 url_base=model.url or model.url_base, 2396 name=name, 2397 decoder=decoder, 2398 ) 2399 if model.authenticator 2400 else None 2401 ) 2402 error_handler = ( 2403 self._create_component_from_model(model=model.error_handler, config=config) 2404 if model.error_handler 2405 else DefaultErrorHandler( 2406 backoff_strategies=[], 2407 response_filters=[], 2408 config=config, 2409 parameters=model.parameters or {}, 2410 ) 2411 ) 2412 2413 api_budget = self._api_budget 2414 2415 # Removes QueryProperties components from the interpolated mappings because it has been designed 2416 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2417 # instead of through jinja interpolation 2418 request_parameters: Optional[Union[str, Mapping[str, str]]] 2419 if isinstance(model.request_parameters, Mapping): 2420 request_parameters = self._remove_query_properties(model.request_parameters) 2421 else: 2422 request_parameters = model.request_parameters 2423 2424 request_options_provider = InterpolatedRequestOptionsProvider( 2425 request_body=model.request_body, 2426 request_body_data=model.request_body_data, 2427 request_body_json=model.request_body_json, 2428 request_headers=model.request_headers, 2429 request_parameters=request_parameters, 2430 query_properties_key=query_properties_key, 2431 config=config, 2432 parameters=model.parameters or {}, 2433 ) 2434 2435 assert model.use_cache is not None # for mypy 2436 assert model.http_method is not None # for mypy 2437 2438 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2439 2440 return HttpRequester( 2441 name=name, 2442 url=model.url, 2443 url_base=model.url_base, 2444 path=model.path, 2445 authenticator=authenticator, 2446 error_handler=error_handler, 2447 api_budget=api_budget, 2448 http_method=HttpMethod[model.http_method.value], 2449 request_options_provider=request_options_provider, 2450 config=config, 2451 disable_retries=self._disable_retries, 2452 parameters=model.parameters or {}, 2453 message_repository=self._message_repository, 2454 use_cache=should_use_cache, 2455 decoder=decoder, 2456 stream_response=decoder.is_stream_response() if decoder else False, 2457 ) 2458 2459 @staticmethod 2460 def create_http_response_filter( 2461 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2462 ) -> HttpResponseFilter: 2463 if model.action: 2464 action = ResponseAction(model.action.value) 2465 else: 2466 action = None 2467 2468 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2469 2470 http_codes = ( 2471 set(model.http_codes) if model.http_codes else set() 2472 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2473 2474 return HttpResponseFilter( 2475 action=action, 2476 failure_type=failure_type, 2477 error_message=model.error_message or "", 2478 error_message_contains=model.error_message_contains or "", 2479 http_codes=http_codes, 2480 predicate=model.predicate or "", 2481 config=config, 2482 parameters=model.parameters or {}, 2483 ) 2484 2485 @staticmethod 2486 def create_inline_schema_loader( 2487 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2488 ) -> InlineSchemaLoader: 2489 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2490 2491 def create_complex_field_type( 2492 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2493 ) -> ComplexFieldType: 2494 items = ( 2495 self._create_component_from_model(model=model.items, config=config) 2496 if isinstance(model.items, ComplexFieldTypeModel) 2497 else model.items 2498 ) 2499 2500 return ComplexFieldType(field_type=model.field_type, items=items) 2501 2502 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2503 target_type = ( 2504 self._create_component_from_model(model=model.target_type, config=config) 2505 if isinstance(model.target_type, ComplexFieldTypeModel) 2506 else model.target_type 2507 ) 2508 2509 return TypesMap( 2510 target_type=target_type, 2511 current_type=model.current_type, 2512 condition=model.condition if model.condition is not None else "True", 2513 ) 2514 2515 def create_schema_type_identifier( 2516 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2517 ) -> SchemaTypeIdentifier: 2518 types_mapping = [] 2519 if model.types_mapping: 2520 types_mapping.extend( 2521 [ 2522 self._create_component_from_model(types_map, config=config) 2523 for types_map in model.types_mapping 2524 ] 2525 ) 2526 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2527 [x for x in model.schema_pointer] if model.schema_pointer else [] 2528 ) 2529 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2530 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2531 [x for x in model.type_pointer] if model.type_pointer else None 2532 ) 2533 2534 return SchemaTypeIdentifier( 2535 schema_pointer=model_schema_pointer, 2536 key_pointer=model_key_pointer, 2537 type_pointer=model_type_pointer, 2538 types_mapping=types_mapping, 2539 parameters=model.parameters or {}, 2540 ) 2541 2542 def create_dynamic_schema_loader( 2543 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2544 ) -> DynamicSchemaLoader: 2545 schema_transformations = [] 2546 if model.schema_transformations: 2547 for transformation_model in model.schema_transformations: 2548 schema_transformations.append( 2549 self._create_component_from_model(model=transformation_model, config=config) 2550 ) 2551 name = "dynamic_properties" 2552 retriever = self._create_component_from_model( 2553 model=model.retriever, 2554 config=config, 2555 name=name, 2556 primary_key=None, 2557 partition_router=self._build_stream_slicer_from_partition_router( 2558 model.retriever, config 2559 ), 2560 transformations=[], 2561 use_cache=True, 2562 log_formatter=( 2563 lambda response: format_http_message( 2564 response, 2565 f"Schema loader '{name}' request", 2566 f"Request performed in order to extract schema.", 2567 name, 2568 is_auxiliary=True, 2569 ) 2570 ), 2571 ) 2572 schema_type_identifier = self._create_component_from_model( 2573 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2574 ) 2575 schema_filter = ( 2576 self._create_component_from_model( 2577 model.schema_filter, config=config, parameters=model.parameters or {} 2578 ) 2579 if model.schema_filter is not None 2580 else None 2581 ) 2582 2583 return DynamicSchemaLoader( 2584 retriever=retriever, 2585 config=config, 2586 schema_transformations=schema_transformations, 2587 schema_filter=schema_filter, 2588 schema_type_identifier=schema_type_identifier, 2589 parameters=model.parameters or {}, 2590 ) 2591 2592 @staticmethod 2593 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2594 return JsonDecoder(parameters={}) 2595 2596 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2597 return CompositeRawDecoder( 2598 parser=ModelToComponentFactory._get_parser(model, config), 2599 stream_response=False if self._emit_connector_builder_messages else True, 2600 ) 2601 2602 def create_jsonl_decoder( 2603 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2604 ) -> Decoder: 2605 return CompositeRawDecoder( 2606 parser=ModelToComponentFactory._get_parser(model, config), 2607 stream_response=False if self._emit_connector_builder_messages else True, 2608 ) 2609 2610 def create_gzip_decoder( 2611 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2612 ) -> Decoder: 2613 _compressed_response_types = { 2614 "gzip", 2615 "x-gzip", 2616 "gzip, deflate", 2617 "x-gzip, deflate", 2618 "application/zip", 2619 "application/gzip", 2620 "application/x-gzip", 2621 "application/x-zip-compressed", 2622 } 2623 2624 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2625 2626 if self._emit_connector_builder_messages: 2627 # This is very surprising but if the response is not streamed, 2628 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2629 # which uses urllib3 directly and does not uncompress the data. 2630 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2631 2632 return CompositeRawDecoder.by_headers( 2633 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2634 stream_response=True, 2635 fallback_parser=gzip_parser.inner_parser, 2636 ) 2637 2638 # todo: This method should be removed once we deprecate the SimpleRetriever.cursor field and the various 2639 # state methods 2640 @staticmethod 2641 def create_incrementing_count_cursor( 2642 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2643 ) -> DatetimeBasedCursor: 2644 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2645 # we still parse models into components. The issue is that there's no runtime implementation of a 2646 # IncrementingCountCursor. 2647 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2648 return DatetimeBasedCursor( 2649 cursor_field=model.cursor_field, 2650 datetime_format="%Y-%m-%d", 2651 start_datetime="2024-12-12", 2652 config=config, 2653 parameters={}, 2654 ) 2655 2656 @staticmethod 2657 def create_iterable_decoder( 2658 model: IterableDecoderModel, config: Config, **kwargs: Any 2659 ) -> IterableDecoder: 2660 return IterableDecoder(parameters={}) 2661 2662 @staticmethod 2663 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2664 return XmlDecoder(parameters={}) 2665 2666 def create_zipfile_decoder( 2667 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2668 ) -> ZipfileDecoder: 2669 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2670 2671 @staticmethod 2672 def _get_parser(model: BaseModel, config: Config) -> Parser: 2673 if isinstance(model, JsonDecoderModel): 2674 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2675 return JsonParser() 2676 elif isinstance(model, JsonlDecoderModel): 2677 return JsonLineParser() 2678 elif isinstance(model, CsvDecoderModel): 2679 return CsvParser( 2680 encoding=model.encoding, 2681 delimiter=model.delimiter, 2682 set_values_to_none=model.set_values_to_none, 2683 ) 2684 elif isinstance(model, GzipDecoderModel): 2685 return GzipParser( 2686 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2687 ) 2688 elif isinstance( 2689 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2690 ): 2691 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2692 2693 raise ValueError(f"Unknown decoder type {model}") 2694 2695 @staticmethod 2696 def create_json_file_schema_loader( 2697 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2698 ) -> JsonFileSchemaLoader: 2699 return JsonFileSchemaLoader( 2700 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2701 ) 2702 2703 @staticmethod 2704 def create_jwt_authenticator( 2705 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2706 ) -> JwtAuthenticator: 2707 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2708 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2709 return JwtAuthenticator( 2710 config=config, 2711 parameters=model.parameters or {}, 2712 algorithm=JwtAlgorithm(model.algorithm.value), 2713 secret_key=model.secret_key, 2714 base64_encode_secret_key=model.base64_encode_secret_key, 2715 token_duration=model.token_duration, 2716 header_prefix=model.header_prefix, 2717 kid=jwt_headers.kid, 2718 typ=jwt_headers.typ, 2719 cty=jwt_headers.cty, 2720 iss=jwt_payload.iss, 2721 sub=jwt_payload.sub, 2722 aud=jwt_payload.aud, 2723 additional_jwt_headers=model.additional_jwt_headers, 2724 additional_jwt_payload=model.additional_jwt_payload, 2725 ) 2726 2727 def create_list_partition_router( 2728 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2729 ) -> ListPartitionRouter: 2730 request_option = ( 2731 self._create_component_from_model(model.request_option, config) 2732 if model.request_option 2733 else None 2734 ) 2735 return ListPartitionRouter( 2736 cursor_field=model.cursor_field, 2737 request_option=request_option, 2738 values=model.values, 2739 config=config, 2740 parameters=model.parameters or {}, 2741 ) 2742 2743 @staticmethod 2744 def create_min_max_datetime( 2745 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2746 ) -> MinMaxDatetime: 2747 return MinMaxDatetime( 2748 datetime=model.datetime, 2749 datetime_format=model.datetime_format or "", 2750 max_datetime=model.max_datetime or "", 2751 min_datetime=model.min_datetime or "", 2752 parameters=model.parameters or {}, 2753 ) 2754 2755 @staticmethod 2756 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2757 return NoAuth(parameters=model.parameters or {}) 2758 2759 @staticmethod 2760 def create_no_pagination( 2761 model: NoPaginationModel, config: Config, **kwargs: Any 2762 ) -> NoPagination: 2763 return NoPagination(parameters={}) 2764 2765 def create_oauth_authenticator( 2766 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2767 ) -> DeclarativeOauth2Authenticator: 2768 profile_assertion = ( 2769 self._create_component_from_model(model.profile_assertion, config=config) 2770 if model.profile_assertion 2771 else None 2772 ) 2773 2774 if model.refresh_token_updater: 2775 # ignore type error because fixing it would have a lot of dependencies, revisit later 2776 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2777 config, 2778 InterpolatedString.create( 2779 model.token_refresh_endpoint, # type: ignore 2780 parameters=model.parameters or {}, 2781 ).eval(config), 2782 access_token_name=InterpolatedString.create( 2783 model.access_token_name or "access_token", parameters=model.parameters or {} 2784 ).eval(config), 2785 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2786 expires_in_name=InterpolatedString.create( 2787 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2788 ).eval(config), 2789 client_id_name=InterpolatedString.create( 2790 model.client_id_name or "client_id", parameters=model.parameters or {} 2791 ).eval(config), 2792 client_id=InterpolatedString.create( 2793 model.client_id, parameters=model.parameters or {} 2794 ).eval(config) 2795 if model.client_id 2796 else model.client_id, 2797 client_secret_name=InterpolatedString.create( 2798 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2799 ).eval(config), 2800 client_secret=InterpolatedString.create( 2801 model.client_secret, parameters=model.parameters or {} 2802 ).eval(config) 2803 if model.client_secret 2804 else model.client_secret, 2805 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2806 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2807 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2808 grant_type_name=InterpolatedString.create( 2809 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2810 ).eval(config), 2811 grant_type=InterpolatedString.create( 2812 model.grant_type or "refresh_token", parameters=model.parameters or {} 2813 ).eval(config), 2814 refresh_request_body=InterpolatedMapping( 2815 model.refresh_request_body or {}, parameters=model.parameters or {} 2816 ).eval(config), 2817 refresh_request_headers=InterpolatedMapping( 2818 model.refresh_request_headers or {}, parameters=model.parameters or {} 2819 ).eval(config), 2820 scopes=model.scopes, 2821 token_expiry_date_format=model.token_expiry_date_format, 2822 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2823 message_repository=self._message_repository, 2824 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2825 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2826 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2827 ) 2828 # ignore type error because fixing it would have a lot of dependencies, revisit later 2829 return DeclarativeOauth2Authenticator( # type: ignore 2830 access_token_name=model.access_token_name or "access_token", 2831 access_token_value=model.access_token_value, 2832 client_id_name=model.client_id_name or "client_id", 2833 client_id=model.client_id, 2834 client_secret_name=model.client_secret_name or "client_secret", 2835 client_secret=model.client_secret, 2836 expires_in_name=model.expires_in_name or "expires_in", 2837 grant_type_name=model.grant_type_name or "grant_type", 2838 grant_type=model.grant_type or "refresh_token", 2839 refresh_request_body=model.refresh_request_body, 2840 refresh_request_headers=model.refresh_request_headers, 2841 refresh_token_name=model.refresh_token_name or "refresh_token", 2842 refresh_token=model.refresh_token, 2843 scopes=model.scopes, 2844 token_expiry_date=model.token_expiry_date, 2845 token_expiry_date_format=model.token_expiry_date_format, 2846 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2847 token_refresh_endpoint=model.token_refresh_endpoint, 2848 config=config, 2849 parameters=model.parameters or {}, 2850 message_repository=self._message_repository, 2851 profile_assertion=profile_assertion, 2852 use_profile_assertion=model.use_profile_assertion, 2853 ) 2854 2855 def create_offset_increment( 2856 self, 2857 model: OffsetIncrementModel, 2858 config: Config, 2859 decoder: Decoder, 2860 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2861 **kwargs: Any, 2862 ) -> OffsetIncrement: 2863 if isinstance(decoder, PaginationDecoderDecorator): 2864 inner_decoder = decoder.decoder 2865 else: 2866 inner_decoder = decoder 2867 decoder = PaginationDecoderDecorator(decoder=decoder) 2868 2869 if self._is_supported_decoder_for_pagination(inner_decoder): 2870 decoder_to_use = decoder 2871 else: 2872 raise ValueError( 2873 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2874 ) 2875 2876 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2877 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2878 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2879 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2880 # When we have more time to investigate we can look into reusing the same component. 2881 extractor = ( 2882 self._create_component_from_model( 2883 model=extractor_model, config=config, decoder=decoder_to_use 2884 ) 2885 if extractor_model 2886 else None 2887 ) 2888 2889 return OffsetIncrement( 2890 page_size=model.page_size, 2891 config=config, 2892 decoder=decoder_to_use, 2893 extractor=extractor, 2894 inject_on_first_request=model.inject_on_first_request or False, 2895 parameters=model.parameters or {}, 2896 ) 2897 2898 @staticmethod 2899 def create_page_increment( 2900 model: PageIncrementModel, config: Config, **kwargs: Any 2901 ) -> PageIncrement: 2902 return PageIncrement( 2903 page_size=model.page_size, 2904 config=config, 2905 start_from_page=model.start_from_page or 0, 2906 inject_on_first_request=model.inject_on_first_request or False, 2907 parameters=model.parameters or {}, 2908 ) 2909 2910 def create_parent_stream_config( 2911 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2912 ) -> ParentStreamConfig: 2913 declarative_stream = self._create_component_from_model( 2914 model.stream, 2915 config=config, 2916 is_parent=True, 2917 **kwargs, 2918 ) 2919 request_option = ( 2920 self._create_component_from_model(model.request_option, config=config) 2921 if model.request_option 2922 else None 2923 ) 2924 2925 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2926 raise ValueError( 2927 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2928 ) 2929 2930 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2931 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2932 ) 2933 2934 return ParentStreamConfig( 2935 parent_key=model.parent_key, 2936 request_option=request_option, 2937 stream=declarative_stream, 2938 partition_field=model.partition_field, 2939 config=config, 2940 incremental_dependency=model.incremental_dependency or False, 2941 parameters=model.parameters or {}, 2942 extra_fields=model.extra_fields, 2943 lazy_read_pointer=model_lazy_read_pointer, 2944 ) 2945 2946 def create_properties_from_endpoint( 2947 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2948 ) -> PropertiesFromEndpoint: 2949 retriever = self._create_component_from_model( 2950 model=model.retriever, 2951 config=config, 2952 name="dynamic_properties", 2953 primary_key=None, 2954 stream_slicer=None, 2955 transformations=[], 2956 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2957 ) 2958 return PropertiesFromEndpoint( 2959 property_field_path=model.property_field_path, 2960 retriever=retriever, 2961 config=config, 2962 parameters=model.parameters or {}, 2963 ) 2964 2965 def create_property_chunking( 2966 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2967 ) -> PropertyChunking: 2968 record_merge_strategy = ( 2969 self._create_component_from_model( 2970 model=model.record_merge_strategy, config=config, **kwargs 2971 ) 2972 if model.record_merge_strategy 2973 else None 2974 ) 2975 2976 property_limit_type: PropertyLimitType 2977 match model.property_limit_type: 2978 case PropertyLimitTypeModel.property_count: 2979 property_limit_type = PropertyLimitType.property_count 2980 case PropertyLimitTypeModel.characters: 2981 property_limit_type = PropertyLimitType.characters 2982 case _: 2983 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2984 2985 return PropertyChunking( 2986 property_limit_type=property_limit_type, 2987 property_limit=model.property_limit, 2988 record_merge_strategy=record_merge_strategy, 2989 config=config, 2990 parameters=model.parameters or {}, 2991 ) 2992 2993 def create_query_properties( 2994 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2995 ) -> QueryProperties: 2996 if isinstance(model.property_list, list): 2997 property_list = model.property_list 2998 else: 2999 property_list = self._create_component_from_model( 3000 model=model.property_list, config=config, **kwargs 3001 ) 3002 3003 property_chunking = ( 3004 self._create_component_from_model( 3005 model=model.property_chunking, config=config, **kwargs 3006 ) 3007 if model.property_chunking 3008 else None 3009 ) 3010 3011 return QueryProperties( 3012 property_list=property_list, 3013 always_include_properties=model.always_include_properties, 3014 property_chunking=property_chunking, 3015 config=config, 3016 parameters=model.parameters or {}, 3017 ) 3018 3019 @staticmethod 3020 def create_record_filter( 3021 model: RecordFilterModel, config: Config, **kwargs: Any 3022 ) -> RecordFilter: 3023 return RecordFilter( 3024 condition=model.condition or "", config=config, parameters=model.parameters or {} 3025 ) 3026 3027 @staticmethod 3028 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3029 return RequestPath(parameters={}) 3030 3031 @staticmethod 3032 def create_request_option( 3033 model: RequestOptionModel, config: Config, **kwargs: Any 3034 ) -> RequestOption: 3035 inject_into = RequestOptionType(model.inject_into.value) 3036 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3037 [ 3038 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3039 for segment in model.field_path 3040 ] 3041 if model.field_path 3042 else None 3043 ) 3044 field_name = ( 3045 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3046 if model.field_name 3047 else None 3048 ) 3049 return RequestOption( 3050 field_name=field_name, 3051 field_path=field_path, 3052 inject_into=inject_into, 3053 parameters=kwargs.get("parameters", {}), 3054 ) 3055 3056 def create_record_selector( 3057 self, 3058 model: RecordSelectorModel, 3059 config: Config, 3060 *, 3061 name: str, 3062 transformations: List[RecordTransformation] | None = None, 3063 decoder: Decoder | None = None, 3064 client_side_incremental_sync: Dict[str, Any] | None = None, 3065 file_uploader: Optional[DefaultFileUploader] = None, 3066 **kwargs: Any, 3067 ) -> RecordSelector: 3068 extractor = self._create_component_from_model( 3069 model=model.extractor, decoder=decoder, config=config 3070 ) 3071 record_filter = ( 3072 self._create_component_from_model(model.record_filter, config=config) 3073 if model.record_filter 3074 else None 3075 ) 3076 3077 transform_before_filtering = ( 3078 False if model.transform_before_filtering is None else model.transform_before_filtering 3079 ) 3080 if client_side_incremental_sync: 3081 record_filter = ClientSideIncrementalRecordFilterDecorator( 3082 config=config, 3083 parameters=model.parameters, 3084 condition=model.record_filter.condition 3085 if (model.record_filter and hasattr(model.record_filter, "condition")) 3086 else None, 3087 **client_side_incremental_sync, 3088 ) 3089 transform_before_filtering = ( 3090 True 3091 if model.transform_before_filtering is None 3092 else model.transform_before_filtering 3093 ) 3094 3095 if model.schema_normalization is None: 3096 # default to no schema normalization if not set 3097 model.schema_normalization = SchemaNormalizationModel.None_ 3098 3099 schema_normalization = ( 3100 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3101 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3102 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3103 ) 3104 3105 return RecordSelector( 3106 extractor=extractor, 3107 name=name, 3108 config=config, 3109 record_filter=record_filter, 3110 transformations=transformations or [], 3111 file_uploader=file_uploader, 3112 schema_normalization=schema_normalization, 3113 parameters=model.parameters or {}, 3114 transform_before_filtering=transform_before_filtering, 3115 ) 3116 3117 @staticmethod 3118 def create_remove_fields( 3119 model: RemoveFieldsModel, config: Config, **kwargs: Any 3120 ) -> RemoveFields: 3121 return RemoveFields( 3122 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3123 ) 3124 3125 def create_selective_authenticator( 3126 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3127 ) -> DeclarativeAuthenticator: 3128 authenticators = { 3129 name: self._create_component_from_model(model=auth, config=config) 3130 for name, auth in model.authenticators.items() 3131 } 3132 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3133 return SelectiveAuthenticator( # type: ignore[abstract] 3134 config=config, 3135 authenticators=authenticators, 3136 authenticator_selection_path=model.authenticator_selection_path, 3137 **kwargs, 3138 ) 3139 3140 @staticmethod 3141 def create_legacy_session_token_authenticator( 3142 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3143 ) -> LegacySessionTokenAuthenticator: 3144 return LegacySessionTokenAuthenticator( 3145 api_url=url_base, 3146 header=model.header, 3147 login_url=model.login_url, 3148 password=model.password or "", 3149 session_token=model.session_token or "", 3150 session_token_response_key=model.session_token_response_key or "", 3151 username=model.username or "", 3152 validate_session_url=model.validate_session_url, 3153 config=config, 3154 parameters=model.parameters or {}, 3155 ) 3156 3157 def create_simple_retriever( 3158 self, 3159 model: SimpleRetrieverModel, 3160 config: Config, 3161 *, 3162 name: str, 3163 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3164 request_options_provider: Optional[RequestOptionsProvider] = None, 3165 stop_condition_cursor: Optional[Cursor] = None, 3166 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3167 transformations: List[RecordTransformation], 3168 file_uploader: Optional[DefaultFileUploader] = None, 3169 incremental_sync: Optional[ 3170 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3171 ] = None, 3172 use_cache: Optional[bool] = None, 3173 log_formatter: Optional[Callable[[Response], Any]] = None, 3174 partition_router: Optional[PartitionRouter] = None, 3175 **kwargs: Any, 3176 ) -> SimpleRetriever: 3177 def _get_url(req: Requester) -> str: 3178 """ 3179 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3180 This is needed because the URL is not set until the requester is created. 3181 """ 3182 3183 _url: str = ( 3184 model.requester.url 3185 if hasattr(model.requester, "url") and model.requester.url is not None 3186 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3187 ) 3188 _url_base: str = ( 3189 model.requester.url_base 3190 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3191 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3192 ) 3193 3194 return _url or _url_base 3195 3196 decoder = ( 3197 self._create_component_from_model(model=model.decoder, config=config) 3198 if model.decoder 3199 else JsonDecoder(parameters={}) 3200 ) 3201 record_selector = self._create_component_from_model( 3202 model=model.record_selector, 3203 name=name, 3204 config=config, 3205 decoder=decoder, 3206 transformations=transformations, 3207 client_side_incremental_sync=client_side_incremental_sync, 3208 file_uploader=file_uploader, 3209 ) 3210 3211 query_properties: Optional[QueryProperties] = None 3212 query_properties_key: Optional[str] = None 3213 if self._query_properties_in_request_parameters(model.requester): 3214 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3215 # places instead of default to request_parameters which isn't clearly documented 3216 if ( 3217 hasattr(model.requester, "fetch_properties_from_endpoint") 3218 and model.requester.fetch_properties_from_endpoint 3219 ): 3220 raise ValueError( 3221 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3222 ) 3223 3224 query_properties_definitions = [] 3225 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3226 if isinstance(request_parameter, QueryPropertiesModel): 3227 query_properties_key = key 3228 query_properties_definitions.append(request_parameter) 3229 3230 if len(query_properties_definitions) > 1: 3231 raise ValueError( 3232 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3233 ) 3234 3235 if len(query_properties_definitions) == 1: 3236 query_properties = self._create_component_from_model( 3237 model=query_properties_definitions[0], config=config 3238 ) 3239 elif ( 3240 hasattr(model.requester, "fetch_properties_from_endpoint") 3241 and model.requester.fetch_properties_from_endpoint 3242 ): 3243 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3244 query_properties_definition = QueryPropertiesModel( 3245 type="QueryProperties", 3246 property_list=model.requester.fetch_properties_from_endpoint, 3247 always_include_properties=None, 3248 property_chunking=None, 3249 ) # type: ignore # $parameters has a default value 3250 3251 query_properties = self.create_query_properties( 3252 model=query_properties_definition, 3253 config=config, 3254 ) 3255 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3256 query_properties = self.create_query_properties( 3257 model=model.requester.query_properties, 3258 config=config, 3259 ) 3260 3261 requester = self._create_component_from_model( 3262 model=model.requester, 3263 decoder=decoder, 3264 name=name, 3265 query_properties_key=query_properties_key, 3266 use_cache=use_cache, 3267 config=config, 3268 ) 3269 3270 if not request_options_provider: 3271 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3272 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3273 partition_router, PartitionRouter 3274 ): 3275 request_options_provider = partition_router 3276 3277 paginator = ( 3278 self._create_component_from_model( 3279 model=model.paginator, 3280 config=config, 3281 url_base=_get_url(requester), 3282 extractor_model=model.record_selector.extractor, 3283 decoder=decoder, 3284 cursor_used_for_stop_condition=stop_condition_cursor or None, 3285 ) 3286 if model.paginator 3287 else NoPagination(parameters={}) 3288 ) 3289 3290 ignore_stream_slicer_parameters_on_paginated_requests = ( 3291 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3292 ) 3293 3294 if ( 3295 model.partition_router 3296 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3297 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3298 and any( 3299 parent_stream_config.lazy_read_pointer 3300 for parent_stream_config in model.partition_router.parent_stream_configs 3301 ) 3302 ): 3303 if incremental_sync: 3304 if incremental_sync.type != "DatetimeBasedCursor": 3305 raise ValueError( 3306 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3307 ) 3308 3309 elif incremental_sync.step or incremental_sync.cursor_granularity: 3310 raise ValueError( 3311 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3312 ) 3313 3314 if model.decoder and model.decoder.type != "JsonDecoder": 3315 raise ValueError( 3316 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3317 ) 3318 3319 return LazySimpleRetriever( 3320 name=name, 3321 paginator=paginator, 3322 primary_key=primary_key, 3323 requester=requester, 3324 record_selector=record_selector, 3325 stream_slicer=_NO_STREAM_SLICING, 3326 request_option_provider=request_options_provider, 3327 cursor=None, 3328 config=config, 3329 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3330 parameters=model.parameters or {}, 3331 ) 3332 3333 return SimpleRetriever( 3334 name=name, 3335 paginator=paginator, 3336 primary_key=primary_key, 3337 requester=requester, 3338 record_selector=record_selector, 3339 stream_slicer=_NO_STREAM_SLICING, 3340 request_option_provider=request_options_provider, 3341 cursor=None, 3342 config=config, 3343 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3344 additional_query_properties=query_properties, 3345 log_formatter=self._get_log_formatter(log_formatter, name), 3346 parameters=model.parameters or {}, 3347 ) 3348 3349 def _get_log_formatter( 3350 self, log_formatter: Callable[[Response], Any] | None, name: str 3351 ) -> Callable[[Response], Any] | None: 3352 if self._should_limit_slices_fetched(): 3353 return ( 3354 ( 3355 lambda response: format_http_message( 3356 response, 3357 f"Stream '{name}' request", 3358 f"Request performed in order to extract records for stream '{name}'", 3359 name, 3360 ) 3361 ) 3362 if not log_formatter 3363 else log_formatter 3364 ) 3365 return None 3366 3367 def _should_limit_slices_fetched(self) -> bool: 3368 """ 3369 Returns True if the number of slices fetched should be limited, False otherwise. 3370 This is used to limit the number of slices fetched during tests. 3371 """ 3372 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3373 3374 @staticmethod 3375 def _query_properties_in_request_parameters( 3376 requester: Union[HttpRequesterModel, CustomRequesterModel], 3377 ) -> bool: 3378 if not hasattr(requester, "request_parameters"): 3379 return False 3380 request_parameters = requester.request_parameters 3381 if request_parameters and isinstance(request_parameters, Mapping): 3382 for request_parameter in request_parameters.values(): 3383 if isinstance(request_parameter, QueryPropertiesModel): 3384 return True 3385 return False 3386 3387 @staticmethod 3388 def _remove_query_properties( 3389 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3390 ) -> Mapping[str, str]: 3391 return { 3392 parameter_field: request_parameter 3393 for parameter_field, request_parameter in request_parameters.items() 3394 if not isinstance(request_parameter, QueryPropertiesModel) 3395 } 3396 3397 def create_state_delegating_stream( 3398 self, 3399 model: StateDelegatingStreamModel, 3400 config: Config, 3401 has_parent_state: Optional[bool] = None, 3402 **kwargs: Any, 3403 ) -> DeclarativeStream: 3404 if ( 3405 model.full_refresh_stream.name != model.name 3406 or model.name != model.incremental_stream.name 3407 ): 3408 raise ValueError( 3409 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3410 ) 3411 3412 stream_model = self._get_state_delegating_stream_model( 3413 False if has_parent_state is None else has_parent_state, model 3414 ) 3415 3416 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel 3417 3418 def _get_state_delegating_stream_model( 3419 self, has_parent_state: bool, model: StateDelegatingStreamModel 3420 ) -> DeclarativeStreamModel: 3421 return ( 3422 model.incremental_stream 3423 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3424 else model.full_refresh_stream 3425 ) 3426 3427 def _create_async_job_status_mapping( 3428 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3429 ) -> Mapping[str, AsyncJobStatus]: 3430 api_status_to_cdk_status = {} 3431 for cdk_status, api_statuses in model.dict().items(): 3432 if cdk_status == "type": 3433 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3434 continue 3435 3436 for status in api_statuses: 3437 if status in api_status_to_cdk_status: 3438 raise ValueError( 3439 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3440 ) 3441 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3442 return api_status_to_cdk_status 3443 3444 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3445 match status: 3446 case "running": 3447 return AsyncJobStatus.RUNNING 3448 case "completed": 3449 return AsyncJobStatus.COMPLETED 3450 case "failed": 3451 return AsyncJobStatus.FAILED 3452 case "timeout": 3453 return AsyncJobStatus.TIMED_OUT 3454 case _: 3455 raise ValueError(f"Unsupported CDK status {status}") 3456 3457 def create_async_retriever( 3458 self, 3459 model: AsyncRetrieverModel, 3460 config: Config, 3461 *, 3462 name: str, 3463 primary_key: Optional[ 3464 Union[str, List[str], List[List[str]]] 3465 ], # this seems to be needed to match create_simple_retriever 3466 stream_slicer: Optional[StreamSlicer], 3467 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3468 transformations: List[RecordTransformation], 3469 **kwargs: Any, 3470 ) -> AsyncRetriever: 3471 if model.download_target_requester and not model.download_target_extractor: 3472 raise ValueError( 3473 f"`download_target_extractor` required if using a `download_target_requester`" 3474 ) 3475 3476 def _get_download_retriever( 3477 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3478 ) -> SimpleRetriever: 3479 # We create a record selector for the download retriever 3480 # with no schema normalization and no transformations, neither record filter 3481 # as all this occurs in the record_selector of the AsyncRetriever 3482 record_selector = RecordSelector( 3483 extractor=extractor, 3484 name=name, 3485 record_filter=None, 3486 transformations=[], 3487 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3488 config=config, 3489 parameters={}, 3490 ) 3491 paginator = ( 3492 self._create_component_from_model( 3493 model=model.download_paginator, 3494 decoder=_decoder, 3495 config=config, 3496 url_base="", 3497 ) 3498 if model.download_paginator 3499 else NoPagination(parameters={}) 3500 ) 3501 3502 return SimpleRetriever( 3503 requester=requester, 3504 record_selector=record_selector, 3505 primary_key=None, 3506 name=name, 3507 paginator=paginator, 3508 config=config, 3509 parameters={}, 3510 log_formatter=self._get_log_formatter(None, name), 3511 ) 3512 3513 def _get_job_timeout() -> datetime.timedelta: 3514 user_defined_timeout: Optional[int] = ( 3515 int( 3516 InterpolatedString.create( 3517 str(model.polling_job_timeout), 3518 parameters={}, 3519 ).eval(config) 3520 ) 3521 if model.polling_job_timeout 3522 else None 3523 ) 3524 3525 # check for user defined timeout during the test read or 15 minutes 3526 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3527 # default value for non-connector builder is 60 minutes. 3528 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3529 3530 return ( 3531 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3532 ) 3533 3534 decoder = ( 3535 self._create_component_from_model(model=model.decoder, config=config) 3536 if model.decoder 3537 else JsonDecoder(parameters={}) 3538 ) 3539 record_selector = self._create_component_from_model( 3540 model=model.record_selector, 3541 config=config, 3542 decoder=decoder, 3543 name=name, 3544 transformations=transformations, 3545 client_side_incremental_sync=client_side_incremental_sync, 3546 ) 3547 3548 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3549 if self._should_limit_slices_fetched(): 3550 stream_slicer = cast( 3551 StreamSlicer, 3552 StreamSlicerTestReadDecorator( 3553 wrapped_slicer=stream_slicer, 3554 maximum_number_of_slices=self._limit_slices_fetched or 5, 3555 ), 3556 ) 3557 3558 creation_requester = self._create_component_from_model( 3559 model=model.creation_requester, 3560 decoder=decoder, 3561 config=config, 3562 name=f"job creation - {name}", 3563 ) 3564 polling_requester = self._create_component_from_model( 3565 model=model.polling_requester, 3566 decoder=decoder, 3567 config=config, 3568 name=f"job polling - {name}", 3569 ) 3570 job_download_components_name = f"job download - {name}" 3571 download_decoder = ( 3572 self._create_component_from_model(model=model.download_decoder, config=config) 3573 if model.download_decoder 3574 else JsonDecoder(parameters={}) 3575 ) 3576 download_extractor = ( 3577 self._create_component_from_model( 3578 model=model.download_extractor, 3579 config=config, 3580 decoder=download_decoder, 3581 parameters=model.parameters, 3582 ) 3583 if model.download_extractor 3584 else DpathExtractor( 3585 [], 3586 config=config, 3587 decoder=download_decoder, 3588 parameters=model.parameters or {}, 3589 ) 3590 ) 3591 download_requester = self._create_component_from_model( 3592 model=model.download_requester, 3593 decoder=download_decoder, 3594 config=config, 3595 name=job_download_components_name, 3596 ) 3597 download_retriever = _get_download_retriever( 3598 download_requester, download_extractor, download_decoder 3599 ) 3600 abort_requester = ( 3601 self._create_component_from_model( 3602 model=model.abort_requester, 3603 decoder=decoder, 3604 config=config, 3605 name=f"job abort - {name}", 3606 ) 3607 if model.abort_requester 3608 else None 3609 ) 3610 delete_requester = ( 3611 self._create_component_from_model( 3612 model=model.delete_requester, 3613 decoder=decoder, 3614 config=config, 3615 name=f"job delete - {name}", 3616 ) 3617 if model.delete_requester 3618 else None 3619 ) 3620 download_target_requester = ( 3621 self._create_component_from_model( 3622 model=model.download_target_requester, 3623 decoder=decoder, 3624 config=config, 3625 name=f"job extract_url - {name}", 3626 ) 3627 if model.download_target_requester 3628 else None 3629 ) 3630 status_extractor = self._create_component_from_model( 3631 model=model.status_extractor, decoder=decoder, config=config, name=name 3632 ) 3633 download_target_extractor = ( 3634 self._create_component_from_model( 3635 model=model.download_target_extractor, 3636 decoder=decoder, 3637 config=config, 3638 name=name, 3639 ) 3640 if model.download_target_extractor 3641 else None 3642 ) 3643 3644 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3645 creation_requester=creation_requester, 3646 polling_requester=polling_requester, 3647 download_retriever=download_retriever, 3648 download_target_requester=download_target_requester, 3649 abort_requester=abort_requester, 3650 delete_requester=delete_requester, 3651 status_extractor=status_extractor, 3652 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3653 download_target_extractor=download_target_extractor, 3654 job_timeout=_get_job_timeout(), 3655 ) 3656 3657 async_job_partition_router = AsyncJobPartitionRouter( 3658 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3659 job_repository, 3660 stream_slices, 3661 self._job_tracker, 3662 self._message_repository, 3663 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3664 has_bulk_parent=False, 3665 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3666 # `None` == default retry is set to 3 attempts, under the hood. 3667 job_max_retry=1 if self._emit_connector_builder_messages else None, 3668 ), 3669 stream_slicer=stream_slicer, 3670 config=config, 3671 parameters=model.parameters or {}, 3672 ) 3673 3674 return AsyncRetriever( 3675 record_selector=record_selector, 3676 stream_slicer=async_job_partition_router, 3677 config=config, 3678 parameters=model.parameters or {}, 3679 ) 3680 3681 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3682 config_migrations = [ 3683 self._create_component_from_model(migration, config) 3684 for migration in ( 3685 model.config_normalization_rules.config_migrations 3686 if ( 3687 model.config_normalization_rules 3688 and model.config_normalization_rules.config_migrations 3689 ) 3690 else [] 3691 ) 3692 ] 3693 config_transformations = [ 3694 self._create_component_from_model(transformation, config) 3695 for transformation in ( 3696 model.config_normalization_rules.transformations 3697 if ( 3698 model.config_normalization_rules 3699 and model.config_normalization_rules.transformations 3700 ) 3701 else [] 3702 ) 3703 ] 3704 config_validations = [ 3705 self._create_component_from_model(validation, config) 3706 for validation in ( 3707 model.config_normalization_rules.validations 3708 if ( 3709 model.config_normalization_rules 3710 and model.config_normalization_rules.validations 3711 ) 3712 else [] 3713 ) 3714 ] 3715 3716 return Spec( 3717 connection_specification=model.connection_specification, 3718 documentation_url=model.documentation_url, 3719 advanced_auth=model.advanced_auth, 3720 parameters={}, 3721 config_migrations=config_migrations, 3722 config_transformations=config_transformations, 3723 config_validations=config_validations, 3724 ) 3725 3726 def create_substream_partition_router( 3727 self, 3728 model: SubstreamPartitionRouterModel, 3729 config: Config, 3730 *, 3731 stream_name: str, 3732 **kwargs: Any, 3733 ) -> SubstreamPartitionRouter: 3734 parent_stream_configs = [] 3735 if model.parent_stream_configs: 3736 parent_stream_configs.extend( 3737 [ 3738 self.create_parent_stream_config_with_substream_wrapper( 3739 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3740 ) 3741 for parent_stream_config in model.parent_stream_configs 3742 ] 3743 ) 3744 3745 return SubstreamPartitionRouter( 3746 parent_stream_configs=parent_stream_configs, 3747 parameters=model.parameters or {}, 3748 config=config, 3749 ) 3750 3751 def create_parent_stream_config_with_substream_wrapper( 3752 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3753 ) -> Any: 3754 # getting the parent state 3755 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3756 3757 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3758 has_parent_state = bool( 3759 self._connector_state_manager.get_stream_state(stream_name, None) 3760 if model.incremental_dependency 3761 else False 3762 ) 3763 connector_state_manager = self._instantiate_parent_stream_state_manager( 3764 child_state, config, model, has_parent_state 3765 ) 3766 3767 substream_factory = ModelToComponentFactory( 3768 connector_state_manager=connector_state_manager, 3769 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3770 limit_slices_fetched=self._limit_slices_fetched, 3771 emit_connector_builder_messages=self._emit_connector_builder_messages, 3772 disable_retries=self._disable_retries, 3773 disable_cache=self._disable_cache, 3774 message_repository=StateFilteringMessageRepository( 3775 LogAppenderMessageRepositoryDecorator( 3776 { 3777 "airbyte_cdk": {"stream": {"is_substream": True}}, 3778 "http": {"is_auxiliary": True}, 3779 }, 3780 self._message_repository, 3781 self._evaluate_log_level(self._emit_connector_builder_messages), 3782 ), 3783 ), 3784 ) 3785 3786 return substream_factory.create_parent_stream_config( 3787 model=model, config=config, stream_name=stream_name, **kwargs 3788 ) 3789 3790 def _instantiate_parent_stream_state_manager( 3791 self, 3792 child_state: MutableMapping[str, Any], 3793 config: Config, 3794 model: ParentStreamConfigModel, 3795 has_parent_state: bool, 3796 ) -> ConnectorStateManager: 3797 """ 3798 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 3799 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 3800 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 3801 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 3802 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 3803 incremental_dependency is set. 3804 """ 3805 if model.incremental_dependency and child_state: 3806 parent_stream_name = model.stream.name or "" 3807 parent_state = ConcurrentPerPartitionCursor.get_parent_state( 3808 child_state, parent_stream_name 3809 ) 3810 3811 if not parent_state: 3812 # there are two migration cases: state value from child stream or from global state 3813 parent_state = ConcurrentPerPartitionCursor.get_global_state( 3814 child_state, parent_stream_name 3815 ) 3816 3817 if not parent_state and not isinstance(parent_state, dict): 3818 cursor_values = child_state.values() 3819 if cursor_values: 3820 incremental_sync_model: Union[ 3821 DatetimeBasedCursorModel, 3822 IncrementingCountCursorModel, 3823 ] = ( 3824 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 3825 if isinstance(model.stream, DeclarativeStreamModel) 3826 else self._get_state_delegating_stream_model( 3827 has_parent_state, model.stream 3828 ).incremental_sync 3829 ) 3830 cursor_field = InterpolatedString.create( 3831 incremental_sync_model.cursor_field, 3832 parameters=incremental_sync_model.parameters or {}, 3833 ).eval(config) 3834 parent_state = AirbyteStateMessage( 3835 type=AirbyteStateType.STREAM, 3836 stream=AirbyteStreamState( 3837 stream_descriptor=StreamDescriptor( 3838 name=parent_stream_name, namespace=None 3839 ), 3840 stream_state=AirbyteStateBlob( 3841 {cursor_field: list(cursor_values)[0]} 3842 ), 3843 ), 3844 ) 3845 return ConnectorStateManager([parent_state] if parent_state else []) 3846 3847 return ConnectorStateManager([]) 3848 3849 @staticmethod 3850 def create_wait_time_from_header( 3851 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3852 ) -> WaitTimeFromHeaderBackoffStrategy: 3853 return WaitTimeFromHeaderBackoffStrategy( 3854 header=model.header, 3855 parameters=model.parameters or {}, 3856 config=config, 3857 regex=model.regex, 3858 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3859 if model.max_waiting_time_in_seconds is not None 3860 else None, 3861 ) 3862 3863 @staticmethod 3864 def create_wait_until_time_from_header( 3865 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3866 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3867 return WaitUntilTimeFromHeaderBackoffStrategy( 3868 header=model.header, 3869 parameters=model.parameters or {}, 3870 config=config, 3871 min_wait=model.min_wait, 3872 regex=model.regex, 3873 ) 3874 3875 def get_message_repository(self) -> MessageRepository: 3876 return self._message_repository 3877 3878 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3879 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3880 3881 @staticmethod 3882 def create_components_mapping_definition( 3883 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3884 ) -> ComponentMappingDefinition: 3885 interpolated_value = InterpolatedString.create( 3886 model.value, parameters=model.parameters or {} 3887 ) 3888 field_path = [ 3889 InterpolatedString.create(path, parameters=model.parameters or {}) 3890 for path in model.field_path 3891 ] 3892 return ComponentMappingDefinition( 3893 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3894 value=interpolated_value, 3895 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3896 create_or_update=model.create_or_update, 3897 condition=model.condition, 3898 parameters=model.parameters or {}, 3899 ) 3900 3901 def create_http_components_resolver( 3902 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3903 ) -> Any: 3904 retriever = self._create_component_from_model( 3905 model=model.retriever, 3906 config=config, 3907 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3908 primary_key=None, 3909 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3910 transformations=[], 3911 ) 3912 3913 components_mapping = [] 3914 for component_mapping_definition_model in model.components_mapping: 3915 if component_mapping_definition_model.condition: 3916 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3917 components_mapping.append( 3918 self._create_component_from_model( 3919 model=component_mapping_definition_model, 3920 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3921 component_mapping_definition_model.value_type 3922 ), 3923 config=config, 3924 ) 3925 ) 3926 3927 return HttpComponentsResolver( 3928 retriever=retriever, 3929 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3930 config=config, 3931 components_mapping=components_mapping, 3932 parameters=model.parameters or {}, 3933 ) 3934 3935 @staticmethod 3936 def create_stream_config( 3937 model: StreamConfigModel, config: Config, **kwargs: Any 3938 ) -> StreamConfig: 3939 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3940 [x for x in model.configs_pointer] if model.configs_pointer else [] 3941 ) 3942 3943 return StreamConfig( 3944 configs_pointer=model_configs_pointer, 3945 default_values=model.default_values, 3946 parameters=model.parameters or {}, 3947 ) 3948 3949 def create_config_components_resolver( 3950 self, 3951 model: ConfigComponentsResolverModel, 3952 config: Config, 3953 ) -> Any: 3954 model_stream_configs = ( 3955 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3956 ) 3957 3958 stream_configs = [ 3959 self._create_component_from_model( 3960 stream_config, config=config, parameters=model.parameters or {} 3961 ) 3962 for stream_config in model_stream_configs 3963 ] 3964 3965 components_mapping = [ 3966 self._create_component_from_model( 3967 model=components_mapping_definition_model, 3968 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3969 components_mapping_definition_model.value_type 3970 ), 3971 config=config, 3972 parameters=model.parameters, 3973 ) 3974 for components_mapping_definition_model in model.components_mapping 3975 ] 3976 3977 return ConfigComponentsResolver( 3978 stream_configs=stream_configs, 3979 config=config, 3980 components_mapping=components_mapping, 3981 parameters=model.parameters or {}, 3982 ) 3983 3984 def create_parametrized_components_resolver( 3985 self, 3986 model: ParametrizedComponentsResolverModel, 3987 config: Config, 3988 ) -> ParametrizedComponentsResolver: 3989 stream_parameters = StreamParametersDefinition( 3990 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3991 ) 3992 3993 components_mapping = [] 3994 for components_mapping_definition_model in model.components_mapping: 3995 if components_mapping_definition_model.condition: 3996 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3997 components_mapping.append( 3998 self._create_component_from_model( 3999 model=components_mapping_definition_model, 4000 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4001 components_mapping_definition_model.value_type 4002 ), 4003 config=config, 4004 ) 4005 ) 4006 return ParametrizedComponentsResolver( 4007 stream_parameters=stream_parameters, 4008 config=config, 4009 components_mapping=components_mapping, 4010 parameters=model.parameters or {}, 4011 ) 4012 4013 _UNSUPPORTED_DECODER_ERROR = ( 4014 "Specified decoder of {decoder_type} is not supported for pagination." 4015 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4016 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4017 ) 4018 4019 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4020 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4021 return True 4022 elif isinstance(decoder, CompositeRawDecoder): 4023 return self._is_supported_parser_for_pagination(decoder.parser) 4024 else: 4025 return False 4026 4027 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4028 if isinstance(parser, JsonParser): 4029 return True 4030 elif isinstance(parser, GzipParser): 4031 return isinstance(parser.inner_parser, JsonParser) 4032 else: 4033 return False 4034 4035 def create_http_api_budget( 4036 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4037 ) -> HttpAPIBudget: 4038 policies = [ 4039 self._create_component_from_model(model=policy, config=config) 4040 for policy in model.policies 4041 ] 4042 4043 return HttpAPIBudget( 4044 policies=policies, 4045 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4046 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4047 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4048 ) 4049 4050 def create_fixed_window_call_rate_policy( 4051 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4052 ) -> FixedWindowCallRatePolicy: 4053 matchers = [ 4054 self._create_component_from_model(model=matcher, config=config) 4055 for matcher in model.matchers 4056 ] 4057 4058 # Set the initial reset timestamp to 10 days from now. 4059 # This value will be updated by the first request. 4060 return FixedWindowCallRatePolicy( 4061 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4062 period=parse_duration(model.period), 4063 call_limit=model.call_limit, 4064 matchers=matchers, 4065 ) 4066 4067 def create_file_uploader( 4068 self, model: FileUploaderModel, config: Config, **kwargs: Any 4069 ) -> FileUploader: 4070 name = "File Uploader" 4071 requester = self._create_component_from_model( 4072 model=model.requester, 4073 config=config, 4074 name=name, 4075 **kwargs, 4076 ) 4077 download_target_extractor = self._create_component_from_model( 4078 model=model.download_target_extractor, 4079 config=config, 4080 name=name, 4081 **kwargs, 4082 ) 4083 emit_connector_builder_messages = self._emit_connector_builder_messages 4084 file_uploader = DefaultFileUploader( 4085 requester=requester, 4086 download_target_extractor=download_target_extractor, 4087 config=config, 4088 file_writer=NoopFileWriter() 4089 if emit_connector_builder_messages 4090 else LocalFileSystemFileWriter(), 4091 parameters=model.parameters or {}, 4092 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4093 ) 4094 4095 return ( 4096 ConnectorBuilderFileUploader(file_uploader) 4097 if emit_connector_builder_messages 4098 else file_uploader 4099 ) 4100 4101 def create_moving_window_call_rate_policy( 4102 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4103 ) -> MovingWindowCallRatePolicy: 4104 rates = [ 4105 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4106 ] 4107 matchers = [ 4108 self._create_component_from_model(model=matcher, config=config) 4109 for matcher in model.matchers 4110 ] 4111 return MovingWindowCallRatePolicy( 4112 rates=rates, 4113 matchers=matchers, 4114 ) 4115 4116 def create_unlimited_call_rate_policy( 4117 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4118 ) -> UnlimitedCallRatePolicy: 4119 matchers = [ 4120 self._create_component_from_model(model=matcher, config=config) 4121 for matcher in model.matchers 4122 ] 4123 4124 return UnlimitedCallRatePolicy( 4125 matchers=matchers, 4126 ) 4127 4128 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4129 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4130 return Rate( 4131 limit=int(interpolated_limit.eval(config=config)), 4132 interval=parse_duration(model.interval), 4133 ) 4134 4135 def create_http_request_matcher( 4136 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4137 ) -> HttpRequestRegexMatcher: 4138 return HttpRequestRegexMatcher( 4139 method=model.method, 4140 url_base=model.url_base, 4141 url_path_pattern=model.url_path_pattern, 4142 params=model.params, 4143 headers=model.headers, 4144 ) 4145 4146 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4147 self._api_budget = self.create_component( 4148 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4149 ) 4150 4151 def create_grouping_partition_router( 4152 self, 4153 model: GroupingPartitionRouterModel, 4154 config: Config, 4155 *, 4156 stream_name: str, 4157 **kwargs: Any, 4158 ) -> GroupingPartitionRouter: 4159 underlying_router = self._create_component_from_model( 4160 model=model.underlying_partition_router, 4161 config=config, 4162 stream_name=stream_name, 4163 **kwargs, 4164 ) 4165 if model.group_size < 1: 4166 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4167 4168 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4169 # because they are specific to individual partitions and cannot be aggregated or handled 4170 # when grouping, potentially leading to incorrect API calls. Any request customization 4171 # should be managed at the stream level through the requester's configuration. 4172 if isinstance(underlying_router, SubstreamPartitionRouter): 4173 if any( 4174 parent_config.request_option 4175 for parent_config in underlying_router.parent_stream_configs 4176 ): 4177 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4178 4179 if isinstance(underlying_router, ListPartitionRouter): 4180 if underlying_router.request_option: 4181 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4182 4183 return GroupingPartitionRouter( 4184 group_size=model.group_size, 4185 underlying_partition_router=underlying_router, 4186 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4187 config=config, 4188 )
652 def __init__( 653 self, 654 limit_pages_fetched_per_slice: Optional[int] = None, 655 limit_slices_fetched: Optional[int] = None, 656 emit_connector_builder_messages: bool = False, 657 disable_retries: bool = False, 658 disable_cache: bool = False, 659 message_repository: Optional[MessageRepository] = None, 660 connector_state_manager: Optional[ConnectorStateManager] = None, 661 max_concurrent_async_job_count: Optional[int] = None, 662 ): 663 self._init_mappings() 664 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 665 self._limit_slices_fetched = limit_slices_fetched 666 self._emit_connector_builder_messages = emit_connector_builder_messages 667 self._disable_retries = disable_retries 668 self._disable_cache = disable_cache 669 self._message_repository = message_repository or InMemoryMessageRepository( 670 self._evaluate_log_level(emit_connector_builder_messages) 671 ) 672 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 673 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 674 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 675 # placeholder for deprecation warnings 676 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
790 def create_component( 791 self, 792 model_type: Type[BaseModel], 793 component_definition: ComponentDefinition, 794 config: Config, 795 **kwargs: Any, 796 ) -> Any: 797 """ 798 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 799 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 800 creating declarative components from that model. 801 802 :param model_type: The type of declarative component that is being initialized 803 :param component_definition: The mapping that represents a declarative component 804 :param config: The connector config that is provided by the customer 805 :return: The declarative component to be used at runtime 806 """ 807 808 component_type = component_definition.get("type") 809 if component_definition.get("type") != model_type.__name__: 810 raise ValueError( 811 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 812 ) 813 814 declarative_component_model = model_type.parse_obj(component_definition) 815 816 if not isinstance(declarative_component_model, model_type): 817 raise ValueError( 818 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 819 ) 820 821 return self._create_component_from_model( 822 model=declarative_component_model, config=config, **kwargs 823 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
840 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 841 """ 842 Returns the deprecation warnings that were collected during the creation of components. 843 """ 844 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
861 def create_config_migration( 862 self, model: ConfigMigrationModel, config: Config 863 ) -> ConfigMigration: 864 transformations: List[ConfigTransformation] = [ 865 self._create_component_from_model(transformation, config) 866 for transformation in model.transformations 867 ] 868 869 return ConfigMigration( 870 description=model.description, 871 transformations=transformations, 872 )
874 def create_config_add_fields( 875 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 876 ) -> ConfigAddFields: 877 fields = [self._create_component_from_model(field, config) for field in model.fields] 878 return ConfigAddFields( 879 fields=fields, 880 condition=model.condition or "", 881 )
930 @staticmethod 931 def create_added_field_definition( 932 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 933 ) -> AddedFieldDefinition: 934 interpolated_value = InterpolatedString.create( 935 model.value, parameters=model.parameters or {} 936 ) 937 return AddedFieldDefinition( 938 path=model.path, 939 value=interpolated_value, 940 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 941 parameters=model.parameters or {}, 942 )
944 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 945 added_field_definitions = [ 946 self._create_component_from_model( 947 model=added_field_definition_model, 948 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 949 added_field_definition_model.value_type 950 ), 951 config=config, 952 ) 953 for added_field_definition_model in model.fields 954 ] 955 return AddFields( 956 fields=added_field_definitions, 957 condition=model.condition or "", 958 parameters=model.parameters or {}, 959 )
985 def create_dpath_flatten_fields( 986 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 987 ) -> DpathFlattenFields: 988 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 989 key_transformation = ( 990 KeyTransformation( 991 config=config, 992 prefix=model.key_transformation.prefix, 993 suffix=model.key_transformation.suffix, 994 parameters=model.parameters or {}, 995 ) 996 if model.key_transformation is not None 997 else None 998 ) 999 return DpathFlattenFields( 1000 config=config, 1001 field_path=model_field_path, 1002 delete_origin_value=model.delete_origin_value 1003 if model.delete_origin_value is not None 1004 else False, 1005 replace_record=model.replace_record if model.replace_record is not None else False, 1006 key_transformation=key_transformation, 1007 parameters=model.parameters or {}, 1008 )
1022 def create_api_key_authenticator( 1023 self, 1024 model: ApiKeyAuthenticatorModel, 1025 config: Config, 1026 token_provider: Optional[TokenProvider] = None, 1027 **kwargs: Any, 1028 ) -> ApiKeyAuthenticator: 1029 if model.inject_into is None and model.header is None: 1030 raise ValueError( 1031 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1032 ) 1033 1034 if model.inject_into is not None and model.header is not None: 1035 raise ValueError( 1036 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1037 ) 1038 1039 if token_provider is not None and model.api_token != "": 1040 raise ValueError( 1041 "If token_provider is set, api_token is ignored and has to be set to empty string." 1042 ) 1043 1044 request_option = ( 1045 self._create_component_from_model( 1046 model.inject_into, config, parameters=model.parameters or {} 1047 ) 1048 if model.inject_into 1049 else RequestOption( 1050 inject_into=RequestOptionType.header, 1051 field_name=model.header or "", 1052 parameters=model.parameters or {}, 1053 ) 1054 ) 1055 1056 return ApiKeyAuthenticator( 1057 token_provider=( 1058 token_provider 1059 if token_provider is not None 1060 else InterpolatedStringTokenProvider( 1061 api_token=model.api_token or "", 1062 config=config, 1063 parameters=model.parameters or {}, 1064 ) 1065 ), 1066 request_option=request_option, 1067 config=config, 1068 parameters=model.parameters or {}, 1069 )
1071 def create_legacy_to_per_partition_state_migration( 1072 self, 1073 model: LegacyToPerPartitionStateMigrationModel, 1074 config: Mapping[str, Any], 1075 declarative_stream: DeclarativeStreamModel, 1076 ) -> LegacyToPerPartitionStateMigration: 1077 retriever = declarative_stream.retriever 1078 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1079 raise ValueError( 1080 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1081 ) 1082 partition_router = retriever.partition_router 1083 if not isinstance( 1084 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1085 ): 1086 raise ValueError( 1087 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1088 ) 1089 if not hasattr(partition_router, "parent_stream_configs"): 1090 raise ValueError( 1091 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1092 ) 1093 1094 if not hasattr(declarative_stream, "incremental_sync"): 1095 raise ValueError( 1096 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1097 ) 1098 1099 return LegacyToPerPartitionStateMigration( 1100 partition_router, # type: ignore # was already checked above 1101 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1102 config, 1103 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1104 )
1106 def create_session_token_authenticator( 1107 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1108 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1109 decoder = ( 1110 self._create_component_from_model(model=model.decoder, config=config) 1111 if model.decoder 1112 else JsonDecoder(parameters={}) 1113 ) 1114 login_requester = self._create_component_from_model( 1115 model=model.login_requester, 1116 config=config, 1117 name=f"{name}_login_requester", 1118 decoder=decoder, 1119 ) 1120 token_provider = SessionTokenProvider( 1121 login_requester=login_requester, 1122 session_token_path=model.session_token_path, 1123 expiration_duration=parse_duration(model.expiration_duration) 1124 if model.expiration_duration 1125 else None, 1126 parameters=model.parameters or {}, 1127 message_repository=self._message_repository, 1128 decoder=decoder, 1129 ) 1130 if model.request_authentication.type == "Bearer": 1131 return ModelToComponentFactory.create_bearer_authenticator( 1132 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1133 config, 1134 token_provider=token_provider, 1135 ) 1136 else: 1137 return self.create_api_key_authenticator( 1138 ApiKeyAuthenticatorModel( 1139 type="ApiKeyAuthenticator", 1140 api_token="", 1141 inject_into=model.request_authentication.inject_into, 1142 ), # type: ignore # $parameters and headers default to None 1143 config=config, 1144 token_provider=token_provider, 1145 )
1147 @staticmethod 1148 def create_basic_http_authenticator( 1149 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1150 ) -> BasicHttpAuthenticator: 1151 return BasicHttpAuthenticator( 1152 password=model.password or "", 1153 username=model.username, 1154 config=config, 1155 parameters=model.parameters or {}, 1156 )
1158 @staticmethod 1159 def create_bearer_authenticator( 1160 model: BearerAuthenticatorModel, 1161 config: Config, 1162 token_provider: Optional[TokenProvider] = None, 1163 **kwargs: Any, 1164 ) -> BearerAuthenticator: 1165 if token_provider is not None and model.api_token != "": 1166 raise ValueError( 1167 "If token_provider is set, api_token is ignored and has to be set to empty string." 1168 ) 1169 return BearerAuthenticator( 1170 token_provider=( 1171 token_provider 1172 if token_provider is not None 1173 else InterpolatedStringTokenProvider( 1174 api_token=model.api_token or "", 1175 config=config, 1176 parameters=model.parameters or {}, 1177 ) 1178 ), 1179 config=config, 1180 parameters=model.parameters or {}, 1181 )
1183 @staticmethod 1184 def create_dynamic_stream_check_config( 1185 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1186 ) -> DynamicStreamCheckConfig: 1187 return DynamicStreamCheckConfig( 1188 dynamic_stream_name=model.dynamic_stream_name, 1189 stream_count=model.stream_count or 0, 1190 )
1192 def create_check_stream( 1193 self, model: CheckStreamModel, config: Config, **kwargs: Any 1194 ) -> CheckStream: 1195 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1196 raise ValueError( 1197 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1198 ) 1199 1200 dynamic_streams_check_configs = ( 1201 [ 1202 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1203 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1204 ] 1205 if model.dynamic_streams_check_configs 1206 else [] 1207 ) 1208 1209 return CheckStream( 1210 stream_names=model.stream_names or [], 1211 dynamic_streams_check_configs=dynamic_streams_check_configs, 1212 parameters={}, 1213 )
1215 @staticmethod 1216 def create_check_dynamic_stream( 1217 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1218 ) -> CheckDynamicStream: 1219 assert model.use_check_availability is not None # for mypy 1220 1221 use_check_availability = model.use_check_availability 1222 1223 return CheckDynamicStream( 1224 stream_count=model.stream_count, 1225 use_check_availability=use_check_availability, 1226 parameters={}, 1227 )
1229 def create_composite_error_handler( 1230 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1231 ) -> CompositeErrorHandler: 1232 error_handlers = [ 1233 self._create_component_from_model(model=error_handler_model, config=config) 1234 for error_handler_model in model.error_handlers 1235 ] 1236 return CompositeErrorHandler( 1237 error_handlers=error_handlers, parameters=model.parameters or {} 1238 )
1240 @staticmethod 1241 def create_concurrency_level( 1242 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1243 ) -> ConcurrencyLevel: 1244 return ConcurrencyLevel( 1245 default_concurrency=model.default_concurrency, 1246 max_concurrency=model.max_concurrency, 1247 config=config, 1248 parameters={}, 1249 )
1251 @staticmethod 1252 def apply_stream_state_migrations( 1253 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1254 ) -> MutableMapping[str, Any]: 1255 if stream_state_migrations: 1256 for state_migration in stream_state_migrations: 1257 if state_migration.should_migrate(stream_state): 1258 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1259 stream_state = dict(state_migration.migrate(stream_state)) 1260 return stream_state
1262 def create_concurrent_cursor_from_datetime_based_cursor( 1263 self, 1264 model_type: Type[BaseModel], 1265 component_definition: ComponentDefinition, 1266 stream_name: str, 1267 stream_namespace: Optional[str], 1268 config: Config, 1269 message_repository: Optional[MessageRepository] = None, 1270 runtime_lookback_window: Optional[datetime.timedelta] = None, 1271 stream_state_migrations: Optional[List[Any]] = None, 1272 **kwargs: Any, 1273 ) -> ConcurrentCursor: 1274 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1275 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1276 # incoming state and connector_state_manager that is initialized when the component factory is created 1277 stream_state = ( 1278 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1279 if "stream_state" not in kwargs 1280 else kwargs["stream_state"] 1281 ) 1282 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1283 1284 component_type = component_definition.get("type") 1285 if component_definition.get("type") != model_type.__name__: 1286 raise ValueError( 1287 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1288 ) 1289 1290 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1291 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1292 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1293 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1294 if "$parameters" not in component_definition and "parameters" in component_definition: 1295 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1296 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1297 1298 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1299 raise ValueError( 1300 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1301 ) 1302 1303 model_parameters = datetime_based_cursor_model.parameters or {} 1304 interpolated_cursor_field = InterpolatedString.create( 1305 datetime_based_cursor_model.cursor_field, 1306 parameters=model_parameters, 1307 ) 1308 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1309 1310 interpolated_partition_field_start = InterpolatedString.create( 1311 datetime_based_cursor_model.partition_field_start or "start_time", 1312 parameters=model_parameters, 1313 ) 1314 interpolated_partition_field_end = InterpolatedString.create( 1315 datetime_based_cursor_model.partition_field_end or "end_time", 1316 parameters=model_parameters, 1317 ) 1318 1319 slice_boundary_fields = ( 1320 interpolated_partition_field_start.eval(config=config), 1321 interpolated_partition_field_end.eval(config=config), 1322 ) 1323 1324 datetime_format = datetime_based_cursor_model.datetime_format 1325 1326 cursor_granularity = ( 1327 parse_duration(datetime_based_cursor_model.cursor_granularity) 1328 if datetime_based_cursor_model.cursor_granularity 1329 else None 1330 ) 1331 1332 lookback_window = None 1333 interpolated_lookback_window = ( 1334 InterpolatedString.create( 1335 datetime_based_cursor_model.lookback_window, 1336 parameters=model_parameters, 1337 ) 1338 if datetime_based_cursor_model.lookback_window 1339 else None 1340 ) 1341 if interpolated_lookback_window: 1342 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1343 if evaluated_lookback_window: 1344 lookback_window = parse_duration(evaluated_lookback_window) 1345 1346 connector_state_converter: DateTimeStreamStateConverter 1347 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1348 datetime_format=datetime_format, 1349 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1350 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1351 cursor_granularity=cursor_granularity, 1352 ) 1353 1354 # Adjusts the stream state by applying the runtime lookback window. 1355 # This is used to ensure correct state handling in case of failed partitions. 1356 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1357 if runtime_lookback_window and stream_state_value: 1358 new_stream_state = ( 1359 connector_state_converter.parse_timestamp(stream_state_value) 1360 - runtime_lookback_window 1361 ) 1362 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1363 new_stream_state 1364 ) 1365 1366 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1367 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1368 start_date_runtime_value = self.create_min_max_datetime( 1369 model=datetime_based_cursor_model.start_datetime, config=config 1370 ) 1371 else: 1372 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1373 1374 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1375 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1376 end_date_runtime_value = self.create_min_max_datetime( 1377 model=datetime_based_cursor_model.end_datetime, config=config 1378 ) 1379 else: 1380 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1381 1382 interpolated_start_date = MinMaxDatetime.create( 1383 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1384 parameters=datetime_based_cursor_model.parameters, 1385 ) 1386 interpolated_end_date = ( 1387 None 1388 if not end_date_runtime_value 1389 else MinMaxDatetime.create( 1390 end_date_runtime_value, datetime_based_cursor_model.parameters 1391 ) 1392 ) 1393 1394 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1395 if not interpolated_start_date.datetime_format: 1396 interpolated_start_date.datetime_format = datetime_format 1397 if interpolated_end_date and not interpolated_end_date.datetime_format: 1398 interpolated_end_date.datetime_format = datetime_format 1399 1400 start_date = interpolated_start_date.get_datetime(config=config) 1401 end_date_provider = ( 1402 partial(interpolated_end_date.get_datetime, config) 1403 if interpolated_end_date 1404 else connector_state_converter.get_end_provider() 1405 ) 1406 1407 if ( 1408 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1409 ) or ( 1410 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1411 ): 1412 raise ValueError( 1413 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1414 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1415 ) 1416 1417 # When step is not defined, default to a step size from the starting date to the present moment 1418 step_length = datetime.timedelta.max 1419 interpolated_step = ( 1420 InterpolatedString.create( 1421 datetime_based_cursor_model.step, 1422 parameters=model_parameters, 1423 ) 1424 if datetime_based_cursor_model.step 1425 else None 1426 ) 1427 if interpolated_step: 1428 evaluated_step = interpolated_step.eval(config) 1429 if evaluated_step: 1430 step_length = parse_duration(evaluated_step) 1431 1432 clamping_strategy: ClampingStrategy = NoClamping() 1433 if datetime_based_cursor_model.clamping: 1434 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1435 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1436 # object which we want to keep agnostic of being low-code 1437 target = InterpolatedString( 1438 string=datetime_based_cursor_model.clamping.target, 1439 parameters=model_parameters, 1440 ) 1441 evaluated_target = target.eval(config=config) 1442 match evaluated_target: 1443 case "DAY": 1444 clamping_strategy = DayClampingStrategy() 1445 end_date_provider = ClampingEndProvider( 1446 DayClampingStrategy(is_ceiling=False), 1447 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1448 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1449 ) 1450 case "WEEK": 1451 if ( 1452 not datetime_based_cursor_model.clamping.target_details 1453 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1454 ): 1455 raise ValueError( 1456 "Given WEEK clamping, weekday needs to be provided as target_details" 1457 ) 1458 weekday = self._assemble_weekday( 1459 datetime_based_cursor_model.clamping.target_details["weekday"] 1460 ) 1461 clamping_strategy = WeekClampingStrategy(weekday) 1462 end_date_provider = ClampingEndProvider( 1463 WeekClampingStrategy(weekday, is_ceiling=False), 1464 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1465 granularity=cursor_granularity or datetime.timedelta(days=1), 1466 ) 1467 case "MONTH": 1468 clamping_strategy = MonthClampingStrategy() 1469 end_date_provider = ClampingEndProvider( 1470 MonthClampingStrategy(is_ceiling=False), 1471 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1472 granularity=cursor_granularity or datetime.timedelta(days=1), 1473 ) 1474 case _: 1475 raise ValueError( 1476 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1477 ) 1478 1479 return ConcurrentCursor( 1480 stream_name=stream_name, 1481 stream_namespace=stream_namespace, 1482 stream_state=stream_state, 1483 message_repository=message_repository or self._message_repository, 1484 connector_state_manager=self._connector_state_manager, 1485 connector_state_converter=connector_state_converter, 1486 cursor_field=cursor_field, 1487 slice_boundary_fields=slice_boundary_fields, 1488 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1489 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1490 lookback_window=lookback_window, 1491 slice_range=step_length, 1492 cursor_granularity=cursor_granularity, 1493 clamping_strategy=clamping_strategy, 1494 )
1496 def create_concurrent_cursor_from_incrementing_count_cursor( 1497 self, 1498 model_type: Type[BaseModel], 1499 component_definition: ComponentDefinition, 1500 stream_name: str, 1501 stream_namespace: Optional[str], 1502 config: Config, 1503 message_repository: Optional[MessageRepository] = None, 1504 stream_state_migrations: Optional[List[Any]] = None, 1505 **kwargs: Any, 1506 ) -> ConcurrentCursor: 1507 # Per-partition incremental streams can dynamically create child cursors which will pass their current 1508 # state via the stream_state keyword argument. Incremental syncs without parent streams use the 1509 # incoming state and connector_state_manager that is initialized when the component factory is created 1510 stream_state = ( 1511 self._connector_state_manager.get_stream_state(stream_name, stream_namespace) 1512 if "stream_state" not in kwargs 1513 else kwargs["stream_state"] 1514 ) 1515 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1516 1517 component_type = component_definition.get("type") 1518 if component_definition.get("type") != model_type.__name__: 1519 raise ValueError( 1520 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1521 ) 1522 1523 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1524 1525 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1526 raise ValueError( 1527 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1528 ) 1529 1530 interpolated_start_value = ( 1531 InterpolatedString.create( 1532 incrementing_count_cursor_model.start_value, # type: ignore 1533 parameters=incrementing_count_cursor_model.parameters or {}, 1534 ) 1535 if incrementing_count_cursor_model.start_value 1536 else 0 1537 ) 1538 1539 interpolated_cursor_field = InterpolatedString.create( 1540 incrementing_count_cursor_model.cursor_field, 1541 parameters=incrementing_count_cursor_model.parameters or {}, 1542 ) 1543 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1544 1545 connector_state_converter = IncrementingCountStreamStateConverter( 1546 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1547 ) 1548 1549 return ConcurrentCursor( 1550 stream_name=stream_name, 1551 stream_namespace=stream_namespace, 1552 stream_state=stream_state, 1553 message_repository=message_repository or self._message_repository, 1554 connector_state_manager=self._connector_state_manager, 1555 connector_state_converter=connector_state_converter, 1556 cursor_field=cursor_field, 1557 slice_boundary_fields=None, 1558 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1559 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1560 )
1581 def create_concurrent_cursor_from_perpartition_cursor( 1582 self, 1583 state_manager: ConnectorStateManager, 1584 model_type: Type[BaseModel], 1585 component_definition: ComponentDefinition, 1586 stream_name: str, 1587 stream_namespace: Optional[str], 1588 config: Config, 1589 stream_state: MutableMapping[str, Any], 1590 partition_router: PartitionRouter, 1591 stream_state_migrations: Optional[List[Any]] = None, 1592 attempt_to_create_cursor_if_not_provided: bool = False, 1593 **kwargs: Any, 1594 ) -> ConcurrentPerPartitionCursor: 1595 component_type = component_definition.get("type") 1596 if component_definition.get("type") != model_type.__name__: 1597 raise ValueError( 1598 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1599 ) 1600 1601 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1602 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1603 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1604 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1605 if "$parameters" not in component_definition and "parameters" in component_definition: 1606 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1607 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1608 1609 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1610 raise ValueError( 1611 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1612 ) 1613 1614 interpolated_cursor_field = InterpolatedString.create( 1615 datetime_based_cursor_model.cursor_field, 1616 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1617 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1618 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1619 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1620 parameters=datetime_based_cursor_model.parameters or {}, 1621 ) 1622 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1623 1624 datetime_format = datetime_based_cursor_model.datetime_format 1625 1626 cursor_granularity = ( 1627 parse_duration(datetime_based_cursor_model.cursor_granularity) 1628 if datetime_based_cursor_model.cursor_granularity 1629 else None 1630 ) 1631 1632 connector_state_converter: DateTimeStreamStateConverter 1633 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1634 datetime_format=datetime_format, 1635 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1636 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1637 cursor_granularity=cursor_granularity, 1638 ) 1639 1640 # Create the cursor factory 1641 cursor_factory = ConcurrentCursorFactory( 1642 partial( 1643 self.create_concurrent_cursor_from_datetime_based_cursor, 1644 state_manager=state_manager, 1645 model_type=model_type, 1646 component_definition=component_definition, 1647 stream_name=stream_name, 1648 stream_namespace=stream_namespace, 1649 config=config, 1650 message_repository=NoopMessageRepository(), 1651 # stream_state_migrations=stream_state_migrations, # FIXME is it expected to run migration on per partition state too? 1652 ) 1653 ) 1654 1655 stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state) 1656 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1657 use_global_cursor = isinstance( 1658 partition_router, GroupingPartitionRouter 1659 ) or component_definition.get("global_substream_cursor", False) 1660 1661 # Return the concurrent cursor and state converter 1662 return ConcurrentPerPartitionCursor( 1663 cursor_factory=cursor_factory, 1664 partition_router=partition_router, 1665 stream_name=stream_name, 1666 stream_namespace=stream_namespace, 1667 stream_state=stream_state, 1668 message_repository=self._message_repository, # type: ignore 1669 connector_state_manager=state_manager, 1670 connector_state_converter=connector_state_converter, 1671 cursor_field=cursor_field, 1672 use_global_cursor=use_global_cursor, 1673 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1674 )
1676 @staticmethod 1677 def create_constant_backoff_strategy( 1678 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1679 ) -> ConstantBackoffStrategy: 1680 return ConstantBackoffStrategy( 1681 backoff_time_in_seconds=model.backoff_time_in_seconds, 1682 config=config, 1683 parameters=model.parameters or {}, 1684 )
1686 def create_cursor_pagination( 1687 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1688 ) -> CursorPaginationStrategy: 1689 if isinstance(decoder, PaginationDecoderDecorator): 1690 inner_decoder = decoder.decoder 1691 else: 1692 inner_decoder = decoder 1693 decoder = PaginationDecoderDecorator(decoder=decoder) 1694 1695 if self._is_supported_decoder_for_pagination(inner_decoder): 1696 decoder_to_use = decoder 1697 else: 1698 raise ValueError( 1699 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1700 ) 1701 1702 return CursorPaginationStrategy( 1703 cursor_value=model.cursor_value, 1704 decoder=decoder_to_use, 1705 page_size=model.page_size, 1706 stop_condition=model.stop_condition, 1707 config=config, 1708 parameters=model.parameters or {}, 1709 )
1711 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1712 """ 1713 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1714 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1715 :param model: The Pydantic model of the custom component being created 1716 :param config: The custom defined connector config 1717 :return: The declarative component built from the Pydantic model to be used at runtime 1718 """ 1719 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1720 component_fields = get_type_hints(custom_component_class) 1721 model_args = model.dict() 1722 model_args["config"] = config 1723 1724 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1725 # we defer to these arguments over the component's definition 1726 for key, arg in kwargs.items(): 1727 model_args[key] = arg 1728 1729 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1730 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1731 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1732 for model_field, model_value in model_args.items(): 1733 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1734 if ( 1735 isinstance(model_value, dict) 1736 and "type" not in model_value 1737 and model_field in component_fields 1738 ): 1739 derived_type = self._derive_component_type_from_type_hints( 1740 component_fields.get(model_field) 1741 ) 1742 if derived_type: 1743 model_value["type"] = derived_type 1744 1745 if self._is_component(model_value): 1746 model_args[model_field] = self._create_nested_component( 1747 model, 1748 model_field, 1749 model_value, 1750 config, 1751 **kwargs, 1752 ) 1753 elif isinstance(model_value, list): 1754 vals = [] 1755 for v in model_value: 1756 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1757 derived_type = self._derive_component_type_from_type_hints( 1758 component_fields.get(model_field) 1759 ) 1760 if derived_type: 1761 v["type"] = derived_type 1762 if self._is_component(v): 1763 vals.append( 1764 self._create_nested_component( 1765 model, 1766 model_field, 1767 v, 1768 config, 1769 **kwargs, 1770 ) 1771 ) 1772 else: 1773 vals.append(v) 1774 model_args[model_field] = vals 1775 1776 kwargs = { 1777 class_field: model_args[class_field] 1778 for class_field in component_fields.keys() 1779 if class_field in model_args 1780 } 1781 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1916 def create_datetime_based_cursor( 1917 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1918 ) -> DatetimeBasedCursor: 1919 start_datetime: Union[str, MinMaxDatetime] = ( 1920 model.start_datetime 1921 if isinstance(model.start_datetime, str) 1922 else self.create_min_max_datetime(model.start_datetime, config) 1923 ) 1924 end_datetime: Union[str, MinMaxDatetime, None] = None 1925 if model.is_data_feed and model.end_datetime: 1926 raise ValueError("Data feed does not support end_datetime") 1927 if model.is_data_feed and model.is_client_side_incremental: 1928 raise ValueError( 1929 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1930 ) 1931 if model.end_datetime: 1932 end_datetime = ( 1933 model.end_datetime 1934 if isinstance(model.end_datetime, str) 1935 else self.create_min_max_datetime(model.end_datetime, config) 1936 ) 1937 1938 end_time_option = ( 1939 self._create_component_from_model( 1940 model.end_time_option, config, parameters=model.parameters or {} 1941 ) 1942 if model.end_time_option 1943 else None 1944 ) 1945 start_time_option = ( 1946 self._create_component_from_model( 1947 model.start_time_option, config, parameters=model.parameters or {} 1948 ) 1949 if model.start_time_option 1950 else None 1951 ) 1952 1953 return DatetimeBasedCursor( 1954 cursor_field=model.cursor_field, 1955 cursor_datetime_formats=model.cursor_datetime_formats 1956 if model.cursor_datetime_formats 1957 else [], 1958 cursor_granularity=model.cursor_granularity, 1959 datetime_format=model.datetime_format, 1960 end_datetime=end_datetime, 1961 start_datetime=start_datetime, 1962 step=model.step, 1963 end_time_option=end_time_option, 1964 lookback_window=model.lookback_window, 1965 start_time_option=start_time_option, 1966 partition_field_end=model.partition_field_end, 1967 partition_field_start=model.partition_field_start, 1968 message_repository=self._message_repository, 1969 is_compare_strictly=model.is_compare_strictly, 1970 config=config, 1971 parameters=model.parameters or {}, 1972 )
1974 def create_default_stream( 1975 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1976 ) -> AbstractStream: 1977 primary_key = model.primary_key.__root__ if model.primary_key else None 1978 1979 partition_router = self._build_stream_slicer_from_partition_router( 1980 model.retriever, 1981 config, 1982 stream_name=model.name, 1983 **kwargs, 1984 ) 1985 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 1986 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 1987 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 1988 1989 end_time_option = ( 1990 self._create_component_from_model( 1991 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 1992 ) 1993 if cursor_model.end_time_option 1994 else None 1995 ) 1996 start_time_option = ( 1997 self._create_component_from_model( 1998 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 1999 ) 2000 if cursor_model.start_time_option 2001 else None 2002 ) 2003 2004 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2005 start_time_option=start_time_option, 2006 end_time_option=end_time_option, 2007 partition_field_start=cursor_model.partition_field_start, 2008 partition_field_end=cursor_model.partition_field_end, 2009 config=config, 2010 parameters=model.parameters or {}, 2011 ) 2012 request_options_provider = ( 2013 datetime_request_options_provider 2014 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2015 else PerPartitionRequestOptionsProvider( 2016 partition_router, datetime_request_options_provider 2017 ) 2018 ) 2019 elif model.incremental_sync and isinstance( 2020 model.incremental_sync, IncrementingCountCursorModel 2021 ): 2022 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2023 raise ValueError( 2024 "PerPartition does not support per partition states because switching to global state is time based" 2025 ) 2026 2027 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2028 2029 start_time_option = ( 2030 self._create_component_from_model( 2031 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2032 config, 2033 parameters=cursor_model.parameters or {}, 2034 ) 2035 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2036 else None 2037 ) 2038 2039 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2040 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2041 partition_field_start = "start" 2042 2043 request_options_provider = DatetimeBasedRequestOptionsProvider( 2044 start_time_option=start_time_option, 2045 partition_field_start=partition_field_start, 2046 config=config, 2047 parameters=model.parameters or {}, 2048 ) 2049 else: 2050 request_options_provider = None 2051 2052 transformations = [] 2053 if model.transformations: 2054 for transformation_model in model.transformations: 2055 transformations.append( 2056 self._create_component_from_model(model=transformation_model, config=config) 2057 ) 2058 file_uploader = None 2059 if model.file_uploader: 2060 file_uploader = self._create_component_from_model( 2061 model=model.file_uploader, config=config 2062 ) 2063 2064 stream_slicer: ConcurrentStreamSlicer = ( 2065 partition_router 2066 if isinstance(concurrent_cursor, FinalStateCursor) 2067 else concurrent_cursor 2068 ) 2069 retriever = self._create_component_from_model( 2070 model=model.retriever, 2071 config=config, 2072 name=model.name, 2073 primary_key=primary_key, 2074 request_options_provider=request_options_provider, 2075 stream_slicer=stream_slicer, 2076 partition_router=partition_router, 2077 stop_condition_cursor=concurrent_cursor 2078 if self._is_stop_condition_on_cursor(model) 2079 else None, 2080 client_side_incremental_sync={"cursor": concurrent_cursor} 2081 if self._is_client_side_filtering_enabled(model) 2082 else None, 2083 transformations=transformations, 2084 file_uploader=file_uploader, 2085 incremental_sync=model.incremental_sync, 2086 ) 2087 if isinstance(retriever, AsyncRetriever): 2088 stream_slicer = retriever.stream_slicer 2089 2090 schema_loader: Union[ 2091 CompositeSchemaLoader, 2092 DefaultSchemaLoader, 2093 DynamicSchemaLoader, 2094 InlineSchemaLoader, 2095 JsonFileSchemaLoader, 2096 ] 2097 if model.schema_loader and isinstance(model.schema_loader, list): 2098 nested_schema_loaders = [ 2099 self._create_component_from_model(model=nested_schema_loader, config=config) 2100 for nested_schema_loader in model.schema_loader 2101 ] 2102 schema_loader = CompositeSchemaLoader( 2103 schema_loaders=nested_schema_loaders, parameters={} 2104 ) 2105 elif model.schema_loader: 2106 schema_loader = self._create_component_from_model( 2107 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2108 config=config, 2109 ) 2110 else: 2111 options = model.parameters or {} 2112 if "name" not in options: 2113 options["name"] = model.name 2114 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2115 2116 stream_name = model.name or "" 2117 return DefaultStream( 2118 partition_generator=StreamSlicerPartitionGenerator( 2119 DeclarativePartitionFactory( 2120 stream_name, 2121 schema_loader, 2122 retriever, 2123 self._message_repository, 2124 ), 2125 stream_slicer, 2126 slice_limit=self._limit_slices_fetched, 2127 ), 2128 name=stream_name, 2129 json_schema=schema_loader.get_json_schema, 2130 primary_key=get_primary_key_from_stream(primary_key), 2131 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2132 if hasattr(concurrent_cursor, "cursor_field") 2133 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2134 logger=logging.getLogger(f"airbyte.{stream_name}"), 2135 cursor=concurrent_cursor, 2136 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2137 )
2265 def create_default_error_handler( 2266 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2267 ) -> DefaultErrorHandler: 2268 backoff_strategies = [] 2269 if model.backoff_strategies: 2270 for backoff_strategy_model in model.backoff_strategies: 2271 backoff_strategies.append( 2272 self._create_component_from_model(model=backoff_strategy_model, config=config) 2273 ) 2274 2275 response_filters = [] 2276 if model.response_filters: 2277 for response_filter_model in model.response_filters: 2278 response_filters.append( 2279 self._create_component_from_model(model=response_filter_model, config=config) 2280 ) 2281 response_filters.append( 2282 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2283 ) 2284 2285 return DefaultErrorHandler( 2286 backoff_strategies=backoff_strategies, 2287 max_retries=model.max_retries, 2288 response_filters=response_filters, 2289 config=config, 2290 parameters=model.parameters or {}, 2291 )
2293 def create_default_paginator( 2294 self, 2295 model: DefaultPaginatorModel, 2296 config: Config, 2297 *, 2298 url_base: str, 2299 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2300 decoder: Optional[Decoder] = None, 2301 cursor_used_for_stop_condition: Optional[Cursor] = None, 2302 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2303 if decoder: 2304 if self._is_supported_decoder_for_pagination(decoder): 2305 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2306 else: 2307 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2308 else: 2309 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2310 page_size_option = ( 2311 self._create_component_from_model(model=model.page_size_option, config=config) 2312 if model.page_size_option 2313 else None 2314 ) 2315 page_token_option = ( 2316 self._create_component_from_model(model=model.page_token_option, config=config) 2317 if model.page_token_option 2318 else None 2319 ) 2320 pagination_strategy = self._create_component_from_model( 2321 model=model.pagination_strategy, 2322 config=config, 2323 decoder=decoder_to_use, 2324 extractor_model=extractor_model, 2325 ) 2326 if cursor_used_for_stop_condition: 2327 pagination_strategy = StopConditionPaginationStrategyDecorator( 2328 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2329 ) 2330 paginator = DefaultPaginator( 2331 decoder=decoder_to_use, 2332 page_size_option=page_size_option, 2333 page_token_option=page_token_option, 2334 pagination_strategy=pagination_strategy, 2335 url_base=url_base, 2336 config=config, 2337 parameters=model.parameters or {}, 2338 ) 2339 if self._limit_pages_fetched_per_slice: 2340 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2341 return paginator
2343 def create_dpath_extractor( 2344 self, 2345 model: DpathExtractorModel, 2346 config: Config, 2347 decoder: Optional[Decoder] = None, 2348 **kwargs: Any, 2349 ) -> DpathExtractor: 2350 if decoder: 2351 decoder_to_use = decoder 2352 else: 2353 decoder_to_use = JsonDecoder(parameters={}) 2354 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2355 return DpathExtractor( 2356 decoder=decoder_to_use, 2357 field_path=model_field_path, 2358 config=config, 2359 parameters=model.parameters or {}, 2360 )
2381 def create_http_requester( 2382 self, 2383 model: HttpRequesterModel, 2384 config: Config, 2385 decoder: Decoder = JsonDecoder(parameters={}), 2386 query_properties_key: Optional[str] = None, 2387 use_cache: Optional[bool] = None, 2388 *, 2389 name: str, 2390 ) -> HttpRequester: 2391 authenticator = ( 2392 self._create_component_from_model( 2393 model=model.authenticator, 2394 config=config, 2395 url_base=model.url or model.url_base, 2396 name=name, 2397 decoder=decoder, 2398 ) 2399 if model.authenticator 2400 else None 2401 ) 2402 error_handler = ( 2403 self._create_component_from_model(model=model.error_handler, config=config) 2404 if model.error_handler 2405 else DefaultErrorHandler( 2406 backoff_strategies=[], 2407 response_filters=[], 2408 config=config, 2409 parameters=model.parameters or {}, 2410 ) 2411 ) 2412 2413 api_budget = self._api_budget 2414 2415 # Removes QueryProperties components from the interpolated mappings because it has been designed 2416 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 2417 # instead of through jinja interpolation 2418 request_parameters: Optional[Union[str, Mapping[str, str]]] 2419 if isinstance(model.request_parameters, Mapping): 2420 request_parameters = self._remove_query_properties(model.request_parameters) 2421 else: 2422 request_parameters = model.request_parameters 2423 2424 request_options_provider = InterpolatedRequestOptionsProvider( 2425 request_body=model.request_body, 2426 request_body_data=model.request_body_data, 2427 request_body_json=model.request_body_json, 2428 request_headers=model.request_headers, 2429 request_parameters=request_parameters, 2430 query_properties_key=query_properties_key, 2431 config=config, 2432 parameters=model.parameters or {}, 2433 ) 2434 2435 assert model.use_cache is not None # for mypy 2436 assert model.http_method is not None # for mypy 2437 2438 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2439 2440 return HttpRequester( 2441 name=name, 2442 url=model.url, 2443 url_base=model.url_base, 2444 path=model.path, 2445 authenticator=authenticator, 2446 error_handler=error_handler, 2447 api_budget=api_budget, 2448 http_method=HttpMethod[model.http_method.value], 2449 request_options_provider=request_options_provider, 2450 config=config, 2451 disable_retries=self._disable_retries, 2452 parameters=model.parameters or {}, 2453 message_repository=self._message_repository, 2454 use_cache=should_use_cache, 2455 decoder=decoder, 2456 stream_response=decoder.is_stream_response() if decoder else False, 2457 )
2459 @staticmethod 2460 def create_http_response_filter( 2461 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2462 ) -> HttpResponseFilter: 2463 if model.action: 2464 action = ResponseAction(model.action.value) 2465 else: 2466 action = None 2467 2468 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2469 2470 http_codes = ( 2471 set(model.http_codes) if model.http_codes else set() 2472 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2473 2474 return HttpResponseFilter( 2475 action=action, 2476 failure_type=failure_type, 2477 error_message=model.error_message or "", 2478 error_message_contains=model.error_message_contains or "", 2479 http_codes=http_codes, 2480 predicate=model.predicate or "", 2481 config=config, 2482 parameters=model.parameters or {}, 2483 )
2491 def create_complex_field_type( 2492 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2493 ) -> ComplexFieldType: 2494 items = ( 2495 self._create_component_from_model(model=model.items, config=config) 2496 if isinstance(model.items, ComplexFieldTypeModel) 2497 else model.items 2498 ) 2499 2500 return ComplexFieldType(field_type=model.field_type, items=items)
2502 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2503 target_type = ( 2504 self._create_component_from_model(model=model.target_type, config=config) 2505 if isinstance(model.target_type, ComplexFieldTypeModel) 2506 else model.target_type 2507 ) 2508 2509 return TypesMap( 2510 target_type=target_type, 2511 current_type=model.current_type, 2512 condition=model.condition if model.condition is not None else "True", 2513 )
2515 def create_schema_type_identifier( 2516 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2517 ) -> SchemaTypeIdentifier: 2518 types_mapping = [] 2519 if model.types_mapping: 2520 types_mapping.extend( 2521 [ 2522 self._create_component_from_model(types_map, config=config) 2523 for types_map in model.types_mapping 2524 ] 2525 ) 2526 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2527 [x for x in model.schema_pointer] if model.schema_pointer else [] 2528 ) 2529 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2530 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2531 [x for x in model.type_pointer] if model.type_pointer else None 2532 ) 2533 2534 return SchemaTypeIdentifier( 2535 schema_pointer=model_schema_pointer, 2536 key_pointer=model_key_pointer, 2537 type_pointer=model_type_pointer, 2538 types_mapping=types_mapping, 2539 parameters=model.parameters or {}, 2540 )
2542 def create_dynamic_schema_loader( 2543 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2544 ) -> DynamicSchemaLoader: 2545 schema_transformations = [] 2546 if model.schema_transformations: 2547 for transformation_model in model.schema_transformations: 2548 schema_transformations.append( 2549 self._create_component_from_model(model=transformation_model, config=config) 2550 ) 2551 name = "dynamic_properties" 2552 retriever = self._create_component_from_model( 2553 model=model.retriever, 2554 config=config, 2555 name=name, 2556 primary_key=None, 2557 partition_router=self._build_stream_slicer_from_partition_router( 2558 model.retriever, config 2559 ), 2560 transformations=[], 2561 use_cache=True, 2562 log_formatter=( 2563 lambda response: format_http_message( 2564 response, 2565 f"Schema loader '{name}' request", 2566 f"Request performed in order to extract schema.", 2567 name, 2568 is_auxiliary=True, 2569 ) 2570 ), 2571 ) 2572 schema_type_identifier = self._create_component_from_model( 2573 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2574 ) 2575 schema_filter = ( 2576 self._create_component_from_model( 2577 model.schema_filter, config=config, parameters=model.parameters or {} 2578 ) 2579 if model.schema_filter is not None 2580 else None 2581 ) 2582 2583 return DynamicSchemaLoader( 2584 retriever=retriever, 2585 config=config, 2586 schema_transformations=schema_transformations, 2587 schema_filter=schema_filter, 2588 schema_type_identifier=schema_type_identifier, 2589 parameters=model.parameters or {}, 2590 )
2610 def create_gzip_decoder( 2611 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2612 ) -> Decoder: 2613 _compressed_response_types = { 2614 "gzip", 2615 "x-gzip", 2616 "gzip, deflate", 2617 "x-gzip, deflate", 2618 "application/zip", 2619 "application/gzip", 2620 "application/x-gzip", 2621 "application/x-zip-compressed", 2622 } 2623 2624 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2625 2626 if self._emit_connector_builder_messages: 2627 # This is very surprising but if the response is not streamed, 2628 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2629 # which uses urllib3 directly and does not uncompress the data. 2630 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2631 2632 return CompositeRawDecoder.by_headers( 2633 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2634 stream_response=True, 2635 fallback_parser=gzip_parser.inner_parser, 2636 )
2640 @staticmethod 2641 def create_incrementing_count_cursor( 2642 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2643 ) -> DatetimeBasedCursor: 2644 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2645 # we still parse models into components. The issue is that there's no runtime implementation of a 2646 # IncrementingCountCursor. 2647 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2648 return DatetimeBasedCursor( 2649 cursor_field=model.cursor_field, 2650 datetime_format="%Y-%m-%d", 2651 start_datetime="2024-12-12", 2652 config=config, 2653 parameters={}, 2654 )
2703 @staticmethod 2704 def create_jwt_authenticator( 2705 model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2706 ) -> JwtAuthenticator: 2707 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2708 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2709 return JwtAuthenticator( 2710 config=config, 2711 parameters=model.parameters or {}, 2712 algorithm=JwtAlgorithm(model.algorithm.value), 2713 secret_key=model.secret_key, 2714 base64_encode_secret_key=model.base64_encode_secret_key, 2715 token_duration=model.token_duration, 2716 header_prefix=model.header_prefix, 2717 kid=jwt_headers.kid, 2718 typ=jwt_headers.typ, 2719 cty=jwt_headers.cty, 2720 iss=jwt_payload.iss, 2721 sub=jwt_payload.sub, 2722 aud=jwt_payload.aud, 2723 additional_jwt_headers=model.additional_jwt_headers, 2724 additional_jwt_payload=model.additional_jwt_payload, 2725 )
2727 def create_list_partition_router( 2728 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2729 ) -> ListPartitionRouter: 2730 request_option = ( 2731 self._create_component_from_model(model.request_option, config) 2732 if model.request_option 2733 else None 2734 ) 2735 return ListPartitionRouter( 2736 cursor_field=model.cursor_field, 2737 request_option=request_option, 2738 values=model.values, 2739 config=config, 2740 parameters=model.parameters or {}, 2741 )
2743 @staticmethod 2744 def create_min_max_datetime( 2745 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2746 ) -> MinMaxDatetime: 2747 return MinMaxDatetime( 2748 datetime=model.datetime, 2749 datetime_format=model.datetime_format or "", 2750 max_datetime=model.max_datetime or "", 2751 min_datetime=model.min_datetime or "", 2752 parameters=model.parameters or {}, 2753 )
2765 def create_oauth_authenticator( 2766 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2767 ) -> DeclarativeOauth2Authenticator: 2768 profile_assertion = ( 2769 self._create_component_from_model(model.profile_assertion, config=config) 2770 if model.profile_assertion 2771 else None 2772 ) 2773 2774 if model.refresh_token_updater: 2775 # ignore type error because fixing it would have a lot of dependencies, revisit later 2776 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2777 config, 2778 InterpolatedString.create( 2779 model.token_refresh_endpoint, # type: ignore 2780 parameters=model.parameters or {}, 2781 ).eval(config), 2782 access_token_name=InterpolatedString.create( 2783 model.access_token_name or "access_token", parameters=model.parameters or {} 2784 ).eval(config), 2785 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2786 expires_in_name=InterpolatedString.create( 2787 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2788 ).eval(config), 2789 client_id_name=InterpolatedString.create( 2790 model.client_id_name or "client_id", parameters=model.parameters or {} 2791 ).eval(config), 2792 client_id=InterpolatedString.create( 2793 model.client_id, parameters=model.parameters or {} 2794 ).eval(config) 2795 if model.client_id 2796 else model.client_id, 2797 client_secret_name=InterpolatedString.create( 2798 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2799 ).eval(config), 2800 client_secret=InterpolatedString.create( 2801 model.client_secret, parameters=model.parameters or {} 2802 ).eval(config) 2803 if model.client_secret 2804 else model.client_secret, 2805 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2806 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2807 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2808 grant_type_name=InterpolatedString.create( 2809 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2810 ).eval(config), 2811 grant_type=InterpolatedString.create( 2812 model.grant_type or "refresh_token", parameters=model.parameters or {} 2813 ).eval(config), 2814 refresh_request_body=InterpolatedMapping( 2815 model.refresh_request_body or {}, parameters=model.parameters or {} 2816 ).eval(config), 2817 refresh_request_headers=InterpolatedMapping( 2818 model.refresh_request_headers or {}, parameters=model.parameters or {} 2819 ).eval(config), 2820 scopes=model.scopes, 2821 token_expiry_date_format=model.token_expiry_date_format, 2822 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2823 message_repository=self._message_repository, 2824 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2825 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2826 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2827 ) 2828 # ignore type error because fixing it would have a lot of dependencies, revisit later 2829 return DeclarativeOauth2Authenticator( # type: ignore 2830 access_token_name=model.access_token_name or "access_token", 2831 access_token_value=model.access_token_value, 2832 client_id_name=model.client_id_name or "client_id", 2833 client_id=model.client_id, 2834 client_secret_name=model.client_secret_name or "client_secret", 2835 client_secret=model.client_secret, 2836 expires_in_name=model.expires_in_name or "expires_in", 2837 grant_type_name=model.grant_type_name or "grant_type", 2838 grant_type=model.grant_type or "refresh_token", 2839 refresh_request_body=model.refresh_request_body, 2840 refresh_request_headers=model.refresh_request_headers, 2841 refresh_token_name=model.refresh_token_name or "refresh_token", 2842 refresh_token=model.refresh_token, 2843 scopes=model.scopes, 2844 token_expiry_date=model.token_expiry_date, 2845 token_expiry_date_format=model.token_expiry_date_format, 2846 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2847 token_refresh_endpoint=model.token_refresh_endpoint, 2848 config=config, 2849 parameters=model.parameters or {}, 2850 message_repository=self._message_repository, 2851 profile_assertion=profile_assertion, 2852 use_profile_assertion=model.use_profile_assertion, 2853 )
2855 def create_offset_increment( 2856 self, 2857 model: OffsetIncrementModel, 2858 config: Config, 2859 decoder: Decoder, 2860 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2861 **kwargs: Any, 2862 ) -> OffsetIncrement: 2863 if isinstance(decoder, PaginationDecoderDecorator): 2864 inner_decoder = decoder.decoder 2865 else: 2866 inner_decoder = decoder 2867 decoder = PaginationDecoderDecorator(decoder=decoder) 2868 2869 if self._is_supported_decoder_for_pagination(inner_decoder): 2870 decoder_to_use = decoder 2871 else: 2872 raise ValueError( 2873 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2874 ) 2875 2876 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2877 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2878 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2879 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2880 # When we have more time to investigate we can look into reusing the same component. 2881 extractor = ( 2882 self._create_component_from_model( 2883 model=extractor_model, config=config, decoder=decoder_to_use 2884 ) 2885 if extractor_model 2886 else None 2887 ) 2888 2889 return OffsetIncrement( 2890 page_size=model.page_size, 2891 config=config, 2892 decoder=decoder_to_use, 2893 extractor=extractor, 2894 inject_on_first_request=model.inject_on_first_request or False, 2895 parameters=model.parameters or {}, 2896 )
2898 @staticmethod 2899 def create_page_increment( 2900 model: PageIncrementModel, config: Config, **kwargs: Any 2901 ) -> PageIncrement: 2902 return PageIncrement( 2903 page_size=model.page_size, 2904 config=config, 2905 start_from_page=model.start_from_page or 0, 2906 inject_on_first_request=model.inject_on_first_request or False, 2907 parameters=model.parameters or {}, 2908 )
2910 def create_parent_stream_config( 2911 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2912 ) -> ParentStreamConfig: 2913 declarative_stream = self._create_component_from_model( 2914 model.stream, 2915 config=config, 2916 is_parent=True, 2917 **kwargs, 2918 ) 2919 request_option = ( 2920 self._create_component_from_model(model.request_option, config=config) 2921 if model.request_option 2922 else None 2923 ) 2924 2925 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2926 raise ValueError( 2927 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2928 ) 2929 2930 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2931 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2932 ) 2933 2934 return ParentStreamConfig( 2935 parent_key=model.parent_key, 2936 request_option=request_option, 2937 stream=declarative_stream, 2938 partition_field=model.partition_field, 2939 config=config, 2940 incremental_dependency=model.incremental_dependency or False, 2941 parameters=model.parameters or {}, 2942 extra_fields=model.extra_fields, 2943 lazy_read_pointer=model_lazy_read_pointer, 2944 )
2946 def create_properties_from_endpoint( 2947 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2948 ) -> PropertiesFromEndpoint: 2949 retriever = self._create_component_from_model( 2950 model=model.retriever, 2951 config=config, 2952 name="dynamic_properties", 2953 primary_key=None, 2954 stream_slicer=None, 2955 transformations=[], 2956 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2957 ) 2958 return PropertiesFromEndpoint( 2959 property_field_path=model.property_field_path, 2960 retriever=retriever, 2961 config=config, 2962 parameters=model.parameters or {}, 2963 )
2965 def create_property_chunking( 2966 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2967 ) -> PropertyChunking: 2968 record_merge_strategy = ( 2969 self._create_component_from_model( 2970 model=model.record_merge_strategy, config=config, **kwargs 2971 ) 2972 if model.record_merge_strategy 2973 else None 2974 ) 2975 2976 property_limit_type: PropertyLimitType 2977 match model.property_limit_type: 2978 case PropertyLimitTypeModel.property_count: 2979 property_limit_type = PropertyLimitType.property_count 2980 case PropertyLimitTypeModel.characters: 2981 property_limit_type = PropertyLimitType.characters 2982 case _: 2983 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 2984 2985 return PropertyChunking( 2986 property_limit_type=property_limit_type, 2987 property_limit=model.property_limit, 2988 record_merge_strategy=record_merge_strategy, 2989 config=config, 2990 parameters=model.parameters or {}, 2991 )
2993 def create_query_properties( 2994 self, model: QueryPropertiesModel, config: Config, **kwargs: Any 2995 ) -> QueryProperties: 2996 if isinstance(model.property_list, list): 2997 property_list = model.property_list 2998 else: 2999 property_list = self._create_component_from_model( 3000 model=model.property_list, config=config, **kwargs 3001 ) 3002 3003 property_chunking = ( 3004 self._create_component_from_model( 3005 model=model.property_chunking, config=config, **kwargs 3006 ) 3007 if model.property_chunking 3008 else None 3009 ) 3010 3011 return QueryProperties( 3012 property_list=property_list, 3013 always_include_properties=model.always_include_properties, 3014 property_chunking=property_chunking, 3015 config=config, 3016 parameters=model.parameters or {}, 3017 )
3031 @staticmethod 3032 def create_request_option( 3033 model: RequestOptionModel, config: Config, **kwargs: Any 3034 ) -> RequestOption: 3035 inject_into = RequestOptionType(model.inject_into.value) 3036 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3037 [ 3038 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3039 for segment in model.field_path 3040 ] 3041 if model.field_path 3042 else None 3043 ) 3044 field_name = ( 3045 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3046 if model.field_name 3047 else None 3048 ) 3049 return RequestOption( 3050 field_name=field_name, 3051 field_path=field_path, 3052 inject_into=inject_into, 3053 parameters=kwargs.get("parameters", {}), 3054 )
3056 def create_record_selector( 3057 self, 3058 model: RecordSelectorModel, 3059 config: Config, 3060 *, 3061 name: str, 3062 transformations: List[RecordTransformation] | None = None, 3063 decoder: Decoder | None = None, 3064 client_side_incremental_sync: Dict[str, Any] | None = None, 3065 file_uploader: Optional[DefaultFileUploader] = None, 3066 **kwargs: Any, 3067 ) -> RecordSelector: 3068 extractor = self._create_component_from_model( 3069 model=model.extractor, decoder=decoder, config=config 3070 ) 3071 record_filter = ( 3072 self._create_component_from_model(model.record_filter, config=config) 3073 if model.record_filter 3074 else None 3075 ) 3076 3077 transform_before_filtering = ( 3078 False if model.transform_before_filtering is None else model.transform_before_filtering 3079 ) 3080 if client_side_incremental_sync: 3081 record_filter = ClientSideIncrementalRecordFilterDecorator( 3082 config=config, 3083 parameters=model.parameters, 3084 condition=model.record_filter.condition 3085 if (model.record_filter and hasattr(model.record_filter, "condition")) 3086 else None, 3087 **client_side_incremental_sync, 3088 ) 3089 transform_before_filtering = ( 3090 True 3091 if model.transform_before_filtering is None 3092 else model.transform_before_filtering 3093 ) 3094 3095 if model.schema_normalization is None: 3096 # default to no schema normalization if not set 3097 model.schema_normalization = SchemaNormalizationModel.None_ 3098 3099 schema_normalization = ( 3100 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3101 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3102 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3103 ) 3104 3105 return RecordSelector( 3106 extractor=extractor, 3107 name=name, 3108 config=config, 3109 record_filter=record_filter, 3110 transformations=transformations or [], 3111 file_uploader=file_uploader, 3112 schema_normalization=schema_normalization, 3113 parameters=model.parameters or {}, 3114 transform_before_filtering=transform_before_filtering, 3115 )
3125 def create_selective_authenticator( 3126 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3127 ) -> DeclarativeAuthenticator: 3128 authenticators = { 3129 name: self._create_component_from_model(model=auth, config=config) 3130 for name, auth in model.authenticators.items() 3131 } 3132 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3133 return SelectiveAuthenticator( # type: ignore[abstract] 3134 config=config, 3135 authenticators=authenticators, 3136 authenticator_selection_path=model.authenticator_selection_path, 3137 **kwargs, 3138 )
3140 @staticmethod 3141 def create_legacy_session_token_authenticator( 3142 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3143 ) -> LegacySessionTokenAuthenticator: 3144 return LegacySessionTokenAuthenticator( 3145 api_url=url_base, 3146 header=model.header, 3147 login_url=model.login_url, 3148 password=model.password or "", 3149 session_token=model.session_token or "", 3150 session_token_response_key=model.session_token_response_key or "", 3151 username=model.username or "", 3152 validate_session_url=model.validate_session_url, 3153 config=config, 3154 parameters=model.parameters or {}, 3155 )
3157 def create_simple_retriever( 3158 self, 3159 model: SimpleRetrieverModel, 3160 config: Config, 3161 *, 3162 name: str, 3163 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3164 request_options_provider: Optional[RequestOptionsProvider] = None, 3165 stop_condition_cursor: Optional[Cursor] = None, 3166 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3167 transformations: List[RecordTransformation], 3168 file_uploader: Optional[DefaultFileUploader] = None, 3169 incremental_sync: Optional[ 3170 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3171 ] = None, 3172 use_cache: Optional[bool] = None, 3173 log_formatter: Optional[Callable[[Response], Any]] = None, 3174 partition_router: Optional[PartitionRouter] = None, 3175 **kwargs: Any, 3176 ) -> SimpleRetriever: 3177 def _get_url(req: Requester) -> str: 3178 """ 3179 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3180 This is needed because the URL is not set until the requester is created. 3181 """ 3182 3183 _url: str = ( 3184 model.requester.url 3185 if hasattr(model.requester, "url") and model.requester.url is not None 3186 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3187 ) 3188 _url_base: str = ( 3189 model.requester.url_base 3190 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3191 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3192 ) 3193 3194 return _url or _url_base 3195 3196 decoder = ( 3197 self._create_component_from_model(model=model.decoder, config=config) 3198 if model.decoder 3199 else JsonDecoder(parameters={}) 3200 ) 3201 record_selector = self._create_component_from_model( 3202 model=model.record_selector, 3203 name=name, 3204 config=config, 3205 decoder=decoder, 3206 transformations=transformations, 3207 client_side_incremental_sync=client_side_incremental_sync, 3208 file_uploader=file_uploader, 3209 ) 3210 3211 query_properties: Optional[QueryProperties] = None 3212 query_properties_key: Optional[str] = None 3213 if self._query_properties_in_request_parameters(model.requester): 3214 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3215 # places instead of default to request_parameters which isn't clearly documented 3216 if ( 3217 hasattr(model.requester, "fetch_properties_from_endpoint") 3218 and model.requester.fetch_properties_from_endpoint 3219 ): 3220 raise ValueError( 3221 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3222 ) 3223 3224 query_properties_definitions = [] 3225 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _query_properties_in_request_parameters() 3226 if isinstance(request_parameter, QueryPropertiesModel): 3227 query_properties_key = key 3228 query_properties_definitions.append(request_parameter) 3229 3230 if len(query_properties_definitions) > 1: 3231 raise ValueError( 3232 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3233 ) 3234 3235 if len(query_properties_definitions) == 1: 3236 query_properties = self._create_component_from_model( 3237 model=query_properties_definitions[0], config=config 3238 ) 3239 elif ( 3240 hasattr(model.requester, "fetch_properties_from_endpoint") 3241 and model.requester.fetch_properties_from_endpoint 3242 ): 3243 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3244 query_properties_definition = QueryPropertiesModel( 3245 type="QueryProperties", 3246 property_list=model.requester.fetch_properties_from_endpoint, 3247 always_include_properties=None, 3248 property_chunking=None, 3249 ) # type: ignore # $parameters has a default value 3250 3251 query_properties = self.create_query_properties( 3252 model=query_properties_definition, 3253 config=config, 3254 ) 3255 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3256 query_properties = self.create_query_properties( 3257 model=model.requester.query_properties, 3258 config=config, 3259 ) 3260 3261 requester = self._create_component_from_model( 3262 model=model.requester, 3263 decoder=decoder, 3264 name=name, 3265 query_properties_key=query_properties_key, 3266 use_cache=use_cache, 3267 config=config, 3268 ) 3269 3270 if not request_options_provider: 3271 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3272 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3273 partition_router, PartitionRouter 3274 ): 3275 request_options_provider = partition_router 3276 3277 paginator = ( 3278 self._create_component_from_model( 3279 model=model.paginator, 3280 config=config, 3281 url_base=_get_url(requester), 3282 extractor_model=model.record_selector.extractor, 3283 decoder=decoder, 3284 cursor_used_for_stop_condition=stop_condition_cursor or None, 3285 ) 3286 if model.paginator 3287 else NoPagination(parameters={}) 3288 ) 3289 3290 ignore_stream_slicer_parameters_on_paginated_requests = ( 3291 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3292 ) 3293 3294 if ( 3295 model.partition_router 3296 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3297 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3298 and any( 3299 parent_stream_config.lazy_read_pointer 3300 for parent_stream_config in model.partition_router.parent_stream_configs 3301 ) 3302 ): 3303 if incremental_sync: 3304 if incremental_sync.type != "DatetimeBasedCursor": 3305 raise ValueError( 3306 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3307 ) 3308 3309 elif incremental_sync.step or incremental_sync.cursor_granularity: 3310 raise ValueError( 3311 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3312 ) 3313 3314 if model.decoder and model.decoder.type != "JsonDecoder": 3315 raise ValueError( 3316 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3317 ) 3318 3319 return LazySimpleRetriever( 3320 name=name, 3321 paginator=paginator, 3322 primary_key=primary_key, 3323 requester=requester, 3324 record_selector=record_selector, 3325 stream_slicer=_NO_STREAM_SLICING, 3326 request_option_provider=request_options_provider, 3327 cursor=None, 3328 config=config, 3329 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3330 parameters=model.parameters or {}, 3331 ) 3332 3333 return SimpleRetriever( 3334 name=name, 3335 paginator=paginator, 3336 primary_key=primary_key, 3337 requester=requester, 3338 record_selector=record_selector, 3339 stream_slicer=_NO_STREAM_SLICING, 3340 request_option_provider=request_options_provider, 3341 cursor=None, 3342 config=config, 3343 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3344 additional_query_properties=query_properties, 3345 log_formatter=self._get_log_formatter(log_formatter, name), 3346 parameters=model.parameters or {}, 3347 )
3397 def create_state_delegating_stream( 3398 self, 3399 model: StateDelegatingStreamModel, 3400 config: Config, 3401 has_parent_state: Optional[bool] = None, 3402 **kwargs: Any, 3403 ) -> DeclarativeStream: 3404 if ( 3405 model.full_refresh_stream.name != model.name 3406 or model.name != model.incremental_stream.name 3407 ): 3408 raise ValueError( 3409 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3410 ) 3411 3412 stream_model = self._get_state_delegating_stream_model( 3413 False if has_parent_state is None else has_parent_state, model 3414 ) 3415 3416 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel
3457 def create_async_retriever( 3458 self, 3459 model: AsyncRetrieverModel, 3460 config: Config, 3461 *, 3462 name: str, 3463 primary_key: Optional[ 3464 Union[str, List[str], List[List[str]]] 3465 ], # this seems to be needed to match create_simple_retriever 3466 stream_slicer: Optional[StreamSlicer], 3467 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3468 transformations: List[RecordTransformation], 3469 **kwargs: Any, 3470 ) -> AsyncRetriever: 3471 if model.download_target_requester and not model.download_target_extractor: 3472 raise ValueError( 3473 f"`download_target_extractor` required if using a `download_target_requester`" 3474 ) 3475 3476 def _get_download_retriever( 3477 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3478 ) -> SimpleRetriever: 3479 # We create a record selector for the download retriever 3480 # with no schema normalization and no transformations, neither record filter 3481 # as all this occurs in the record_selector of the AsyncRetriever 3482 record_selector = RecordSelector( 3483 extractor=extractor, 3484 name=name, 3485 record_filter=None, 3486 transformations=[], 3487 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3488 config=config, 3489 parameters={}, 3490 ) 3491 paginator = ( 3492 self._create_component_from_model( 3493 model=model.download_paginator, 3494 decoder=_decoder, 3495 config=config, 3496 url_base="", 3497 ) 3498 if model.download_paginator 3499 else NoPagination(parameters={}) 3500 ) 3501 3502 return SimpleRetriever( 3503 requester=requester, 3504 record_selector=record_selector, 3505 primary_key=None, 3506 name=name, 3507 paginator=paginator, 3508 config=config, 3509 parameters={}, 3510 log_formatter=self._get_log_formatter(None, name), 3511 ) 3512 3513 def _get_job_timeout() -> datetime.timedelta: 3514 user_defined_timeout: Optional[int] = ( 3515 int( 3516 InterpolatedString.create( 3517 str(model.polling_job_timeout), 3518 parameters={}, 3519 ).eval(config) 3520 ) 3521 if model.polling_job_timeout 3522 else None 3523 ) 3524 3525 # check for user defined timeout during the test read or 15 minutes 3526 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3527 # default value for non-connector builder is 60 minutes. 3528 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3529 3530 return ( 3531 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3532 ) 3533 3534 decoder = ( 3535 self._create_component_from_model(model=model.decoder, config=config) 3536 if model.decoder 3537 else JsonDecoder(parameters={}) 3538 ) 3539 record_selector = self._create_component_from_model( 3540 model=model.record_selector, 3541 config=config, 3542 decoder=decoder, 3543 name=name, 3544 transformations=transformations, 3545 client_side_incremental_sync=client_side_incremental_sync, 3546 ) 3547 3548 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3549 if self._should_limit_slices_fetched(): 3550 stream_slicer = cast( 3551 StreamSlicer, 3552 StreamSlicerTestReadDecorator( 3553 wrapped_slicer=stream_slicer, 3554 maximum_number_of_slices=self._limit_slices_fetched or 5, 3555 ), 3556 ) 3557 3558 creation_requester = self._create_component_from_model( 3559 model=model.creation_requester, 3560 decoder=decoder, 3561 config=config, 3562 name=f"job creation - {name}", 3563 ) 3564 polling_requester = self._create_component_from_model( 3565 model=model.polling_requester, 3566 decoder=decoder, 3567 config=config, 3568 name=f"job polling - {name}", 3569 ) 3570 job_download_components_name = f"job download - {name}" 3571 download_decoder = ( 3572 self._create_component_from_model(model=model.download_decoder, config=config) 3573 if model.download_decoder 3574 else JsonDecoder(parameters={}) 3575 ) 3576 download_extractor = ( 3577 self._create_component_from_model( 3578 model=model.download_extractor, 3579 config=config, 3580 decoder=download_decoder, 3581 parameters=model.parameters, 3582 ) 3583 if model.download_extractor 3584 else DpathExtractor( 3585 [], 3586 config=config, 3587 decoder=download_decoder, 3588 parameters=model.parameters or {}, 3589 ) 3590 ) 3591 download_requester = self._create_component_from_model( 3592 model=model.download_requester, 3593 decoder=download_decoder, 3594 config=config, 3595 name=job_download_components_name, 3596 ) 3597 download_retriever = _get_download_retriever( 3598 download_requester, download_extractor, download_decoder 3599 ) 3600 abort_requester = ( 3601 self._create_component_from_model( 3602 model=model.abort_requester, 3603 decoder=decoder, 3604 config=config, 3605 name=f"job abort - {name}", 3606 ) 3607 if model.abort_requester 3608 else None 3609 ) 3610 delete_requester = ( 3611 self._create_component_from_model( 3612 model=model.delete_requester, 3613 decoder=decoder, 3614 config=config, 3615 name=f"job delete - {name}", 3616 ) 3617 if model.delete_requester 3618 else None 3619 ) 3620 download_target_requester = ( 3621 self._create_component_from_model( 3622 model=model.download_target_requester, 3623 decoder=decoder, 3624 config=config, 3625 name=f"job extract_url - {name}", 3626 ) 3627 if model.download_target_requester 3628 else None 3629 ) 3630 status_extractor = self._create_component_from_model( 3631 model=model.status_extractor, decoder=decoder, config=config, name=name 3632 ) 3633 download_target_extractor = ( 3634 self._create_component_from_model( 3635 model=model.download_target_extractor, 3636 decoder=decoder, 3637 config=config, 3638 name=name, 3639 ) 3640 if model.download_target_extractor 3641 else None 3642 ) 3643 3644 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3645 creation_requester=creation_requester, 3646 polling_requester=polling_requester, 3647 download_retriever=download_retriever, 3648 download_target_requester=download_target_requester, 3649 abort_requester=abort_requester, 3650 delete_requester=delete_requester, 3651 status_extractor=status_extractor, 3652 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3653 download_target_extractor=download_target_extractor, 3654 job_timeout=_get_job_timeout(), 3655 ) 3656 3657 async_job_partition_router = AsyncJobPartitionRouter( 3658 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3659 job_repository, 3660 stream_slices, 3661 self._job_tracker, 3662 self._message_repository, 3663 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3664 has_bulk_parent=False, 3665 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3666 # `None` == default retry is set to 3 attempts, under the hood. 3667 job_max_retry=1 if self._emit_connector_builder_messages else None, 3668 ), 3669 stream_slicer=stream_slicer, 3670 config=config, 3671 parameters=model.parameters or {}, 3672 ) 3673 3674 return AsyncRetriever( 3675 record_selector=record_selector, 3676 stream_slicer=async_job_partition_router, 3677 config=config, 3678 parameters=model.parameters or {}, 3679 )
3681 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3682 config_migrations = [ 3683 self._create_component_from_model(migration, config) 3684 for migration in ( 3685 model.config_normalization_rules.config_migrations 3686 if ( 3687 model.config_normalization_rules 3688 and model.config_normalization_rules.config_migrations 3689 ) 3690 else [] 3691 ) 3692 ] 3693 config_transformations = [ 3694 self._create_component_from_model(transformation, config) 3695 for transformation in ( 3696 model.config_normalization_rules.transformations 3697 if ( 3698 model.config_normalization_rules 3699 and model.config_normalization_rules.transformations 3700 ) 3701 else [] 3702 ) 3703 ] 3704 config_validations = [ 3705 self._create_component_from_model(validation, config) 3706 for validation in ( 3707 model.config_normalization_rules.validations 3708 if ( 3709 model.config_normalization_rules 3710 and model.config_normalization_rules.validations 3711 ) 3712 else [] 3713 ) 3714 ] 3715 3716 return Spec( 3717 connection_specification=model.connection_specification, 3718 documentation_url=model.documentation_url, 3719 advanced_auth=model.advanced_auth, 3720 parameters={}, 3721 config_migrations=config_migrations, 3722 config_transformations=config_transformations, 3723 config_validations=config_validations, 3724 )
3726 def create_substream_partition_router( 3727 self, 3728 model: SubstreamPartitionRouterModel, 3729 config: Config, 3730 *, 3731 stream_name: str, 3732 **kwargs: Any, 3733 ) -> SubstreamPartitionRouter: 3734 parent_stream_configs = [] 3735 if model.parent_stream_configs: 3736 parent_stream_configs.extend( 3737 [ 3738 self.create_parent_stream_config_with_substream_wrapper( 3739 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3740 ) 3741 for parent_stream_config in model.parent_stream_configs 3742 ] 3743 ) 3744 3745 return SubstreamPartitionRouter( 3746 parent_stream_configs=parent_stream_configs, 3747 parameters=model.parameters or {}, 3748 config=config, 3749 )
3751 def create_parent_stream_config_with_substream_wrapper( 3752 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3753 ) -> Any: 3754 # getting the parent state 3755 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3756 3757 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3758 has_parent_state = bool( 3759 self._connector_state_manager.get_stream_state(stream_name, None) 3760 if model.incremental_dependency 3761 else False 3762 ) 3763 connector_state_manager = self._instantiate_parent_stream_state_manager( 3764 child_state, config, model, has_parent_state 3765 ) 3766 3767 substream_factory = ModelToComponentFactory( 3768 connector_state_manager=connector_state_manager, 3769 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3770 limit_slices_fetched=self._limit_slices_fetched, 3771 emit_connector_builder_messages=self._emit_connector_builder_messages, 3772 disable_retries=self._disable_retries, 3773 disable_cache=self._disable_cache, 3774 message_repository=StateFilteringMessageRepository( 3775 LogAppenderMessageRepositoryDecorator( 3776 { 3777 "airbyte_cdk": {"stream": {"is_substream": True}}, 3778 "http": {"is_auxiliary": True}, 3779 }, 3780 self._message_repository, 3781 self._evaluate_log_level(self._emit_connector_builder_messages), 3782 ), 3783 ), 3784 ) 3785 3786 return substream_factory.create_parent_stream_config( 3787 model=model, config=config, stream_name=stream_name, **kwargs 3788 )
3849 @staticmethod 3850 def create_wait_time_from_header( 3851 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3852 ) -> WaitTimeFromHeaderBackoffStrategy: 3853 return WaitTimeFromHeaderBackoffStrategy( 3854 header=model.header, 3855 parameters=model.parameters or {}, 3856 config=config, 3857 regex=model.regex, 3858 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3859 if model.max_waiting_time_in_seconds is not None 3860 else None, 3861 )
3863 @staticmethod 3864 def create_wait_until_time_from_header( 3865 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3866 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3867 return WaitUntilTimeFromHeaderBackoffStrategy( 3868 header=model.header, 3869 parameters=model.parameters or {}, 3870 config=config, 3871 min_wait=model.min_wait, 3872 regex=model.regex, 3873 )
3881 @staticmethod 3882 def create_components_mapping_definition( 3883 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3884 ) -> ComponentMappingDefinition: 3885 interpolated_value = InterpolatedString.create( 3886 model.value, parameters=model.parameters or {} 3887 ) 3888 field_path = [ 3889 InterpolatedString.create(path, parameters=model.parameters or {}) 3890 for path in model.field_path 3891 ] 3892 return ComponentMappingDefinition( 3893 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 3894 value=interpolated_value, 3895 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 3896 create_or_update=model.create_or_update, 3897 condition=model.condition, 3898 parameters=model.parameters or {}, 3899 )
3901 def create_http_components_resolver( 3902 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 3903 ) -> Any: 3904 retriever = self._create_component_from_model( 3905 model=model.retriever, 3906 config=config, 3907 name=f"{stream_name if stream_name else '__http_components_resolver'}", 3908 primary_key=None, 3909 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3910 transformations=[], 3911 ) 3912 3913 components_mapping = [] 3914 for component_mapping_definition_model in model.components_mapping: 3915 if component_mapping_definition_model.condition: 3916 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3917 components_mapping.append( 3918 self._create_component_from_model( 3919 model=component_mapping_definition_model, 3920 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3921 component_mapping_definition_model.value_type 3922 ), 3923 config=config, 3924 ) 3925 ) 3926 3927 return HttpComponentsResolver( 3928 retriever=retriever, 3929 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 3930 config=config, 3931 components_mapping=components_mapping, 3932 parameters=model.parameters or {}, 3933 )
3935 @staticmethod 3936 def create_stream_config( 3937 model: StreamConfigModel, config: Config, **kwargs: Any 3938 ) -> StreamConfig: 3939 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 3940 [x for x in model.configs_pointer] if model.configs_pointer else [] 3941 ) 3942 3943 return StreamConfig( 3944 configs_pointer=model_configs_pointer, 3945 default_values=model.default_values, 3946 parameters=model.parameters or {}, 3947 )
3949 def create_config_components_resolver( 3950 self, 3951 model: ConfigComponentsResolverModel, 3952 config: Config, 3953 ) -> Any: 3954 model_stream_configs = ( 3955 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 3956 ) 3957 3958 stream_configs = [ 3959 self._create_component_from_model( 3960 stream_config, config=config, parameters=model.parameters or {} 3961 ) 3962 for stream_config in model_stream_configs 3963 ] 3964 3965 components_mapping = [ 3966 self._create_component_from_model( 3967 model=components_mapping_definition_model, 3968 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 3969 components_mapping_definition_model.value_type 3970 ), 3971 config=config, 3972 parameters=model.parameters, 3973 ) 3974 for components_mapping_definition_model in model.components_mapping 3975 ] 3976 3977 return ConfigComponentsResolver( 3978 stream_configs=stream_configs, 3979 config=config, 3980 components_mapping=components_mapping, 3981 parameters=model.parameters or {}, 3982 )
3984 def create_parametrized_components_resolver( 3985 self, 3986 model: ParametrizedComponentsResolverModel, 3987 config: Config, 3988 ) -> ParametrizedComponentsResolver: 3989 stream_parameters = StreamParametersDefinition( 3990 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 3991 ) 3992 3993 components_mapping = [] 3994 for components_mapping_definition_model in model.components_mapping: 3995 if components_mapping_definition_model.condition: 3996 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 3997 components_mapping.append( 3998 self._create_component_from_model( 3999 model=components_mapping_definition_model, 4000 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4001 components_mapping_definition_model.value_type 4002 ), 4003 config=config, 4004 ) 4005 ) 4006 return ParametrizedComponentsResolver( 4007 stream_parameters=stream_parameters, 4008 config=config, 4009 components_mapping=components_mapping, 4010 parameters=model.parameters or {}, 4011 )
4035 def create_http_api_budget( 4036 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4037 ) -> HttpAPIBudget: 4038 policies = [ 4039 self._create_component_from_model(model=policy, config=config) 4040 for policy in model.policies 4041 ] 4042 4043 return HttpAPIBudget( 4044 policies=policies, 4045 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4046 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4047 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4048 )
4050 def create_fixed_window_call_rate_policy( 4051 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4052 ) -> FixedWindowCallRatePolicy: 4053 matchers = [ 4054 self._create_component_from_model(model=matcher, config=config) 4055 for matcher in model.matchers 4056 ] 4057 4058 # Set the initial reset timestamp to 10 days from now. 4059 # This value will be updated by the first request. 4060 return FixedWindowCallRatePolicy( 4061 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4062 period=parse_duration(model.period), 4063 call_limit=model.call_limit, 4064 matchers=matchers, 4065 )
4067 def create_file_uploader( 4068 self, model: FileUploaderModel, config: Config, **kwargs: Any 4069 ) -> FileUploader: 4070 name = "File Uploader" 4071 requester = self._create_component_from_model( 4072 model=model.requester, 4073 config=config, 4074 name=name, 4075 **kwargs, 4076 ) 4077 download_target_extractor = self._create_component_from_model( 4078 model=model.download_target_extractor, 4079 config=config, 4080 name=name, 4081 **kwargs, 4082 ) 4083 emit_connector_builder_messages = self._emit_connector_builder_messages 4084 file_uploader = DefaultFileUploader( 4085 requester=requester, 4086 download_target_extractor=download_target_extractor, 4087 config=config, 4088 file_writer=NoopFileWriter() 4089 if emit_connector_builder_messages 4090 else LocalFileSystemFileWriter(), 4091 parameters=model.parameters or {}, 4092 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4093 ) 4094 4095 return ( 4096 ConnectorBuilderFileUploader(file_uploader) 4097 if emit_connector_builder_messages 4098 else file_uploader 4099 )
4101 def create_moving_window_call_rate_policy( 4102 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4103 ) -> MovingWindowCallRatePolicy: 4104 rates = [ 4105 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4106 ] 4107 matchers = [ 4108 self._create_component_from_model(model=matcher, config=config) 4109 for matcher in model.matchers 4110 ] 4111 return MovingWindowCallRatePolicy( 4112 rates=rates, 4113 matchers=matchers, 4114 )
4116 def create_unlimited_call_rate_policy( 4117 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4118 ) -> UnlimitedCallRatePolicy: 4119 matchers = [ 4120 self._create_component_from_model(model=matcher, config=config) 4121 for matcher in model.matchers 4122 ] 4123 4124 return UnlimitedCallRatePolicy( 4125 matchers=matchers, 4126 )
4135 def create_http_request_matcher( 4136 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4137 ) -> HttpRequestRegexMatcher: 4138 return HttpRequestRegexMatcher( 4139 method=model.method, 4140 url_base=model.url_base, 4141 url_path_pattern=model.url_path_pattern, 4142 params=model.params, 4143 headers=model.headers, 4144 )
4151 def create_grouping_partition_router( 4152 self, 4153 model: GroupingPartitionRouterModel, 4154 config: Config, 4155 *, 4156 stream_name: str, 4157 **kwargs: Any, 4158 ) -> GroupingPartitionRouter: 4159 underlying_router = self._create_component_from_model( 4160 model=model.underlying_partition_router, 4161 config=config, 4162 stream_name=stream_name, 4163 **kwargs, 4164 ) 4165 if model.group_size < 1: 4166 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4167 4168 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4169 # because they are specific to individual partitions and cannot be aggregated or handled 4170 # when grouping, potentially leading to incorrect API calls. Any request customization 4171 # should be managed at the stream level through the requester's configuration. 4172 if isinstance(underlying_router, SubstreamPartitionRouter): 4173 if any( 4174 parent_config.request_option 4175 for parent_config in underlying_router.parent_stream_configs 4176 ): 4177 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4178 4179 if isinstance(underlying_router, ListPartitionRouter): 4180 if underlying_router.request_option: 4181 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4182 4183 return GroupingPartitionRouter( 4184 group_size=model.group_size, 4185 underlying_partition_router=underlying_router, 4186 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4187 config=config, 4188 )