airbyte_cdk.sources.declarative.parsers.model_to_component_factory
1# 2# Copyright (c) 2025 Airbyte, Inc., all rights reserved. 3# 4 5from __future__ import annotations 6 7import datetime 8import importlib 9import inspect 10import logging 11import re 12from functools import partial 13from typing import ( 14 Any, 15 Callable, 16 Dict, 17 List, 18 Mapping, 19 MutableMapping, 20 Optional, 21 Type, 22 Union, 23 cast, 24 get_args, 25 get_origin, 26 get_type_hints, 27) 28 29from airbyte_protocol_dataclasses.models import ConfiguredAirbyteStream 30from isodate import parse_duration 31from pydantic.v1 import BaseModel 32from requests import Response 33 34from airbyte_cdk.connector_builder.models import ( 35 LogMessage as ConnectorBuilderLogMessage, 36) 37from airbyte_cdk.legacy.sources.declarative.declarative_stream import DeclarativeStream 38from airbyte_cdk.legacy.sources.declarative.incremental import ( 39 DatetimeBasedCursor, 40) 41from airbyte_cdk.models import ( 42 AirbyteStateBlob, 43 AirbyteStateMessage, 44 AirbyteStateType, 45 AirbyteStreamState, 46 ConfiguredAirbyteCatalog, 47 FailureType, 48 Level, 49 StreamDescriptor, 50) 51from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager 52from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator 53from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker 54from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository 55from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus 56from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator, JwtAuthenticator 57from airbyte_cdk.sources.declarative.auth.declarative_authenticator import ( 58 DeclarativeAuthenticator, 59 NoAuth, 60) 61from airbyte_cdk.sources.declarative.auth.jwt import JwtAlgorithm 62from airbyte_cdk.sources.declarative.auth.oauth import ( 63 DeclarativeSingleUseRefreshTokenOauth2Authenticator, 64) 65from airbyte_cdk.sources.declarative.auth.selective_authenticator import SelectiveAuthenticator 66from airbyte_cdk.sources.declarative.auth.token import ( 67 ApiKeyAuthenticator, 68 BasicHttpAuthenticator, 69 BearerAuthenticator, 70 LegacySessionTokenAuthenticator, 71) 72from airbyte_cdk.sources.declarative.auth.token_provider import ( 73 InterpolatedStringTokenProvider, 74 SessionTokenProvider, 75 TokenProvider, 76) 77from airbyte_cdk.sources.declarative.checks import ( 78 CheckDynamicStream, 79 CheckStream, 80 DynamicStreamCheckConfig, 81) 82from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel 83from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime 84from airbyte_cdk.sources.declarative.decoders import ( 85 Decoder, 86 IterableDecoder, 87 JsonDecoder, 88 PaginationDecoderDecorator, 89 XmlDecoder, 90 ZipfileDecoder, 91) 92from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( 93 CompositeRawDecoder, 94 CsvParser, 95 GzipParser, 96 JsonLineParser, 97 JsonParser, 98 Parser, 99) 100from airbyte_cdk.sources.declarative.extractors import ( 101 DpathExtractor, 102 RecordFilter, 103 RecordSelector, 104 ResponseToFileExtractor, 105) 106from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor 107from airbyte_cdk.sources.declarative.extractors.record_filter import ( 108 ClientSideIncrementalRecordFilterDecorator, 109) 110from airbyte_cdk.sources.declarative.incremental import ( 111 ConcurrentCursorFactory, 112 ConcurrentPerPartitionCursor, 113) 114from airbyte_cdk.sources.declarative.interpolation import InterpolatedString 115from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping 116from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import ( 117 LegacyToPerPartitionStateMigration, 118) 119from airbyte_cdk.sources.declarative.models import ( 120 CustomStateMigration, 121 PaginationResetLimits, 122) 123from airbyte_cdk.sources.declarative.models.base_model_with_deprecations import ( 124 DEPRECATION_LOGS_TAG, 125 BaseModelWithDeprecations, 126) 127from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 128 Action1 as PaginationResetActionModel, 129) 130from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 131 AddedFieldDefinition as AddedFieldDefinitionModel, 132) 133from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 134 AddFields as AddFieldsModel, 135) 136from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 137 ApiKeyAuthenticator as ApiKeyAuthenticatorModel, 138) 139from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 140 AsyncJobStatusMap as AsyncJobStatusMapModel, 141) 142from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 143 AsyncRetriever as AsyncRetrieverModel, 144) 145from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 146 BasicHttpAuthenticator as BasicHttpAuthenticatorModel, 147) 148from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 149 BearerAuthenticator as BearerAuthenticatorModel, 150) 151from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 152 CheckDynamicStream as CheckDynamicStreamModel, 153) 154from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 155 CheckStream as CheckStreamModel, 156) 157from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 158 ComplexFieldType as ComplexFieldTypeModel, 159) 160from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 161 ComponentMappingDefinition as ComponentMappingDefinitionModel, 162) 163from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 164 CompositeErrorHandler as CompositeErrorHandlerModel, 165) 166from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 167 ConcurrencyLevel as ConcurrencyLevelModel, 168) 169from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 170 ConfigAddFields as ConfigAddFieldsModel, 171) 172from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 173 ConfigComponentsResolver as ConfigComponentsResolverModel, 174) 175from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 176 ConfigMigration as ConfigMigrationModel, 177) 178from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 179 ConfigRemapField as ConfigRemapFieldModel, 180) 181from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 182 ConfigRemoveFields as ConfigRemoveFieldsModel, 183) 184from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 185 ConstantBackoffStrategy as ConstantBackoffStrategyModel, 186) 187from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 188 CsvDecoder as CsvDecoderModel, 189) 190from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 191 CursorPagination as CursorPaginationModel, 192) 193from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 194 CustomAuthenticator as CustomAuthenticatorModel, 195) 196from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 197 CustomBackoffStrategy as CustomBackoffStrategyModel, 198) 199from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 200 CustomConfigTransformation as CustomConfigTransformationModel, 201) 202from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 203 CustomDecoder as CustomDecoderModel, 204) 205from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 206 CustomErrorHandler as CustomErrorHandlerModel, 207) 208from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 209 CustomPaginationStrategy as CustomPaginationStrategyModel, 210) 211from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 212 CustomPartitionRouter as CustomPartitionRouterModel, 213) 214from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 215 CustomRecordExtractor as CustomRecordExtractorModel, 216) 217from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 218 CustomRecordFilter as CustomRecordFilterModel, 219) 220from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 221 CustomRequester as CustomRequesterModel, 222) 223from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 224 CustomRetriever as CustomRetrieverModel, 225) 226from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 227 CustomSchemaLoader as CustomSchemaLoader, 228) 229from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 230 CustomSchemaNormalization as CustomSchemaNormalizationModel, 231) 232from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 233 CustomTransformation as CustomTransformationModel, 234) 235from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 236 CustomValidationStrategy as CustomValidationStrategyModel, 237) 238from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 239 DatetimeBasedCursor as DatetimeBasedCursorModel, 240) 241from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 242 DeclarativeStream as DeclarativeStreamModel, 243) 244from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 245 DefaultErrorHandler as DefaultErrorHandlerModel, 246) 247from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 248 DefaultPaginator as DefaultPaginatorModel, 249) 250from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 251 DpathExtractor as DpathExtractorModel, 252) 253from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 254 DpathFlattenFields as DpathFlattenFieldsModel, 255) 256from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 257 DpathValidator as DpathValidatorModel, 258) 259from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 260 DynamicSchemaLoader as DynamicSchemaLoaderModel, 261) 262from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 263 DynamicStreamCheckConfig as DynamicStreamCheckConfigModel, 264) 265from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 266 ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, 267) 268from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 269 FileUploader as FileUploaderModel, 270) 271from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 272 FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, 273) 274from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 275 FlattenFields as FlattenFieldsModel, 276) 277from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 278 GroupByKeyMergeStrategy as GroupByKeyMergeStrategyModel, 279) 280from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 281 GroupingPartitionRouter as GroupingPartitionRouterModel, 282) 283from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 284 GzipDecoder as GzipDecoderModel, 285) 286from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 287 HTTPAPIBudget as HTTPAPIBudgetModel, 288) 289from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 290 HttpComponentsResolver as HttpComponentsResolverModel, 291) 292from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 293 HttpRequester as HttpRequesterModel, 294) 295from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 296 HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, 297) 298from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 299 HttpResponseFilter as HttpResponseFilterModel, 300) 301from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 302 IncrementingCountCursor as IncrementingCountCursorModel, 303) 304from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 305 InlineSchemaLoader as InlineSchemaLoaderModel, 306) 307from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 308 IterableDecoder as IterableDecoderModel, 309) 310from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 311 JsonDecoder as JsonDecoderModel, 312) 313from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 314 JsonFileSchemaLoader as JsonFileSchemaLoaderModel, 315) 316from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 317 JsonlDecoder as JsonlDecoderModel, 318) 319from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 320 JsonSchemaPropertySelector as JsonSchemaPropertySelectorModel, 321) 322from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 323 JwtAuthenticator as JwtAuthenticatorModel, 324) 325from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 326 JwtHeaders as JwtHeadersModel, 327) 328from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 329 JwtPayload as JwtPayloadModel, 330) 331from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 332 KeysReplace as KeysReplaceModel, 333) 334from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 335 KeysToLower as KeysToLowerModel, 336) 337from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 338 KeysToSnakeCase as KeysToSnakeCaseModel, 339) 340from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 341 LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel, 342) 343from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 344 LegacyToPerPartitionStateMigration as LegacyToPerPartitionStateMigrationModel, 345) 346from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 347 ListPartitionRouter as ListPartitionRouterModel, 348) 349from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 350 MinMaxDatetime as MinMaxDatetimeModel, 351) 352from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 353 MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, 354) 355from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 356 NoAuth as NoAuthModel, 357) 358from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 359 NoPagination as NoPaginationModel, 360) 361from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 362 OAuthAuthenticator as OAuthAuthenticatorModel, 363) 364from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 365 OffsetIncrement as OffsetIncrementModel, 366) 367from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 368 PageIncrement as PageIncrementModel, 369) 370from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 371 PaginationReset as PaginationResetModel, 372) 373from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 374 ParametrizedComponentsResolver as ParametrizedComponentsResolverModel, 375) 376from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 377 ParentStreamConfig as ParentStreamConfigModel, 378) 379from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 380 PredicateValidator as PredicateValidatorModel, 381) 382from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 383 PropertiesFromEndpoint as PropertiesFromEndpointModel, 384) 385from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 386 PropertyChunking as PropertyChunkingModel, 387) 388from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 389 PropertyLimitType as PropertyLimitTypeModel, 390) 391from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 392 QueryProperties as QueryPropertiesModel, 393) 394from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 395 Rate as RateModel, 396) 397from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 398 RecordFilter as RecordFilterModel, 399) 400from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 401 RecordSelector as RecordSelectorModel, 402) 403from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 404 RemoveFields as RemoveFieldsModel, 405) 406from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 407 RequestOption as RequestOptionModel, 408) 409from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 410 RequestPath as RequestPathModel, 411) 412from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 413 ResponseToFileExtractor as ResponseToFileExtractorModel, 414) 415from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 416 SchemaNormalization as SchemaNormalizationModel, 417) 418from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 419 SchemaTypeIdentifier as SchemaTypeIdentifierModel, 420) 421from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 422 SelectiveAuthenticator as SelectiveAuthenticatorModel, 423) 424from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 425 SessionTokenAuthenticator as SessionTokenAuthenticatorModel, 426) 427from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 428 SimpleRetriever as SimpleRetrieverModel, 429) 430from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel 431from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 432 StateDelegatingStream as StateDelegatingStreamModel, 433) 434from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 435 StreamConfig as StreamConfigModel, 436) 437from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 438 SubstreamPartitionRouter as SubstreamPartitionRouterModel, 439) 440from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 441 TypesMap as TypesMapModel, 442) 443from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 444 UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, 445) 446from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 447 ValidateAdheresToSchema as ValidateAdheresToSchemaModel, 448) 449from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType 450from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 451 WaitTimeFromHeader as WaitTimeFromHeaderModel, 452) 453from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 454 WaitUntilTimeFromHeader as WaitUntilTimeFromHeaderModel, 455) 456from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 457 XmlDecoder as XmlDecoderModel, 458) 459from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( 460 ZipfileDecoder as ZipfileDecoderModel, 461) 462from airbyte_cdk.sources.declarative.partition_routers import ( 463 CartesianProductStreamSlicer, 464 GroupingPartitionRouter, 465 ListPartitionRouter, 466 PartitionRouter, 467 SinglePartitionRouter, 468 SubstreamPartitionRouter, 469) 470from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import ( 471 AsyncJobPartitionRouter, 472) 473from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( 474 ParentStreamConfig, 475) 476from airbyte_cdk.sources.declarative.requesters import HttpRequester, RequestOption 477from airbyte_cdk.sources.declarative.requesters.error_handlers import ( 478 CompositeErrorHandler, 479 DefaultErrorHandler, 480 HttpResponseFilter, 481) 482from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies import ( 483 ConstantBackoffStrategy, 484 ExponentialBackoffStrategy, 485 WaitTimeFromHeaderBackoffStrategy, 486 WaitUntilTimeFromHeaderBackoffStrategy, 487) 488from airbyte_cdk.sources.declarative.requesters.http_job_repository import AsyncHttpJobRepository 489from airbyte_cdk.sources.declarative.requesters.paginators import ( 490 DefaultPaginator, 491 NoPagination, 492 PaginatorTestReadDecorator, 493) 494from airbyte_cdk.sources.declarative.requesters.paginators.strategies import ( 495 CursorPaginationStrategy, 496 CursorStopCondition, 497 OffsetIncrement, 498 PageIncrement, 499 StopConditionPaginationStrategyDecorator, 500) 501from airbyte_cdk.sources.declarative.requesters.query_properties import ( 502 PropertiesFromEndpoint, 503 PropertyChunking, 504 QueryProperties, 505) 506from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import ( 507 PropertyLimitType, 508) 509from airbyte_cdk.sources.declarative.requesters.query_properties.property_selector import ( 510 JsonSchemaPropertySelector, 511) 512from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import ( 513 GroupByKey, 514) 515from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType 516from airbyte_cdk.sources.declarative.requesters.request_options import ( 517 DatetimeBasedRequestOptionsProvider, 518 DefaultRequestOptionsProvider, 519 InterpolatedRequestOptionsProvider, 520 RequestOptionsProvider, 521) 522from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import ( 523 PerPartitionRequestOptionsProvider, 524) 525from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath 526from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester 527from airbyte_cdk.sources.declarative.resolvers import ( 528 ComponentMappingDefinition, 529 ConfigComponentsResolver, 530 HttpComponentsResolver, 531 ParametrizedComponentsResolver, 532 StreamConfig, 533 StreamParametersDefinition, 534) 535from airbyte_cdk.sources.declarative.retrievers import ( 536 AsyncRetriever, 537 LazySimpleRetriever, 538 SimpleRetriever, 539) 540from airbyte_cdk.sources.declarative.retrievers.file_uploader import ( 541 ConnectorBuilderFileUploader, 542 DefaultFileUploader, 543 FileUploader, 544 LocalFileSystemFileWriter, 545 NoopFileWriter, 546) 547from airbyte_cdk.sources.declarative.retrievers.pagination_tracker import PaginationTracker 548from airbyte_cdk.sources.declarative.schema import ( 549 ComplexFieldType, 550 DefaultSchemaLoader, 551 DynamicSchemaLoader, 552 InlineSchemaLoader, 553 JsonFileSchemaLoader, 554 SchemaLoader, 555 SchemaTypeIdentifier, 556 TypesMap, 557) 558from airbyte_cdk.sources.declarative.schema.caching_schema_loader_decorator import ( 559 CachingSchemaLoaderDecorator, 560) 561from airbyte_cdk.sources.declarative.schema.composite_schema_loader import CompositeSchemaLoader 562from airbyte_cdk.sources.declarative.spec import ConfigMigration, Spec 563from airbyte_cdk.sources.declarative.stream_slicers import ( 564 StreamSlicer, 565 StreamSlicerTestReadDecorator, 566) 567from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( 568 DeclarativePartitionFactory, 569 StreamSlicerPartitionGenerator, 570) 571from airbyte_cdk.sources.declarative.transformations import ( 572 AddFields, 573 RecordTransformation, 574 RemoveFields, 575) 576from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition 577from airbyte_cdk.sources.declarative.transformations.config_transformations import ( 578 ConfigAddFields, 579 ConfigRemapField, 580 ConfigRemoveFields, 581) 582from airbyte_cdk.sources.declarative.transformations.config_transformations.config_transformation import ( 583 ConfigTransformation, 584) 585from airbyte_cdk.sources.declarative.transformations.dpath_flatten_fields import ( 586 DpathFlattenFields, 587 KeyTransformation, 588) 589from airbyte_cdk.sources.declarative.transformations.flatten_fields import ( 590 FlattenFields, 591) 592from airbyte_cdk.sources.declarative.transformations.keys_replace_transformation import ( 593 KeysReplaceTransformation, 594) 595from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import ( 596 KeysToLowerTransformation, 597) 598from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import ( 599 KeysToSnakeCaseTransformation, 600) 601from airbyte_cdk.sources.declarative.validators import ( 602 DpathValidator, 603 PredicateValidator, 604 ValidateAdheresToSchema, 605) 606from airbyte_cdk.sources.http_logger import format_http_message 607from airbyte_cdk.sources.message import ( 608 InMemoryMessageRepository, 609 LogAppenderMessageRepositoryDecorator, 610 MessageRepository, 611 NoopMessageRepository, 612) 613from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository 614from airbyte_cdk.sources.streams.call_rate import ( 615 APIBudget, 616 FixedWindowCallRatePolicy, 617 HttpAPIBudget, 618 HttpRequestRegexMatcher, 619 MovingWindowCallRatePolicy, 620 Rate, 621 UnlimitedCallRatePolicy, 622) 623from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream 624from airbyte_cdk.sources.streams.concurrent.clamping import ( 625 ClampingEndProvider, 626 ClampingStrategy, 627 DayClampingStrategy, 628 MonthClampingStrategy, 629 NoClamping, 630 WeekClampingStrategy, 631 Weekday, 632) 633from airbyte_cdk.sources.streams.concurrent.cursor import ( 634 ConcurrentCursor, 635 Cursor, 636 CursorField, 637 FinalStateCursor, 638) 639from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream 640from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream 641from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import ( 642 StreamSlicer as ConcurrentStreamSlicer, 643) 644from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( 645 CustomFormatConcurrentStreamStateConverter, 646 DateTimeStreamStateConverter, 647) 648from airbyte_cdk.sources.streams.concurrent.state_converters.incrementing_count_stream_state_converter import ( 649 IncrementingCountStreamStateConverter, 650) 651from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction 652from airbyte_cdk.sources.types import Config 653from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer 654 655ComponentDefinition = Mapping[str, Any] 656 657SCHEMA_TRANSFORMER_TYPE_MAPPING = { 658 SchemaNormalizationModel.None_: TransformConfig.NoTransform, 659 SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization, 660} 661_NO_STREAM_SLICING = SinglePartitionRouter(parameters={}) 662 663# Ideally this should use the value defined in ConcurrentDeclarativeSource, but 664# this would be a circular import 665MAX_SLICES = 5 666 667LOGGER = logging.getLogger(f"airbyte.model_to_component_factory") 668 669 670class ModelToComponentFactory: 671 EPOCH_DATETIME_FORMAT = "%s" 672 673 def __init__( 674 self, 675 limit_pages_fetched_per_slice: Optional[int] = None, 676 limit_slices_fetched: Optional[int] = None, 677 emit_connector_builder_messages: bool = False, 678 disable_retries: bool = False, 679 disable_cache: bool = False, 680 message_repository: Optional[MessageRepository] = None, 681 connector_state_manager: Optional[ConnectorStateManager] = None, 682 max_concurrent_async_job_count: Optional[int] = None, 683 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 684 ): 685 self._init_mappings() 686 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 687 self._limit_slices_fetched = limit_slices_fetched 688 self._emit_connector_builder_messages = emit_connector_builder_messages 689 self._disable_retries = disable_retries 690 self._disable_cache = disable_cache 691 self._message_repository = message_repository or InMemoryMessageRepository( 692 self._evaluate_log_level(emit_connector_builder_messages) 693 ) 694 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 695 configured_catalog 696 ) 697 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 698 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 699 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 700 # placeholder for deprecation warnings 701 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 702 703 def _init_mappings(self) -> None: 704 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 705 AddedFieldDefinitionModel: self.create_added_field_definition, 706 AddFieldsModel: self.create_add_fields, 707 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 708 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 709 BearerAuthenticatorModel: self.create_bearer_authenticator, 710 CheckStreamModel: self.create_check_stream, 711 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 712 CheckDynamicStreamModel: self.create_check_dynamic_stream, 713 CompositeErrorHandlerModel: self.create_composite_error_handler, 714 ConcurrencyLevelModel: self.create_concurrency_level, 715 ConfigMigrationModel: self.create_config_migration, 716 ConfigAddFieldsModel: self.create_config_add_fields, 717 ConfigRemapFieldModel: self.create_config_remap_field, 718 ConfigRemoveFieldsModel: self.create_config_remove_fields, 719 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 720 CsvDecoderModel: self.create_csv_decoder, 721 CursorPaginationModel: self.create_cursor_pagination, 722 CustomAuthenticatorModel: self.create_custom_component, 723 CustomBackoffStrategyModel: self.create_custom_component, 724 CustomDecoderModel: self.create_custom_component, 725 CustomErrorHandlerModel: self.create_custom_component, 726 CustomRecordExtractorModel: self.create_custom_component, 727 CustomRecordFilterModel: self.create_custom_component, 728 CustomRequesterModel: self.create_custom_component, 729 CustomRetrieverModel: self.create_custom_component, 730 CustomSchemaLoader: self.create_custom_component, 731 CustomSchemaNormalizationModel: self.create_custom_component, 732 CustomStateMigration: self.create_custom_component, 733 CustomPaginationStrategyModel: self.create_custom_component, 734 CustomPartitionRouterModel: self.create_custom_component, 735 CustomTransformationModel: self.create_custom_component, 736 CustomValidationStrategyModel: self.create_custom_component, 737 CustomConfigTransformationModel: self.create_custom_component, 738 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 739 DeclarativeStreamModel: self.create_default_stream, 740 DefaultErrorHandlerModel: self.create_default_error_handler, 741 DefaultPaginatorModel: self.create_default_paginator, 742 DpathExtractorModel: self.create_dpath_extractor, 743 DpathValidatorModel: self.create_dpath_validator, 744 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 745 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 746 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 747 GroupByKeyMergeStrategyModel: self.create_group_by_key, 748 HttpRequesterModel: self.create_http_requester, 749 HttpResponseFilterModel: self.create_http_response_filter, 750 InlineSchemaLoaderModel: self.create_inline_schema_loader, 751 JsonDecoderModel: self.create_json_decoder, 752 JsonlDecoderModel: self.create_jsonl_decoder, 753 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 754 GzipDecoderModel: self.create_gzip_decoder, 755 KeysToLowerModel: self.create_keys_to_lower_transformation, 756 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 757 KeysReplaceModel: self.create_keys_replace_transformation, 758 FlattenFieldsModel: self.create_flatten_fields, 759 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 760 IterableDecoderModel: self.create_iterable_decoder, 761 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 762 XmlDecoderModel: self.create_xml_decoder, 763 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 764 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 765 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 766 TypesMapModel: self.create_types_map, 767 ComplexFieldTypeModel: self.create_complex_field_type, 768 JwtAuthenticatorModel: self.create_jwt_authenticator, 769 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 770 ListPartitionRouterModel: self.create_list_partition_router, 771 MinMaxDatetimeModel: self.create_min_max_datetime, 772 NoAuthModel: self.create_no_auth, 773 NoPaginationModel: self.create_no_pagination, 774 OAuthAuthenticatorModel: self.create_oauth_authenticator, 775 OffsetIncrementModel: self.create_offset_increment, 776 PageIncrementModel: self.create_page_increment, 777 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 778 PredicateValidatorModel: self.create_predicate_validator, 779 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 780 PropertyChunkingModel: self.create_property_chunking, 781 QueryPropertiesModel: self.create_query_properties, 782 RecordFilterModel: self.create_record_filter, 783 RecordSelectorModel: self.create_record_selector, 784 RemoveFieldsModel: self.create_remove_fields, 785 RequestPathModel: self.create_request_path, 786 RequestOptionModel: self.create_request_option, 787 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 788 SelectiveAuthenticatorModel: self.create_selective_authenticator, 789 SimpleRetrieverModel: self.create_simple_retriever, 790 StateDelegatingStreamModel: self.create_state_delegating_stream, 791 SpecModel: self.create_spec, 792 SubstreamPartitionRouterModel: self.create_substream_partition_router, 793 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 794 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 795 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 796 AsyncRetrieverModel: self.create_async_retriever, 797 HttpComponentsResolverModel: self.create_http_components_resolver, 798 ConfigComponentsResolverModel: self.create_config_components_resolver, 799 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 800 StreamConfigModel: self.create_stream_config, 801 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 802 ZipfileDecoderModel: self.create_zipfile_decoder, 803 HTTPAPIBudgetModel: self.create_http_api_budget, 804 FileUploaderModel: self.create_file_uploader, 805 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 806 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 807 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 808 RateModel: self.create_rate, 809 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 810 GroupingPartitionRouterModel: self.create_grouping_partition_router, 811 } 812 813 # Needed for the case where we need to perform a second parse on the fields of a custom component 814 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 815 816 @staticmethod 817 def _create_stream_name_to_configured_stream( 818 configured_catalog: Optional[ConfiguredAirbyteCatalog], 819 ) -> Mapping[str, ConfiguredAirbyteStream]: 820 return ( 821 {stream.stream.name: stream for stream in configured_catalog.streams} 822 if configured_catalog 823 else {} 824 ) 825 826 def create_component( 827 self, 828 model_type: Type[BaseModel], 829 component_definition: ComponentDefinition, 830 config: Config, 831 **kwargs: Any, 832 ) -> Any: 833 """ 834 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 835 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 836 creating declarative components from that model. 837 838 :param model_type: The type of declarative component that is being initialized 839 :param component_definition: The mapping that represents a declarative component 840 :param config: The connector config that is provided by the customer 841 :return: The declarative component to be used at runtime 842 """ 843 844 component_type = component_definition.get("type") 845 if component_definition.get("type") != model_type.__name__: 846 raise ValueError( 847 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 848 ) 849 850 declarative_component_model = model_type.parse_obj(component_definition) 851 852 if not isinstance(declarative_component_model, model_type): 853 raise ValueError( 854 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 855 ) 856 857 return self._create_component_from_model( 858 model=declarative_component_model, config=config, **kwargs 859 ) 860 861 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 862 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 863 raise ValueError( 864 f"{model.__class__} with attributes {model} is not a valid component type" 865 ) 866 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 867 if not component_constructor: 868 raise ValueError(f"Could not find constructor for {model.__class__}") 869 870 # collect deprecation warnings for supported models. 871 if isinstance(model, BaseModelWithDeprecations): 872 self._collect_model_deprecations(model) 873 874 return component_constructor(model=model, config=config, **kwargs) 875 876 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 877 """ 878 Returns the deprecation warnings that were collected during the creation of components. 879 """ 880 return self._collected_deprecation_logs 881 882 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 883 """ 884 Collects deprecation logs from the given model and appends any new logs to the internal collection. 885 886 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 887 888 Args: 889 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 890 """ 891 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 892 for log in model._deprecation_logs: 893 # avoid duplicates for deprecation logs observed. 894 if log not in self._collected_deprecation_logs: 895 self._collected_deprecation_logs.append(log) 896 897 def create_config_migration( 898 self, model: ConfigMigrationModel, config: Config 899 ) -> ConfigMigration: 900 transformations: List[ConfigTransformation] = [ 901 self._create_component_from_model(transformation, config) 902 for transformation in model.transformations 903 ] 904 905 return ConfigMigration( 906 description=model.description, 907 transformations=transformations, 908 ) 909 910 def create_config_add_fields( 911 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 912 ) -> ConfigAddFields: 913 fields = [self._create_component_from_model(field, config) for field in model.fields] 914 return ConfigAddFields( 915 fields=fields, 916 condition=model.condition or "", 917 ) 918 919 @staticmethod 920 def create_config_remove_fields( 921 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 922 ) -> ConfigRemoveFields: 923 return ConfigRemoveFields( 924 field_pointers=model.field_pointers, 925 condition=model.condition or "", 926 ) 927 928 @staticmethod 929 def create_config_remap_field( 930 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 931 ) -> ConfigRemapField: 932 mapping = cast(Mapping[str, Any], model.map) 933 return ConfigRemapField( 934 map=mapping, 935 field_path=model.field_path, 936 config=config, 937 ) 938 939 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 940 strategy = self._create_component_from_model(model.validation_strategy, config) 941 942 return DpathValidator( 943 field_path=model.field_path, 944 strategy=strategy, 945 ) 946 947 def create_predicate_validator( 948 self, model: PredicateValidatorModel, config: Config 949 ) -> PredicateValidator: 950 strategy = self._create_component_from_model(model.validation_strategy, config) 951 952 return PredicateValidator( 953 value=model.value, 954 strategy=strategy, 955 ) 956 957 @staticmethod 958 def create_validate_adheres_to_schema( 959 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 960 ) -> ValidateAdheresToSchema: 961 base_schema = cast(Mapping[str, Any], model.base_schema) 962 return ValidateAdheresToSchema( 963 schema=base_schema, 964 ) 965 966 @staticmethod 967 def create_added_field_definition( 968 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 969 ) -> AddedFieldDefinition: 970 interpolated_value = InterpolatedString.create( 971 model.value, parameters=model.parameters or {} 972 ) 973 return AddedFieldDefinition( 974 path=model.path, 975 value=interpolated_value, 976 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 977 parameters=model.parameters or {}, 978 ) 979 980 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 981 added_field_definitions = [ 982 self._create_component_from_model( 983 model=added_field_definition_model, 984 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 985 added_field_definition_model.value_type 986 ), 987 config=config, 988 ) 989 for added_field_definition_model in model.fields 990 ] 991 return AddFields( 992 fields=added_field_definitions, 993 condition=model.condition or "", 994 parameters=model.parameters or {}, 995 ) 996 997 def create_keys_to_lower_transformation( 998 self, model: KeysToLowerModel, config: Config, **kwargs: Any 999 ) -> KeysToLowerTransformation: 1000 return KeysToLowerTransformation() 1001 1002 def create_keys_to_snake_transformation( 1003 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1004 ) -> KeysToSnakeCaseTransformation: 1005 return KeysToSnakeCaseTransformation() 1006 1007 def create_keys_replace_transformation( 1008 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1009 ) -> KeysReplaceTransformation: 1010 return KeysReplaceTransformation( 1011 old=model.old, new=model.new, parameters=model.parameters or {} 1012 ) 1013 1014 def create_flatten_fields( 1015 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1016 ) -> FlattenFields: 1017 return FlattenFields( 1018 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1019 ) 1020 1021 def create_dpath_flatten_fields( 1022 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1023 ) -> DpathFlattenFields: 1024 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1025 key_transformation = ( 1026 KeyTransformation( 1027 config=config, 1028 prefix=model.key_transformation.prefix, 1029 suffix=model.key_transformation.suffix, 1030 parameters=model.parameters or {}, 1031 ) 1032 if model.key_transformation is not None 1033 else None 1034 ) 1035 return DpathFlattenFields( 1036 config=config, 1037 field_path=model_field_path, 1038 delete_origin_value=model.delete_origin_value 1039 if model.delete_origin_value is not None 1040 else False, 1041 replace_record=model.replace_record if model.replace_record is not None else False, 1042 key_transformation=key_transformation, 1043 parameters=model.parameters or {}, 1044 ) 1045 1046 @staticmethod 1047 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1048 if not value_type: 1049 return None 1050 names_to_types = { 1051 ValueType.string: str, 1052 ValueType.number: float, 1053 ValueType.integer: int, 1054 ValueType.boolean: bool, 1055 } 1056 return names_to_types[value_type] 1057 1058 def create_api_key_authenticator( 1059 self, 1060 model: ApiKeyAuthenticatorModel, 1061 config: Config, 1062 token_provider: Optional[TokenProvider] = None, 1063 **kwargs: Any, 1064 ) -> ApiKeyAuthenticator: 1065 if model.inject_into is None and model.header is None: 1066 raise ValueError( 1067 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1068 ) 1069 1070 if model.inject_into is not None and model.header is not None: 1071 raise ValueError( 1072 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1073 ) 1074 1075 if token_provider is not None and model.api_token != "": 1076 raise ValueError( 1077 "If token_provider is set, api_token is ignored and has to be set to empty string." 1078 ) 1079 1080 request_option = ( 1081 self._create_component_from_model( 1082 model.inject_into, config, parameters=model.parameters or {} 1083 ) 1084 if model.inject_into 1085 else RequestOption( 1086 inject_into=RequestOptionType.header, 1087 field_name=model.header or "", 1088 parameters=model.parameters or {}, 1089 ) 1090 ) 1091 1092 return ApiKeyAuthenticator( 1093 token_provider=( 1094 token_provider 1095 if token_provider is not None 1096 else InterpolatedStringTokenProvider( 1097 api_token=model.api_token or "", 1098 config=config, 1099 parameters=model.parameters or {}, 1100 ) 1101 ), 1102 request_option=request_option, 1103 config=config, 1104 parameters=model.parameters or {}, 1105 ) 1106 1107 def create_legacy_to_per_partition_state_migration( 1108 self, 1109 model: LegacyToPerPartitionStateMigrationModel, 1110 config: Mapping[str, Any], 1111 declarative_stream: DeclarativeStreamModel, 1112 ) -> LegacyToPerPartitionStateMigration: 1113 retriever = declarative_stream.retriever 1114 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1115 raise ValueError( 1116 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1117 ) 1118 partition_router = retriever.partition_router 1119 if not isinstance( 1120 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1121 ): 1122 raise ValueError( 1123 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1124 ) 1125 if not hasattr(partition_router, "parent_stream_configs"): 1126 raise ValueError( 1127 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1128 ) 1129 1130 if not hasattr(declarative_stream, "incremental_sync"): 1131 raise ValueError( 1132 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1133 ) 1134 1135 return LegacyToPerPartitionStateMigration( 1136 partition_router, # type: ignore # was already checked above 1137 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1138 config, 1139 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1140 ) 1141 1142 def create_session_token_authenticator( 1143 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1144 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1145 decoder = ( 1146 self._create_component_from_model(model=model.decoder, config=config) 1147 if model.decoder 1148 else JsonDecoder(parameters={}) 1149 ) 1150 login_requester = self._create_component_from_model( 1151 model=model.login_requester, 1152 config=config, 1153 name=f"{name}_login_requester", 1154 decoder=decoder, 1155 ) 1156 token_provider = SessionTokenProvider( 1157 login_requester=login_requester, 1158 session_token_path=model.session_token_path, 1159 expiration_duration=parse_duration(model.expiration_duration) 1160 if model.expiration_duration 1161 else None, 1162 parameters=model.parameters or {}, 1163 message_repository=self._message_repository, 1164 decoder=decoder, 1165 ) 1166 if model.request_authentication.type == "Bearer": 1167 return ModelToComponentFactory.create_bearer_authenticator( 1168 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1169 config, 1170 token_provider=token_provider, 1171 ) 1172 else: 1173 return self.create_api_key_authenticator( 1174 ApiKeyAuthenticatorModel( 1175 type="ApiKeyAuthenticator", 1176 api_token="", 1177 inject_into=model.request_authentication.inject_into, 1178 ), # type: ignore # $parameters and headers default to None 1179 config=config, 1180 token_provider=token_provider, 1181 ) 1182 1183 @staticmethod 1184 def create_basic_http_authenticator( 1185 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1186 ) -> BasicHttpAuthenticator: 1187 return BasicHttpAuthenticator( 1188 password=model.password or "", 1189 username=model.username, 1190 config=config, 1191 parameters=model.parameters or {}, 1192 ) 1193 1194 @staticmethod 1195 def create_bearer_authenticator( 1196 model: BearerAuthenticatorModel, 1197 config: Config, 1198 token_provider: Optional[TokenProvider] = None, 1199 **kwargs: Any, 1200 ) -> BearerAuthenticator: 1201 if token_provider is not None and model.api_token != "": 1202 raise ValueError( 1203 "If token_provider is set, api_token is ignored and has to be set to empty string." 1204 ) 1205 return BearerAuthenticator( 1206 token_provider=( 1207 token_provider 1208 if token_provider is not None 1209 else InterpolatedStringTokenProvider( 1210 api_token=model.api_token or "", 1211 config=config, 1212 parameters=model.parameters or {}, 1213 ) 1214 ), 1215 config=config, 1216 parameters=model.parameters or {}, 1217 ) 1218 1219 @staticmethod 1220 def create_dynamic_stream_check_config( 1221 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1222 ) -> DynamicStreamCheckConfig: 1223 return DynamicStreamCheckConfig( 1224 dynamic_stream_name=model.dynamic_stream_name, 1225 stream_count=model.stream_count or 0, 1226 ) 1227 1228 def create_check_stream( 1229 self, model: CheckStreamModel, config: Config, **kwargs: Any 1230 ) -> CheckStream: 1231 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1232 raise ValueError( 1233 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1234 ) 1235 1236 dynamic_streams_check_configs = ( 1237 [ 1238 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1239 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1240 ] 1241 if model.dynamic_streams_check_configs 1242 else [] 1243 ) 1244 1245 return CheckStream( 1246 stream_names=model.stream_names or [], 1247 dynamic_streams_check_configs=dynamic_streams_check_configs, 1248 parameters={}, 1249 ) 1250 1251 @staticmethod 1252 def create_check_dynamic_stream( 1253 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1254 ) -> CheckDynamicStream: 1255 assert model.use_check_availability is not None # for mypy 1256 1257 use_check_availability = model.use_check_availability 1258 1259 return CheckDynamicStream( 1260 stream_count=model.stream_count, 1261 use_check_availability=use_check_availability, 1262 parameters={}, 1263 ) 1264 1265 def create_composite_error_handler( 1266 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1267 ) -> CompositeErrorHandler: 1268 error_handlers = [ 1269 self._create_component_from_model(model=error_handler_model, config=config) 1270 for error_handler_model in model.error_handlers 1271 ] 1272 return CompositeErrorHandler( 1273 error_handlers=error_handlers, parameters=model.parameters or {} 1274 ) 1275 1276 @staticmethod 1277 def create_concurrency_level( 1278 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1279 ) -> ConcurrencyLevel: 1280 return ConcurrencyLevel( 1281 default_concurrency=model.default_concurrency, 1282 max_concurrency=model.max_concurrency, 1283 config=config, 1284 parameters={}, 1285 ) 1286 1287 @staticmethod 1288 def apply_stream_state_migrations( 1289 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1290 ) -> MutableMapping[str, Any]: 1291 if stream_state_migrations: 1292 for state_migration in stream_state_migrations: 1293 if state_migration.should_migrate(stream_state): 1294 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1295 stream_state = dict(state_migration.migrate(stream_state)) 1296 return stream_state 1297 1298 def create_concurrent_cursor_from_datetime_based_cursor( 1299 self, 1300 model_type: Type[BaseModel], 1301 component_definition: ComponentDefinition, 1302 stream_name: str, 1303 stream_namespace: Optional[str], 1304 stream_state: MutableMapping[str, Any], 1305 config: Config, 1306 message_repository: Optional[MessageRepository] = None, 1307 runtime_lookback_window: Optional[datetime.timedelta] = None, 1308 **kwargs: Any, 1309 ) -> ConcurrentCursor: 1310 component_type = component_definition.get("type") 1311 if component_definition.get("type") != model_type.__name__: 1312 raise ValueError( 1313 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1314 ) 1315 1316 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1317 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1318 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1319 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1320 if "$parameters" not in component_definition and "parameters" in component_definition: 1321 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1322 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1323 1324 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1325 raise ValueError( 1326 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1327 ) 1328 1329 model_parameters = datetime_based_cursor_model.parameters or {} 1330 interpolated_cursor_field = InterpolatedString.create( 1331 datetime_based_cursor_model.cursor_field, 1332 parameters=model_parameters, 1333 ) 1334 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1335 1336 interpolated_partition_field_start = InterpolatedString.create( 1337 datetime_based_cursor_model.partition_field_start or "start_time", 1338 parameters=model_parameters, 1339 ) 1340 interpolated_partition_field_end = InterpolatedString.create( 1341 datetime_based_cursor_model.partition_field_end or "end_time", 1342 parameters=model_parameters, 1343 ) 1344 1345 slice_boundary_fields = ( 1346 interpolated_partition_field_start.eval(config=config), 1347 interpolated_partition_field_end.eval(config=config), 1348 ) 1349 1350 datetime_format = datetime_based_cursor_model.datetime_format 1351 1352 cursor_granularity = ( 1353 parse_duration(datetime_based_cursor_model.cursor_granularity) 1354 if datetime_based_cursor_model.cursor_granularity 1355 else None 1356 ) 1357 1358 lookback_window = None 1359 interpolated_lookback_window = ( 1360 InterpolatedString.create( 1361 datetime_based_cursor_model.lookback_window, 1362 parameters=model_parameters, 1363 ) 1364 if datetime_based_cursor_model.lookback_window 1365 else None 1366 ) 1367 if interpolated_lookback_window: 1368 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1369 if evaluated_lookback_window: 1370 lookback_window = parse_duration(evaluated_lookback_window) 1371 1372 connector_state_converter: DateTimeStreamStateConverter 1373 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1374 datetime_format=datetime_format, 1375 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1376 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1377 cursor_granularity=cursor_granularity, 1378 ) 1379 1380 # Adjusts the stream state by applying the runtime lookback window. 1381 # This is used to ensure correct state handling in case of failed partitions. 1382 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1383 if runtime_lookback_window and stream_state_value: 1384 new_stream_state = ( 1385 connector_state_converter.parse_timestamp(stream_state_value) 1386 - runtime_lookback_window 1387 ) 1388 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1389 new_stream_state 1390 ) 1391 1392 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1393 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1394 start_date_runtime_value = self.create_min_max_datetime( 1395 model=datetime_based_cursor_model.start_datetime, config=config 1396 ) 1397 else: 1398 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1399 1400 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1401 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1402 end_date_runtime_value = self.create_min_max_datetime( 1403 model=datetime_based_cursor_model.end_datetime, config=config 1404 ) 1405 else: 1406 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1407 1408 interpolated_start_date = MinMaxDatetime.create( 1409 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1410 parameters=datetime_based_cursor_model.parameters, 1411 ) 1412 interpolated_end_date = ( 1413 None 1414 if not end_date_runtime_value 1415 else MinMaxDatetime.create( 1416 end_date_runtime_value, datetime_based_cursor_model.parameters 1417 ) 1418 ) 1419 1420 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1421 if not interpolated_start_date.datetime_format: 1422 interpolated_start_date.datetime_format = datetime_format 1423 if interpolated_end_date and not interpolated_end_date.datetime_format: 1424 interpolated_end_date.datetime_format = datetime_format 1425 1426 start_date = interpolated_start_date.get_datetime(config=config) 1427 end_date_provider = ( 1428 partial(interpolated_end_date.get_datetime, config) 1429 if interpolated_end_date 1430 else connector_state_converter.get_end_provider() 1431 ) 1432 1433 if ( 1434 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1435 ) or ( 1436 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1437 ): 1438 raise ValueError( 1439 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1440 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1441 ) 1442 1443 # When step is not defined, default to a step size from the starting date to the present moment 1444 step_length = datetime.timedelta.max 1445 interpolated_step = ( 1446 InterpolatedString.create( 1447 datetime_based_cursor_model.step, 1448 parameters=model_parameters, 1449 ) 1450 if datetime_based_cursor_model.step 1451 else None 1452 ) 1453 if interpolated_step: 1454 evaluated_step = interpolated_step.eval(config) 1455 if evaluated_step: 1456 step_length = parse_duration(evaluated_step) 1457 1458 clamping_strategy: ClampingStrategy = NoClamping() 1459 if datetime_based_cursor_model.clamping: 1460 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1461 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1462 # object which we want to keep agnostic of being low-code 1463 target = InterpolatedString( 1464 string=datetime_based_cursor_model.clamping.target, 1465 parameters=model_parameters, 1466 ) 1467 evaluated_target = target.eval(config=config) 1468 match evaluated_target: 1469 case "DAY": 1470 clamping_strategy = DayClampingStrategy() 1471 end_date_provider = ClampingEndProvider( 1472 DayClampingStrategy(is_ceiling=False), 1473 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1474 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1475 ) 1476 case "WEEK": 1477 if ( 1478 not datetime_based_cursor_model.clamping.target_details 1479 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1480 ): 1481 raise ValueError( 1482 "Given WEEK clamping, weekday needs to be provided as target_details" 1483 ) 1484 weekday = self._assemble_weekday( 1485 datetime_based_cursor_model.clamping.target_details["weekday"] 1486 ) 1487 clamping_strategy = WeekClampingStrategy(weekday) 1488 end_date_provider = ClampingEndProvider( 1489 WeekClampingStrategy(weekday, is_ceiling=False), 1490 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1491 granularity=cursor_granularity or datetime.timedelta(days=1), 1492 ) 1493 case "MONTH": 1494 clamping_strategy = MonthClampingStrategy() 1495 end_date_provider = ClampingEndProvider( 1496 MonthClampingStrategy(is_ceiling=False), 1497 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1498 granularity=cursor_granularity or datetime.timedelta(days=1), 1499 ) 1500 case _: 1501 raise ValueError( 1502 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1503 ) 1504 1505 return ConcurrentCursor( 1506 stream_name=stream_name, 1507 stream_namespace=stream_namespace, 1508 stream_state=stream_state, 1509 message_repository=message_repository or self._message_repository, 1510 connector_state_manager=self._connector_state_manager, 1511 connector_state_converter=connector_state_converter, 1512 cursor_field=cursor_field, 1513 slice_boundary_fields=slice_boundary_fields, 1514 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1515 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 lookback_window=lookback_window, 1517 slice_range=step_length, 1518 cursor_granularity=cursor_granularity, 1519 clamping_strategy=clamping_strategy, 1520 ) 1521 1522 def create_concurrent_cursor_from_incrementing_count_cursor( 1523 self, 1524 model_type: Type[BaseModel], 1525 component_definition: ComponentDefinition, 1526 stream_name: str, 1527 stream_namespace: Optional[str], 1528 stream_state: MutableMapping[str, Any], 1529 config: Config, 1530 message_repository: Optional[MessageRepository] = None, 1531 **kwargs: Any, 1532 ) -> ConcurrentCursor: 1533 component_type = component_definition.get("type") 1534 if component_definition.get("type") != model_type.__name__: 1535 raise ValueError( 1536 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1537 ) 1538 1539 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1540 1541 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1542 raise ValueError( 1543 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1544 ) 1545 1546 interpolated_start_value = ( 1547 InterpolatedString.create( 1548 incrementing_count_cursor_model.start_value, # type: ignore 1549 parameters=incrementing_count_cursor_model.parameters or {}, 1550 ) 1551 if incrementing_count_cursor_model.start_value 1552 else 0 1553 ) 1554 1555 interpolated_cursor_field = InterpolatedString.create( 1556 incrementing_count_cursor_model.cursor_field, 1557 parameters=incrementing_count_cursor_model.parameters or {}, 1558 ) 1559 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1560 1561 connector_state_converter = IncrementingCountStreamStateConverter( 1562 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1563 ) 1564 1565 return ConcurrentCursor( 1566 stream_name=stream_name, 1567 stream_namespace=stream_namespace, 1568 stream_state=stream_state, 1569 message_repository=message_repository or self._message_repository, 1570 connector_state_manager=self._connector_state_manager, 1571 connector_state_converter=connector_state_converter, 1572 cursor_field=cursor_field, 1573 slice_boundary_fields=None, 1574 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1575 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1576 ) 1577 1578 def _assemble_weekday(self, weekday: str) -> Weekday: 1579 match weekday: 1580 case "MONDAY": 1581 return Weekday.MONDAY 1582 case "TUESDAY": 1583 return Weekday.TUESDAY 1584 case "WEDNESDAY": 1585 return Weekday.WEDNESDAY 1586 case "THURSDAY": 1587 return Weekday.THURSDAY 1588 case "FRIDAY": 1589 return Weekday.FRIDAY 1590 case "SATURDAY": 1591 return Weekday.SATURDAY 1592 case "SUNDAY": 1593 return Weekday.SUNDAY 1594 case _: 1595 raise ValueError(f"Unknown weekday {weekday}") 1596 1597 def create_concurrent_cursor_from_perpartition_cursor( 1598 self, 1599 state_manager: ConnectorStateManager, 1600 model_type: Type[BaseModel], 1601 component_definition: ComponentDefinition, 1602 stream_name: str, 1603 stream_namespace: Optional[str], 1604 config: Config, 1605 stream_state: MutableMapping[str, Any], 1606 partition_router: PartitionRouter, 1607 attempt_to_create_cursor_if_not_provided: bool = False, 1608 **kwargs: Any, 1609 ) -> ConcurrentPerPartitionCursor: 1610 component_type = component_definition.get("type") 1611 if component_definition.get("type") != model_type.__name__: 1612 raise ValueError( 1613 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1614 ) 1615 1616 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1617 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1618 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1619 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1620 if "$parameters" not in component_definition and "parameters" in component_definition: 1621 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1622 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1623 1624 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1625 raise ValueError( 1626 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1627 ) 1628 1629 interpolated_cursor_field = InterpolatedString.create( 1630 datetime_based_cursor_model.cursor_field, 1631 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1632 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1633 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1634 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1635 parameters=datetime_based_cursor_model.parameters or {}, 1636 ) 1637 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1638 1639 datetime_format = datetime_based_cursor_model.datetime_format 1640 1641 cursor_granularity = ( 1642 parse_duration(datetime_based_cursor_model.cursor_granularity) 1643 if datetime_based_cursor_model.cursor_granularity 1644 else None 1645 ) 1646 1647 connector_state_converter: DateTimeStreamStateConverter 1648 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1649 datetime_format=datetime_format, 1650 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1651 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1652 cursor_granularity=cursor_granularity, 1653 ) 1654 1655 # Create the cursor factory 1656 cursor_factory = ConcurrentCursorFactory( 1657 partial( 1658 self.create_concurrent_cursor_from_datetime_based_cursor, 1659 state_manager=state_manager, 1660 model_type=model_type, 1661 component_definition=component_definition, 1662 stream_name=stream_name, 1663 stream_namespace=stream_namespace, 1664 config=config, 1665 message_repository=NoopMessageRepository(), 1666 ) 1667 ) 1668 1669 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1670 use_global_cursor = isinstance( 1671 partition_router, GroupingPartitionRouter 1672 ) or component_definition.get("global_substream_cursor", False) 1673 1674 # Return the concurrent cursor and state converter 1675 return ConcurrentPerPartitionCursor( 1676 cursor_factory=cursor_factory, 1677 partition_router=partition_router, 1678 stream_name=stream_name, 1679 stream_namespace=stream_namespace, 1680 stream_state=stream_state, 1681 message_repository=self._message_repository, # type: ignore 1682 connector_state_manager=state_manager, 1683 connector_state_converter=connector_state_converter, 1684 cursor_field=cursor_field, 1685 use_global_cursor=use_global_cursor, 1686 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1687 ) 1688 1689 @staticmethod 1690 def create_constant_backoff_strategy( 1691 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1692 ) -> ConstantBackoffStrategy: 1693 return ConstantBackoffStrategy( 1694 backoff_time_in_seconds=model.backoff_time_in_seconds, 1695 config=config, 1696 parameters=model.parameters or {}, 1697 ) 1698 1699 def create_cursor_pagination( 1700 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1701 ) -> CursorPaginationStrategy: 1702 if isinstance(decoder, PaginationDecoderDecorator): 1703 inner_decoder = decoder.decoder 1704 else: 1705 inner_decoder = decoder 1706 decoder = PaginationDecoderDecorator(decoder=decoder) 1707 1708 if self._is_supported_decoder_for_pagination(inner_decoder): 1709 decoder_to_use = decoder 1710 else: 1711 raise ValueError( 1712 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1713 ) 1714 1715 return CursorPaginationStrategy( 1716 cursor_value=model.cursor_value, 1717 decoder=decoder_to_use, 1718 page_size=model.page_size, 1719 stop_condition=model.stop_condition, 1720 config=config, 1721 parameters=model.parameters or {}, 1722 ) 1723 1724 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1725 """ 1726 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1727 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1728 :param model: The Pydantic model of the custom component being created 1729 :param config: The custom defined connector config 1730 :return: The declarative component built from the Pydantic model to be used at runtime 1731 """ 1732 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1733 component_fields = get_type_hints(custom_component_class) 1734 model_args = model.dict() 1735 model_args["config"] = config 1736 1737 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1738 # we defer to these arguments over the component's definition 1739 for key, arg in kwargs.items(): 1740 model_args[key] = arg 1741 1742 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1743 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1744 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1745 for model_field, model_value in model_args.items(): 1746 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1747 if ( 1748 isinstance(model_value, dict) 1749 and "type" not in model_value 1750 and model_field in component_fields 1751 ): 1752 derived_type = self._derive_component_type_from_type_hints( 1753 component_fields.get(model_field) 1754 ) 1755 if derived_type: 1756 model_value["type"] = derived_type 1757 1758 if self._is_component(model_value): 1759 model_args[model_field] = self._create_nested_component( 1760 model, 1761 model_field, 1762 model_value, 1763 config, 1764 **kwargs, 1765 ) 1766 elif isinstance(model_value, list): 1767 vals = [] 1768 for v in model_value: 1769 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1770 derived_type = self._derive_component_type_from_type_hints( 1771 component_fields.get(model_field) 1772 ) 1773 if derived_type: 1774 v["type"] = derived_type 1775 if self._is_component(v): 1776 vals.append( 1777 self._create_nested_component( 1778 model, 1779 model_field, 1780 v, 1781 config, 1782 **kwargs, 1783 ) 1784 ) 1785 else: 1786 vals.append(v) 1787 model_args[model_field] = vals 1788 1789 kwargs = { 1790 class_field: model_args[class_field] 1791 for class_field in component_fields.keys() 1792 if class_field in model_args 1793 } 1794 return custom_component_class(**kwargs) 1795 1796 @staticmethod 1797 def _get_class_from_fully_qualified_class_name( 1798 full_qualified_class_name: str, 1799 ) -> Any: 1800 """Get a class from its fully qualified name. 1801 1802 If a custom components module is needed, we assume it is already registered - probably 1803 as `source_declarative_manifest.components` or `components`. 1804 1805 Args: 1806 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1807 1808 Returns: 1809 Any: The class object. 1810 1811 Raises: 1812 ValueError: If the class cannot be loaded. 1813 """ 1814 split = full_qualified_class_name.split(".") 1815 module_name_full = ".".join(split[:-1]) 1816 class_name = split[-1] 1817 1818 try: 1819 module_ref = importlib.import_module(module_name_full) 1820 except ModuleNotFoundError as e: 1821 if split[0] == "source_declarative_manifest": 1822 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1823 try: 1824 import os 1825 1826 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1827 module_ref = importlib.import_module( 1828 module_name_with_source_declarative_manifest 1829 ) 1830 except ModuleNotFoundError: 1831 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1832 else: 1833 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1834 1835 try: 1836 return getattr(module_ref, class_name) 1837 except AttributeError as e: 1838 raise ValueError( 1839 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1840 ) from e 1841 1842 @staticmethod 1843 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1844 interface = field_type 1845 while True: 1846 origin = get_origin(interface) 1847 if origin: 1848 # Unnest types until we reach the raw type 1849 # List[T] -> T 1850 # Optional[List[T]] -> T 1851 args = get_args(interface) 1852 interface = args[0] 1853 else: 1854 break 1855 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1856 return interface.__name__ 1857 return None 1858 1859 @staticmethod 1860 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1861 if not cls: 1862 return False 1863 return cls.__module__ == "builtins" 1864 1865 @staticmethod 1866 def _extract_missing_parameters(error: TypeError) -> List[str]: 1867 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1868 if parameter_search: 1869 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1870 else: 1871 return [] 1872 1873 def _create_nested_component( 1874 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1875 ) -> Any: 1876 type_name = model_value.get("type", None) 1877 if not type_name: 1878 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1879 return model_value 1880 1881 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1882 if model_type: 1883 parsed_model = model_type.parse_obj(model_value) 1884 try: 1885 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1886 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1887 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1888 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1889 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1890 # are needed by a component and could not be shared. 1891 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1892 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1893 model_parameters = model_value.get("$parameters", {}) 1894 matching_parameters = { 1895 kwarg: model_parameters[kwarg] 1896 for kwarg in constructor_kwargs 1897 if kwarg in model_parameters 1898 } 1899 matching_kwargs = { 1900 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1901 } 1902 return self._create_component_from_model( 1903 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1904 ) 1905 except TypeError as error: 1906 missing_parameters = self._extract_missing_parameters(error) 1907 if missing_parameters: 1908 raise ValueError( 1909 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1910 + ", ".join( 1911 ( 1912 f"{type_name}.$parameters.{parameter}" 1913 for parameter in missing_parameters 1914 ) 1915 ) 1916 ) 1917 raise TypeError( 1918 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1919 ) 1920 else: 1921 raise ValueError( 1922 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1923 ) 1924 1925 @staticmethod 1926 def _is_component(model_value: Any) -> bool: 1927 return isinstance(model_value, dict) and model_value.get("type") is not None 1928 1929 def create_datetime_based_cursor( 1930 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1931 ) -> DatetimeBasedCursor: 1932 start_datetime: Union[str, MinMaxDatetime] = ( 1933 model.start_datetime 1934 if isinstance(model.start_datetime, str) 1935 else self.create_min_max_datetime(model.start_datetime, config) 1936 ) 1937 end_datetime: Union[str, MinMaxDatetime, None] = None 1938 if model.is_data_feed and model.end_datetime: 1939 raise ValueError("Data feed does not support end_datetime") 1940 if model.is_data_feed and model.is_client_side_incremental: 1941 raise ValueError( 1942 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1943 ) 1944 if model.end_datetime: 1945 end_datetime = ( 1946 model.end_datetime 1947 if isinstance(model.end_datetime, str) 1948 else self.create_min_max_datetime(model.end_datetime, config) 1949 ) 1950 1951 end_time_option = ( 1952 self._create_component_from_model( 1953 model.end_time_option, config, parameters=model.parameters or {} 1954 ) 1955 if model.end_time_option 1956 else None 1957 ) 1958 start_time_option = ( 1959 self._create_component_from_model( 1960 model.start_time_option, config, parameters=model.parameters or {} 1961 ) 1962 if model.start_time_option 1963 else None 1964 ) 1965 1966 return DatetimeBasedCursor( 1967 cursor_field=model.cursor_field, 1968 cursor_datetime_formats=model.cursor_datetime_formats 1969 if model.cursor_datetime_formats 1970 else [], 1971 cursor_granularity=model.cursor_granularity, 1972 datetime_format=model.datetime_format, 1973 end_datetime=end_datetime, 1974 start_datetime=start_datetime, 1975 step=model.step, 1976 end_time_option=end_time_option, 1977 lookback_window=model.lookback_window, 1978 start_time_option=start_time_option, 1979 partition_field_end=model.partition_field_end, 1980 partition_field_start=model.partition_field_start, 1981 message_repository=self._message_repository, 1982 is_compare_strictly=model.is_compare_strictly, 1983 config=config, 1984 parameters=model.parameters or {}, 1985 ) 1986 1987 def create_default_stream( 1988 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1989 ) -> AbstractStream: 1990 primary_key = model.primary_key.__root__ if model.primary_key else None 1991 self._migrate_state(model, config) 1992 1993 partition_router = self._build_stream_slicer_from_partition_router( 1994 model.retriever, 1995 config, 1996 stream_name=model.name, 1997 **kwargs, 1998 ) 1999 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2000 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2001 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2002 2003 end_time_option = ( 2004 self._create_component_from_model( 2005 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2006 ) 2007 if cursor_model.end_time_option 2008 else None 2009 ) 2010 start_time_option = ( 2011 self._create_component_from_model( 2012 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2013 ) 2014 if cursor_model.start_time_option 2015 else None 2016 ) 2017 2018 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2019 start_time_option=start_time_option, 2020 end_time_option=end_time_option, 2021 partition_field_start=cursor_model.partition_field_start, 2022 partition_field_end=cursor_model.partition_field_end, 2023 config=config, 2024 parameters=model.parameters or {}, 2025 ) 2026 request_options_provider = ( 2027 datetime_request_options_provider 2028 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2029 else PerPartitionRequestOptionsProvider( 2030 partition_router, datetime_request_options_provider 2031 ) 2032 ) 2033 elif model.incremental_sync and isinstance( 2034 model.incremental_sync, IncrementingCountCursorModel 2035 ): 2036 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2037 raise ValueError( 2038 "PerPartition does not support per partition states because switching to global state is time based" 2039 ) 2040 2041 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2042 2043 start_time_option = ( 2044 self._create_component_from_model( 2045 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2046 config, 2047 parameters=cursor_model.parameters or {}, 2048 ) 2049 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2050 else None 2051 ) 2052 2053 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2054 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2055 partition_field_start = "start" 2056 2057 request_options_provider = DatetimeBasedRequestOptionsProvider( 2058 start_time_option=start_time_option, 2059 partition_field_start=partition_field_start, 2060 config=config, 2061 parameters=model.parameters or {}, 2062 ) 2063 else: 2064 request_options_provider = None 2065 2066 transformations = [] 2067 if model.transformations: 2068 for transformation_model in model.transformations: 2069 transformations.append( 2070 self._create_component_from_model(model=transformation_model, config=config) 2071 ) 2072 file_uploader = None 2073 if model.file_uploader: 2074 file_uploader = self._create_component_from_model( 2075 model=model.file_uploader, config=config 2076 ) 2077 2078 stream_slicer: ConcurrentStreamSlicer = ( 2079 partition_router 2080 if isinstance(concurrent_cursor, FinalStateCursor) 2081 else concurrent_cursor 2082 ) 2083 2084 retriever = self._create_component_from_model( 2085 model=model.retriever, 2086 config=config, 2087 name=model.name, 2088 primary_key=primary_key, 2089 request_options_provider=request_options_provider, 2090 stream_slicer=stream_slicer, 2091 partition_router=partition_router, 2092 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2093 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2094 cursor=concurrent_cursor, 2095 transformations=transformations, 2096 file_uploader=file_uploader, 2097 incremental_sync=model.incremental_sync, 2098 ) 2099 if isinstance(retriever, AsyncRetriever): 2100 stream_slicer = retriever.stream_slicer 2101 2102 schema_loader: SchemaLoader 2103 if model.schema_loader and isinstance(model.schema_loader, list): 2104 nested_schema_loaders = [ 2105 self._create_component_from_model(model=nested_schema_loader, config=config) 2106 for nested_schema_loader in model.schema_loader 2107 ] 2108 schema_loader = CompositeSchemaLoader( 2109 schema_loaders=nested_schema_loaders, parameters={} 2110 ) 2111 elif model.schema_loader: 2112 schema_loader = self._create_component_from_model( 2113 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2114 config=config, 2115 ) 2116 else: 2117 options = model.parameters or {} 2118 if "name" not in options: 2119 options["name"] = model.name 2120 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2121 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2122 2123 stream_name = model.name or "" 2124 return DefaultStream( 2125 partition_generator=StreamSlicerPartitionGenerator( 2126 DeclarativePartitionFactory( 2127 stream_name, 2128 schema_loader, 2129 retriever, 2130 self._message_repository, 2131 ), 2132 stream_slicer, 2133 slice_limit=self._limit_slices_fetched, 2134 ), 2135 name=stream_name, 2136 json_schema=schema_loader.get_json_schema, 2137 primary_key=get_primary_key_from_stream(primary_key), 2138 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2139 if hasattr(concurrent_cursor, "cursor_field") 2140 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2141 logger=logging.getLogger(f"airbyte.{stream_name}"), 2142 cursor=concurrent_cursor, 2143 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2144 ) 2145 2146 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2147 stream_name = model.name or "" 2148 stream_state = self._connector_state_manager.get_stream_state( 2149 stream_name=stream_name, namespace=None 2150 ) 2151 if model.state_migrations: 2152 state_transformations = [ 2153 self._create_component_from_model(state_migration, config, declarative_stream=model) 2154 for state_migration in model.state_migrations 2155 ] 2156 else: 2157 state_transformations = [] 2158 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2159 self._connector_state_manager.update_state_for_stream( 2160 stream_name=stream_name, namespace=None, value=stream_state 2161 ) 2162 2163 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2164 return bool( 2165 model.incremental_sync 2166 and hasattr(model.incremental_sync, "is_data_feed") 2167 and model.incremental_sync.is_data_feed 2168 ) 2169 2170 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2171 return bool( 2172 model.incremental_sync 2173 and hasattr(model.incremental_sync, "is_client_side_incremental") 2174 and model.incremental_sync.is_client_side_incremental 2175 ) 2176 2177 def _build_stream_slicer_from_partition_router( 2178 self, 2179 model: Union[ 2180 AsyncRetrieverModel, 2181 CustomRetrieverModel, 2182 SimpleRetrieverModel, 2183 ], 2184 config: Config, 2185 stream_name: Optional[str] = None, 2186 **kwargs: Any, 2187 ) -> PartitionRouter: 2188 if ( 2189 hasattr(model, "partition_router") 2190 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2191 and model.partition_router 2192 ): 2193 stream_slicer_model = model.partition_router 2194 if isinstance(stream_slicer_model, list): 2195 return CartesianProductStreamSlicer( 2196 [ 2197 self._create_component_from_model( 2198 model=slicer, config=config, stream_name=stream_name or "" 2199 ) 2200 for slicer in stream_slicer_model 2201 ], 2202 parameters={}, 2203 ) 2204 elif isinstance(stream_slicer_model, dict): 2205 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2206 params = stream_slicer_model.get("$parameters") 2207 if not isinstance(params, dict): 2208 params = {} 2209 stream_slicer_model["$parameters"] = params 2210 2211 if stream_name is not None: 2212 params["stream_name"] = stream_name 2213 2214 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2215 model, 2216 "partition_router", 2217 stream_slicer_model, 2218 config, 2219 **kwargs, 2220 ) 2221 else: 2222 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2223 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2224 ) 2225 return SinglePartitionRouter(parameters={}) 2226 2227 def _build_concurrent_cursor( 2228 self, 2229 model: DeclarativeStreamModel, 2230 stream_slicer: Optional[PartitionRouter], 2231 config: Config, 2232 ) -> Cursor: 2233 stream_name = model.name or "" 2234 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2235 2236 if ( 2237 model.incremental_sync 2238 and stream_slicer 2239 and not isinstance(stream_slicer, SinglePartitionRouter) 2240 ): 2241 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2242 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2243 # same time because we didn't solve for design questions like what the lookback window would 2244 # be as well as global cursor fall backs. We have not seen customers that have needed both 2245 # at the same time yet and are currently punting on this until we need to solve it. 2246 raise ValueError( 2247 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2248 ) 2249 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2250 state_manager=self._connector_state_manager, 2251 model_type=DatetimeBasedCursorModel, 2252 component_definition=model.incremental_sync.__dict__, 2253 stream_name=stream_name, 2254 stream_state=stream_state, 2255 stream_namespace=None, 2256 config=config or {}, 2257 partition_router=stream_slicer, 2258 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2259 ) 2260 elif model.incremental_sync: 2261 if type(model.incremental_sync) == IncrementingCountCursorModel: 2262 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2263 model_type=IncrementingCountCursorModel, 2264 component_definition=model.incremental_sync.__dict__, 2265 stream_name=stream_name, 2266 stream_namespace=None, 2267 stream_state=stream_state, 2268 config=config or {}, 2269 ) 2270 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2271 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2272 model_type=type(model.incremental_sync), 2273 component_definition=model.incremental_sync.__dict__, 2274 stream_name=stream_name, 2275 stream_namespace=None, 2276 stream_state=stream_state, 2277 config=config or {}, 2278 attempt_to_create_cursor_if_not_provided=True, 2279 ) 2280 else: 2281 raise ValueError( 2282 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2283 ) 2284 return FinalStateCursor(stream_name, None, self._message_repository) 2285 2286 def create_default_error_handler( 2287 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2288 ) -> DefaultErrorHandler: 2289 backoff_strategies = [] 2290 if model.backoff_strategies: 2291 for backoff_strategy_model in model.backoff_strategies: 2292 backoff_strategies.append( 2293 self._create_component_from_model(model=backoff_strategy_model, config=config) 2294 ) 2295 2296 response_filters = [] 2297 if model.response_filters: 2298 for response_filter_model in model.response_filters: 2299 response_filters.append( 2300 self._create_component_from_model(model=response_filter_model, config=config) 2301 ) 2302 response_filters.append( 2303 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2304 ) 2305 2306 return DefaultErrorHandler( 2307 backoff_strategies=backoff_strategies, 2308 max_retries=model.max_retries, 2309 response_filters=response_filters, 2310 config=config, 2311 parameters=model.parameters or {}, 2312 ) 2313 2314 def create_default_paginator( 2315 self, 2316 model: DefaultPaginatorModel, 2317 config: Config, 2318 *, 2319 url_base: str, 2320 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2321 decoder: Optional[Decoder] = None, 2322 cursor_used_for_stop_condition: Optional[Cursor] = None, 2323 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2324 if decoder: 2325 if self._is_supported_decoder_for_pagination(decoder): 2326 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2327 else: 2328 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2329 else: 2330 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2331 page_size_option = ( 2332 self._create_component_from_model(model=model.page_size_option, config=config) 2333 if model.page_size_option 2334 else None 2335 ) 2336 page_token_option = ( 2337 self._create_component_from_model(model=model.page_token_option, config=config) 2338 if model.page_token_option 2339 else None 2340 ) 2341 pagination_strategy = self._create_component_from_model( 2342 model=model.pagination_strategy, 2343 config=config, 2344 decoder=decoder_to_use, 2345 extractor_model=extractor_model, 2346 ) 2347 if cursor_used_for_stop_condition: 2348 pagination_strategy = StopConditionPaginationStrategyDecorator( 2349 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2350 ) 2351 paginator = DefaultPaginator( 2352 decoder=decoder_to_use, 2353 page_size_option=page_size_option, 2354 page_token_option=page_token_option, 2355 pagination_strategy=pagination_strategy, 2356 url_base=url_base, 2357 config=config, 2358 parameters=model.parameters or {}, 2359 ) 2360 if self._limit_pages_fetched_per_slice: 2361 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2362 return paginator 2363 2364 def create_dpath_extractor( 2365 self, 2366 model: DpathExtractorModel, 2367 config: Config, 2368 decoder: Optional[Decoder] = None, 2369 **kwargs: Any, 2370 ) -> DpathExtractor: 2371 if decoder: 2372 decoder_to_use = decoder 2373 else: 2374 decoder_to_use = JsonDecoder(parameters={}) 2375 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2376 return DpathExtractor( 2377 decoder=decoder_to_use, 2378 field_path=model_field_path, 2379 config=config, 2380 parameters=model.parameters or {}, 2381 ) 2382 2383 @staticmethod 2384 def create_response_to_file_extractor( 2385 model: ResponseToFileExtractorModel, 2386 **kwargs: Any, 2387 ) -> ResponseToFileExtractor: 2388 return ResponseToFileExtractor(parameters=model.parameters or {}) 2389 2390 @staticmethod 2391 def create_exponential_backoff_strategy( 2392 model: ExponentialBackoffStrategyModel, config: Config 2393 ) -> ExponentialBackoffStrategy: 2394 return ExponentialBackoffStrategy( 2395 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2396 ) 2397 2398 @staticmethod 2399 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2400 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2401 2402 def create_http_requester( 2403 self, 2404 model: HttpRequesterModel, 2405 config: Config, 2406 decoder: Decoder = JsonDecoder(parameters={}), 2407 query_properties_key: Optional[str] = None, 2408 use_cache: Optional[bool] = None, 2409 *, 2410 name: str, 2411 ) -> HttpRequester: 2412 authenticator = ( 2413 self._create_component_from_model( 2414 model=model.authenticator, 2415 config=config, 2416 url_base=model.url or model.url_base, 2417 name=name, 2418 decoder=decoder, 2419 ) 2420 if model.authenticator 2421 else None 2422 ) 2423 error_handler = ( 2424 self._create_component_from_model(model=model.error_handler, config=config) 2425 if model.error_handler 2426 else DefaultErrorHandler( 2427 backoff_strategies=[], 2428 response_filters=[], 2429 config=config, 2430 parameters=model.parameters or {}, 2431 ) 2432 ) 2433 2434 api_budget = self._api_budget 2435 2436 request_options_provider = InterpolatedRequestOptionsProvider( 2437 request_body=model.request_body, 2438 request_body_data=model.request_body_data, 2439 request_body_json=model.request_body_json, 2440 request_headers=model.request_headers, 2441 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2442 query_properties_key=query_properties_key, 2443 config=config, 2444 parameters=model.parameters or {}, 2445 ) 2446 2447 assert model.use_cache is not None # for mypy 2448 assert model.http_method is not None # for mypy 2449 2450 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2451 2452 return HttpRequester( 2453 name=name, 2454 url=model.url, 2455 url_base=model.url_base, 2456 path=model.path, 2457 authenticator=authenticator, 2458 error_handler=error_handler, 2459 api_budget=api_budget, 2460 http_method=HttpMethod[model.http_method.value], 2461 request_options_provider=request_options_provider, 2462 config=config, 2463 disable_retries=self._disable_retries, 2464 parameters=model.parameters or {}, 2465 message_repository=self._message_repository, 2466 use_cache=should_use_cache, 2467 decoder=decoder, 2468 stream_response=decoder.is_stream_response() if decoder else False, 2469 ) 2470 2471 @staticmethod 2472 def create_http_response_filter( 2473 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2474 ) -> HttpResponseFilter: 2475 if model.action: 2476 action = ResponseAction(model.action.value) 2477 else: 2478 action = None 2479 2480 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2481 2482 http_codes = ( 2483 set(model.http_codes) if model.http_codes else set() 2484 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2485 2486 return HttpResponseFilter( 2487 action=action, 2488 failure_type=failure_type, 2489 error_message=model.error_message or "", 2490 error_message_contains=model.error_message_contains or "", 2491 http_codes=http_codes, 2492 predicate=model.predicate or "", 2493 config=config, 2494 parameters=model.parameters or {}, 2495 ) 2496 2497 @staticmethod 2498 def create_inline_schema_loader( 2499 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2500 ) -> InlineSchemaLoader: 2501 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2502 2503 def create_complex_field_type( 2504 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2505 ) -> ComplexFieldType: 2506 items = ( 2507 self._create_component_from_model(model=model.items, config=config) 2508 if isinstance(model.items, ComplexFieldTypeModel) 2509 else model.items 2510 ) 2511 2512 return ComplexFieldType(field_type=model.field_type, items=items) 2513 2514 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2515 target_type = ( 2516 self._create_component_from_model(model=model.target_type, config=config) 2517 if isinstance(model.target_type, ComplexFieldTypeModel) 2518 else model.target_type 2519 ) 2520 2521 return TypesMap( 2522 target_type=target_type, 2523 current_type=model.current_type, 2524 condition=model.condition if model.condition is not None else "True", 2525 ) 2526 2527 def create_schema_type_identifier( 2528 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2529 ) -> SchemaTypeIdentifier: 2530 types_mapping = [] 2531 if model.types_mapping: 2532 types_mapping.extend( 2533 [ 2534 self._create_component_from_model(types_map, config=config) 2535 for types_map in model.types_mapping 2536 ] 2537 ) 2538 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2539 [x for x in model.schema_pointer] if model.schema_pointer else [] 2540 ) 2541 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2542 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2543 [x for x in model.type_pointer] if model.type_pointer else None 2544 ) 2545 2546 return SchemaTypeIdentifier( 2547 schema_pointer=model_schema_pointer, 2548 key_pointer=model_key_pointer, 2549 type_pointer=model_type_pointer, 2550 types_mapping=types_mapping, 2551 parameters=model.parameters or {}, 2552 ) 2553 2554 def create_dynamic_schema_loader( 2555 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2556 ) -> DynamicSchemaLoader: 2557 schema_transformations = [] 2558 if model.schema_transformations: 2559 for transformation_model in model.schema_transformations: 2560 schema_transformations.append( 2561 self._create_component_from_model(model=transformation_model, config=config) 2562 ) 2563 name = "dynamic_properties" 2564 retriever = self._create_component_from_model( 2565 model=model.retriever, 2566 config=config, 2567 name=name, 2568 primary_key=None, 2569 partition_router=self._build_stream_slicer_from_partition_router( 2570 model.retriever, config 2571 ), 2572 transformations=[], 2573 use_cache=True, 2574 log_formatter=( 2575 lambda response: format_http_message( 2576 response, 2577 f"Schema loader '{name}' request", 2578 f"Request performed in order to extract schema.", 2579 name, 2580 is_auxiliary=True, 2581 ) 2582 ), 2583 ) 2584 schema_type_identifier = self._create_component_from_model( 2585 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2586 ) 2587 schema_filter = ( 2588 self._create_component_from_model( 2589 model.schema_filter, config=config, parameters=model.parameters or {} 2590 ) 2591 if model.schema_filter is not None 2592 else None 2593 ) 2594 2595 return DynamicSchemaLoader( 2596 retriever=retriever, 2597 config=config, 2598 schema_transformations=schema_transformations, 2599 schema_filter=schema_filter, 2600 schema_type_identifier=schema_type_identifier, 2601 parameters=model.parameters or {}, 2602 ) 2603 2604 @staticmethod 2605 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2606 return JsonDecoder(parameters={}) 2607 2608 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2609 return CompositeRawDecoder( 2610 parser=ModelToComponentFactory._get_parser(model, config), 2611 stream_response=False if self._emit_connector_builder_messages else True, 2612 ) 2613 2614 def create_jsonl_decoder( 2615 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2616 ) -> Decoder: 2617 return CompositeRawDecoder( 2618 parser=ModelToComponentFactory._get_parser(model, config), 2619 stream_response=False if self._emit_connector_builder_messages else True, 2620 ) 2621 2622 def create_gzip_decoder( 2623 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2624 ) -> Decoder: 2625 _compressed_response_types = { 2626 "gzip", 2627 "x-gzip", 2628 "gzip, deflate", 2629 "x-gzip, deflate", 2630 "application/zip", 2631 "application/gzip", 2632 "application/x-gzip", 2633 "application/x-zip-compressed", 2634 } 2635 2636 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2637 2638 if self._emit_connector_builder_messages: 2639 # This is very surprising but if the response is not streamed, 2640 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2641 # which uses urllib3 directly and does not uncompress the data. 2642 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2643 2644 return CompositeRawDecoder.by_headers( 2645 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2646 stream_response=True, 2647 fallback_parser=gzip_parser.inner_parser, 2648 ) 2649 2650 # todo: This method should be removed once we deprecate the SimpleRetriever.cursor field and the various 2651 # state methods 2652 @staticmethod 2653 def create_incrementing_count_cursor( 2654 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2655 ) -> DatetimeBasedCursor: 2656 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2657 # we still parse models into components. The issue is that there's no runtime implementation of a 2658 # IncrementingCountCursor. 2659 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2660 return DatetimeBasedCursor( 2661 cursor_field=model.cursor_field, 2662 datetime_format="%Y-%m-%d", 2663 start_datetime="2024-12-12", 2664 config=config, 2665 parameters={}, 2666 ) 2667 2668 @staticmethod 2669 def create_iterable_decoder( 2670 model: IterableDecoderModel, config: Config, **kwargs: Any 2671 ) -> IterableDecoder: 2672 return IterableDecoder(parameters={}) 2673 2674 @staticmethod 2675 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2676 return XmlDecoder(parameters={}) 2677 2678 def create_zipfile_decoder( 2679 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2680 ) -> ZipfileDecoder: 2681 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2682 2683 @staticmethod 2684 def _get_parser(model: BaseModel, config: Config) -> Parser: 2685 if isinstance(model, JsonDecoderModel): 2686 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2687 return JsonParser() 2688 elif isinstance(model, JsonlDecoderModel): 2689 return JsonLineParser() 2690 elif isinstance(model, CsvDecoderModel): 2691 return CsvParser( 2692 encoding=model.encoding, 2693 delimiter=model.delimiter, 2694 set_values_to_none=model.set_values_to_none, 2695 ) 2696 elif isinstance(model, GzipDecoderModel): 2697 return GzipParser( 2698 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2699 ) 2700 elif isinstance( 2701 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2702 ): 2703 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2704 2705 raise ValueError(f"Unknown decoder type {model}") 2706 2707 @staticmethod 2708 def create_json_file_schema_loader( 2709 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2710 ) -> JsonFileSchemaLoader: 2711 return JsonFileSchemaLoader( 2712 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2713 ) 2714 2715 def create_jwt_authenticator( 2716 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2717 ) -> JwtAuthenticator: 2718 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2719 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2720 request_option = ( 2721 self._create_component_from_model(model.request_option, config) 2722 if model.request_option 2723 else None 2724 ) 2725 return JwtAuthenticator( 2726 config=config, 2727 parameters=model.parameters or {}, 2728 algorithm=JwtAlgorithm(model.algorithm.value), 2729 secret_key=model.secret_key, 2730 base64_encode_secret_key=model.base64_encode_secret_key, 2731 token_duration=model.token_duration, 2732 header_prefix=model.header_prefix, 2733 kid=jwt_headers.kid, 2734 typ=jwt_headers.typ, 2735 cty=jwt_headers.cty, 2736 iss=jwt_payload.iss, 2737 sub=jwt_payload.sub, 2738 aud=jwt_payload.aud, 2739 additional_jwt_headers=model.additional_jwt_headers, 2740 additional_jwt_payload=model.additional_jwt_payload, 2741 passphrase=model.passphrase, 2742 request_option=request_option, 2743 ) 2744 2745 def create_list_partition_router( 2746 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2747 ) -> ListPartitionRouter: 2748 request_option = ( 2749 self._create_component_from_model(model.request_option, config) 2750 if model.request_option 2751 else None 2752 ) 2753 return ListPartitionRouter( 2754 cursor_field=model.cursor_field, 2755 request_option=request_option, 2756 values=model.values, 2757 config=config, 2758 parameters=model.parameters or {}, 2759 ) 2760 2761 @staticmethod 2762 def create_min_max_datetime( 2763 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2764 ) -> MinMaxDatetime: 2765 return MinMaxDatetime( 2766 datetime=model.datetime, 2767 datetime_format=model.datetime_format or "", 2768 max_datetime=model.max_datetime or "", 2769 min_datetime=model.min_datetime or "", 2770 parameters=model.parameters or {}, 2771 ) 2772 2773 @staticmethod 2774 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2775 return NoAuth(parameters=model.parameters or {}) 2776 2777 @staticmethod 2778 def create_no_pagination( 2779 model: NoPaginationModel, config: Config, **kwargs: Any 2780 ) -> NoPagination: 2781 return NoPagination(parameters={}) 2782 2783 def create_oauth_authenticator( 2784 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2785 ) -> DeclarativeOauth2Authenticator: 2786 profile_assertion = ( 2787 self._create_component_from_model(model.profile_assertion, config=config) 2788 if model.profile_assertion 2789 else None 2790 ) 2791 2792 if model.refresh_token_updater: 2793 # ignore type error because fixing it would have a lot of dependencies, revisit later 2794 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2795 config, 2796 InterpolatedString.create( 2797 model.token_refresh_endpoint, # type: ignore 2798 parameters=model.parameters or {}, 2799 ).eval(config), 2800 access_token_name=InterpolatedString.create( 2801 model.access_token_name or "access_token", parameters=model.parameters or {} 2802 ).eval(config), 2803 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2804 expires_in_name=InterpolatedString.create( 2805 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2806 ).eval(config), 2807 client_id_name=InterpolatedString.create( 2808 model.client_id_name or "client_id", parameters=model.parameters or {} 2809 ).eval(config), 2810 client_id=InterpolatedString.create( 2811 model.client_id, parameters=model.parameters or {} 2812 ).eval(config) 2813 if model.client_id 2814 else model.client_id, 2815 client_secret_name=InterpolatedString.create( 2816 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2817 ).eval(config), 2818 client_secret=InterpolatedString.create( 2819 model.client_secret, parameters=model.parameters or {} 2820 ).eval(config) 2821 if model.client_secret 2822 else model.client_secret, 2823 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2824 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2825 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2826 grant_type_name=InterpolatedString.create( 2827 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2828 ).eval(config), 2829 grant_type=InterpolatedString.create( 2830 model.grant_type or "refresh_token", parameters=model.parameters or {} 2831 ).eval(config), 2832 refresh_request_body=InterpolatedMapping( 2833 model.refresh_request_body or {}, parameters=model.parameters or {} 2834 ).eval(config), 2835 refresh_request_headers=InterpolatedMapping( 2836 model.refresh_request_headers or {}, parameters=model.parameters or {} 2837 ).eval(config), 2838 scopes=model.scopes, 2839 token_expiry_date_format=model.token_expiry_date_format, 2840 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2841 message_repository=self._message_repository, 2842 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2843 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2844 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2845 ) 2846 # ignore type error because fixing it would have a lot of dependencies, revisit later 2847 return DeclarativeOauth2Authenticator( # type: ignore 2848 access_token_name=model.access_token_name or "access_token", 2849 access_token_value=model.access_token_value, 2850 client_id_name=model.client_id_name or "client_id", 2851 client_id=model.client_id, 2852 client_secret_name=model.client_secret_name or "client_secret", 2853 client_secret=model.client_secret, 2854 expires_in_name=model.expires_in_name or "expires_in", 2855 grant_type_name=model.grant_type_name or "grant_type", 2856 grant_type=model.grant_type or "refresh_token", 2857 refresh_request_body=model.refresh_request_body, 2858 refresh_request_headers=model.refresh_request_headers, 2859 refresh_token_name=model.refresh_token_name or "refresh_token", 2860 refresh_token=model.refresh_token, 2861 scopes=model.scopes, 2862 token_expiry_date=model.token_expiry_date, 2863 token_expiry_date_format=model.token_expiry_date_format, 2864 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2865 token_refresh_endpoint=model.token_refresh_endpoint, 2866 config=config, 2867 parameters=model.parameters or {}, 2868 message_repository=self._message_repository, 2869 profile_assertion=profile_assertion, 2870 use_profile_assertion=model.use_profile_assertion, 2871 ) 2872 2873 def create_offset_increment( 2874 self, 2875 model: OffsetIncrementModel, 2876 config: Config, 2877 decoder: Decoder, 2878 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2879 **kwargs: Any, 2880 ) -> OffsetIncrement: 2881 if isinstance(decoder, PaginationDecoderDecorator): 2882 inner_decoder = decoder.decoder 2883 else: 2884 inner_decoder = decoder 2885 decoder = PaginationDecoderDecorator(decoder=decoder) 2886 2887 if self._is_supported_decoder_for_pagination(inner_decoder): 2888 decoder_to_use = decoder 2889 else: 2890 raise ValueError( 2891 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2892 ) 2893 2894 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2895 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2896 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2897 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2898 # When we have more time to investigate we can look into reusing the same component. 2899 extractor = ( 2900 self._create_component_from_model( 2901 model=extractor_model, config=config, decoder=decoder_to_use 2902 ) 2903 if extractor_model 2904 else None 2905 ) 2906 2907 return OffsetIncrement( 2908 page_size=model.page_size, 2909 config=config, 2910 decoder=decoder_to_use, 2911 extractor=extractor, 2912 inject_on_first_request=model.inject_on_first_request or False, 2913 parameters=model.parameters or {}, 2914 ) 2915 2916 @staticmethod 2917 def create_page_increment( 2918 model: PageIncrementModel, config: Config, **kwargs: Any 2919 ) -> PageIncrement: 2920 return PageIncrement( 2921 page_size=model.page_size, 2922 config=config, 2923 start_from_page=model.start_from_page or 0, 2924 inject_on_first_request=model.inject_on_first_request or False, 2925 parameters=model.parameters or {}, 2926 ) 2927 2928 def create_parent_stream_config( 2929 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2930 ) -> ParentStreamConfig: 2931 declarative_stream = self._create_component_from_model( 2932 model.stream, 2933 config=config, 2934 is_parent=True, 2935 **kwargs, 2936 ) 2937 request_option = ( 2938 self._create_component_from_model(model.request_option, config=config) 2939 if model.request_option 2940 else None 2941 ) 2942 2943 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2944 raise ValueError( 2945 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2946 ) 2947 2948 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2949 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2950 ) 2951 2952 return ParentStreamConfig( 2953 parent_key=model.parent_key, 2954 request_option=request_option, 2955 stream=declarative_stream, 2956 partition_field=model.partition_field, 2957 config=config, 2958 incremental_dependency=model.incremental_dependency or False, 2959 parameters=model.parameters or {}, 2960 extra_fields=model.extra_fields, 2961 lazy_read_pointer=model_lazy_read_pointer, 2962 ) 2963 2964 def create_properties_from_endpoint( 2965 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2966 ) -> PropertiesFromEndpoint: 2967 retriever = self._create_component_from_model( 2968 model=model.retriever, 2969 config=config, 2970 name="dynamic_properties", 2971 primary_key=None, 2972 stream_slicer=None, 2973 transformations=[], 2974 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2975 ) 2976 return PropertiesFromEndpoint( 2977 property_field_path=model.property_field_path, 2978 retriever=retriever, 2979 config=config, 2980 parameters=model.parameters or {}, 2981 ) 2982 2983 def create_property_chunking( 2984 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2985 ) -> PropertyChunking: 2986 record_merge_strategy = ( 2987 self._create_component_from_model( 2988 model=model.record_merge_strategy, config=config, **kwargs 2989 ) 2990 if model.record_merge_strategy 2991 else None 2992 ) 2993 2994 property_limit_type: PropertyLimitType 2995 match model.property_limit_type: 2996 case PropertyLimitTypeModel.property_count: 2997 property_limit_type = PropertyLimitType.property_count 2998 case PropertyLimitTypeModel.characters: 2999 property_limit_type = PropertyLimitType.characters 3000 case _: 3001 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3002 3003 return PropertyChunking( 3004 property_limit_type=property_limit_type, 3005 property_limit=model.property_limit, 3006 record_merge_strategy=record_merge_strategy, 3007 config=config, 3008 parameters=model.parameters or {}, 3009 ) 3010 3011 def create_query_properties( 3012 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3013 ) -> QueryProperties: 3014 if isinstance(model.property_list, list): 3015 property_list = model.property_list 3016 else: 3017 property_list = self._create_component_from_model( 3018 model=model.property_list, config=config, **kwargs 3019 ) 3020 3021 property_chunking = ( 3022 self._create_component_from_model( 3023 model=model.property_chunking, config=config, **kwargs 3024 ) 3025 if model.property_chunking 3026 else None 3027 ) 3028 3029 property_selector = ( 3030 self._create_component_from_model( 3031 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3032 ) 3033 if model.property_selector 3034 else None 3035 ) 3036 3037 return QueryProperties( 3038 property_list=property_list, 3039 always_include_properties=model.always_include_properties, 3040 property_chunking=property_chunking, 3041 property_selector=property_selector, 3042 config=config, 3043 parameters=model.parameters or {}, 3044 ) 3045 3046 def create_json_schema_property_selector( 3047 self, 3048 model: JsonSchemaPropertySelectorModel, 3049 config: Config, 3050 *, 3051 stream_name: str, 3052 **kwargs: Any, 3053 ) -> JsonSchemaPropertySelector: 3054 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3055 3056 transformations = [] 3057 if model.transformations: 3058 for transformation_model in model.transformations: 3059 transformations.append( 3060 self._create_component_from_model(model=transformation_model, config=config) 3061 ) 3062 3063 return JsonSchemaPropertySelector( 3064 configured_stream=configured_stream, 3065 properties_transformations=transformations, 3066 config=config, 3067 parameters=model.parameters or {}, 3068 ) 3069 3070 @staticmethod 3071 def create_record_filter( 3072 model: RecordFilterModel, config: Config, **kwargs: Any 3073 ) -> RecordFilter: 3074 return RecordFilter( 3075 condition=model.condition or "", config=config, parameters=model.parameters or {} 3076 ) 3077 3078 @staticmethod 3079 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3080 return RequestPath(parameters={}) 3081 3082 @staticmethod 3083 def create_request_option( 3084 model: RequestOptionModel, config: Config, **kwargs: Any 3085 ) -> RequestOption: 3086 inject_into = RequestOptionType(model.inject_into.value) 3087 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3088 [ 3089 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3090 for segment in model.field_path 3091 ] 3092 if model.field_path 3093 else None 3094 ) 3095 field_name = ( 3096 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3097 if model.field_name 3098 else None 3099 ) 3100 return RequestOption( 3101 field_name=field_name, 3102 field_path=field_path, 3103 inject_into=inject_into, 3104 parameters=kwargs.get("parameters", {}), 3105 ) 3106 3107 def create_record_selector( 3108 self, 3109 model: RecordSelectorModel, 3110 config: Config, 3111 *, 3112 name: str, 3113 transformations: List[RecordTransformation] | None = None, 3114 decoder: Decoder | None = None, 3115 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3116 file_uploader: Optional[DefaultFileUploader] = None, 3117 **kwargs: Any, 3118 ) -> RecordSelector: 3119 extractor = self._create_component_from_model( 3120 model=model.extractor, decoder=decoder, config=config 3121 ) 3122 record_filter = ( 3123 self._create_component_from_model(model.record_filter, config=config) 3124 if model.record_filter 3125 else None 3126 ) 3127 3128 transform_before_filtering = ( 3129 False if model.transform_before_filtering is None else model.transform_before_filtering 3130 ) 3131 if client_side_incremental_sync_cursor: 3132 record_filter = ClientSideIncrementalRecordFilterDecorator( 3133 config=config, 3134 parameters=model.parameters, 3135 condition=model.record_filter.condition 3136 if (model.record_filter and hasattr(model.record_filter, "condition")) 3137 else None, 3138 cursor=client_side_incremental_sync_cursor, 3139 ) 3140 transform_before_filtering = ( 3141 True 3142 if model.transform_before_filtering is None 3143 else model.transform_before_filtering 3144 ) 3145 3146 if model.schema_normalization is None: 3147 # default to no schema normalization if not set 3148 model.schema_normalization = SchemaNormalizationModel.None_ 3149 3150 schema_normalization = ( 3151 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3152 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3153 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3154 ) 3155 3156 return RecordSelector( 3157 extractor=extractor, 3158 name=name, 3159 config=config, 3160 record_filter=record_filter, 3161 transformations=transformations or [], 3162 file_uploader=file_uploader, 3163 schema_normalization=schema_normalization, 3164 parameters=model.parameters or {}, 3165 transform_before_filtering=transform_before_filtering, 3166 ) 3167 3168 @staticmethod 3169 def create_remove_fields( 3170 model: RemoveFieldsModel, config: Config, **kwargs: Any 3171 ) -> RemoveFields: 3172 return RemoveFields( 3173 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3174 ) 3175 3176 def create_selective_authenticator( 3177 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3178 ) -> DeclarativeAuthenticator: 3179 authenticators = { 3180 name: self._create_component_from_model(model=auth, config=config) 3181 for name, auth in model.authenticators.items() 3182 } 3183 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3184 return SelectiveAuthenticator( # type: ignore[abstract] 3185 config=config, 3186 authenticators=authenticators, 3187 authenticator_selection_path=model.authenticator_selection_path, 3188 **kwargs, 3189 ) 3190 3191 @staticmethod 3192 def create_legacy_session_token_authenticator( 3193 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3194 ) -> LegacySessionTokenAuthenticator: 3195 return LegacySessionTokenAuthenticator( 3196 api_url=url_base, 3197 header=model.header, 3198 login_url=model.login_url, 3199 password=model.password or "", 3200 session_token=model.session_token or "", 3201 session_token_response_key=model.session_token_response_key or "", 3202 username=model.username or "", 3203 validate_session_url=model.validate_session_url, 3204 config=config, 3205 parameters=model.parameters or {}, 3206 ) 3207 3208 def create_simple_retriever( 3209 self, 3210 model: SimpleRetrieverModel, 3211 config: Config, 3212 *, 3213 name: str, 3214 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3215 request_options_provider: Optional[RequestOptionsProvider] = None, 3216 cursor: Optional[Cursor] = None, 3217 has_stop_condition_cursor: bool = False, 3218 is_client_side_incremental_sync: bool = False, 3219 transformations: List[RecordTransformation], 3220 file_uploader: Optional[DefaultFileUploader] = None, 3221 incremental_sync: Optional[ 3222 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3223 ] = None, 3224 use_cache: Optional[bool] = None, 3225 log_formatter: Optional[Callable[[Response], Any]] = None, 3226 partition_router: Optional[PartitionRouter] = None, 3227 **kwargs: Any, 3228 ) -> SimpleRetriever: 3229 def _get_url(req: Requester) -> str: 3230 """ 3231 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3232 This is needed because the URL is not set until the requester is created. 3233 """ 3234 3235 _url: str = ( 3236 model.requester.url 3237 if hasattr(model.requester, "url") and model.requester.url is not None 3238 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3239 ) 3240 _url_base: str = ( 3241 model.requester.url_base 3242 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3243 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3244 ) 3245 3246 return _url or _url_base 3247 3248 if cursor is None: 3249 cursor = FinalStateCursor(name, None, self._message_repository) 3250 3251 decoder = ( 3252 self._create_component_from_model(model=model.decoder, config=config) 3253 if model.decoder 3254 else JsonDecoder(parameters={}) 3255 ) 3256 record_selector = self._create_component_from_model( 3257 model=model.record_selector, 3258 name=name, 3259 config=config, 3260 decoder=decoder, 3261 transformations=transformations, 3262 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3263 file_uploader=file_uploader, 3264 ) 3265 3266 query_properties: Optional[QueryProperties] = None 3267 query_properties_key: Optional[str] = None 3268 self._ensure_query_properties_to_model(model.requester) 3269 if self._has_query_properties_in_request_parameters(model.requester): 3270 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3271 # places instead of default to request_parameters which isn't clearly documented 3272 if ( 3273 hasattr(model.requester, "fetch_properties_from_endpoint") 3274 and model.requester.fetch_properties_from_endpoint 3275 ): 3276 raise ValueError( 3277 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3278 ) 3279 3280 query_properties_definitions = [] 3281 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3282 if isinstance(request_parameter, QueryPropertiesModel): 3283 query_properties_key = key 3284 query_properties_definitions.append(request_parameter) 3285 3286 if len(query_properties_definitions) > 1: 3287 raise ValueError( 3288 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3289 ) 3290 3291 if len(query_properties_definitions) == 1: 3292 query_properties = self._create_component_from_model( 3293 model=query_properties_definitions[0], stream_name=name, config=config 3294 ) 3295 3296 # Removes QueryProperties components from the interpolated mappings because it has been designed 3297 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3298 # instead of through jinja interpolation 3299 if hasattr(model.requester, "request_parameters") and isinstance( 3300 model.requester.request_parameters, Mapping 3301 ): 3302 model.requester.request_parameters = self._remove_query_properties( 3303 model.requester.request_parameters 3304 ) 3305 elif ( 3306 hasattr(model.requester, "fetch_properties_from_endpoint") 3307 and model.requester.fetch_properties_from_endpoint 3308 ): 3309 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3310 query_properties_definition = QueryPropertiesModel( 3311 type="QueryProperties", 3312 property_list=model.requester.fetch_properties_from_endpoint, 3313 always_include_properties=None, 3314 property_chunking=None, 3315 ) # type: ignore # $parameters has a default value 3316 3317 query_properties = self.create_query_properties( 3318 model=query_properties_definition, 3319 stream_name=name, 3320 config=config, 3321 ) 3322 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3323 query_properties = self.create_query_properties( 3324 model=model.requester.query_properties, 3325 stream_name=name, 3326 config=config, 3327 ) 3328 3329 requester = self._create_component_from_model( 3330 model=model.requester, 3331 decoder=decoder, 3332 name=name, 3333 query_properties_key=query_properties_key, 3334 use_cache=use_cache, 3335 config=config, 3336 ) 3337 3338 if not request_options_provider: 3339 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3340 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3341 partition_router, PartitionRouter 3342 ): 3343 request_options_provider = partition_router 3344 3345 paginator = ( 3346 self._create_component_from_model( 3347 model=model.paginator, 3348 config=config, 3349 url_base=_get_url(requester), 3350 extractor_model=model.record_selector.extractor, 3351 decoder=decoder, 3352 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3353 ) 3354 if model.paginator 3355 else NoPagination(parameters={}) 3356 ) 3357 3358 ignore_stream_slicer_parameters_on_paginated_requests = ( 3359 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3360 ) 3361 3362 if ( 3363 model.partition_router 3364 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3365 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3366 and any( 3367 parent_stream_config.lazy_read_pointer 3368 for parent_stream_config in model.partition_router.parent_stream_configs 3369 ) 3370 ): 3371 if incremental_sync: 3372 if incremental_sync.type != "DatetimeBasedCursor": 3373 raise ValueError( 3374 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3375 ) 3376 3377 elif incremental_sync.step or incremental_sync.cursor_granularity: 3378 raise ValueError( 3379 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3380 ) 3381 3382 if model.decoder and model.decoder.type != "JsonDecoder": 3383 raise ValueError( 3384 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3385 ) 3386 3387 return LazySimpleRetriever( 3388 name=name, 3389 paginator=paginator, 3390 primary_key=primary_key, 3391 requester=requester, 3392 record_selector=record_selector, 3393 stream_slicer=_NO_STREAM_SLICING, 3394 request_option_provider=request_options_provider, 3395 cursor=None, 3396 config=config, 3397 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3398 parameters=model.parameters or {}, 3399 ) 3400 3401 if ( 3402 model.record_selector.record_filter 3403 and model.pagination_reset 3404 and model.pagination_reset.limits 3405 ): 3406 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3407 3408 return SimpleRetriever( 3409 name=name, 3410 paginator=paginator, 3411 primary_key=primary_key, 3412 requester=requester, 3413 record_selector=record_selector, 3414 stream_slicer=_NO_STREAM_SLICING, 3415 request_option_provider=request_options_provider, 3416 cursor=None, 3417 config=config, 3418 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3419 additional_query_properties=query_properties, 3420 log_formatter=self._get_log_formatter(log_formatter, name), 3421 pagination_tracker_factory=self._create_pagination_tracker_factory( 3422 model.pagination_reset, cursor 3423 ), 3424 parameters=model.parameters or {}, 3425 ) 3426 3427 def _create_pagination_tracker_factory( 3428 self, model: Optional[PaginationResetModel], cursor: Cursor 3429 ) -> Callable[[], PaginationTracker]: 3430 if model is None: 3431 return lambda: PaginationTracker() 3432 3433 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3434 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3435 if model.action == PaginationResetActionModel.RESET: 3436 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3437 pass 3438 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3439 if isinstance(cursor, ConcurrentCursor): 3440 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3441 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3442 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3443 {}, datetime.timedelta(0) 3444 ) 3445 elif not isinstance(cursor, FinalStateCursor): 3446 LOGGER.warning( 3447 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3448 ) 3449 else: 3450 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3451 3452 limit = model.limits.number_of_records if model and model.limits else None 3453 return lambda: PaginationTracker(cursor_factory(), limit) 3454 3455 def _get_log_formatter( 3456 self, log_formatter: Callable[[Response], Any] | None, name: str 3457 ) -> Callable[[Response], Any] | None: 3458 if self._should_limit_slices_fetched(): 3459 return ( 3460 ( 3461 lambda response: format_http_message( 3462 response, 3463 f"Stream '{name}' request", 3464 f"Request performed in order to extract records for stream '{name}'", 3465 name, 3466 ) 3467 ) 3468 if not log_formatter 3469 else log_formatter 3470 ) 3471 return None 3472 3473 def _should_limit_slices_fetched(self) -> bool: 3474 """ 3475 Returns True if the number of slices fetched should be limited, False otherwise. 3476 This is used to limit the number of slices fetched during tests. 3477 """ 3478 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3479 3480 @staticmethod 3481 def _has_query_properties_in_request_parameters( 3482 requester: Union[HttpRequesterModel, CustomRequesterModel], 3483 ) -> bool: 3484 if not hasattr(requester, "request_parameters"): 3485 return False 3486 request_parameters = requester.request_parameters 3487 if request_parameters and isinstance(request_parameters, Mapping): 3488 for request_parameter in request_parameters.values(): 3489 if isinstance(request_parameter, QueryPropertiesModel): 3490 return True 3491 return False 3492 3493 @staticmethod 3494 def _remove_query_properties( 3495 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3496 ) -> Mapping[str, str]: 3497 return { 3498 parameter_field: request_parameter 3499 for parameter_field, request_parameter in request_parameters.items() 3500 if not isinstance(request_parameter, QueryPropertiesModel) 3501 } 3502 3503 def create_state_delegating_stream( 3504 self, 3505 model: StateDelegatingStreamModel, 3506 config: Config, 3507 has_parent_state: Optional[bool] = None, 3508 **kwargs: Any, 3509 ) -> DeclarativeStream: 3510 if ( 3511 model.full_refresh_stream.name != model.name 3512 or model.name != model.incremental_stream.name 3513 ): 3514 raise ValueError( 3515 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3516 ) 3517 3518 stream_model = self._get_state_delegating_stream_model( 3519 False if has_parent_state is None else has_parent_state, model 3520 ) 3521 3522 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel 3523 3524 def _get_state_delegating_stream_model( 3525 self, has_parent_state: bool, model: StateDelegatingStreamModel 3526 ) -> DeclarativeStreamModel: 3527 return ( 3528 model.incremental_stream 3529 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3530 else model.full_refresh_stream 3531 ) 3532 3533 def _create_async_job_status_mapping( 3534 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3535 ) -> Mapping[str, AsyncJobStatus]: 3536 api_status_to_cdk_status = {} 3537 for cdk_status, api_statuses in model.dict().items(): 3538 if cdk_status == "type": 3539 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3540 continue 3541 3542 for status in api_statuses: 3543 if status in api_status_to_cdk_status: 3544 raise ValueError( 3545 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3546 ) 3547 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3548 return api_status_to_cdk_status 3549 3550 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3551 match status: 3552 case "running": 3553 return AsyncJobStatus.RUNNING 3554 case "completed": 3555 return AsyncJobStatus.COMPLETED 3556 case "failed": 3557 return AsyncJobStatus.FAILED 3558 case "timeout": 3559 return AsyncJobStatus.TIMED_OUT 3560 case _: 3561 raise ValueError(f"Unsupported CDK status {status}") 3562 3563 def create_async_retriever( 3564 self, 3565 model: AsyncRetrieverModel, 3566 config: Config, 3567 *, 3568 name: str, 3569 primary_key: Optional[ 3570 Union[str, List[str], List[List[str]]] 3571 ], # this seems to be needed to match create_simple_retriever 3572 stream_slicer: Optional[StreamSlicer], 3573 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3574 transformations: List[RecordTransformation], 3575 **kwargs: Any, 3576 ) -> AsyncRetriever: 3577 if model.download_target_requester and not model.download_target_extractor: 3578 raise ValueError( 3579 f"`download_target_extractor` required if using a `download_target_requester`" 3580 ) 3581 3582 def _get_download_retriever( 3583 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3584 ) -> SimpleRetriever: 3585 # We create a record selector for the download retriever 3586 # with no schema normalization and no transformations, neither record filter 3587 # as all this occurs in the record_selector of the AsyncRetriever 3588 record_selector = RecordSelector( 3589 extractor=extractor, 3590 name=name, 3591 record_filter=None, 3592 transformations=[], 3593 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3594 config=config, 3595 parameters={}, 3596 ) 3597 paginator = ( 3598 self._create_component_from_model( 3599 model=model.download_paginator, 3600 decoder=_decoder, 3601 config=config, 3602 url_base="", 3603 ) 3604 if model.download_paginator 3605 else NoPagination(parameters={}) 3606 ) 3607 3608 return SimpleRetriever( 3609 requester=requester, 3610 record_selector=record_selector, 3611 primary_key=None, 3612 name=name, 3613 paginator=paginator, 3614 config=config, 3615 parameters={}, 3616 log_formatter=self._get_log_formatter(None, name), 3617 ) 3618 3619 def _get_job_timeout() -> datetime.timedelta: 3620 user_defined_timeout: Optional[int] = ( 3621 int( 3622 InterpolatedString.create( 3623 str(model.polling_job_timeout), 3624 parameters={}, 3625 ).eval(config) 3626 ) 3627 if model.polling_job_timeout 3628 else None 3629 ) 3630 3631 # check for user defined timeout during the test read or 15 minutes 3632 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3633 # default value for non-connector builder is 60 minutes. 3634 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3635 3636 return ( 3637 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3638 ) 3639 3640 decoder = ( 3641 self._create_component_from_model(model=model.decoder, config=config) 3642 if model.decoder 3643 else JsonDecoder(parameters={}) 3644 ) 3645 record_selector = self._create_component_from_model( 3646 model=model.record_selector, 3647 config=config, 3648 decoder=decoder, 3649 name=name, 3650 transformations=transformations, 3651 client_side_incremental_sync=client_side_incremental_sync, 3652 ) 3653 3654 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3655 if self._should_limit_slices_fetched(): 3656 stream_slicer = cast( 3657 StreamSlicer, 3658 StreamSlicerTestReadDecorator( 3659 wrapped_slicer=stream_slicer, 3660 maximum_number_of_slices=self._limit_slices_fetched or 5, 3661 ), 3662 ) 3663 3664 creation_requester = self._create_component_from_model( 3665 model=model.creation_requester, 3666 decoder=decoder, 3667 config=config, 3668 name=f"job creation - {name}", 3669 ) 3670 polling_requester = self._create_component_from_model( 3671 model=model.polling_requester, 3672 decoder=decoder, 3673 config=config, 3674 name=f"job polling - {name}", 3675 ) 3676 job_download_components_name = f"job download - {name}" 3677 download_decoder = ( 3678 self._create_component_from_model(model=model.download_decoder, config=config) 3679 if model.download_decoder 3680 else JsonDecoder(parameters={}) 3681 ) 3682 download_extractor = ( 3683 self._create_component_from_model( 3684 model=model.download_extractor, 3685 config=config, 3686 decoder=download_decoder, 3687 parameters=model.parameters, 3688 ) 3689 if model.download_extractor 3690 else DpathExtractor( 3691 [], 3692 config=config, 3693 decoder=download_decoder, 3694 parameters=model.parameters or {}, 3695 ) 3696 ) 3697 download_requester = self._create_component_from_model( 3698 model=model.download_requester, 3699 decoder=download_decoder, 3700 config=config, 3701 name=job_download_components_name, 3702 ) 3703 download_retriever = _get_download_retriever( 3704 download_requester, download_extractor, download_decoder 3705 ) 3706 abort_requester = ( 3707 self._create_component_from_model( 3708 model=model.abort_requester, 3709 decoder=decoder, 3710 config=config, 3711 name=f"job abort - {name}", 3712 ) 3713 if model.abort_requester 3714 else None 3715 ) 3716 delete_requester = ( 3717 self._create_component_from_model( 3718 model=model.delete_requester, 3719 decoder=decoder, 3720 config=config, 3721 name=f"job delete - {name}", 3722 ) 3723 if model.delete_requester 3724 else None 3725 ) 3726 download_target_requester = ( 3727 self._create_component_from_model( 3728 model=model.download_target_requester, 3729 decoder=decoder, 3730 config=config, 3731 name=f"job extract_url - {name}", 3732 ) 3733 if model.download_target_requester 3734 else None 3735 ) 3736 status_extractor = self._create_component_from_model( 3737 model=model.status_extractor, decoder=decoder, config=config, name=name 3738 ) 3739 download_target_extractor = ( 3740 self._create_component_from_model( 3741 model=model.download_target_extractor, 3742 decoder=decoder, 3743 config=config, 3744 name=name, 3745 ) 3746 if model.download_target_extractor 3747 else None 3748 ) 3749 3750 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3751 creation_requester=creation_requester, 3752 polling_requester=polling_requester, 3753 download_retriever=download_retriever, 3754 download_target_requester=download_target_requester, 3755 abort_requester=abort_requester, 3756 delete_requester=delete_requester, 3757 status_extractor=status_extractor, 3758 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3759 download_target_extractor=download_target_extractor, 3760 job_timeout=_get_job_timeout(), 3761 ) 3762 3763 async_job_partition_router = AsyncJobPartitionRouter( 3764 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3765 job_repository, 3766 stream_slices, 3767 self._job_tracker, 3768 self._message_repository, 3769 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3770 has_bulk_parent=False, 3771 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3772 # `None` == default retry is set to 3 attempts, under the hood. 3773 job_max_retry=1 if self._emit_connector_builder_messages else None, 3774 ), 3775 stream_slicer=stream_slicer, 3776 config=config, 3777 parameters=model.parameters or {}, 3778 ) 3779 3780 return AsyncRetriever( 3781 record_selector=record_selector, 3782 stream_slicer=async_job_partition_router, 3783 config=config, 3784 parameters=model.parameters or {}, 3785 ) 3786 3787 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3788 config_migrations = [ 3789 self._create_component_from_model(migration, config) 3790 for migration in ( 3791 model.config_normalization_rules.config_migrations 3792 if ( 3793 model.config_normalization_rules 3794 and model.config_normalization_rules.config_migrations 3795 ) 3796 else [] 3797 ) 3798 ] 3799 config_transformations = [ 3800 self._create_component_from_model(transformation, config) 3801 for transformation in ( 3802 model.config_normalization_rules.transformations 3803 if ( 3804 model.config_normalization_rules 3805 and model.config_normalization_rules.transformations 3806 ) 3807 else [] 3808 ) 3809 ] 3810 config_validations = [ 3811 self._create_component_from_model(validation, config) 3812 for validation in ( 3813 model.config_normalization_rules.validations 3814 if ( 3815 model.config_normalization_rules 3816 and model.config_normalization_rules.validations 3817 ) 3818 else [] 3819 ) 3820 ] 3821 3822 return Spec( 3823 connection_specification=model.connection_specification, 3824 documentation_url=model.documentation_url, 3825 advanced_auth=model.advanced_auth, 3826 parameters={}, 3827 config_migrations=config_migrations, 3828 config_transformations=config_transformations, 3829 config_validations=config_validations, 3830 ) 3831 3832 def create_substream_partition_router( 3833 self, 3834 model: SubstreamPartitionRouterModel, 3835 config: Config, 3836 *, 3837 stream_name: str, 3838 **kwargs: Any, 3839 ) -> SubstreamPartitionRouter: 3840 parent_stream_configs = [] 3841 if model.parent_stream_configs: 3842 parent_stream_configs.extend( 3843 [ 3844 self.create_parent_stream_config_with_substream_wrapper( 3845 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3846 ) 3847 for parent_stream_config in model.parent_stream_configs 3848 ] 3849 ) 3850 3851 return SubstreamPartitionRouter( 3852 parent_stream_configs=parent_stream_configs, 3853 parameters=model.parameters or {}, 3854 config=config, 3855 ) 3856 3857 def create_parent_stream_config_with_substream_wrapper( 3858 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3859 ) -> Any: 3860 # getting the parent state 3861 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3862 3863 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3864 has_parent_state = bool( 3865 self._connector_state_manager.get_stream_state(stream_name, None) 3866 if model.incremental_dependency 3867 else False 3868 ) 3869 connector_state_manager = self._instantiate_parent_stream_state_manager( 3870 child_state, config, model, has_parent_state 3871 ) 3872 3873 substream_factory = ModelToComponentFactory( 3874 connector_state_manager=connector_state_manager, 3875 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3876 limit_slices_fetched=self._limit_slices_fetched, 3877 emit_connector_builder_messages=self._emit_connector_builder_messages, 3878 disable_retries=self._disable_retries, 3879 disable_cache=self._disable_cache, 3880 message_repository=StateFilteringMessageRepository( 3881 LogAppenderMessageRepositoryDecorator( 3882 { 3883 "airbyte_cdk": {"stream": {"is_substream": True}}, 3884 "http": {"is_auxiliary": True}, 3885 }, 3886 self._message_repository, 3887 self._evaluate_log_level(self._emit_connector_builder_messages), 3888 ), 3889 ), 3890 ) 3891 3892 return substream_factory.create_parent_stream_config( 3893 model=model, config=config, stream_name=stream_name, **kwargs 3894 ) 3895 3896 def _instantiate_parent_stream_state_manager( 3897 self, 3898 child_state: MutableMapping[str, Any], 3899 config: Config, 3900 model: ParentStreamConfigModel, 3901 has_parent_state: bool, 3902 ) -> ConnectorStateManager: 3903 """ 3904 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 3905 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 3906 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 3907 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 3908 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 3909 incremental_dependency is set. 3910 """ 3911 if model.incremental_dependency and child_state: 3912 parent_stream_name = model.stream.name or "" 3913 parent_state = ConcurrentPerPartitionCursor.get_parent_state( 3914 child_state, parent_stream_name 3915 ) 3916 3917 if not parent_state: 3918 # there are two migration cases: state value from child stream or from global state 3919 parent_state = ConcurrentPerPartitionCursor.get_global_state( 3920 child_state, parent_stream_name 3921 ) 3922 3923 if not parent_state and not isinstance(parent_state, dict): 3924 cursor_values = child_state.values() 3925 if cursor_values and len(cursor_values) == 1: 3926 # We assume the child state is a pair `{<cursor_field>: <cursor_value>}` and we will use the 3927 # cursor value as a parent state. 3928 incremental_sync_model: Union[ 3929 DatetimeBasedCursorModel, 3930 IncrementingCountCursorModel, 3931 ] = ( 3932 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 3933 if isinstance(model.stream, DeclarativeStreamModel) 3934 else self._get_state_delegating_stream_model( 3935 has_parent_state, model.stream 3936 ).incremental_sync 3937 ) 3938 cursor_field = InterpolatedString.create( 3939 incremental_sync_model.cursor_field, 3940 parameters=incremental_sync_model.parameters or {}, 3941 ).eval(config) 3942 parent_state = AirbyteStateMessage( 3943 type=AirbyteStateType.STREAM, 3944 stream=AirbyteStreamState( 3945 stream_descriptor=StreamDescriptor( 3946 name=parent_stream_name, namespace=None 3947 ), 3948 stream_state=AirbyteStateBlob( 3949 {cursor_field: list(cursor_values)[0]} 3950 ), 3951 ), 3952 ) 3953 return ConnectorStateManager([parent_state] if parent_state else []) 3954 3955 return ConnectorStateManager([]) 3956 3957 @staticmethod 3958 def create_wait_time_from_header( 3959 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3960 ) -> WaitTimeFromHeaderBackoffStrategy: 3961 return WaitTimeFromHeaderBackoffStrategy( 3962 header=model.header, 3963 parameters=model.parameters or {}, 3964 config=config, 3965 regex=model.regex, 3966 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3967 if model.max_waiting_time_in_seconds is not None 3968 else None, 3969 ) 3970 3971 @staticmethod 3972 def create_wait_until_time_from_header( 3973 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3974 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3975 return WaitUntilTimeFromHeaderBackoffStrategy( 3976 header=model.header, 3977 parameters=model.parameters or {}, 3978 config=config, 3979 min_wait=model.min_wait, 3980 regex=model.regex, 3981 ) 3982 3983 def get_message_repository(self) -> MessageRepository: 3984 return self._message_repository 3985 3986 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3987 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3988 3989 @staticmethod 3990 def create_components_mapping_definition( 3991 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3992 ) -> ComponentMappingDefinition: 3993 interpolated_value = InterpolatedString.create( 3994 model.value, parameters=model.parameters or {} 3995 ) 3996 field_path = [ 3997 InterpolatedString.create(path, parameters=model.parameters or {}) 3998 for path in model.field_path 3999 ] 4000 return ComponentMappingDefinition( 4001 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4002 value=interpolated_value, 4003 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4004 create_or_update=model.create_or_update, 4005 condition=model.condition, 4006 parameters=model.parameters or {}, 4007 ) 4008 4009 def create_http_components_resolver( 4010 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4011 ) -> Any: 4012 retriever = self._create_component_from_model( 4013 model=model.retriever, 4014 config=config, 4015 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4016 primary_key=None, 4017 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4018 transformations=[], 4019 ) 4020 4021 components_mapping = [] 4022 for component_mapping_definition_model in model.components_mapping: 4023 if component_mapping_definition_model.condition: 4024 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4025 components_mapping.append( 4026 self._create_component_from_model( 4027 model=component_mapping_definition_model, 4028 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4029 component_mapping_definition_model.value_type 4030 ), 4031 config=config, 4032 ) 4033 ) 4034 4035 return HttpComponentsResolver( 4036 retriever=retriever, 4037 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4038 config=config, 4039 components_mapping=components_mapping, 4040 parameters=model.parameters or {}, 4041 ) 4042 4043 @staticmethod 4044 def create_stream_config( 4045 model: StreamConfigModel, config: Config, **kwargs: Any 4046 ) -> StreamConfig: 4047 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4048 [x for x in model.configs_pointer] if model.configs_pointer else [] 4049 ) 4050 4051 return StreamConfig( 4052 configs_pointer=model_configs_pointer, 4053 default_values=model.default_values, 4054 parameters=model.parameters or {}, 4055 ) 4056 4057 def create_config_components_resolver( 4058 self, 4059 model: ConfigComponentsResolverModel, 4060 config: Config, 4061 ) -> Any: 4062 model_stream_configs = ( 4063 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4064 ) 4065 4066 stream_configs = [ 4067 self._create_component_from_model( 4068 stream_config, config=config, parameters=model.parameters or {} 4069 ) 4070 for stream_config in model_stream_configs 4071 ] 4072 4073 components_mapping = [ 4074 self._create_component_from_model( 4075 model=components_mapping_definition_model, 4076 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4077 components_mapping_definition_model.value_type 4078 ), 4079 config=config, 4080 parameters=model.parameters, 4081 ) 4082 for components_mapping_definition_model in model.components_mapping 4083 ] 4084 4085 return ConfigComponentsResolver( 4086 stream_configs=stream_configs, 4087 config=config, 4088 components_mapping=components_mapping, 4089 parameters=model.parameters or {}, 4090 ) 4091 4092 def create_parametrized_components_resolver( 4093 self, 4094 model: ParametrizedComponentsResolverModel, 4095 config: Config, 4096 ) -> ParametrizedComponentsResolver: 4097 stream_parameters = StreamParametersDefinition( 4098 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4099 ) 4100 4101 components_mapping = [] 4102 for components_mapping_definition_model in model.components_mapping: 4103 if components_mapping_definition_model.condition: 4104 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4105 components_mapping.append( 4106 self._create_component_from_model( 4107 model=components_mapping_definition_model, 4108 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4109 components_mapping_definition_model.value_type 4110 ), 4111 config=config, 4112 ) 4113 ) 4114 return ParametrizedComponentsResolver( 4115 stream_parameters=stream_parameters, 4116 config=config, 4117 components_mapping=components_mapping, 4118 parameters=model.parameters or {}, 4119 ) 4120 4121 _UNSUPPORTED_DECODER_ERROR = ( 4122 "Specified decoder of {decoder_type} is not supported for pagination." 4123 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4124 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4125 ) 4126 4127 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4128 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4129 return True 4130 elif isinstance(decoder, CompositeRawDecoder): 4131 return self._is_supported_parser_for_pagination(decoder.parser) 4132 else: 4133 return False 4134 4135 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4136 if isinstance(parser, JsonParser): 4137 return True 4138 elif isinstance(parser, GzipParser): 4139 return isinstance(parser.inner_parser, JsonParser) 4140 else: 4141 return False 4142 4143 def create_http_api_budget( 4144 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4145 ) -> HttpAPIBudget: 4146 policies = [ 4147 self._create_component_from_model(model=policy, config=config) 4148 for policy in model.policies 4149 ] 4150 4151 return HttpAPIBudget( 4152 policies=policies, 4153 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4154 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4155 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4156 ) 4157 4158 def create_fixed_window_call_rate_policy( 4159 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4160 ) -> FixedWindowCallRatePolicy: 4161 matchers = [ 4162 self._create_component_from_model(model=matcher, config=config) 4163 for matcher in model.matchers 4164 ] 4165 4166 # Set the initial reset timestamp to 10 days from now. 4167 # This value will be updated by the first request. 4168 return FixedWindowCallRatePolicy( 4169 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4170 period=parse_duration(model.period), 4171 call_limit=model.call_limit, 4172 matchers=matchers, 4173 ) 4174 4175 def create_file_uploader( 4176 self, model: FileUploaderModel, config: Config, **kwargs: Any 4177 ) -> FileUploader: 4178 name = "File Uploader" 4179 requester = self._create_component_from_model( 4180 model=model.requester, 4181 config=config, 4182 name=name, 4183 **kwargs, 4184 ) 4185 download_target_extractor = self._create_component_from_model( 4186 model=model.download_target_extractor, 4187 config=config, 4188 name=name, 4189 **kwargs, 4190 ) 4191 emit_connector_builder_messages = self._emit_connector_builder_messages 4192 file_uploader = DefaultFileUploader( 4193 requester=requester, 4194 download_target_extractor=download_target_extractor, 4195 config=config, 4196 file_writer=NoopFileWriter() 4197 if emit_connector_builder_messages 4198 else LocalFileSystemFileWriter(), 4199 parameters=model.parameters or {}, 4200 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4201 ) 4202 4203 return ( 4204 ConnectorBuilderFileUploader(file_uploader) 4205 if emit_connector_builder_messages 4206 else file_uploader 4207 ) 4208 4209 def create_moving_window_call_rate_policy( 4210 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4211 ) -> MovingWindowCallRatePolicy: 4212 rates = [ 4213 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4214 ] 4215 matchers = [ 4216 self._create_component_from_model(model=matcher, config=config) 4217 for matcher in model.matchers 4218 ] 4219 return MovingWindowCallRatePolicy( 4220 rates=rates, 4221 matchers=matchers, 4222 ) 4223 4224 def create_unlimited_call_rate_policy( 4225 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4226 ) -> UnlimitedCallRatePolicy: 4227 matchers = [ 4228 self._create_component_from_model(model=matcher, config=config) 4229 for matcher in model.matchers 4230 ] 4231 4232 return UnlimitedCallRatePolicy( 4233 matchers=matchers, 4234 ) 4235 4236 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4237 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4238 return Rate( 4239 limit=int(interpolated_limit.eval(config=config)), 4240 interval=parse_duration(model.interval), 4241 ) 4242 4243 def create_http_request_matcher( 4244 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4245 ) -> HttpRequestRegexMatcher: 4246 return HttpRequestRegexMatcher( 4247 method=model.method, 4248 url_base=model.url_base, 4249 url_path_pattern=model.url_path_pattern, 4250 params=model.params, 4251 headers=model.headers, 4252 ) 4253 4254 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4255 self._api_budget = self.create_component( 4256 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4257 ) 4258 4259 def create_grouping_partition_router( 4260 self, 4261 model: GroupingPartitionRouterModel, 4262 config: Config, 4263 *, 4264 stream_name: str, 4265 **kwargs: Any, 4266 ) -> GroupingPartitionRouter: 4267 underlying_router = self._create_component_from_model( 4268 model=model.underlying_partition_router, 4269 config=config, 4270 stream_name=stream_name, 4271 **kwargs, 4272 ) 4273 if model.group_size < 1: 4274 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4275 4276 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4277 # because they are specific to individual partitions and cannot be aggregated or handled 4278 # when grouping, potentially leading to incorrect API calls. Any request customization 4279 # should be managed at the stream level through the requester's configuration. 4280 if isinstance(underlying_router, SubstreamPartitionRouter): 4281 if any( 4282 parent_config.request_option 4283 for parent_config in underlying_router.parent_stream_configs 4284 ): 4285 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4286 4287 if isinstance(underlying_router, ListPartitionRouter): 4288 if underlying_router.request_option: 4289 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4290 4291 return GroupingPartitionRouter( 4292 group_size=model.group_size, 4293 underlying_partition_router=underlying_router, 4294 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4295 config=config, 4296 ) 4297 4298 def _ensure_query_properties_to_model( 4299 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4300 ) -> None: 4301 """ 4302 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4303 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4304 proper model. 4305 """ 4306 if not hasattr(requester, "request_parameters"): 4307 return 4308 4309 request_parameters = requester.request_parameters 4310 if request_parameters and isinstance(request_parameters, Dict): 4311 for request_parameter_key in request_parameters.keys(): 4312 request_parameter = request_parameters[request_parameter_key] 4313 if ( 4314 isinstance(request_parameter, Dict) 4315 and request_parameter.get("type") == "QueryProperties" 4316 ): 4317 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4318 request_parameter 4319 )
671class ModelToComponentFactory: 672 EPOCH_DATETIME_FORMAT = "%s" 673 674 def __init__( 675 self, 676 limit_pages_fetched_per_slice: Optional[int] = None, 677 limit_slices_fetched: Optional[int] = None, 678 emit_connector_builder_messages: bool = False, 679 disable_retries: bool = False, 680 disable_cache: bool = False, 681 message_repository: Optional[MessageRepository] = None, 682 connector_state_manager: Optional[ConnectorStateManager] = None, 683 max_concurrent_async_job_count: Optional[int] = None, 684 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 685 ): 686 self._init_mappings() 687 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 688 self._limit_slices_fetched = limit_slices_fetched 689 self._emit_connector_builder_messages = emit_connector_builder_messages 690 self._disable_retries = disable_retries 691 self._disable_cache = disable_cache 692 self._message_repository = message_repository or InMemoryMessageRepository( 693 self._evaluate_log_level(emit_connector_builder_messages) 694 ) 695 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 696 configured_catalog 697 ) 698 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 699 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 700 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 701 # placeholder for deprecation warnings 702 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] 703 704 def _init_mappings(self) -> None: 705 self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { 706 AddedFieldDefinitionModel: self.create_added_field_definition, 707 AddFieldsModel: self.create_add_fields, 708 ApiKeyAuthenticatorModel: self.create_api_key_authenticator, 709 BasicHttpAuthenticatorModel: self.create_basic_http_authenticator, 710 BearerAuthenticatorModel: self.create_bearer_authenticator, 711 CheckStreamModel: self.create_check_stream, 712 DynamicStreamCheckConfigModel: self.create_dynamic_stream_check_config, 713 CheckDynamicStreamModel: self.create_check_dynamic_stream, 714 CompositeErrorHandlerModel: self.create_composite_error_handler, 715 ConcurrencyLevelModel: self.create_concurrency_level, 716 ConfigMigrationModel: self.create_config_migration, 717 ConfigAddFieldsModel: self.create_config_add_fields, 718 ConfigRemapFieldModel: self.create_config_remap_field, 719 ConfigRemoveFieldsModel: self.create_config_remove_fields, 720 ConstantBackoffStrategyModel: self.create_constant_backoff_strategy, 721 CsvDecoderModel: self.create_csv_decoder, 722 CursorPaginationModel: self.create_cursor_pagination, 723 CustomAuthenticatorModel: self.create_custom_component, 724 CustomBackoffStrategyModel: self.create_custom_component, 725 CustomDecoderModel: self.create_custom_component, 726 CustomErrorHandlerModel: self.create_custom_component, 727 CustomRecordExtractorModel: self.create_custom_component, 728 CustomRecordFilterModel: self.create_custom_component, 729 CustomRequesterModel: self.create_custom_component, 730 CustomRetrieverModel: self.create_custom_component, 731 CustomSchemaLoader: self.create_custom_component, 732 CustomSchemaNormalizationModel: self.create_custom_component, 733 CustomStateMigration: self.create_custom_component, 734 CustomPaginationStrategyModel: self.create_custom_component, 735 CustomPartitionRouterModel: self.create_custom_component, 736 CustomTransformationModel: self.create_custom_component, 737 CustomValidationStrategyModel: self.create_custom_component, 738 CustomConfigTransformationModel: self.create_custom_component, 739 DatetimeBasedCursorModel: self.create_datetime_based_cursor, 740 DeclarativeStreamModel: self.create_default_stream, 741 DefaultErrorHandlerModel: self.create_default_error_handler, 742 DefaultPaginatorModel: self.create_default_paginator, 743 DpathExtractorModel: self.create_dpath_extractor, 744 DpathValidatorModel: self.create_dpath_validator, 745 ResponseToFileExtractorModel: self.create_response_to_file_extractor, 746 ExponentialBackoffStrategyModel: self.create_exponential_backoff_strategy, 747 SessionTokenAuthenticatorModel: self.create_session_token_authenticator, 748 GroupByKeyMergeStrategyModel: self.create_group_by_key, 749 HttpRequesterModel: self.create_http_requester, 750 HttpResponseFilterModel: self.create_http_response_filter, 751 InlineSchemaLoaderModel: self.create_inline_schema_loader, 752 JsonDecoderModel: self.create_json_decoder, 753 JsonlDecoderModel: self.create_jsonl_decoder, 754 JsonSchemaPropertySelectorModel: self.create_json_schema_property_selector, 755 GzipDecoderModel: self.create_gzip_decoder, 756 KeysToLowerModel: self.create_keys_to_lower_transformation, 757 KeysToSnakeCaseModel: self.create_keys_to_snake_transformation, 758 KeysReplaceModel: self.create_keys_replace_transformation, 759 FlattenFieldsModel: self.create_flatten_fields, 760 DpathFlattenFieldsModel: self.create_dpath_flatten_fields, 761 IterableDecoderModel: self.create_iterable_decoder, 762 IncrementingCountCursorModel: self.create_incrementing_count_cursor, 763 XmlDecoderModel: self.create_xml_decoder, 764 JsonFileSchemaLoaderModel: self.create_json_file_schema_loader, 765 DynamicSchemaLoaderModel: self.create_dynamic_schema_loader, 766 SchemaTypeIdentifierModel: self.create_schema_type_identifier, 767 TypesMapModel: self.create_types_map, 768 ComplexFieldTypeModel: self.create_complex_field_type, 769 JwtAuthenticatorModel: self.create_jwt_authenticator, 770 LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration, 771 ListPartitionRouterModel: self.create_list_partition_router, 772 MinMaxDatetimeModel: self.create_min_max_datetime, 773 NoAuthModel: self.create_no_auth, 774 NoPaginationModel: self.create_no_pagination, 775 OAuthAuthenticatorModel: self.create_oauth_authenticator, 776 OffsetIncrementModel: self.create_offset_increment, 777 PageIncrementModel: self.create_page_increment, 778 ParentStreamConfigModel: self.create_parent_stream_config_with_substream_wrapper, 779 PredicateValidatorModel: self.create_predicate_validator, 780 PropertiesFromEndpointModel: self.create_properties_from_endpoint, 781 PropertyChunkingModel: self.create_property_chunking, 782 QueryPropertiesModel: self.create_query_properties, 783 RecordFilterModel: self.create_record_filter, 784 RecordSelectorModel: self.create_record_selector, 785 RemoveFieldsModel: self.create_remove_fields, 786 RequestPathModel: self.create_request_path, 787 RequestOptionModel: self.create_request_option, 788 LegacySessionTokenAuthenticatorModel: self.create_legacy_session_token_authenticator, 789 SelectiveAuthenticatorModel: self.create_selective_authenticator, 790 SimpleRetrieverModel: self.create_simple_retriever, 791 StateDelegatingStreamModel: self.create_state_delegating_stream, 792 SpecModel: self.create_spec, 793 SubstreamPartitionRouterModel: self.create_substream_partition_router, 794 ValidateAdheresToSchemaModel: self.create_validate_adheres_to_schema, 795 WaitTimeFromHeaderModel: self.create_wait_time_from_header, 796 WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header, 797 AsyncRetrieverModel: self.create_async_retriever, 798 HttpComponentsResolverModel: self.create_http_components_resolver, 799 ConfigComponentsResolverModel: self.create_config_components_resolver, 800 ParametrizedComponentsResolverModel: self.create_parametrized_components_resolver, 801 StreamConfigModel: self.create_stream_config, 802 ComponentMappingDefinitionModel: self.create_components_mapping_definition, 803 ZipfileDecoderModel: self.create_zipfile_decoder, 804 HTTPAPIBudgetModel: self.create_http_api_budget, 805 FileUploaderModel: self.create_file_uploader, 806 FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, 807 MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, 808 UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, 809 RateModel: self.create_rate, 810 HttpRequestRegexMatcherModel: self.create_http_request_matcher, 811 GroupingPartitionRouterModel: self.create_grouping_partition_router, 812 } 813 814 # Needed for the case where we need to perform a second parse on the fields of a custom component 815 self.TYPE_NAME_TO_MODEL = {cls.__name__: cls for cls in self.PYDANTIC_MODEL_TO_CONSTRUCTOR} 816 817 @staticmethod 818 def _create_stream_name_to_configured_stream( 819 configured_catalog: Optional[ConfiguredAirbyteCatalog], 820 ) -> Mapping[str, ConfiguredAirbyteStream]: 821 return ( 822 {stream.stream.name: stream for stream in configured_catalog.streams} 823 if configured_catalog 824 else {} 825 ) 826 827 def create_component( 828 self, 829 model_type: Type[BaseModel], 830 component_definition: ComponentDefinition, 831 config: Config, 832 **kwargs: Any, 833 ) -> Any: 834 """ 835 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 836 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 837 creating declarative components from that model. 838 839 :param model_type: The type of declarative component that is being initialized 840 :param component_definition: The mapping that represents a declarative component 841 :param config: The connector config that is provided by the customer 842 :return: The declarative component to be used at runtime 843 """ 844 845 component_type = component_definition.get("type") 846 if component_definition.get("type") != model_type.__name__: 847 raise ValueError( 848 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 849 ) 850 851 declarative_component_model = model_type.parse_obj(component_definition) 852 853 if not isinstance(declarative_component_model, model_type): 854 raise ValueError( 855 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 856 ) 857 858 return self._create_component_from_model( 859 model=declarative_component_model, config=config, **kwargs 860 ) 861 862 def _create_component_from_model(self, model: BaseModel, config: Config, **kwargs: Any) -> Any: 863 if model.__class__ not in self.PYDANTIC_MODEL_TO_CONSTRUCTOR: 864 raise ValueError( 865 f"{model.__class__} with attributes {model} is not a valid component type" 866 ) 867 component_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(model.__class__) 868 if not component_constructor: 869 raise ValueError(f"Could not find constructor for {model.__class__}") 870 871 # collect deprecation warnings for supported models. 872 if isinstance(model, BaseModelWithDeprecations): 873 self._collect_model_deprecations(model) 874 875 return component_constructor(model=model, config=config, **kwargs) 876 877 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 878 """ 879 Returns the deprecation warnings that were collected during the creation of components. 880 """ 881 return self._collected_deprecation_logs 882 883 def _collect_model_deprecations(self, model: BaseModelWithDeprecations) -> None: 884 """ 885 Collects deprecation logs from the given model and appends any new logs to the internal collection. 886 887 This method checks if the provided model has deprecation logs (identified by the presence of the DEPRECATION_LOGS_TAG attribute and a non-None `_deprecation_logs` property). It iterates through each deprecation log in the model and appends it to the `_collected_deprecation_logs` list if it has not already been collected, ensuring that duplicate logs are avoided. 888 889 Args: 890 model (BaseModelWithDeprecations): The model instance from which to collect deprecation logs. 891 """ 892 if hasattr(model, DEPRECATION_LOGS_TAG) and model._deprecation_logs is not None: 893 for log in model._deprecation_logs: 894 # avoid duplicates for deprecation logs observed. 895 if log not in self._collected_deprecation_logs: 896 self._collected_deprecation_logs.append(log) 897 898 def create_config_migration( 899 self, model: ConfigMigrationModel, config: Config 900 ) -> ConfigMigration: 901 transformations: List[ConfigTransformation] = [ 902 self._create_component_from_model(transformation, config) 903 for transformation in model.transformations 904 ] 905 906 return ConfigMigration( 907 description=model.description, 908 transformations=transformations, 909 ) 910 911 def create_config_add_fields( 912 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 913 ) -> ConfigAddFields: 914 fields = [self._create_component_from_model(field, config) for field in model.fields] 915 return ConfigAddFields( 916 fields=fields, 917 condition=model.condition or "", 918 ) 919 920 @staticmethod 921 def create_config_remove_fields( 922 model: ConfigRemoveFieldsModel, config: Config, **kwargs: Any 923 ) -> ConfigRemoveFields: 924 return ConfigRemoveFields( 925 field_pointers=model.field_pointers, 926 condition=model.condition or "", 927 ) 928 929 @staticmethod 930 def create_config_remap_field( 931 model: ConfigRemapFieldModel, config: Config, **kwargs: Any 932 ) -> ConfigRemapField: 933 mapping = cast(Mapping[str, Any], model.map) 934 return ConfigRemapField( 935 map=mapping, 936 field_path=model.field_path, 937 config=config, 938 ) 939 940 def create_dpath_validator(self, model: DpathValidatorModel, config: Config) -> DpathValidator: 941 strategy = self._create_component_from_model(model.validation_strategy, config) 942 943 return DpathValidator( 944 field_path=model.field_path, 945 strategy=strategy, 946 ) 947 948 def create_predicate_validator( 949 self, model: PredicateValidatorModel, config: Config 950 ) -> PredicateValidator: 951 strategy = self._create_component_from_model(model.validation_strategy, config) 952 953 return PredicateValidator( 954 value=model.value, 955 strategy=strategy, 956 ) 957 958 @staticmethod 959 def create_validate_adheres_to_schema( 960 model: ValidateAdheresToSchemaModel, config: Config, **kwargs: Any 961 ) -> ValidateAdheresToSchema: 962 base_schema = cast(Mapping[str, Any], model.base_schema) 963 return ValidateAdheresToSchema( 964 schema=base_schema, 965 ) 966 967 @staticmethod 968 def create_added_field_definition( 969 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 970 ) -> AddedFieldDefinition: 971 interpolated_value = InterpolatedString.create( 972 model.value, parameters=model.parameters or {} 973 ) 974 return AddedFieldDefinition( 975 path=model.path, 976 value=interpolated_value, 977 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 978 parameters=model.parameters or {}, 979 ) 980 981 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 982 added_field_definitions = [ 983 self._create_component_from_model( 984 model=added_field_definition_model, 985 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 986 added_field_definition_model.value_type 987 ), 988 config=config, 989 ) 990 for added_field_definition_model in model.fields 991 ] 992 return AddFields( 993 fields=added_field_definitions, 994 condition=model.condition or "", 995 parameters=model.parameters or {}, 996 ) 997 998 def create_keys_to_lower_transformation( 999 self, model: KeysToLowerModel, config: Config, **kwargs: Any 1000 ) -> KeysToLowerTransformation: 1001 return KeysToLowerTransformation() 1002 1003 def create_keys_to_snake_transformation( 1004 self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any 1005 ) -> KeysToSnakeCaseTransformation: 1006 return KeysToSnakeCaseTransformation() 1007 1008 def create_keys_replace_transformation( 1009 self, model: KeysReplaceModel, config: Config, **kwargs: Any 1010 ) -> KeysReplaceTransformation: 1011 return KeysReplaceTransformation( 1012 old=model.old, new=model.new, parameters=model.parameters or {} 1013 ) 1014 1015 def create_flatten_fields( 1016 self, model: FlattenFieldsModel, config: Config, **kwargs: Any 1017 ) -> FlattenFields: 1018 return FlattenFields( 1019 flatten_lists=model.flatten_lists if model.flatten_lists is not None else True 1020 ) 1021 1022 def create_dpath_flatten_fields( 1023 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1024 ) -> DpathFlattenFields: 1025 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1026 key_transformation = ( 1027 KeyTransformation( 1028 config=config, 1029 prefix=model.key_transformation.prefix, 1030 suffix=model.key_transformation.suffix, 1031 parameters=model.parameters or {}, 1032 ) 1033 if model.key_transformation is not None 1034 else None 1035 ) 1036 return DpathFlattenFields( 1037 config=config, 1038 field_path=model_field_path, 1039 delete_origin_value=model.delete_origin_value 1040 if model.delete_origin_value is not None 1041 else False, 1042 replace_record=model.replace_record if model.replace_record is not None else False, 1043 key_transformation=key_transformation, 1044 parameters=model.parameters or {}, 1045 ) 1046 1047 @staticmethod 1048 def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]: 1049 if not value_type: 1050 return None 1051 names_to_types = { 1052 ValueType.string: str, 1053 ValueType.number: float, 1054 ValueType.integer: int, 1055 ValueType.boolean: bool, 1056 } 1057 return names_to_types[value_type] 1058 1059 def create_api_key_authenticator( 1060 self, 1061 model: ApiKeyAuthenticatorModel, 1062 config: Config, 1063 token_provider: Optional[TokenProvider] = None, 1064 **kwargs: Any, 1065 ) -> ApiKeyAuthenticator: 1066 if model.inject_into is None and model.header is None: 1067 raise ValueError( 1068 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1069 ) 1070 1071 if model.inject_into is not None and model.header is not None: 1072 raise ValueError( 1073 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1074 ) 1075 1076 if token_provider is not None and model.api_token != "": 1077 raise ValueError( 1078 "If token_provider is set, api_token is ignored and has to be set to empty string." 1079 ) 1080 1081 request_option = ( 1082 self._create_component_from_model( 1083 model.inject_into, config, parameters=model.parameters or {} 1084 ) 1085 if model.inject_into 1086 else RequestOption( 1087 inject_into=RequestOptionType.header, 1088 field_name=model.header or "", 1089 parameters=model.parameters or {}, 1090 ) 1091 ) 1092 1093 return ApiKeyAuthenticator( 1094 token_provider=( 1095 token_provider 1096 if token_provider is not None 1097 else InterpolatedStringTokenProvider( 1098 api_token=model.api_token or "", 1099 config=config, 1100 parameters=model.parameters or {}, 1101 ) 1102 ), 1103 request_option=request_option, 1104 config=config, 1105 parameters=model.parameters or {}, 1106 ) 1107 1108 def create_legacy_to_per_partition_state_migration( 1109 self, 1110 model: LegacyToPerPartitionStateMigrationModel, 1111 config: Mapping[str, Any], 1112 declarative_stream: DeclarativeStreamModel, 1113 ) -> LegacyToPerPartitionStateMigration: 1114 retriever = declarative_stream.retriever 1115 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1116 raise ValueError( 1117 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1118 ) 1119 partition_router = retriever.partition_router 1120 if not isinstance( 1121 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1122 ): 1123 raise ValueError( 1124 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1125 ) 1126 if not hasattr(partition_router, "parent_stream_configs"): 1127 raise ValueError( 1128 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1129 ) 1130 1131 if not hasattr(declarative_stream, "incremental_sync"): 1132 raise ValueError( 1133 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1134 ) 1135 1136 return LegacyToPerPartitionStateMigration( 1137 partition_router, # type: ignore # was already checked above 1138 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1139 config, 1140 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1141 ) 1142 1143 def create_session_token_authenticator( 1144 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1145 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1146 decoder = ( 1147 self._create_component_from_model(model=model.decoder, config=config) 1148 if model.decoder 1149 else JsonDecoder(parameters={}) 1150 ) 1151 login_requester = self._create_component_from_model( 1152 model=model.login_requester, 1153 config=config, 1154 name=f"{name}_login_requester", 1155 decoder=decoder, 1156 ) 1157 token_provider = SessionTokenProvider( 1158 login_requester=login_requester, 1159 session_token_path=model.session_token_path, 1160 expiration_duration=parse_duration(model.expiration_duration) 1161 if model.expiration_duration 1162 else None, 1163 parameters=model.parameters or {}, 1164 message_repository=self._message_repository, 1165 decoder=decoder, 1166 ) 1167 if model.request_authentication.type == "Bearer": 1168 return ModelToComponentFactory.create_bearer_authenticator( 1169 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1170 config, 1171 token_provider=token_provider, 1172 ) 1173 else: 1174 return self.create_api_key_authenticator( 1175 ApiKeyAuthenticatorModel( 1176 type="ApiKeyAuthenticator", 1177 api_token="", 1178 inject_into=model.request_authentication.inject_into, 1179 ), # type: ignore # $parameters and headers default to None 1180 config=config, 1181 token_provider=token_provider, 1182 ) 1183 1184 @staticmethod 1185 def create_basic_http_authenticator( 1186 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1187 ) -> BasicHttpAuthenticator: 1188 return BasicHttpAuthenticator( 1189 password=model.password or "", 1190 username=model.username, 1191 config=config, 1192 parameters=model.parameters or {}, 1193 ) 1194 1195 @staticmethod 1196 def create_bearer_authenticator( 1197 model: BearerAuthenticatorModel, 1198 config: Config, 1199 token_provider: Optional[TokenProvider] = None, 1200 **kwargs: Any, 1201 ) -> BearerAuthenticator: 1202 if token_provider is not None and model.api_token != "": 1203 raise ValueError( 1204 "If token_provider is set, api_token is ignored and has to be set to empty string." 1205 ) 1206 return BearerAuthenticator( 1207 token_provider=( 1208 token_provider 1209 if token_provider is not None 1210 else InterpolatedStringTokenProvider( 1211 api_token=model.api_token or "", 1212 config=config, 1213 parameters=model.parameters or {}, 1214 ) 1215 ), 1216 config=config, 1217 parameters=model.parameters or {}, 1218 ) 1219 1220 @staticmethod 1221 def create_dynamic_stream_check_config( 1222 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1223 ) -> DynamicStreamCheckConfig: 1224 return DynamicStreamCheckConfig( 1225 dynamic_stream_name=model.dynamic_stream_name, 1226 stream_count=model.stream_count or 0, 1227 ) 1228 1229 def create_check_stream( 1230 self, model: CheckStreamModel, config: Config, **kwargs: Any 1231 ) -> CheckStream: 1232 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1233 raise ValueError( 1234 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1235 ) 1236 1237 dynamic_streams_check_configs = ( 1238 [ 1239 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1240 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1241 ] 1242 if model.dynamic_streams_check_configs 1243 else [] 1244 ) 1245 1246 return CheckStream( 1247 stream_names=model.stream_names or [], 1248 dynamic_streams_check_configs=dynamic_streams_check_configs, 1249 parameters={}, 1250 ) 1251 1252 @staticmethod 1253 def create_check_dynamic_stream( 1254 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1255 ) -> CheckDynamicStream: 1256 assert model.use_check_availability is not None # for mypy 1257 1258 use_check_availability = model.use_check_availability 1259 1260 return CheckDynamicStream( 1261 stream_count=model.stream_count, 1262 use_check_availability=use_check_availability, 1263 parameters={}, 1264 ) 1265 1266 def create_composite_error_handler( 1267 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1268 ) -> CompositeErrorHandler: 1269 error_handlers = [ 1270 self._create_component_from_model(model=error_handler_model, config=config) 1271 for error_handler_model in model.error_handlers 1272 ] 1273 return CompositeErrorHandler( 1274 error_handlers=error_handlers, parameters=model.parameters or {} 1275 ) 1276 1277 @staticmethod 1278 def create_concurrency_level( 1279 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1280 ) -> ConcurrencyLevel: 1281 return ConcurrencyLevel( 1282 default_concurrency=model.default_concurrency, 1283 max_concurrency=model.max_concurrency, 1284 config=config, 1285 parameters={}, 1286 ) 1287 1288 @staticmethod 1289 def apply_stream_state_migrations( 1290 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1291 ) -> MutableMapping[str, Any]: 1292 if stream_state_migrations: 1293 for state_migration in stream_state_migrations: 1294 if state_migration.should_migrate(stream_state): 1295 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1296 stream_state = dict(state_migration.migrate(stream_state)) 1297 return stream_state 1298 1299 def create_concurrent_cursor_from_datetime_based_cursor( 1300 self, 1301 model_type: Type[BaseModel], 1302 component_definition: ComponentDefinition, 1303 stream_name: str, 1304 stream_namespace: Optional[str], 1305 stream_state: MutableMapping[str, Any], 1306 config: Config, 1307 message_repository: Optional[MessageRepository] = None, 1308 runtime_lookback_window: Optional[datetime.timedelta] = None, 1309 **kwargs: Any, 1310 ) -> ConcurrentCursor: 1311 component_type = component_definition.get("type") 1312 if component_definition.get("type") != model_type.__name__: 1313 raise ValueError( 1314 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1315 ) 1316 1317 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1318 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1319 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1320 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1321 if "$parameters" not in component_definition and "parameters" in component_definition: 1322 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1323 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1324 1325 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1326 raise ValueError( 1327 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1328 ) 1329 1330 model_parameters = datetime_based_cursor_model.parameters or {} 1331 interpolated_cursor_field = InterpolatedString.create( 1332 datetime_based_cursor_model.cursor_field, 1333 parameters=model_parameters, 1334 ) 1335 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1336 1337 interpolated_partition_field_start = InterpolatedString.create( 1338 datetime_based_cursor_model.partition_field_start or "start_time", 1339 parameters=model_parameters, 1340 ) 1341 interpolated_partition_field_end = InterpolatedString.create( 1342 datetime_based_cursor_model.partition_field_end or "end_time", 1343 parameters=model_parameters, 1344 ) 1345 1346 slice_boundary_fields = ( 1347 interpolated_partition_field_start.eval(config=config), 1348 interpolated_partition_field_end.eval(config=config), 1349 ) 1350 1351 datetime_format = datetime_based_cursor_model.datetime_format 1352 1353 cursor_granularity = ( 1354 parse_duration(datetime_based_cursor_model.cursor_granularity) 1355 if datetime_based_cursor_model.cursor_granularity 1356 else None 1357 ) 1358 1359 lookback_window = None 1360 interpolated_lookback_window = ( 1361 InterpolatedString.create( 1362 datetime_based_cursor_model.lookback_window, 1363 parameters=model_parameters, 1364 ) 1365 if datetime_based_cursor_model.lookback_window 1366 else None 1367 ) 1368 if interpolated_lookback_window: 1369 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1370 if evaluated_lookback_window: 1371 lookback_window = parse_duration(evaluated_lookback_window) 1372 1373 connector_state_converter: DateTimeStreamStateConverter 1374 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1375 datetime_format=datetime_format, 1376 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1377 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1378 cursor_granularity=cursor_granularity, 1379 ) 1380 1381 # Adjusts the stream state by applying the runtime lookback window. 1382 # This is used to ensure correct state handling in case of failed partitions. 1383 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1384 if runtime_lookback_window and stream_state_value: 1385 new_stream_state = ( 1386 connector_state_converter.parse_timestamp(stream_state_value) 1387 - runtime_lookback_window 1388 ) 1389 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1390 new_stream_state 1391 ) 1392 1393 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1394 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1395 start_date_runtime_value = self.create_min_max_datetime( 1396 model=datetime_based_cursor_model.start_datetime, config=config 1397 ) 1398 else: 1399 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1400 1401 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1402 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1403 end_date_runtime_value = self.create_min_max_datetime( 1404 model=datetime_based_cursor_model.end_datetime, config=config 1405 ) 1406 else: 1407 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1408 1409 interpolated_start_date = MinMaxDatetime.create( 1410 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1411 parameters=datetime_based_cursor_model.parameters, 1412 ) 1413 interpolated_end_date = ( 1414 None 1415 if not end_date_runtime_value 1416 else MinMaxDatetime.create( 1417 end_date_runtime_value, datetime_based_cursor_model.parameters 1418 ) 1419 ) 1420 1421 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1422 if not interpolated_start_date.datetime_format: 1423 interpolated_start_date.datetime_format = datetime_format 1424 if interpolated_end_date and not interpolated_end_date.datetime_format: 1425 interpolated_end_date.datetime_format = datetime_format 1426 1427 start_date = interpolated_start_date.get_datetime(config=config) 1428 end_date_provider = ( 1429 partial(interpolated_end_date.get_datetime, config) 1430 if interpolated_end_date 1431 else connector_state_converter.get_end_provider() 1432 ) 1433 1434 if ( 1435 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1436 ) or ( 1437 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1438 ): 1439 raise ValueError( 1440 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1441 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1442 ) 1443 1444 # When step is not defined, default to a step size from the starting date to the present moment 1445 step_length = datetime.timedelta.max 1446 interpolated_step = ( 1447 InterpolatedString.create( 1448 datetime_based_cursor_model.step, 1449 parameters=model_parameters, 1450 ) 1451 if datetime_based_cursor_model.step 1452 else None 1453 ) 1454 if interpolated_step: 1455 evaluated_step = interpolated_step.eval(config) 1456 if evaluated_step: 1457 step_length = parse_duration(evaluated_step) 1458 1459 clamping_strategy: ClampingStrategy = NoClamping() 1460 if datetime_based_cursor_model.clamping: 1461 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1462 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1463 # object which we want to keep agnostic of being low-code 1464 target = InterpolatedString( 1465 string=datetime_based_cursor_model.clamping.target, 1466 parameters=model_parameters, 1467 ) 1468 evaluated_target = target.eval(config=config) 1469 match evaluated_target: 1470 case "DAY": 1471 clamping_strategy = DayClampingStrategy() 1472 end_date_provider = ClampingEndProvider( 1473 DayClampingStrategy(is_ceiling=False), 1474 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1475 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1476 ) 1477 case "WEEK": 1478 if ( 1479 not datetime_based_cursor_model.clamping.target_details 1480 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1481 ): 1482 raise ValueError( 1483 "Given WEEK clamping, weekday needs to be provided as target_details" 1484 ) 1485 weekday = self._assemble_weekday( 1486 datetime_based_cursor_model.clamping.target_details["weekday"] 1487 ) 1488 clamping_strategy = WeekClampingStrategy(weekday) 1489 end_date_provider = ClampingEndProvider( 1490 WeekClampingStrategy(weekday, is_ceiling=False), 1491 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1492 granularity=cursor_granularity or datetime.timedelta(days=1), 1493 ) 1494 case "MONTH": 1495 clamping_strategy = MonthClampingStrategy() 1496 end_date_provider = ClampingEndProvider( 1497 MonthClampingStrategy(is_ceiling=False), 1498 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1499 granularity=cursor_granularity or datetime.timedelta(days=1), 1500 ) 1501 case _: 1502 raise ValueError( 1503 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1504 ) 1505 1506 return ConcurrentCursor( 1507 stream_name=stream_name, 1508 stream_namespace=stream_namespace, 1509 stream_state=stream_state, 1510 message_repository=message_repository or self._message_repository, 1511 connector_state_manager=self._connector_state_manager, 1512 connector_state_converter=connector_state_converter, 1513 cursor_field=cursor_field, 1514 slice_boundary_fields=slice_boundary_fields, 1515 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1517 lookback_window=lookback_window, 1518 slice_range=step_length, 1519 cursor_granularity=cursor_granularity, 1520 clamping_strategy=clamping_strategy, 1521 ) 1522 1523 def create_concurrent_cursor_from_incrementing_count_cursor( 1524 self, 1525 model_type: Type[BaseModel], 1526 component_definition: ComponentDefinition, 1527 stream_name: str, 1528 stream_namespace: Optional[str], 1529 stream_state: MutableMapping[str, Any], 1530 config: Config, 1531 message_repository: Optional[MessageRepository] = None, 1532 **kwargs: Any, 1533 ) -> ConcurrentCursor: 1534 component_type = component_definition.get("type") 1535 if component_definition.get("type") != model_type.__name__: 1536 raise ValueError( 1537 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1538 ) 1539 1540 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1541 1542 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1543 raise ValueError( 1544 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1545 ) 1546 1547 interpolated_start_value = ( 1548 InterpolatedString.create( 1549 incrementing_count_cursor_model.start_value, # type: ignore 1550 parameters=incrementing_count_cursor_model.parameters or {}, 1551 ) 1552 if incrementing_count_cursor_model.start_value 1553 else 0 1554 ) 1555 1556 interpolated_cursor_field = InterpolatedString.create( 1557 incrementing_count_cursor_model.cursor_field, 1558 parameters=incrementing_count_cursor_model.parameters or {}, 1559 ) 1560 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1561 1562 connector_state_converter = IncrementingCountStreamStateConverter( 1563 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1564 ) 1565 1566 return ConcurrentCursor( 1567 stream_name=stream_name, 1568 stream_namespace=stream_namespace, 1569 stream_state=stream_state, 1570 message_repository=message_repository or self._message_repository, 1571 connector_state_manager=self._connector_state_manager, 1572 connector_state_converter=connector_state_converter, 1573 cursor_field=cursor_field, 1574 slice_boundary_fields=None, 1575 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1576 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1577 ) 1578 1579 def _assemble_weekday(self, weekday: str) -> Weekday: 1580 match weekday: 1581 case "MONDAY": 1582 return Weekday.MONDAY 1583 case "TUESDAY": 1584 return Weekday.TUESDAY 1585 case "WEDNESDAY": 1586 return Weekday.WEDNESDAY 1587 case "THURSDAY": 1588 return Weekday.THURSDAY 1589 case "FRIDAY": 1590 return Weekday.FRIDAY 1591 case "SATURDAY": 1592 return Weekday.SATURDAY 1593 case "SUNDAY": 1594 return Weekday.SUNDAY 1595 case _: 1596 raise ValueError(f"Unknown weekday {weekday}") 1597 1598 def create_concurrent_cursor_from_perpartition_cursor( 1599 self, 1600 state_manager: ConnectorStateManager, 1601 model_type: Type[BaseModel], 1602 component_definition: ComponentDefinition, 1603 stream_name: str, 1604 stream_namespace: Optional[str], 1605 config: Config, 1606 stream_state: MutableMapping[str, Any], 1607 partition_router: PartitionRouter, 1608 attempt_to_create_cursor_if_not_provided: bool = False, 1609 **kwargs: Any, 1610 ) -> ConcurrentPerPartitionCursor: 1611 component_type = component_definition.get("type") 1612 if component_definition.get("type") != model_type.__name__: 1613 raise ValueError( 1614 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1615 ) 1616 1617 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1618 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1619 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1620 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1621 if "$parameters" not in component_definition and "parameters" in component_definition: 1622 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1623 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1624 1625 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1626 raise ValueError( 1627 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1628 ) 1629 1630 interpolated_cursor_field = InterpolatedString.create( 1631 datetime_based_cursor_model.cursor_field, 1632 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1633 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1634 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1635 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1636 parameters=datetime_based_cursor_model.parameters or {}, 1637 ) 1638 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1639 1640 datetime_format = datetime_based_cursor_model.datetime_format 1641 1642 cursor_granularity = ( 1643 parse_duration(datetime_based_cursor_model.cursor_granularity) 1644 if datetime_based_cursor_model.cursor_granularity 1645 else None 1646 ) 1647 1648 connector_state_converter: DateTimeStreamStateConverter 1649 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1650 datetime_format=datetime_format, 1651 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1652 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1653 cursor_granularity=cursor_granularity, 1654 ) 1655 1656 # Create the cursor factory 1657 cursor_factory = ConcurrentCursorFactory( 1658 partial( 1659 self.create_concurrent_cursor_from_datetime_based_cursor, 1660 state_manager=state_manager, 1661 model_type=model_type, 1662 component_definition=component_definition, 1663 stream_name=stream_name, 1664 stream_namespace=stream_namespace, 1665 config=config, 1666 message_repository=NoopMessageRepository(), 1667 ) 1668 ) 1669 1670 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1671 use_global_cursor = isinstance( 1672 partition_router, GroupingPartitionRouter 1673 ) or component_definition.get("global_substream_cursor", False) 1674 1675 # Return the concurrent cursor and state converter 1676 return ConcurrentPerPartitionCursor( 1677 cursor_factory=cursor_factory, 1678 partition_router=partition_router, 1679 stream_name=stream_name, 1680 stream_namespace=stream_namespace, 1681 stream_state=stream_state, 1682 message_repository=self._message_repository, # type: ignore 1683 connector_state_manager=state_manager, 1684 connector_state_converter=connector_state_converter, 1685 cursor_field=cursor_field, 1686 use_global_cursor=use_global_cursor, 1687 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1688 ) 1689 1690 @staticmethod 1691 def create_constant_backoff_strategy( 1692 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1693 ) -> ConstantBackoffStrategy: 1694 return ConstantBackoffStrategy( 1695 backoff_time_in_seconds=model.backoff_time_in_seconds, 1696 config=config, 1697 parameters=model.parameters or {}, 1698 ) 1699 1700 def create_cursor_pagination( 1701 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1702 ) -> CursorPaginationStrategy: 1703 if isinstance(decoder, PaginationDecoderDecorator): 1704 inner_decoder = decoder.decoder 1705 else: 1706 inner_decoder = decoder 1707 decoder = PaginationDecoderDecorator(decoder=decoder) 1708 1709 if self._is_supported_decoder_for_pagination(inner_decoder): 1710 decoder_to_use = decoder 1711 else: 1712 raise ValueError( 1713 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1714 ) 1715 1716 return CursorPaginationStrategy( 1717 cursor_value=model.cursor_value, 1718 decoder=decoder_to_use, 1719 page_size=model.page_size, 1720 stop_condition=model.stop_condition, 1721 config=config, 1722 parameters=model.parameters or {}, 1723 ) 1724 1725 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1726 """ 1727 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1728 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1729 :param model: The Pydantic model of the custom component being created 1730 :param config: The custom defined connector config 1731 :return: The declarative component built from the Pydantic model to be used at runtime 1732 """ 1733 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1734 component_fields = get_type_hints(custom_component_class) 1735 model_args = model.dict() 1736 model_args["config"] = config 1737 1738 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1739 # we defer to these arguments over the component's definition 1740 for key, arg in kwargs.items(): 1741 model_args[key] = arg 1742 1743 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1744 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1745 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1746 for model_field, model_value in model_args.items(): 1747 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1748 if ( 1749 isinstance(model_value, dict) 1750 and "type" not in model_value 1751 and model_field in component_fields 1752 ): 1753 derived_type = self._derive_component_type_from_type_hints( 1754 component_fields.get(model_field) 1755 ) 1756 if derived_type: 1757 model_value["type"] = derived_type 1758 1759 if self._is_component(model_value): 1760 model_args[model_field] = self._create_nested_component( 1761 model, 1762 model_field, 1763 model_value, 1764 config, 1765 **kwargs, 1766 ) 1767 elif isinstance(model_value, list): 1768 vals = [] 1769 for v in model_value: 1770 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1771 derived_type = self._derive_component_type_from_type_hints( 1772 component_fields.get(model_field) 1773 ) 1774 if derived_type: 1775 v["type"] = derived_type 1776 if self._is_component(v): 1777 vals.append( 1778 self._create_nested_component( 1779 model, 1780 model_field, 1781 v, 1782 config, 1783 **kwargs, 1784 ) 1785 ) 1786 else: 1787 vals.append(v) 1788 model_args[model_field] = vals 1789 1790 kwargs = { 1791 class_field: model_args[class_field] 1792 for class_field in component_fields.keys() 1793 if class_field in model_args 1794 } 1795 return custom_component_class(**kwargs) 1796 1797 @staticmethod 1798 def _get_class_from_fully_qualified_class_name( 1799 full_qualified_class_name: str, 1800 ) -> Any: 1801 """Get a class from its fully qualified name. 1802 1803 If a custom components module is needed, we assume it is already registered - probably 1804 as `source_declarative_manifest.components` or `components`. 1805 1806 Args: 1807 full_qualified_class_name (str): The fully qualified name of the class (e.g., "module.ClassName"). 1808 1809 Returns: 1810 Any: The class object. 1811 1812 Raises: 1813 ValueError: If the class cannot be loaded. 1814 """ 1815 split = full_qualified_class_name.split(".") 1816 module_name_full = ".".join(split[:-1]) 1817 class_name = split[-1] 1818 1819 try: 1820 module_ref = importlib.import_module(module_name_full) 1821 except ModuleNotFoundError as e: 1822 if split[0] == "source_declarative_manifest": 1823 # During testing, the modules containing the custom components are not moved to source_declarative_manifest. In order to run the test, add the source folder to your PYTHONPATH or add it runtime using sys.path.append 1824 try: 1825 import os 1826 1827 module_name_with_source_declarative_manifest = ".".join(split[1:-1]) 1828 module_ref = importlib.import_module( 1829 module_name_with_source_declarative_manifest 1830 ) 1831 except ModuleNotFoundError: 1832 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1833 else: 1834 raise ValueError(f"Could not load module `{module_name_full}`.") from e 1835 1836 try: 1837 return getattr(module_ref, class_name) 1838 except AttributeError as e: 1839 raise ValueError( 1840 f"Could not load class `{class_name}` from module `{module_name_full}`.", 1841 ) from e 1842 1843 @staticmethod 1844 def _derive_component_type_from_type_hints(field_type: Any) -> Optional[str]: 1845 interface = field_type 1846 while True: 1847 origin = get_origin(interface) 1848 if origin: 1849 # Unnest types until we reach the raw type 1850 # List[T] -> T 1851 # Optional[List[T]] -> T 1852 args = get_args(interface) 1853 interface = args[0] 1854 else: 1855 break 1856 if isinstance(interface, type) and not ModelToComponentFactory.is_builtin_type(interface): 1857 return interface.__name__ 1858 return None 1859 1860 @staticmethod 1861 def is_builtin_type(cls: Optional[Type[Any]]) -> bool: 1862 if not cls: 1863 return False 1864 return cls.__module__ == "builtins" 1865 1866 @staticmethod 1867 def _extract_missing_parameters(error: TypeError) -> List[str]: 1868 parameter_search = re.search(r"keyword-only.*:\s(.*)", str(error)) 1869 if parameter_search: 1870 return re.findall(r"\'(.+?)\'", parameter_search.group(1)) 1871 else: 1872 return [] 1873 1874 def _create_nested_component( 1875 self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any 1876 ) -> Any: 1877 type_name = model_value.get("type", None) 1878 if not type_name: 1879 # If no type is specified, we can assume this is a dictionary object which can be returned instead of a subcomponent 1880 return model_value 1881 1882 model_type = self.TYPE_NAME_TO_MODEL.get(type_name, None) 1883 if model_type: 1884 parsed_model = model_type.parse_obj(model_value) 1885 try: 1886 # To improve usability of the language, certain fields are shared between components. This can come in the form of 1887 # a parent component passing some of its fields to a child component or the parent extracting fields from other child 1888 # components and passing it to others. One example is the DefaultPaginator referencing the HttpRequester url_base 1889 # while constructing a SimpleRetriever. However, custom components don't support this behavior because they are created 1890 # generically in create_custom_component(). This block allows developers to specify extra arguments in $parameters that 1891 # are needed by a component and could not be shared. 1892 model_constructor = self.PYDANTIC_MODEL_TO_CONSTRUCTOR.get(parsed_model.__class__) 1893 constructor_kwargs = inspect.getfullargspec(model_constructor).kwonlyargs 1894 model_parameters = model_value.get("$parameters", {}) 1895 matching_parameters = { 1896 kwarg: model_parameters[kwarg] 1897 for kwarg in constructor_kwargs 1898 if kwarg in model_parameters 1899 } 1900 matching_kwargs = { 1901 kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs 1902 } 1903 return self._create_component_from_model( 1904 model=parsed_model, config=config, **(matching_parameters | matching_kwargs) 1905 ) 1906 except TypeError as error: 1907 missing_parameters = self._extract_missing_parameters(error) 1908 if missing_parameters: 1909 raise ValueError( 1910 f"Error creating component '{type_name}' with parent custom component {model.class_name}: Please provide " 1911 + ", ".join( 1912 ( 1913 f"{type_name}.$parameters.{parameter}" 1914 for parameter in missing_parameters 1915 ) 1916 ) 1917 ) 1918 raise TypeError( 1919 f"Error creating component '{type_name}' with parent custom component {model.class_name}: {error}" 1920 ) 1921 else: 1922 raise ValueError( 1923 f"Error creating custom component {model.class_name}. Subcomponent creation has not been implemented for '{type_name}'" 1924 ) 1925 1926 @staticmethod 1927 def _is_component(model_value: Any) -> bool: 1928 return isinstance(model_value, dict) and model_value.get("type") is not None 1929 1930 def create_datetime_based_cursor( 1931 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1932 ) -> DatetimeBasedCursor: 1933 start_datetime: Union[str, MinMaxDatetime] = ( 1934 model.start_datetime 1935 if isinstance(model.start_datetime, str) 1936 else self.create_min_max_datetime(model.start_datetime, config) 1937 ) 1938 end_datetime: Union[str, MinMaxDatetime, None] = None 1939 if model.is_data_feed and model.end_datetime: 1940 raise ValueError("Data feed does not support end_datetime") 1941 if model.is_data_feed and model.is_client_side_incremental: 1942 raise ValueError( 1943 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1944 ) 1945 if model.end_datetime: 1946 end_datetime = ( 1947 model.end_datetime 1948 if isinstance(model.end_datetime, str) 1949 else self.create_min_max_datetime(model.end_datetime, config) 1950 ) 1951 1952 end_time_option = ( 1953 self._create_component_from_model( 1954 model.end_time_option, config, parameters=model.parameters or {} 1955 ) 1956 if model.end_time_option 1957 else None 1958 ) 1959 start_time_option = ( 1960 self._create_component_from_model( 1961 model.start_time_option, config, parameters=model.parameters or {} 1962 ) 1963 if model.start_time_option 1964 else None 1965 ) 1966 1967 return DatetimeBasedCursor( 1968 cursor_field=model.cursor_field, 1969 cursor_datetime_formats=model.cursor_datetime_formats 1970 if model.cursor_datetime_formats 1971 else [], 1972 cursor_granularity=model.cursor_granularity, 1973 datetime_format=model.datetime_format, 1974 end_datetime=end_datetime, 1975 start_datetime=start_datetime, 1976 step=model.step, 1977 end_time_option=end_time_option, 1978 lookback_window=model.lookback_window, 1979 start_time_option=start_time_option, 1980 partition_field_end=model.partition_field_end, 1981 partition_field_start=model.partition_field_start, 1982 message_repository=self._message_repository, 1983 is_compare_strictly=model.is_compare_strictly, 1984 config=config, 1985 parameters=model.parameters or {}, 1986 ) 1987 1988 def create_default_stream( 1989 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1990 ) -> AbstractStream: 1991 primary_key = model.primary_key.__root__ if model.primary_key else None 1992 self._migrate_state(model, config) 1993 1994 partition_router = self._build_stream_slicer_from_partition_router( 1995 model.retriever, 1996 config, 1997 stream_name=model.name, 1998 **kwargs, 1999 ) 2000 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2001 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2002 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2003 2004 end_time_option = ( 2005 self._create_component_from_model( 2006 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2007 ) 2008 if cursor_model.end_time_option 2009 else None 2010 ) 2011 start_time_option = ( 2012 self._create_component_from_model( 2013 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2014 ) 2015 if cursor_model.start_time_option 2016 else None 2017 ) 2018 2019 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2020 start_time_option=start_time_option, 2021 end_time_option=end_time_option, 2022 partition_field_start=cursor_model.partition_field_start, 2023 partition_field_end=cursor_model.partition_field_end, 2024 config=config, 2025 parameters=model.parameters or {}, 2026 ) 2027 request_options_provider = ( 2028 datetime_request_options_provider 2029 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2030 else PerPartitionRequestOptionsProvider( 2031 partition_router, datetime_request_options_provider 2032 ) 2033 ) 2034 elif model.incremental_sync and isinstance( 2035 model.incremental_sync, IncrementingCountCursorModel 2036 ): 2037 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2038 raise ValueError( 2039 "PerPartition does not support per partition states because switching to global state is time based" 2040 ) 2041 2042 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2043 2044 start_time_option = ( 2045 self._create_component_from_model( 2046 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2047 config, 2048 parameters=cursor_model.parameters or {}, 2049 ) 2050 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2051 else None 2052 ) 2053 2054 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2055 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2056 partition_field_start = "start" 2057 2058 request_options_provider = DatetimeBasedRequestOptionsProvider( 2059 start_time_option=start_time_option, 2060 partition_field_start=partition_field_start, 2061 config=config, 2062 parameters=model.parameters or {}, 2063 ) 2064 else: 2065 request_options_provider = None 2066 2067 transformations = [] 2068 if model.transformations: 2069 for transformation_model in model.transformations: 2070 transformations.append( 2071 self._create_component_from_model(model=transformation_model, config=config) 2072 ) 2073 file_uploader = None 2074 if model.file_uploader: 2075 file_uploader = self._create_component_from_model( 2076 model=model.file_uploader, config=config 2077 ) 2078 2079 stream_slicer: ConcurrentStreamSlicer = ( 2080 partition_router 2081 if isinstance(concurrent_cursor, FinalStateCursor) 2082 else concurrent_cursor 2083 ) 2084 2085 retriever = self._create_component_from_model( 2086 model=model.retriever, 2087 config=config, 2088 name=model.name, 2089 primary_key=primary_key, 2090 request_options_provider=request_options_provider, 2091 stream_slicer=stream_slicer, 2092 partition_router=partition_router, 2093 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2094 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2095 cursor=concurrent_cursor, 2096 transformations=transformations, 2097 file_uploader=file_uploader, 2098 incremental_sync=model.incremental_sync, 2099 ) 2100 if isinstance(retriever, AsyncRetriever): 2101 stream_slicer = retriever.stream_slicer 2102 2103 schema_loader: SchemaLoader 2104 if model.schema_loader and isinstance(model.schema_loader, list): 2105 nested_schema_loaders = [ 2106 self._create_component_from_model(model=nested_schema_loader, config=config) 2107 for nested_schema_loader in model.schema_loader 2108 ] 2109 schema_loader = CompositeSchemaLoader( 2110 schema_loaders=nested_schema_loaders, parameters={} 2111 ) 2112 elif model.schema_loader: 2113 schema_loader = self._create_component_from_model( 2114 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2115 config=config, 2116 ) 2117 else: 2118 options = model.parameters or {} 2119 if "name" not in options: 2120 options["name"] = model.name 2121 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2122 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2123 2124 stream_name = model.name or "" 2125 return DefaultStream( 2126 partition_generator=StreamSlicerPartitionGenerator( 2127 DeclarativePartitionFactory( 2128 stream_name, 2129 schema_loader, 2130 retriever, 2131 self._message_repository, 2132 ), 2133 stream_slicer, 2134 slice_limit=self._limit_slices_fetched, 2135 ), 2136 name=stream_name, 2137 json_schema=schema_loader.get_json_schema, 2138 primary_key=get_primary_key_from_stream(primary_key), 2139 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2140 if hasattr(concurrent_cursor, "cursor_field") 2141 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2142 logger=logging.getLogger(f"airbyte.{stream_name}"), 2143 cursor=concurrent_cursor, 2144 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2145 ) 2146 2147 def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: 2148 stream_name = model.name or "" 2149 stream_state = self._connector_state_manager.get_stream_state( 2150 stream_name=stream_name, namespace=None 2151 ) 2152 if model.state_migrations: 2153 state_transformations = [ 2154 self._create_component_from_model(state_migration, config, declarative_stream=model) 2155 for state_migration in model.state_migrations 2156 ] 2157 else: 2158 state_transformations = [] 2159 stream_state = self.apply_stream_state_migrations(state_transformations, stream_state) 2160 self._connector_state_manager.update_state_for_stream( 2161 stream_name=stream_name, namespace=None, value=stream_state 2162 ) 2163 2164 def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool: 2165 return bool( 2166 model.incremental_sync 2167 and hasattr(model.incremental_sync, "is_data_feed") 2168 and model.incremental_sync.is_data_feed 2169 ) 2170 2171 def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool: 2172 return bool( 2173 model.incremental_sync 2174 and hasattr(model.incremental_sync, "is_client_side_incremental") 2175 and model.incremental_sync.is_client_side_incremental 2176 ) 2177 2178 def _build_stream_slicer_from_partition_router( 2179 self, 2180 model: Union[ 2181 AsyncRetrieverModel, 2182 CustomRetrieverModel, 2183 SimpleRetrieverModel, 2184 ], 2185 config: Config, 2186 stream_name: Optional[str] = None, 2187 **kwargs: Any, 2188 ) -> PartitionRouter: 2189 if ( 2190 hasattr(model, "partition_router") 2191 and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel)) 2192 and model.partition_router 2193 ): 2194 stream_slicer_model = model.partition_router 2195 if isinstance(stream_slicer_model, list): 2196 return CartesianProductStreamSlicer( 2197 [ 2198 self._create_component_from_model( 2199 model=slicer, config=config, stream_name=stream_name or "" 2200 ) 2201 for slicer in stream_slicer_model 2202 ], 2203 parameters={}, 2204 ) 2205 elif isinstance(stream_slicer_model, dict): 2206 # partition router comes from CustomRetrieverModel therefore has not been parsed as a model 2207 params = stream_slicer_model.get("$parameters") 2208 if not isinstance(params, dict): 2209 params = {} 2210 stream_slicer_model["$parameters"] = params 2211 2212 if stream_name is not None: 2213 params["stream_name"] = stream_name 2214 2215 return self._create_nested_component( # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices` 2216 model, 2217 "partition_router", 2218 stream_slicer_model, 2219 config, 2220 **kwargs, 2221 ) 2222 else: 2223 return self._create_component_from_model( # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router 2224 model=stream_slicer_model, config=config, stream_name=stream_name or "" 2225 ) 2226 return SinglePartitionRouter(parameters={}) 2227 2228 def _build_concurrent_cursor( 2229 self, 2230 model: DeclarativeStreamModel, 2231 stream_slicer: Optional[PartitionRouter], 2232 config: Config, 2233 ) -> Cursor: 2234 stream_name = model.name or "" 2235 stream_state = self._connector_state_manager.get_stream_state(stream_name, None) 2236 2237 if ( 2238 model.incremental_sync 2239 and stream_slicer 2240 and not isinstance(stream_slicer, SinglePartitionRouter) 2241 ): 2242 if isinstance(model.incremental_sync, IncrementingCountCursorModel): 2243 # We don't currently support usage of partition routing and IncrementingCountCursor at the 2244 # same time because we didn't solve for design questions like what the lookback window would 2245 # be as well as global cursor fall backs. We have not seen customers that have needed both 2246 # at the same time yet and are currently punting on this until we need to solve it. 2247 raise ValueError( 2248 f"The low-code framework does not currently support usage of a PartitionRouter and an IncrementingCountCursor at the same time. Please specify only one of these options for stream {stream_name}." 2249 ) 2250 return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2251 state_manager=self._connector_state_manager, 2252 model_type=DatetimeBasedCursorModel, 2253 component_definition=model.incremental_sync.__dict__, 2254 stream_name=stream_name, 2255 stream_state=stream_state, 2256 stream_namespace=None, 2257 config=config or {}, 2258 partition_router=stream_slicer, 2259 attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now? 2260 ) 2261 elif model.incremental_sync: 2262 if type(model.incremental_sync) == IncrementingCountCursorModel: 2263 return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2264 model_type=IncrementingCountCursorModel, 2265 component_definition=model.incremental_sync.__dict__, 2266 stream_name=stream_name, 2267 stream_namespace=None, 2268 stream_state=stream_state, 2269 config=config or {}, 2270 ) 2271 elif type(model.incremental_sync) == DatetimeBasedCursorModel: 2272 return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing 2273 model_type=type(model.incremental_sync), 2274 component_definition=model.incremental_sync.__dict__, 2275 stream_name=stream_name, 2276 stream_namespace=None, 2277 stream_state=stream_state, 2278 config=config or {}, 2279 attempt_to_create_cursor_if_not_provided=True, 2280 ) 2281 else: 2282 raise ValueError( 2283 f"Incremental sync of type {type(model.incremental_sync)} is not supported" 2284 ) 2285 return FinalStateCursor(stream_name, None, self._message_repository) 2286 2287 def create_default_error_handler( 2288 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2289 ) -> DefaultErrorHandler: 2290 backoff_strategies = [] 2291 if model.backoff_strategies: 2292 for backoff_strategy_model in model.backoff_strategies: 2293 backoff_strategies.append( 2294 self._create_component_from_model(model=backoff_strategy_model, config=config) 2295 ) 2296 2297 response_filters = [] 2298 if model.response_filters: 2299 for response_filter_model in model.response_filters: 2300 response_filters.append( 2301 self._create_component_from_model(model=response_filter_model, config=config) 2302 ) 2303 response_filters.append( 2304 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2305 ) 2306 2307 return DefaultErrorHandler( 2308 backoff_strategies=backoff_strategies, 2309 max_retries=model.max_retries, 2310 response_filters=response_filters, 2311 config=config, 2312 parameters=model.parameters or {}, 2313 ) 2314 2315 def create_default_paginator( 2316 self, 2317 model: DefaultPaginatorModel, 2318 config: Config, 2319 *, 2320 url_base: str, 2321 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2322 decoder: Optional[Decoder] = None, 2323 cursor_used_for_stop_condition: Optional[Cursor] = None, 2324 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2325 if decoder: 2326 if self._is_supported_decoder_for_pagination(decoder): 2327 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2328 else: 2329 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2330 else: 2331 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2332 page_size_option = ( 2333 self._create_component_from_model(model=model.page_size_option, config=config) 2334 if model.page_size_option 2335 else None 2336 ) 2337 page_token_option = ( 2338 self._create_component_from_model(model=model.page_token_option, config=config) 2339 if model.page_token_option 2340 else None 2341 ) 2342 pagination_strategy = self._create_component_from_model( 2343 model=model.pagination_strategy, 2344 config=config, 2345 decoder=decoder_to_use, 2346 extractor_model=extractor_model, 2347 ) 2348 if cursor_used_for_stop_condition: 2349 pagination_strategy = StopConditionPaginationStrategyDecorator( 2350 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2351 ) 2352 paginator = DefaultPaginator( 2353 decoder=decoder_to_use, 2354 page_size_option=page_size_option, 2355 page_token_option=page_token_option, 2356 pagination_strategy=pagination_strategy, 2357 url_base=url_base, 2358 config=config, 2359 parameters=model.parameters or {}, 2360 ) 2361 if self._limit_pages_fetched_per_slice: 2362 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2363 return paginator 2364 2365 def create_dpath_extractor( 2366 self, 2367 model: DpathExtractorModel, 2368 config: Config, 2369 decoder: Optional[Decoder] = None, 2370 **kwargs: Any, 2371 ) -> DpathExtractor: 2372 if decoder: 2373 decoder_to_use = decoder 2374 else: 2375 decoder_to_use = JsonDecoder(parameters={}) 2376 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2377 return DpathExtractor( 2378 decoder=decoder_to_use, 2379 field_path=model_field_path, 2380 config=config, 2381 parameters=model.parameters or {}, 2382 ) 2383 2384 @staticmethod 2385 def create_response_to_file_extractor( 2386 model: ResponseToFileExtractorModel, 2387 **kwargs: Any, 2388 ) -> ResponseToFileExtractor: 2389 return ResponseToFileExtractor(parameters=model.parameters or {}) 2390 2391 @staticmethod 2392 def create_exponential_backoff_strategy( 2393 model: ExponentialBackoffStrategyModel, config: Config 2394 ) -> ExponentialBackoffStrategy: 2395 return ExponentialBackoffStrategy( 2396 factor=model.factor or 5, parameters=model.parameters or {}, config=config 2397 ) 2398 2399 @staticmethod 2400 def create_group_by_key(model: GroupByKeyMergeStrategyModel, config: Config) -> GroupByKey: 2401 return GroupByKey(model.key, config=config, parameters=model.parameters or {}) 2402 2403 def create_http_requester( 2404 self, 2405 model: HttpRequesterModel, 2406 config: Config, 2407 decoder: Decoder = JsonDecoder(parameters={}), 2408 query_properties_key: Optional[str] = None, 2409 use_cache: Optional[bool] = None, 2410 *, 2411 name: str, 2412 ) -> HttpRequester: 2413 authenticator = ( 2414 self._create_component_from_model( 2415 model=model.authenticator, 2416 config=config, 2417 url_base=model.url or model.url_base, 2418 name=name, 2419 decoder=decoder, 2420 ) 2421 if model.authenticator 2422 else None 2423 ) 2424 error_handler = ( 2425 self._create_component_from_model(model=model.error_handler, config=config) 2426 if model.error_handler 2427 else DefaultErrorHandler( 2428 backoff_strategies=[], 2429 response_filters=[], 2430 config=config, 2431 parameters=model.parameters or {}, 2432 ) 2433 ) 2434 2435 api_budget = self._api_budget 2436 2437 request_options_provider = InterpolatedRequestOptionsProvider( 2438 request_body=model.request_body, 2439 request_body_data=model.request_body_data, 2440 request_body_json=model.request_body_json, 2441 request_headers=model.request_headers, 2442 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2443 query_properties_key=query_properties_key, 2444 config=config, 2445 parameters=model.parameters or {}, 2446 ) 2447 2448 assert model.use_cache is not None # for mypy 2449 assert model.http_method is not None # for mypy 2450 2451 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2452 2453 return HttpRequester( 2454 name=name, 2455 url=model.url, 2456 url_base=model.url_base, 2457 path=model.path, 2458 authenticator=authenticator, 2459 error_handler=error_handler, 2460 api_budget=api_budget, 2461 http_method=HttpMethod[model.http_method.value], 2462 request_options_provider=request_options_provider, 2463 config=config, 2464 disable_retries=self._disable_retries, 2465 parameters=model.parameters or {}, 2466 message_repository=self._message_repository, 2467 use_cache=should_use_cache, 2468 decoder=decoder, 2469 stream_response=decoder.is_stream_response() if decoder else False, 2470 ) 2471 2472 @staticmethod 2473 def create_http_response_filter( 2474 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2475 ) -> HttpResponseFilter: 2476 if model.action: 2477 action = ResponseAction(model.action.value) 2478 else: 2479 action = None 2480 2481 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2482 2483 http_codes = ( 2484 set(model.http_codes) if model.http_codes else set() 2485 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2486 2487 return HttpResponseFilter( 2488 action=action, 2489 failure_type=failure_type, 2490 error_message=model.error_message or "", 2491 error_message_contains=model.error_message_contains or "", 2492 http_codes=http_codes, 2493 predicate=model.predicate or "", 2494 config=config, 2495 parameters=model.parameters or {}, 2496 ) 2497 2498 @staticmethod 2499 def create_inline_schema_loader( 2500 model: InlineSchemaLoaderModel, config: Config, **kwargs: Any 2501 ) -> InlineSchemaLoader: 2502 return InlineSchemaLoader(schema=model.schema_ or {}, parameters={}) 2503 2504 def create_complex_field_type( 2505 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2506 ) -> ComplexFieldType: 2507 items = ( 2508 self._create_component_from_model(model=model.items, config=config) 2509 if isinstance(model.items, ComplexFieldTypeModel) 2510 else model.items 2511 ) 2512 2513 return ComplexFieldType(field_type=model.field_type, items=items) 2514 2515 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2516 target_type = ( 2517 self._create_component_from_model(model=model.target_type, config=config) 2518 if isinstance(model.target_type, ComplexFieldTypeModel) 2519 else model.target_type 2520 ) 2521 2522 return TypesMap( 2523 target_type=target_type, 2524 current_type=model.current_type, 2525 condition=model.condition if model.condition is not None else "True", 2526 ) 2527 2528 def create_schema_type_identifier( 2529 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2530 ) -> SchemaTypeIdentifier: 2531 types_mapping = [] 2532 if model.types_mapping: 2533 types_mapping.extend( 2534 [ 2535 self._create_component_from_model(types_map, config=config) 2536 for types_map in model.types_mapping 2537 ] 2538 ) 2539 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2540 [x for x in model.schema_pointer] if model.schema_pointer else [] 2541 ) 2542 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2543 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2544 [x for x in model.type_pointer] if model.type_pointer else None 2545 ) 2546 2547 return SchemaTypeIdentifier( 2548 schema_pointer=model_schema_pointer, 2549 key_pointer=model_key_pointer, 2550 type_pointer=model_type_pointer, 2551 types_mapping=types_mapping, 2552 parameters=model.parameters or {}, 2553 ) 2554 2555 def create_dynamic_schema_loader( 2556 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2557 ) -> DynamicSchemaLoader: 2558 schema_transformations = [] 2559 if model.schema_transformations: 2560 for transformation_model in model.schema_transformations: 2561 schema_transformations.append( 2562 self._create_component_from_model(model=transformation_model, config=config) 2563 ) 2564 name = "dynamic_properties" 2565 retriever = self._create_component_from_model( 2566 model=model.retriever, 2567 config=config, 2568 name=name, 2569 primary_key=None, 2570 partition_router=self._build_stream_slicer_from_partition_router( 2571 model.retriever, config 2572 ), 2573 transformations=[], 2574 use_cache=True, 2575 log_formatter=( 2576 lambda response: format_http_message( 2577 response, 2578 f"Schema loader '{name}' request", 2579 f"Request performed in order to extract schema.", 2580 name, 2581 is_auxiliary=True, 2582 ) 2583 ), 2584 ) 2585 schema_type_identifier = self._create_component_from_model( 2586 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2587 ) 2588 schema_filter = ( 2589 self._create_component_from_model( 2590 model.schema_filter, config=config, parameters=model.parameters or {} 2591 ) 2592 if model.schema_filter is not None 2593 else None 2594 ) 2595 2596 return DynamicSchemaLoader( 2597 retriever=retriever, 2598 config=config, 2599 schema_transformations=schema_transformations, 2600 schema_filter=schema_filter, 2601 schema_type_identifier=schema_type_identifier, 2602 parameters=model.parameters or {}, 2603 ) 2604 2605 @staticmethod 2606 def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2607 return JsonDecoder(parameters={}) 2608 2609 def create_csv_decoder(self, model: CsvDecoderModel, config: Config, **kwargs: Any) -> Decoder: 2610 return CompositeRawDecoder( 2611 parser=ModelToComponentFactory._get_parser(model, config), 2612 stream_response=False if self._emit_connector_builder_messages else True, 2613 ) 2614 2615 def create_jsonl_decoder( 2616 self, model: JsonlDecoderModel, config: Config, **kwargs: Any 2617 ) -> Decoder: 2618 return CompositeRawDecoder( 2619 parser=ModelToComponentFactory._get_parser(model, config), 2620 stream_response=False if self._emit_connector_builder_messages else True, 2621 ) 2622 2623 def create_gzip_decoder( 2624 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2625 ) -> Decoder: 2626 _compressed_response_types = { 2627 "gzip", 2628 "x-gzip", 2629 "gzip, deflate", 2630 "x-gzip, deflate", 2631 "application/zip", 2632 "application/gzip", 2633 "application/x-gzip", 2634 "application/x-zip-compressed", 2635 } 2636 2637 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2638 2639 if self._emit_connector_builder_messages: 2640 # This is very surprising but if the response is not streamed, 2641 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2642 # which uses urllib3 directly and does not uncompress the data. 2643 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2644 2645 return CompositeRawDecoder.by_headers( 2646 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2647 stream_response=True, 2648 fallback_parser=gzip_parser.inner_parser, 2649 ) 2650 2651 # todo: This method should be removed once we deprecate the SimpleRetriever.cursor field and the various 2652 # state methods 2653 @staticmethod 2654 def create_incrementing_count_cursor( 2655 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2656 ) -> DatetimeBasedCursor: 2657 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2658 # we still parse models into components. The issue is that there's no runtime implementation of a 2659 # IncrementingCountCursor. 2660 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2661 return DatetimeBasedCursor( 2662 cursor_field=model.cursor_field, 2663 datetime_format="%Y-%m-%d", 2664 start_datetime="2024-12-12", 2665 config=config, 2666 parameters={}, 2667 ) 2668 2669 @staticmethod 2670 def create_iterable_decoder( 2671 model: IterableDecoderModel, config: Config, **kwargs: Any 2672 ) -> IterableDecoder: 2673 return IterableDecoder(parameters={}) 2674 2675 @staticmethod 2676 def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder: 2677 return XmlDecoder(parameters={}) 2678 2679 def create_zipfile_decoder( 2680 self, model: ZipfileDecoderModel, config: Config, **kwargs: Any 2681 ) -> ZipfileDecoder: 2682 return ZipfileDecoder(parser=ModelToComponentFactory._get_parser(model.decoder, config)) 2683 2684 @staticmethod 2685 def _get_parser(model: BaseModel, config: Config) -> Parser: 2686 if isinstance(model, JsonDecoderModel): 2687 # Note that the logic is a bit different from the JsonDecoder as there is some legacy that is maintained to return {} on error cases 2688 return JsonParser() 2689 elif isinstance(model, JsonlDecoderModel): 2690 return JsonLineParser() 2691 elif isinstance(model, CsvDecoderModel): 2692 return CsvParser( 2693 encoding=model.encoding, 2694 delimiter=model.delimiter, 2695 set_values_to_none=model.set_values_to_none, 2696 ) 2697 elif isinstance(model, GzipDecoderModel): 2698 return GzipParser( 2699 inner_parser=ModelToComponentFactory._get_parser(model.decoder, config) 2700 ) 2701 elif isinstance( 2702 model, (CustomDecoderModel, IterableDecoderModel, XmlDecoderModel, ZipfileDecoderModel) 2703 ): 2704 raise ValueError(f"Decoder type {model} does not have parser associated to it") 2705 2706 raise ValueError(f"Unknown decoder type {model}") 2707 2708 @staticmethod 2709 def create_json_file_schema_loader( 2710 model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any 2711 ) -> JsonFileSchemaLoader: 2712 return JsonFileSchemaLoader( 2713 file_path=model.file_path or "", config=config, parameters=model.parameters or {} 2714 ) 2715 2716 def create_jwt_authenticator( 2717 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2718 ) -> JwtAuthenticator: 2719 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2720 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2721 request_option = ( 2722 self._create_component_from_model(model.request_option, config) 2723 if model.request_option 2724 else None 2725 ) 2726 return JwtAuthenticator( 2727 config=config, 2728 parameters=model.parameters or {}, 2729 algorithm=JwtAlgorithm(model.algorithm.value), 2730 secret_key=model.secret_key, 2731 base64_encode_secret_key=model.base64_encode_secret_key, 2732 token_duration=model.token_duration, 2733 header_prefix=model.header_prefix, 2734 kid=jwt_headers.kid, 2735 typ=jwt_headers.typ, 2736 cty=jwt_headers.cty, 2737 iss=jwt_payload.iss, 2738 sub=jwt_payload.sub, 2739 aud=jwt_payload.aud, 2740 additional_jwt_headers=model.additional_jwt_headers, 2741 additional_jwt_payload=model.additional_jwt_payload, 2742 passphrase=model.passphrase, 2743 request_option=request_option, 2744 ) 2745 2746 def create_list_partition_router( 2747 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2748 ) -> ListPartitionRouter: 2749 request_option = ( 2750 self._create_component_from_model(model.request_option, config) 2751 if model.request_option 2752 else None 2753 ) 2754 return ListPartitionRouter( 2755 cursor_field=model.cursor_field, 2756 request_option=request_option, 2757 values=model.values, 2758 config=config, 2759 parameters=model.parameters or {}, 2760 ) 2761 2762 @staticmethod 2763 def create_min_max_datetime( 2764 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2765 ) -> MinMaxDatetime: 2766 return MinMaxDatetime( 2767 datetime=model.datetime, 2768 datetime_format=model.datetime_format or "", 2769 max_datetime=model.max_datetime or "", 2770 min_datetime=model.min_datetime or "", 2771 parameters=model.parameters or {}, 2772 ) 2773 2774 @staticmethod 2775 def create_no_auth(model: NoAuthModel, config: Config, **kwargs: Any) -> NoAuth: 2776 return NoAuth(parameters=model.parameters or {}) 2777 2778 @staticmethod 2779 def create_no_pagination( 2780 model: NoPaginationModel, config: Config, **kwargs: Any 2781 ) -> NoPagination: 2782 return NoPagination(parameters={}) 2783 2784 def create_oauth_authenticator( 2785 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2786 ) -> DeclarativeOauth2Authenticator: 2787 profile_assertion = ( 2788 self._create_component_from_model(model.profile_assertion, config=config) 2789 if model.profile_assertion 2790 else None 2791 ) 2792 2793 if model.refresh_token_updater: 2794 # ignore type error because fixing it would have a lot of dependencies, revisit later 2795 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2796 config, 2797 InterpolatedString.create( 2798 model.token_refresh_endpoint, # type: ignore 2799 parameters=model.parameters or {}, 2800 ).eval(config), 2801 access_token_name=InterpolatedString.create( 2802 model.access_token_name or "access_token", parameters=model.parameters or {} 2803 ).eval(config), 2804 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2805 expires_in_name=InterpolatedString.create( 2806 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2807 ).eval(config), 2808 client_id_name=InterpolatedString.create( 2809 model.client_id_name or "client_id", parameters=model.parameters or {} 2810 ).eval(config), 2811 client_id=InterpolatedString.create( 2812 model.client_id, parameters=model.parameters or {} 2813 ).eval(config) 2814 if model.client_id 2815 else model.client_id, 2816 client_secret_name=InterpolatedString.create( 2817 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2818 ).eval(config), 2819 client_secret=InterpolatedString.create( 2820 model.client_secret, parameters=model.parameters or {} 2821 ).eval(config) 2822 if model.client_secret 2823 else model.client_secret, 2824 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2825 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2826 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2827 grant_type_name=InterpolatedString.create( 2828 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2829 ).eval(config), 2830 grant_type=InterpolatedString.create( 2831 model.grant_type or "refresh_token", parameters=model.parameters or {} 2832 ).eval(config), 2833 refresh_request_body=InterpolatedMapping( 2834 model.refresh_request_body or {}, parameters=model.parameters or {} 2835 ).eval(config), 2836 refresh_request_headers=InterpolatedMapping( 2837 model.refresh_request_headers or {}, parameters=model.parameters or {} 2838 ).eval(config), 2839 scopes=model.scopes, 2840 token_expiry_date_format=model.token_expiry_date_format, 2841 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2842 message_repository=self._message_repository, 2843 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2844 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2845 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2846 ) 2847 # ignore type error because fixing it would have a lot of dependencies, revisit later 2848 return DeclarativeOauth2Authenticator( # type: ignore 2849 access_token_name=model.access_token_name or "access_token", 2850 access_token_value=model.access_token_value, 2851 client_id_name=model.client_id_name or "client_id", 2852 client_id=model.client_id, 2853 client_secret_name=model.client_secret_name or "client_secret", 2854 client_secret=model.client_secret, 2855 expires_in_name=model.expires_in_name or "expires_in", 2856 grant_type_name=model.grant_type_name or "grant_type", 2857 grant_type=model.grant_type or "refresh_token", 2858 refresh_request_body=model.refresh_request_body, 2859 refresh_request_headers=model.refresh_request_headers, 2860 refresh_token_name=model.refresh_token_name or "refresh_token", 2861 refresh_token=model.refresh_token, 2862 scopes=model.scopes, 2863 token_expiry_date=model.token_expiry_date, 2864 token_expiry_date_format=model.token_expiry_date_format, 2865 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2866 token_refresh_endpoint=model.token_refresh_endpoint, 2867 config=config, 2868 parameters=model.parameters or {}, 2869 message_repository=self._message_repository, 2870 profile_assertion=profile_assertion, 2871 use_profile_assertion=model.use_profile_assertion, 2872 ) 2873 2874 def create_offset_increment( 2875 self, 2876 model: OffsetIncrementModel, 2877 config: Config, 2878 decoder: Decoder, 2879 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2880 **kwargs: Any, 2881 ) -> OffsetIncrement: 2882 if isinstance(decoder, PaginationDecoderDecorator): 2883 inner_decoder = decoder.decoder 2884 else: 2885 inner_decoder = decoder 2886 decoder = PaginationDecoderDecorator(decoder=decoder) 2887 2888 if self._is_supported_decoder_for_pagination(inner_decoder): 2889 decoder_to_use = decoder 2890 else: 2891 raise ValueError( 2892 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2893 ) 2894 2895 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2896 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2897 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2898 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2899 # When we have more time to investigate we can look into reusing the same component. 2900 extractor = ( 2901 self._create_component_from_model( 2902 model=extractor_model, config=config, decoder=decoder_to_use 2903 ) 2904 if extractor_model 2905 else None 2906 ) 2907 2908 return OffsetIncrement( 2909 page_size=model.page_size, 2910 config=config, 2911 decoder=decoder_to_use, 2912 extractor=extractor, 2913 inject_on_first_request=model.inject_on_first_request or False, 2914 parameters=model.parameters or {}, 2915 ) 2916 2917 @staticmethod 2918 def create_page_increment( 2919 model: PageIncrementModel, config: Config, **kwargs: Any 2920 ) -> PageIncrement: 2921 return PageIncrement( 2922 page_size=model.page_size, 2923 config=config, 2924 start_from_page=model.start_from_page or 0, 2925 inject_on_first_request=model.inject_on_first_request or False, 2926 parameters=model.parameters or {}, 2927 ) 2928 2929 def create_parent_stream_config( 2930 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2931 ) -> ParentStreamConfig: 2932 declarative_stream = self._create_component_from_model( 2933 model.stream, 2934 config=config, 2935 is_parent=True, 2936 **kwargs, 2937 ) 2938 request_option = ( 2939 self._create_component_from_model(model.request_option, config=config) 2940 if model.request_option 2941 else None 2942 ) 2943 2944 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2945 raise ValueError( 2946 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2947 ) 2948 2949 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2950 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2951 ) 2952 2953 return ParentStreamConfig( 2954 parent_key=model.parent_key, 2955 request_option=request_option, 2956 stream=declarative_stream, 2957 partition_field=model.partition_field, 2958 config=config, 2959 incremental_dependency=model.incremental_dependency or False, 2960 parameters=model.parameters or {}, 2961 extra_fields=model.extra_fields, 2962 lazy_read_pointer=model_lazy_read_pointer, 2963 ) 2964 2965 def create_properties_from_endpoint( 2966 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2967 ) -> PropertiesFromEndpoint: 2968 retriever = self._create_component_from_model( 2969 model=model.retriever, 2970 config=config, 2971 name="dynamic_properties", 2972 primary_key=None, 2973 stream_slicer=None, 2974 transformations=[], 2975 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2976 ) 2977 return PropertiesFromEndpoint( 2978 property_field_path=model.property_field_path, 2979 retriever=retriever, 2980 config=config, 2981 parameters=model.parameters or {}, 2982 ) 2983 2984 def create_property_chunking( 2985 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2986 ) -> PropertyChunking: 2987 record_merge_strategy = ( 2988 self._create_component_from_model( 2989 model=model.record_merge_strategy, config=config, **kwargs 2990 ) 2991 if model.record_merge_strategy 2992 else None 2993 ) 2994 2995 property_limit_type: PropertyLimitType 2996 match model.property_limit_type: 2997 case PropertyLimitTypeModel.property_count: 2998 property_limit_type = PropertyLimitType.property_count 2999 case PropertyLimitTypeModel.characters: 3000 property_limit_type = PropertyLimitType.characters 3001 case _: 3002 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3003 3004 return PropertyChunking( 3005 property_limit_type=property_limit_type, 3006 property_limit=model.property_limit, 3007 record_merge_strategy=record_merge_strategy, 3008 config=config, 3009 parameters=model.parameters or {}, 3010 ) 3011 3012 def create_query_properties( 3013 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3014 ) -> QueryProperties: 3015 if isinstance(model.property_list, list): 3016 property_list = model.property_list 3017 else: 3018 property_list = self._create_component_from_model( 3019 model=model.property_list, config=config, **kwargs 3020 ) 3021 3022 property_chunking = ( 3023 self._create_component_from_model( 3024 model=model.property_chunking, config=config, **kwargs 3025 ) 3026 if model.property_chunking 3027 else None 3028 ) 3029 3030 property_selector = ( 3031 self._create_component_from_model( 3032 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3033 ) 3034 if model.property_selector 3035 else None 3036 ) 3037 3038 return QueryProperties( 3039 property_list=property_list, 3040 always_include_properties=model.always_include_properties, 3041 property_chunking=property_chunking, 3042 property_selector=property_selector, 3043 config=config, 3044 parameters=model.parameters or {}, 3045 ) 3046 3047 def create_json_schema_property_selector( 3048 self, 3049 model: JsonSchemaPropertySelectorModel, 3050 config: Config, 3051 *, 3052 stream_name: str, 3053 **kwargs: Any, 3054 ) -> JsonSchemaPropertySelector: 3055 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3056 3057 transformations = [] 3058 if model.transformations: 3059 for transformation_model in model.transformations: 3060 transformations.append( 3061 self._create_component_from_model(model=transformation_model, config=config) 3062 ) 3063 3064 return JsonSchemaPropertySelector( 3065 configured_stream=configured_stream, 3066 properties_transformations=transformations, 3067 config=config, 3068 parameters=model.parameters or {}, 3069 ) 3070 3071 @staticmethod 3072 def create_record_filter( 3073 model: RecordFilterModel, config: Config, **kwargs: Any 3074 ) -> RecordFilter: 3075 return RecordFilter( 3076 condition=model.condition or "", config=config, parameters=model.parameters or {} 3077 ) 3078 3079 @staticmethod 3080 def create_request_path(model: RequestPathModel, config: Config, **kwargs: Any) -> RequestPath: 3081 return RequestPath(parameters={}) 3082 3083 @staticmethod 3084 def create_request_option( 3085 model: RequestOptionModel, config: Config, **kwargs: Any 3086 ) -> RequestOption: 3087 inject_into = RequestOptionType(model.inject_into.value) 3088 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3089 [ 3090 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3091 for segment in model.field_path 3092 ] 3093 if model.field_path 3094 else None 3095 ) 3096 field_name = ( 3097 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3098 if model.field_name 3099 else None 3100 ) 3101 return RequestOption( 3102 field_name=field_name, 3103 field_path=field_path, 3104 inject_into=inject_into, 3105 parameters=kwargs.get("parameters", {}), 3106 ) 3107 3108 def create_record_selector( 3109 self, 3110 model: RecordSelectorModel, 3111 config: Config, 3112 *, 3113 name: str, 3114 transformations: List[RecordTransformation] | None = None, 3115 decoder: Decoder | None = None, 3116 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3117 file_uploader: Optional[DefaultFileUploader] = None, 3118 **kwargs: Any, 3119 ) -> RecordSelector: 3120 extractor = self._create_component_from_model( 3121 model=model.extractor, decoder=decoder, config=config 3122 ) 3123 record_filter = ( 3124 self._create_component_from_model(model.record_filter, config=config) 3125 if model.record_filter 3126 else None 3127 ) 3128 3129 transform_before_filtering = ( 3130 False if model.transform_before_filtering is None else model.transform_before_filtering 3131 ) 3132 if client_side_incremental_sync_cursor: 3133 record_filter = ClientSideIncrementalRecordFilterDecorator( 3134 config=config, 3135 parameters=model.parameters, 3136 condition=model.record_filter.condition 3137 if (model.record_filter and hasattr(model.record_filter, "condition")) 3138 else None, 3139 cursor=client_side_incremental_sync_cursor, 3140 ) 3141 transform_before_filtering = ( 3142 True 3143 if model.transform_before_filtering is None 3144 else model.transform_before_filtering 3145 ) 3146 3147 if model.schema_normalization is None: 3148 # default to no schema normalization if not set 3149 model.schema_normalization = SchemaNormalizationModel.None_ 3150 3151 schema_normalization = ( 3152 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3153 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3154 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3155 ) 3156 3157 return RecordSelector( 3158 extractor=extractor, 3159 name=name, 3160 config=config, 3161 record_filter=record_filter, 3162 transformations=transformations or [], 3163 file_uploader=file_uploader, 3164 schema_normalization=schema_normalization, 3165 parameters=model.parameters or {}, 3166 transform_before_filtering=transform_before_filtering, 3167 ) 3168 3169 @staticmethod 3170 def create_remove_fields( 3171 model: RemoveFieldsModel, config: Config, **kwargs: Any 3172 ) -> RemoveFields: 3173 return RemoveFields( 3174 field_pointers=model.field_pointers, condition=model.condition or "", parameters={} 3175 ) 3176 3177 def create_selective_authenticator( 3178 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3179 ) -> DeclarativeAuthenticator: 3180 authenticators = { 3181 name: self._create_component_from_model(model=auth, config=config) 3182 for name, auth in model.authenticators.items() 3183 } 3184 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3185 return SelectiveAuthenticator( # type: ignore[abstract] 3186 config=config, 3187 authenticators=authenticators, 3188 authenticator_selection_path=model.authenticator_selection_path, 3189 **kwargs, 3190 ) 3191 3192 @staticmethod 3193 def create_legacy_session_token_authenticator( 3194 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3195 ) -> LegacySessionTokenAuthenticator: 3196 return LegacySessionTokenAuthenticator( 3197 api_url=url_base, 3198 header=model.header, 3199 login_url=model.login_url, 3200 password=model.password or "", 3201 session_token=model.session_token or "", 3202 session_token_response_key=model.session_token_response_key or "", 3203 username=model.username or "", 3204 validate_session_url=model.validate_session_url, 3205 config=config, 3206 parameters=model.parameters or {}, 3207 ) 3208 3209 def create_simple_retriever( 3210 self, 3211 model: SimpleRetrieverModel, 3212 config: Config, 3213 *, 3214 name: str, 3215 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3216 request_options_provider: Optional[RequestOptionsProvider] = None, 3217 cursor: Optional[Cursor] = None, 3218 has_stop_condition_cursor: bool = False, 3219 is_client_side_incremental_sync: bool = False, 3220 transformations: List[RecordTransformation], 3221 file_uploader: Optional[DefaultFileUploader] = None, 3222 incremental_sync: Optional[ 3223 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3224 ] = None, 3225 use_cache: Optional[bool] = None, 3226 log_formatter: Optional[Callable[[Response], Any]] = None, 3227 partition_router: Optional[PartitionRouter] = None, 3228 **kwargs: Any, 3229 ) -> SimpleRetriever: 3230 def _get_url(req: Requester) -> str: 3231 """ 3232 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3233 This is needed because the URL is not set until the requester is created. 3234 """ 3235 3236 _url: str = ( 3237 model.requester.url 3238 if hasattr(model.requester, "url") and model.requester.url is not None 3239 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3240 ) 3241 _url_base: str = ( 3242 model.requester.url_base 3243 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3244 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3245 ) 3246 3247 return _url or _url_base 3248 3249 if cursor is None: 3250 cursor = FinalStateCursor(name, None, self._message_repository) 3251 3252 decoder = ( 3253 self._create_component_from_model(model=model.decoder, config=config) 3254 if model.decoder 3255 else JsonDecoder(parameters={}) 3256 ) 3257 record_selector = self._create_component_from_model( 3258 model=model.record_selector, 3259 name=name, 3260 config=config, 3261 decoder=decoder, 3262 transformations=transformations, 3263 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3264 file_uploader=file_uploader, 3265 ) 3266 3267 query_properties: Optional[QueryProperties] = None 3268 query_properties_key: Optional[str] = None 3269 self._ensure_query_properties_to_model(model.requester) 3270 if self._has_query_properties_in_request_parameters(model.requester): 3271 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3272 # places instead of default to request_parameters which isn't clearly documented 3273 if ( 3274 hasattr(model.requester, "fetch_properties_from_endpoint") 3275 and model.requester.fetch_properties_from_endpoint 3276 ): 3277 raise ValueError( 3278 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3279 ) 3280 3281 query_properties_definitions = [] 3282 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3283 if isinstance(request_parameter, QueryPropertiesModel): 3284 query_properties_key = key 3285 query_properties_definitions.append(request_parameter) 3286 3287 if len(query_properties_definitions) > 1: 3288 raise ValueError( 3289 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3290 ) 3291 3292 if len(query_properties_definitions) == 1: 3293 query_properties = self._create_component_from_model( 3294 model=query_properties_definitions[0], stream_name=name, config=config 3295 ) 3296 3297 # Removes QueryProperties components from the interpolated mappings because it has been designed 3298 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3299 # instead of through jinja interpolation 3300 if hasattr(model.requester, "request_parameters") and isinstance( 3301 model.requester.request_parameters, Mapping 3302 ): 3303 model.requester.request_parameters = self._remove_query_properties( 3304 model.requester.request_parameters 3305 ) 3306 elif ( 3307 hasattr(model.requester, "fetch_properties_from_endpoint") 3308 and model.requester.fetch_properties_from_endpoint 3309 ): 3310 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3311 query_properties_definition = QueryPropertiesModel( 3312 type="QueryProperties", 3313 property_list=model.requester.fetch_properties_from_endpoint, 3314 always_include_properties=None, 3315 property_chunking=None, 3316 ) # type: ignore # $parameters has a default value 3317 3318 query_properties = self.create_query_properties( 3319 model=query_properties_definition, 3320 stream_name=name, 3321 config=config, 3322 ) 3323 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3324 query_properties = self.create_query_properties( 3325 model=model.requester.query_properties, 3326 stream_name=name, 3327 config=config, 3328 ) 3329 3330 requester = self._create_component_from_model( 3331 model=model.requester, 3332 decoder=decoder, 3333 name=name, 3334 query_properties_key=query_properties_key, 3335 use_cache=use_cache, 3336 config=config, 3337 ) 3338 3339 if not request_options_provider: 3340 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3341 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3342 partition_router, PartitionRouter 3343 ): 3344 request_options_provider = partition_router 3345 3346 paginator = ( 3347 self._create_component_from_model( 3348 model=model.paginator, 3349 config=config, 3350 url_base=_get_url(requester), 3351 extractor_model=model.record_selector.extractor, 3352 decoder=decoder, 3353 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3354 ) 3355 if model.paginator 3356 else NoPagination(parameters={}) 3357 ) 3358 3359 ignore_stream_slicer_parameters_on_paginated_requests = ( 3360 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3361 ) 3362 3363 if ( 3364 model.partition_router 3365 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3366 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3367 and any( 3368 parent_stream_config.lazy_read_pointer 3369 for parent_stream_config in model.partition_router.parent_stream_configs 3370 ) 3371 ): 3372 if incremental_sync: 3373 if incremental_sync.type != "DatetimeBasedCursor": 3374 raise ValueError( 3375 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3376 ) 3377 3378 elif incremental_sync.step or incremental_sync.cursor_granularity: 3379 raise ValueError( 3380 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3381 ) 3382 3383 if model.decoder and model.decoder.type != "JsonDecoder": 3384 raise ValueError( 3385 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3386 ) 3387 3388 return LazySimpleRetriever( 3389 name=name, 3390 paginator=paginator, 3391 primary_key=primary_key, 3392 requester=requester, 3393 record_selector=record_selector, 3394 stream_slicer=_NO_STREAM_SLICING, 3395 request_option_provider=request_options_provider, 3396 cursor=None, 3397 config=config, 3398 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3399 parameters=model.parameters or {}, 3400 ) 3401 3402 if ( 3403 model.record_selector.record_filter 3404 and model.pagination_reset 3405 and model.pagination_reset.limits 3406 ): 3407 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3408 3409 return SimpleRetriever( 3410 name=name, 3411 paginator=paginator, 3412 primary_key=primary_key, 3413 requester=requester, 3414 record_selector=record_selector, 3415 stream_slicer=_NO_STREAM_SLICING, 3416 request_option_provider=request_options_provider, 3417 cursor=None, 3418 config=config, 3419 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3420 additional_query_properties=query_properties, 3421 log_formatter=self._get_log_formatter(log_formatter, name), 3422 pagination_tracker_factory=self._create_pagination_tracker_factory( 3423 model.pagination_reset, cursor 3424 ), 3425 parameters=model.parameters or {}, 3426 ) 3427 3428 def _create_pagination_tracker_factory( 3429 self, model: Optional[PaginationResetModel], cursor: Cursor 3430 ) -> Callable[[], PaginationTracker]: 3431 if model is None: 3432 return lambda: PaginationTracker() 3433 3434 # Until we figure out a way to use any cursor for PaginationTracker, we will have to have this cursor selector logic 3435 cursor_factory: Callable[[], Optional[ConcurrentCursor]] = lambda: None 3436 if model.action == PaginationResetActionModel.RESET: 3437 # in that case, we will let cursor_factory to return None even if the stream has a cursor 3438 pass 3439 elif model.action == PaginationResetActionModel.SPLIT_USING_CURSOR: 3440 if isinstance(cursor, ConcurrentCursor): 3441 cursor_factory = lambda: cursor.copy_without_state() # type: ignore # the if condition validates that it is a ConcurrentCursor 3442 elif isinstance(cursor, ConcurrentPerPartitionCursor): 3443 cursor_factory = lambda: cursor._cursor_factory.create( # type: ignore # if this becomes a problem, we would need to extract the cursor_factory instantiation logic and make it accessible here 3444 {}, datetime.timedelta(0) 3445 ) 3446 elif not isinstance(cursor, FinalStateCursor): 3447 LOGGER.warning( 3448 "Unknown cursor for PaginationTracker. Pagination resets might not work properly" 3449 ) 3450 else: 3451 raise ValueError(f"Unknown PaginationReset action: {model.action}") 3452 3453 limit = model.limits.number_of_records if model and model.limits else None 3454 return lambda: PaginationTracker(cursor_factory(), limit) 3455 3456 def _get_log_formatter( 3457 self, log_formatter: Callable[[Response], Any] | None, name: str 3458 ) -> Callable[[Response], Any] | None: 3459 if self._should_limit_slices_fetched(): 3460 return ( 3461 ( 3462 lambda response: format_http_message( 3463 response, 3464 f"Stream '{name}' request", 3465 f"Request performed in order to extract records for stream '{name}'", 3466 name, 3467 ) 3468 ) 3469 if not log_formatter 3470 else log_formatter 3471 ) 3472 return None 3473 3474 def _should_limit_slices_fetched(self) -> bool: 3475 """ 3476 Returns True if the number of slices fetched should be limited, False otherwise. 3477 This is used to limit the number of slices fetched during tests. 3478 """ 3479 return bool(self._limit_slices_fetched or self._emit_connector_builder_messages) 3480 3481 @staticmethod 3482 def _has_query_properties_in_request_parameters( 3483 requester: Union[HttpRequesterModel, CustomRequesterModel], 3484 ) -> bool: 3485 if not hasattr(requester, "request_parameters"): 3486 return False 3487 request_parameters = requester.request_parameters 3488 if request_parameters and isinstance(request_parameters, Mapping): 3489 for request_parameter in request_parameters.values(): 3490 if isinstance(request_parameter, QueryPropertiesModel): 3491 return True 3492 return False 3493 3494 @staticmethod 3495 def _remove_query_properties( 3496 request_parameters: Mapping[str, Union[str, QueryPropertiesModel]], 3497 ) -> Mapping[str, str]: 3498 return { 3499 parameter_field: request_parameter 3500 for parameter_field, request_parameter in request_parameters.items() 3501 if not isinstance(request_parameter, QueryPropertiesModel) 3502 } 3503 3504 def create_state_delegating_stream( 3505 self, 3506 model: StateDelegatingStreamModel, 3507 config: Config, 3508 has_parent_state: Optional[bool] = None, 3509 **kwargs: Any, 3510 ) -> DeclarativeStream: 3511 if ( 3512 model.full_refresh_stream.name != model.name 3513 or model.name != model.incremental_stream.name 3514 ): 3515 raise ValueError( 3516 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3517 ) 3518 3519 stream_model = self._get_state_delegating_stream_model( 3520 False if has_parent_state is None else has_parent_state, model 3521 ) 3522 3523 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel 3524 3525 def _get_state_delegating_stream_model( 3526 self, has_parent_state: bool, model: StateDelegatingStreamModel 3527 ) -> DeclarativeStreamModel: 3528 return ( 3529 model.incremental_stream 3530 if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state 3531 else model.full_refresh_stream 3532 ) 3533 3534 def _create_async_job_status_mapping( 3535 self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any 3536 ) -> Mapping[str, AsyncJobStatus]: 3537 api_status_to_cdk_status = {} 3538 for cdk_status, api_statuses in model.dict().items(): 3539 if cdk_status == "type": 3540 # This is an element of the dict because of the typing of the CDK but it is not a CDK status 3541 continue 3542 3543 for status in api_statuses: 3544 if status in api_status_to_cdk_status: 3545 raise ValueError( 3546 f"API status {status} is already set for CDK status {cdk_status}. Please ensure API statuses are only provided once" 3547 ) 3548 api_status_to_cdk_status[status] = self._get_async_job_status(cdk_status) 3549 return api_status_to_cdk_status 3550 3551 def _get_async_job_status(self, status: str) -> AsyncJobStatus: 3552 match status: 3553 case "running": 3554 return AsyncJobStatus.RUNNING 3555 case "completed": 3556 return AsyncJobStatus.COMPLETED 3557 case "failed": 3558 return AsyncJobStatus.FAILED 3559 case "timeout": 3560 return AsyncJobStatus.TIMED_OUT 3561 case _: 3562 raise ValueError(f"Unsupported CDK status {status}") 3563 3564 def create_async_retriever( 3565 self, 3566 model: AsyncRetrieverModel, 3567 config: Config, 3568 *, 3569 name: str, 3570 primary_key: Optional[ 3571 Union[str, List[str], List[List[str]]] 3572 ], # this seems to be needed to match create_simple_retriever 3573 stream_slicer: Optional[StreamSlicer], 3574 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3575 transformations: List[RecordTransformation], 3576 **kwargs: Any, 3577 ) -> AsyncRetriever: 3578 if model.download_target_requester and not model.download_target_extractor: 3579 raise ValueError( 3580 f"`download_target_extractor` required if using a `download_target_requester`" 3581 ) 3582 3583 def _get_download_retriever( 3584 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3585 ) -> SimpleRetriever: 3586 # We create a record selector for the download retriever 3587 # with no schema normalization and no transformations, neither record filter 3588 # as all this occurs in the record_selector of the AsyncRetriever 3589 record_selector = RecordSelector( 3590 extractor=extractor, 3591 name=name, 3592 record_filter=None, 3593 transformations=[], 3594 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3595 config=config, 3596 parameters={}, 3597 ) 3598 paginator = ( 3599 self._create_component_from_model( 3600 model=model.download_paginator, 3601 decoder=_decoder, 3602 config=config, 3603 url_base="", 3604 ) 3605 if model.download_paginator 3606 else NoPagination(parameters={}) 3607 ) 3608 3609 return SimpleRetriever( 3610 requester=requester, 3611 record_selector=record_selector, 3612 primary_key=None, 3613 name=name, 3614 paginator=paginator, 3615 config=config, 3616 parameters={}, 3617 log_formatter=self._get_log_formatter(None, name), 3618 ) 3619 3620 def _get_job_timeout() -> datetime.timedelta: 3621 user_defined_timeout: Optional[int] = ( 3622 int( 3623 InterpolatedString.create( 3624 str(model.polling_job_timeout), 3625 parameters={}, 3626 ).eval(config) 3627 ) 3628 if model.polling_job_timeout 3629 else None 3630 ) 3631 3632 # check for user defined timeout during the test read or 15 minutes 3633 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3634 # default value for non-connector builder is 60 minutes. 3635 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3636 3637 return ( 3638 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3639 ) 3640 3641 decoder = ( 3642 self._create_component_from_model(model=model.decoder, config=config) 3643 if model.decoder 3644 else JsonDecoder(parameters={}) 3645 ) 3646 record_selector = self._create_component_from_model( 3647 model=model.record_selector, 3648 config=config, 3649 decoder=decoder, 3650 name=name, 3651 transformations=transformations, 3652 client_side_incremental_sync=client_side_incremental_sync, 3653 ) 3654 3655 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3656 if self._should_limit_slices_fetched(): 3657 stream_slicer = cast( 3658 StreamSlicer, 3659 StreamSlicerTestReadDecorator( 3660 wrapped_slicer=stream_slicer, 3661 maximum_number_of_slices=self._limit_slices_fetched or 5, 3662 ), 3663 ) 3664 3665 creation_requester = self._create_component_from_model( 3666 model=model.creation_requester, 3667 decoder=decoder, 3668 config=config, 3669 name=f"job creation - {name}", 3670 ) 3671 polling_requester = self._create_component_from_model( 3672 model=model.polling_requester, 3673 decoder=decoder, 3674 config=config, 3675 name=f"job polling - {name}", 3676 ) 3677 job_download_components_name = f"job download - {name}" 3678 download_decoder = ( 3679 self._create_component_from_model(model=model.download_decoder, config=config) 3680 if model.download_decoder 3681 else JsonDecoder(parameters={}) 3682 ) 3683 download_extractor = ( 3684 self._create_component_from_model( 3685 model=model.download_extractor, 3686 config=config, 3687 decoder=download_decoder, 3688 parameters=model.parameters, 3689 ) 3690 if model.download_extractor 3691 else DpathExtractor( 3692 [], 3693 config=config, 3694 decoder=download_decoder, 3695 parameters=model.parameters or {}, 3696 ) 3697 ) 3698 download_requester = self._create_component_from_model( 3699 model=model.download_requester, 3700 decoder=download_decoder, 3701 config=config, 3702 name=job_download_components_name, 3703 ) 3704 download_retriever = _get_download_retriever( 3705 download_requester, download_extractor, download_decoder 3706 ) 3707 abort_requester = ( 3708 self._create_component_from_model( 3709 model=model.abort_requester, 3710 decoder=decoder, 3711 config=config, 3712 name=f"job abort - {name}", 3713 ) 3714 if model.abort_requester 3715 else None 3716 ) 3717 delete_requester = ( 3718 self._create_component_from_model( 3719 model=model.delete_requester, 3720 decoder=decoder, 3721 config=config, 3722 name=f"job delete - {name}", 3723 ) 3724 if model.delete_requester 3725 else None 3726 ) 3727 download_target_requester = ( 3728 self._create_component_from_model( 3729 model=model.download_target_requester, 3730 decoder=decoder, 3731 config=config, 3732 name=f"job extract_url - {name}", 3733 ) 3734 if model.download_target_requester 3735 else None 3736 ) 3737 status_extractor = self._create_component_from_model( 3738 model=model.status_extractor, decoder=decoder, config=config, name=name 3739 ) 3740 download_target_extractor = ( 3741 self._create_component_from_model( 3742 model=model.download_target_extractor, 3743 decoder=decoder, 3744 config=config, 3745 name=name, 3746 ) 3747 if model.download_target_extractor 3748 else None 3749 ) 3750 3751 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3752 creation_requester=creation_requester, 3753 polling_requester=polling_requester, 3754 download_retriever=download_retriever, 3755 download_target_requester=download_target_requester, 3756 abort_requester=abort_requester, 3757 delete_requester=delete_requester, 3758 status_extractor=status_extractor, 3759 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3760 download_target_extractor=download_target_extractor, 3761 job_timeout=_get_job_timeout(), 3762 ) 3763 3764 async_job_partition_router = AsyncJobPartitionRouter( 3765 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3766 job_repository, 3767 stream_slices, 3768 self._job_tracker, 3769 self._message_repository, 3770 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3771 has_bulk_parent=False, 3772 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3773 # `None` == default retry is set to 3 attempts, under the hood. 3774 job_max_retry=1 if self._emit_connector_builder_messages else None, 3775 ), 3776 stream_slicer=stream_slicer, 3777 config=config, 3778 parameters=model.parameters or {}, 3779 ) 3780 3781 return AsyncRetriever( 3782 record_selector=record_selector, 3783 stream_slicer=async_job_partition_router, 3784 config=config, 3785 parameters=model.parameters or {}, 3786 ) 3787 3788 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3789 config_migrations = [ 3790 self._create_component_from_model(migration, config) 3791 for migration in ( 3792 model.config_normalization_rules.config_migrations 3793 if ( 3794 model.config_normalization_rules 3795 and model.config_normalization_rules.config_migrations 3796 ) 3797 else [] 3798 ) 3799 ] 3800 config_transformations = [ 3801 self._create_component_from_model(transformation, config) 3802 for transformation in ( 3803 model.config_normalization_rules.transformations 3804 if ( 3805 model.config_normalization_rules 3806 and model.config_normalization_rules.transformations 3807 ) 3808 else [] 3809 ) 3810 ] 3811 config_validations = [ 3812 self._create_component_from_model(validation, config) 3813 for validation in ( 3814 model.config_normalization_rules.validations 3815 if ( 3816 model.config_normalization_rules 3817 and model.config_normalization_rules.validations 3818 ) 3819 else [] 3820 ) 3821 ] 3822 3823 return Spec( 3824 connection_specification=model.connection_specification, 3825 documentation_url=model.documentation_url, 3826 advanced_auth=model.advanced_auth, 3827 parameters={}, 3828 config_migrations=config_migrations, 3829 config_transformations=config_transformations, 3830 config_validations=config_validations, 3831 ) 3832 3833 def create_substream_partition_router( 3834 self, 3835 model: SubstreamPartitionRouterModel, 3836 config: Config, 3837 *, 3838 stream_name: str, 3839 **kwargs: Any, 3840 ) -> SubstreamPartitionRouter: 3841 parent_stream_configs = [] 3842 if model.parent_stream_configs: 3843 parent_stream_configs.extend( 3844 [ 3845 self.create_parent_stream_config_with_substream_wrapper( 3846 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3847 ) 3848 for parent_stream_config in model.parent_stream_configs 3849 ] 3850 ) 3851 3852 return SubstreamPartitionRouter( 3853 parent_stream_configs=parent_stream_configs, 3854 parameters=model.parameters or {}, 3855 config=config, 3856 ) 3857 3858 def create_parent_stream_config_with_substream_wrapper( 3859 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3860 ) -> Any: 3861 # getting the parent state 3862 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3863 3864 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3865 has_parent_state = bool( 3866 self._connector_state_manager.get_stream_state(stream_name, None) 3867 if model.incremental_dependency 3868 else False 3869 ) 3870 connector_state_manager = self._instantiate_parent_stream_state_manager( 3871 child_state, config, model, has_parent_state 3872 ) 3873 3874 substream_factory = ModelToComponentFactory( 3875 connector_state_manager=connector_state_manager, 3876 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3877 limit_slices_fetched=self._limit_slices_fetched, 3878 emit_connector_builder_messages=self._emit_connector_builder_messages, 3879 disable_retries=self._disable_retries, 3880 disable_cache=self._disable_cache, 3881 message_repository=StateFilteringMessageRepository( 3882 LogAppenderMessageRepositoryDecorator( 3883 { 3884 "airbyte_cdk": {"stream": {"is_substream": True}}, 3885 "http": {"is_auxiliary": True}, 3886 }, 3887 self._message_repository, 3888 self._evaluate_log_level(self._emit_connector_builder_messages), 3889 ), 3890 ), 3891 ) 3892 3893 return substream_factory.create_parent_stream_config( 3894 model=model, config=config, stream_name=stream_name, **kwargs 3895 ) 3896 3897 def _instantiate_parent_stream_state_manager( 3898 self, 3899 child_state: MutableMapping[str, Any], 3900 config: Config, 3901 model: ParentStreamConfigModel, 3902 has_parent_state: bool, 3903 ) -> ConnectorStateManager: 3904 """ 3905 With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the 3906 `set_initial_state` flow that existed for the declarative cursors. This state is taken from 3907 self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account 3908 for the MessageRepository being different). So we need to pass a ConnectorStateManager to the 3909 ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if 3910 incremental_dependency is set. 3911 """ 3912 if model.incremental_dependency and child_state: 3913 parent_stream_name = model.stream.name or "" 3914 parent_state = ConcurrentPerPartitionCursor.get_parent_state( 3915 child_state, parent_stream_name 3916 ) 3917 3918 if not parent_state: 3919 # there are two migration cases: state value from child stream or from global state 3920 parent_state = ConcurrentPerPartitionCursor.get_global_state( 3921 child_state, parent_stream_name 3922 ) 3923 3924 if not parent_state and not isinstance(parent_state, dict): 3925 cursor_values = child_state.values() 3926 if cursor_values and len(cursor_values) == 1: 3927 # We assume the child state is a pair `{<cursor_field>: <cursor_value>}` and we will use the 3928 # cursor value as a parent state. 3929 incremental_sync_model: Union[ 3930 DatetimeBasedCursorModel, 3931 IncrementingCountCursorModel, 3932 ] = ( 3933 model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream 3934 if isinstance(model.stream, DeclarativeStreamModel) 3935 else self._get_state_delegating_stream_model( 3936 has_parent_state, model.stream 3937 ).incremental_sync 3938 ) 3939 cursor_field = InterpolatedString.create( 3940 incremental_sync_model.cursor_field, 3941 parameters=incremental_sync_model.parameters or {}, 3942 ).eval(config) 3943 parent_state = AirbyteStateMessage( 3944 type=AirbyteStateType.STREAM, 3945 stream=AirbyteStreamState( 3946 stream_descriptor=StreamDescriptor( 3947 name=parent_stream_name, namespace=None 3948 ), 3949 stream_state=AirbyteStateBlob( 3950 {cursor_field: list(cursor_values)[0]} 3951 ), 3952 ), 3953 ) 3954 return ConnectorStateManager([parent_state] if parent_state else []) 3955 3956 return ConnectorStateManager([]) 3957 3958 @staticmethod 3959 def create_wait_time_from_header( 3960 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3961 ) -> WaitTimeFromHeaderBackoffStrategy: 3962 return WaitTimeFromHeaderBackoffStrategy( 3963 header=model.header, 3964 parameters=model.parameters or {}, 3965 config=config, 3966 regex=model.regex, 3967 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3968 if model.max_waiting_time_in_seconds is not None 3969 else None, 3970 ) 3971 3972 @staticmethod 3973 def create_wait_until_time_from_header( 3974 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3975 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3976 return WaitUntilTimeFromHeaderBackoffStrategy( 3977 header=model.header, 3978 parameters=model.parameters or {}, 3979 config=config, 3980 min_wait=model.min_wait, 3981 regex=model.regex, 3982 ) 3983 3984 def get_message_repository(self) -> MessageRepository: 3985 return self._message_repository 3986 3987 def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level: 3988 return Level.DEBUG if emit_connector_builder_messages else Level.INFO 3989 3990 @staticmethod 3991 def create_components_mapping_definition( 3992 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3993 ) -> ComponentMappingDefinition: 3994 interpolated_value = InterpolatedString.create( 3995 model.value, parameters=model.parameters or {} 3996 ) 3997 field_path = [ 3998 InterpolatedString.create(path, parameters=model.parameters or {}) 3999 for path in model.field_path 4000 ] 4001 return ComponentMappingDefinition( 4002 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4003 value=interpolated_value, 4004 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4005 create_or_update=model.create_or_update, 4006 condition=model.condition, 4007 parameters=model.parameters or {}, 4008 ) 4009 4010 def create_http_components_resolver( 4011 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4012 ) -> Any: 4013 retriever = self._create_component_from_model( 4014 model=model.retriever, 4015 config=config, 4016 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4017 primary_key=None, 4018 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4019 transformations=[], 4020 ) 4021 4022 components_mapping = [] 4023 for component_mapping_definition_model in model.components_mapping: 4024 if component_mapping_definition_model.condition: 4025 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4026 components_mapping.append( 4027 self._create_component_from_model( 4028 model=component_mapping_definition_model, 4029 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4030 component_mapping_definition_model.value_type 4031 ), 4032 config=config, 4033 ) 4034 ) 4035 4036 return HttpComponentsResolver( 4037 retriever=retriever, 4038 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4039 config=config, 4040 components_mapping=components_mapping, 4041 parameters=model.parameters or {}, 4042 ) 4043 4044 @staticmethod 4045 def create_stream_config( 4046 model: StreamConfigModel, config: Config, **kwargs: Any 4047 ) -> StreamConfig: 4048 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4049 [x for x in model.configs_pointer] if model.configs_pointer else [] 4050 ) 4051 4052 return StreamConfig( 4053 configs_pointer=model_configs_pointer, 4054 default_values=model.default_values, 4055 parameters=model.parameters or {}, 4056 ) 4057 4058 def create_config_components_resolver( 4059 self, 4060 model: ConfigComponentsResolverModel, 4061 config: Config, 4062 ) -> Any: 4063 model_stream_configs = ( 4064 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4065 ) 4066 4067 stream_configs = [ 4068 self._create_component_from_model( 4069 stream_config, config=config, parameters=model.parameters or {} 4070 ) 4071 for stream_config in model_stream_configs 4072 ] 4073 4074 components_mapping = [ 4075 self._create_component_from_model( 4076 model=components_mapping_definition_model, 4077 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4078 components_mapping_definition_model.value_type 4079 ), 4080 config=config, 4081 parameters=model.parameters, 4082 ) 4083 for components_mapping_definition_model in model.components_mapping 4084 ] 4085 4086 return ConfigComponentsResolver( 4087 stream_configs=stream_configs, 4088 config=config, 4089 components_mapping=components_mapping, 4090 parameters=model.parameters or {}, 4091 ) 4092 4093 def create_parametrized_components_resolver( 4094 self, 4095 model: ParametrizedComponentsResolverModel, 4096 config: Config, 4097 ) -> ParametrizedComponentsResolver: 4098 stream_parameters = StreamParametersDefinition( 4099 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4100 ) 4101 4102 components_mapping = [] 4103 for components_mapping_definition_model in model.components_mapping: 4104 if components_mapping_definition_model.condition: 4105 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4106 components_mapping.append( 4107 self._create_component_from_model( 4108 model=components_mapping_definition_model, 4109 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4110 components_mapping_definition_model.value_type 4111 ), 4112 config=config, 4113 ) 4114 ) 4115 return ParametrizedComponentsResolver( 4116 stream_parameters=stream_parameters, 4117 config=config, 4118 components_mapping=components_mapping, 4119 parameters=model.parameters or {}, 4120 ) 4121 4122 _UNSUPPORTED_DECODER_ERROR = ( 4123 "Specified decoder of {decoder_type} is not supported for pagination." 4124 "Please set as `JsonDecoder`, `XmlDecoder`, or a `CompositeRawDecoder` with an inner_parser of `JsonParser` or `GzipParser` instead." 4125 "If using `GzipParser`, please ensure that the lowest level inner_parser is a `JsonParser`." 4126 ) 4127 4128 def _is_supported_decoder_for_pagination(self, decoder: Decoder) -> bool: 4129 if isinstance(decoder, (JsonDecoder, XmlDecoder)): 4130 return True 4131 elif isinstance(decoder, CompositeRawDecoder): 4132 return self._is_supported_parser_for_pagination(decoder.parser) 4133 else: 4134 return False 4135 4136 def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: 4137 if isinstance(parser, JsonParser): 4138 return True 4139 elif isinstance(parser, GzipParser): 4140 return isinstance(parser.inner_parser, JsonParser) 4141 else: 4142 return False 4143 4144 def create_http_api_budget( 4145 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4146 ) -> HttpAPIBudget: 4147 policies = [ 4148 self._create_component_from_model(model=policy, config=config) 4149 for policy in model.policies 4150 ] 4151 4152 return HttpAPIBudget( 4153 policies=policies, 4154 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4155 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4156 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4157 ) 4158 4159 def create_fixed_window_call_rate_policy( 4160 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4161 ) -> FixedWindowCallRatePolicy: 4162 matchers = [ 4163 self._create_component_from_model(model=matcher, config=config) 4164 for matcher in model.matchers 4165 ] 4166 4167 # Set the initial reset timestamp to 10 days from now. 4168 # This value will be updated by the first request. 4169 return FixedWindowCallRatePolicy( 4170 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4171 period=parse_duration(model.period), 4172 call_limit=model.call_limit, 4173 matchers=matchers, 4174 ) 4175 4176 def create_file_uploader( 4177 self, model: FileUploaderModel, config: Config, **kwargs: Any 4178 ) -> FileUploader: 4179 name = "File Uploader" 4180 requester = self._create_component_from_model( 4181 model=model.requester, 4182 config=config, 4183 name=name, 4184 **kwargs, 4185 ) 4186 download_target_extractor = self._create_component_from_model( 4187 model=model.download_target_extractor, 4188 config=config, 4189 name=name, 4190 **kwargs, 4191 ) 4192 emit_connector_builder_messages = self._emit_connector_builder_messages 4193 file_uploader = DefaultFileUploader( 4194 requester=requester, 4195 download_target_extractor=download_target_extractor, 4196 config=config, 4197 file_writer=NoopFileWriter() 4198 if emit_connector_builder_messages 4199 else LocalFileSystemFileWriter(), 4200 parameters=model.parameters or {}, 4201 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4202 ) 4203 4204 return ( 4205 ConnectorBuilderFileUploader(file_uploader) 4206 if emit_connector_builder_messages 4207 else file_uploader 4208 ) 4209 4210 def create_moving_window_call_rate_policy( 4211 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4212 ) -> MovingWindowCallRatePolicy: 4213 rates = [ 4214 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4215 ] 4216 matchers = [ 4217 self._create_component_from_model(model=matcher, config=config) 4218 for matcher in model.matchers 4219 ] 4220 return MovingWindowCallRatePolicy( 4221 rates=rates, 4222 matchers=matchers, 4223 ) 4224 4225 def create_unlimited_call_rate_policy( 4226 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4227 ) -> UnlimitedCallRatePolicy: 4228 matchers = [ 4229 self._create_component_from_model(model=matcher, config=config) 4230 for matcher in model.matchers 4231 ] 4232 4233 return UnlimitedCallRatePolicy( 4234 matchers=matchers, 4235 ) 4236 4237 def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: 4238 interpolated_limit = InterpolatedString.create(str(model.limit), parameters={}) 4239 return Rate( 4240 limit=int(interpolated_limit.eval(config=config)), 4241 interval=parse_duration(model.interval), 4242 ) 4243 4244 def create_http_request_matcher( 4245 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4246 ) -> HttpRequestRegexMatcher: 4247 return HttpRequestRegexMatcher( 4248 method=model.method, 4249 url_base=model.url_base, 4250 url_path_pattern=model.url_path_pattern, 4251 params=model.params, 4252 headers=model.headers, 4253 ) 4254 4255 def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: 4256 self._api_budget = self.create_component( 4257 model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config 4258 ) 4259 4260 def create_grouping_partition_router( 4261 self, 4262 model: GroupingPartitionRouterModel, 4263 config: Config, 4264 *, 4265 stream_name: str, 4266 **kwargs: Any, 4267 ) -> GroupingPartitionRouter: 4268 underlying_router = self._create_component_from_model( 4269 model=model.underlying_partition_router, 4270 config=config, 4271 stream_name=stream_name, 4272 **kwargs, 4273 ) 4274 if model.group_size < 1: 4275 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4276 4277 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4278 # because they are specific to individual partitions and cannot be aggregated or handled 4279 # when grouping, potentially leading to incorrect API calls. Any request customization 4280 # should be managed at the stream level through the requester's configuration. 4281 if isinstance(underlying_router, SubstreamPartitionRouter): 4282 if any( 4283 parent_config.request_option 4284 for parent_config in underlying_router.parent_stream_configs 4285 ): 4286 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4287 4288 if isinstance(underlying_router, ListPartitionRouter): 4289 if underlying_router.request_option: 4290 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4291 4292 return GroupingPartitionRouter( 4293 group_size=model.group_size, 4294 underlying_partition_router=underlying_router, 4295 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4296 config=config, 4297 ) 4298 4299 def _ensure_query_properties_to_model( 4300 self, requester: Union[HttpRequesterModel, CustomRequesterModel] 4301 ) -> None: 4302 """ 4303 For some reason, it seems like CustomRequesterModel request_parameters stays as dictionaries which means that 4304 the other conditions relying on it being QueryPropertiesModel instead of a dict fail. Here, we migrate them to 4305 proper model. 4306 """ 4307 if not hasattr(requester, "request_parameters"): 4308 return 4309 4310 request_parameters = requester.request_parameters 4311 if request_parameters and isinstance(request_parameters, Dict): 4312 for request_parameter_key in request_parameters.keys(): 4313 request_parameter = request_parameters[request_parameter_key] 4314 if ( 4315 isinstance(request_parameter, Dict) 4316 and request_parameter.get("type") == "QueryProperties" 4317 ): 4318 request_parameters[request_parameter_key] = QueryPropertiesModel.parse_obj( 4319 request_parameter 4320 )
674 def __init__( 675 self, 676 limit_pages_fetched_per_slice: Optional[int] = None, 677 limit_slices_fetched: Optional[int] = None, 678 emit_connector_builder_messages: bool = False, 679 disable_retries: bool = False, 680 disable_cache: bool = False, 681 message_repository: Optional[MessageRepository] = None, 682 connector_state_manager: Optional[ConnectorStateManager] = None, 683 max_concurrent_async_job_count: Optional[int] = None, 684 configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, 685 ): 686 self._init_mappings() 687 self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice 688 self._limit_slices_fetched = limit_slices_fetched 689 self._emit_connector_builder_messages = emit_connector_builder_messages 690 self._disable_retries = disable_retries 691 self._disable_cache = disable_cache 692 self._message_repository = message_repository or InMemoryMessageRepository( 693 self._evaluate_log_level(emit_connector_builder_messages) 694 ) 695 self._stream_name_to_configured_stream = self._create_stream_name_to_configured_stream( 696 configured_catalog 697 ) 698 self._connector_state_manager = connector_state_manager or ConnectorStateManager() 699 self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None 700 self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) 701 # placeholder for deprecation warnings 702 self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = []
827 def create_component( 828 self, 829 model_type: Type[BaseModel], 830 component_definition: ComponentDefinition, 831 config: Config, 832 **kwargs: Any, 833 ) -> Any: 834 """ 835 Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and 836 subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating 837 creating declarative components from that model. 838 839 :param model_type: The type of declarative component that is being initialized 840 :param component_definition: The mapping that represents a declarative component 841 :param config: The connector config that is provided by the customer 842 :return: The declarative component to be used at runtime 843 """ 844 845 component_type = component_definition.get("type") 846 if component_definition.get("type") != model_type.__name__: 847 raise ValueError( 848 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 849 ) 850 851 declarative_component_model = model_type.parse_obj(component_definition) 852 853 if not isinstance(declarative_component_model, model_type): 854 raise ValueError( 855 f"Expected {model_type.__name__} component, but received {declarative_component_model.__class__.__name__}" 856 ) 857 858 return self._create_component_from_model( 859 model=declarative_component_model, config=config, **kwargs 860 )
Takes a given Pydantic model type and Mapping representing a component definition and creates a declarative component and subcomponents which will be used at runtime. This is done by first parsing the mapping into a Pydantic model and then creating creating declarative components from that model.
Parameters
- model_type: The type of declarative component that is being initialized
- component_definition: The mapping that represents a declarative component
- config: The connector config that is provided by the customer
Returns
The declarative component to be used at runtime
877 def get_model_deprecations(self) -> List[ConnectorBuilderLogMessage]: 878 """ 879 Returns the deprecation warnings that were collected during the creation of components. 880 """ 881 return self._collected_deprecation_logs
Returns the deprecation warnings that were collected during the creation of components.
898 def create_config_migration( 899 self, model: ConfigMigrationModel, config: Config 900 ) -> ConfigMigration: 901 transformations: List[ConfigTransformation] = [ 902 self._create_component_from_model(transformation, config) 903 for transformation in model.transformations 904 ] 905 906 return ConfigMigration( 907 description=model.description, 908 transformations=transformations, 909 )
911 def create_config_add_fields( 912 self, model: ConfigAddFieldsModel, config: Config, **kwargs: Any 913 ) -> ConfigAddFields: 914 fields = [self._create_component_from_model(field, config) for field in model.fields] 915 return ConfigAddFields( 916 fields=fields, 917 condition=model.condition or "", 918 )
967 @staticmethod 968 def create_added_field_definition( 969 model: AddedFieldDefinitionModel, config: Config, **kwargs: Any 970 ) -> AddedFieldDefinition: 971 interpolated_value = InterpolatedString.create( 972 model.value, parameters=model.parameters or {} 973 ) 974 return AddedFieldDefinition( 975 path=model.path, 976 value=interpolated_value, 977 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 978 parameters=model.parameters or {}, 979 )
981 def create_add_fields(self, model: AddFieldsModel, config: Config, **kwargs: Any) -> AddFields: 982 added_field_definitions = [ 983 self._create_component_from_model( 984 model=added_field_definition_model, 985 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 986 added_field_definition_model.value_type 987 ), 988 config=config, 989 ) 990 for added_field_definition_model in model.fields 991 ] 992 return AddFields( 993 fields=added_field_definitions, 994 condition=model.condition or "", 995 parameters=model.parameters or {}, 996 )
1022 def create_dpath_flatten_fields( 1023 self, model: DpathFlattenFieldsModel, config: Config, **kwargs: Any 1024 ) -> DpathFlattenFields: 1025 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 1026 key_transformation = ( 1027 KeyTransformation( 1028 config=config, 1029 prefix=model.key_transformation.prefix, 1030 suffix=model.key_transformation.suffix, 1031 parameters=model.parameters or {}, 1032 ) 1033 if model.key_transformation is not None 1034 else None 1035 ) 1036 return DpathFlattenFields( 1037 config=config, 1038 field_path=model_field_path, 1039 delete_origin_value=model.delete_origin_value 1040 if model.delete_origin_value is not None 1041 else False, 1042 replace_record=model.replace_record if model.replace_record is not None else False, 1043 key_transformation=key_transformation, 1044 parameters=model.parameters or {}, 1045 )
1059 def create_api_key_authenticator( 1060 self, 1061 model: ApiKeyAuthenticatorModel, 1062 config: Config, 1063 token_provider: Optional[TokenProvider] = None, 1064 **kwargs: Any, 1065 ) -> ApiKeyAuthenticator: 1066 if model.inject_into is None and model.header is None: 1067 raise ValueError( 1068 "Expected either inject_into or header to be set for ApiKeyAuthenticator" 1069 ) 1070 1071 if model.inject_into is not None and model.header is not None: 1072 raise ValueError( 1073 "inject_into and header cannot be set both for ApiKeyAuthenticator - remove the deprecated header option" 1074 ) 1075 1076 if token_provider is not None and model.api_token != "": 1077 raise ValueError( 1078 "If token_provider is set, api_token is ignored and has to be set to empty string." 1079 ) 1080 1081 request_option = ( 1082 self._create_component_from_model( 1083 model.inject_into, config, parameters=model.parameters or {} 1084 ) 1085 if model.inject_into 1086 else RequestOption( 1087 inject_into=RequestOptionType.header, 1088 field_name=model.header or "", 1089 parameters=model.parameters or {}, 1090 ) 1091 ) 1092 1093 return ApiKeyAuthenticator( 1094 token_provider=( 1095 token_provider 1096 if token_provider is not None 1097 else InterpolatedStringTokenProvider( 1098 api_token=model.api_token or "", 1099 config=config, 1100 parameters=model.parameters or {}, 1101 ) 1102 ), 1103 request_option=request_option, 1104 config=config, 1105 parameters=model.parameters or {}, 1106 )
1108 def create_legacy_to_per_partition_state_migration( 1109 self, 1110 model: LegacyToPerPartitionStateMigrationModel, 1111 config: Mapping[str, Any], 1112 declarative_stream: DeclarativeStreamModel, 1113 ) -> LegacyToPerPartitionStateMigration: 1114 retriever = declarative_stream.retriever 1115 if not isinstance(retriever, (SimpleRetrieverModel, AsyncRetrieverModel)): 1116 raise ValueError( 1117 f"LegacyToPerPartitionStateMigrations can only be applied on a DeclarativeStream with a SimpleRetriever or AsyncRetriever. Got {type(retriever)}" 1118 ) 1119 partition_router = retriever.partition_router 1120 if not isinstance( 1121 partition_router, (SubstreamPartitionRouterModel, CustomPartitionRouterModel) 1122 ): 1123 raise ValueError( 1124 f"LegacyToPerPartitionStateMigrations can only be applied on a SimpleRetriever with a Substream partition router. Got {type(partition_router)}" 1125 ) 1126 if not hasattr(partition_router, "parent_stream_configs"): 1127 raise ValueError( 1128 "LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration." 1129 ) 1130 1131 if not hasattr(declarative_stream, "incremental_sync"): 1132 raise ValueError( 1133 "LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration." 1134 ) 1135 1136 return LegacyToPerPartitionStateMigration( 1137 partition_router, # type: ignore # was already checked above 1138 declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. 1139 config, 1140 declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] 1141 )
1143 def create_session_token_authenticator( 1144 self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any 1145 ) -> Union[ApiKeyAuthenticator, BearerAuthenticator]: 1146 decoder = ( 1147 self._create_component_from_model(model=model.decoder, config=config) 1148 if model.decoder 1149 else JsonDecoder(parameters={}) 1150 ) 1151 login_requester = self._create_component_from_model( 1152 model=model.login_requester, 1153 config=config, 1154 name=f"{name}_login_requester", 1155 decoder=decoder, 1156 ) 1157 token_provider = SessionTokenProvider( 1158 login_requester=login_requester, 1159 session_token_path=model.session_token_path, 1160 expiration_duration=parse_duration(model.expiration_duration) 1161 if model.expiration_duration 1162 else None, 1163 parameters=model.parameters or {}, 1164 message_repository=self._message_repository, 1165 decoder=decoder, 1166 ) 1167 if model.request_authentication.type == "Bearer": 1168 return ModelToComponentFactory.create_bearer_authenticator( 1169 BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value 1170 config, 1171 token_provider=token_provider, 1172 ) 1173 else: 1174 return self.create_api_key_authenticator( 1175 ApiKeyAuthenticatorModel( 1176 type="ApiKeyAuthenticator", 1177 api_token="", 1178 inject_into=model.request_authentication.inject_into, 1179 ), # type: ignore # $parameters and headers default to None 1180 config=config, 1181 token_provider=token_provider, 1182 )
1184 @staticmethod 1185 def create_basic_http_authenticator( 1186 model: BasicHttpAuthenticatorModel, config: Config, **kwargs: Any 1187 ) -> BasicHttpAuthenticator: 1188 return BasicHttpAuthenticator( 1189 password=model.password or "", 1190 username=model.username, 1191 config=config, 1192 parameters=model.parameters or {}, 1193 )
1195 @staticmethod 1196 def create_bearer_authenticator( 1197 model: BearerAuthenticatorModel, 1198 config: Config, 1199 token_provider: Optional[TokenProvider] = None, 1200 **kwargs: Any, 1201 ) -> BearerAuthenticator: 1202 if token_provider is not None and model.api_token != "": 1203 raise ValueError( 1204 "If token_provider is set, api_token is ignored and has to be set to empty string." 1205 ) 1206 return BearerAuthenticator( 1207 token_provider=( 1208 token_provider 1209 if token_provider is not None 1210 else InterpolatedStringTokenProvider( 1211 api_token=model.api_token or "", 1212 config=config, 1213 parameters=model.parameters or {}, 1214 ) 1215 ), 1216 config=config, 1217 parameters=model.parameters or {}, 1218 )
1220 @staticmethod 1221 def create_dynamic_stream_check_config( 1222 model: DynamicStreamCheckConfigModel, config: Config, **kwargs: Any 1223 ) -> DynamicStreamCheckConfig: 1224 return DynamicStreamCheckConfig( 1225 dynamic_stream_name=model.dynamic_stream_name, 1226 stream_count=model.stream_count or 0, 1227 )
1229 def create_check_stream( 1230 self, model: CheckStreamModel, config: Config, **kwargs: Any 1231 ) -> CheckStream: 1232 if model.dynamic_streams_check_configs is None and model.stream_names is None: 1233 raise ValueError( 1234 "Expected either stream_names or dynamic_streams_check_configs to be set for CheckStream" 1235 ) 1236 1237 dynamic_streams_check_configs = ( 1238 [ 1239 self._create_component_from_model(model=dynamic_stream_check_config, config=config) 1240 for dynamic_stream_check_config in model.dynamic_streams_check_configs 1241 ] 1242 if model.dynamic_streams_check_configs 1243 else [] 1244 ) 1245 1246 return CheckStream( 1247 stream_names=model.stream_names or [], 1248 dynamic_streams_check_configs=dynamic_streams_check_configs, 1249 parameters={}, 1250 )
1252 @staticmethod 1253 def create_check_dynamic_stream( 1254 model: CheckDynamicStreamModel, config: Config, **kwargs: Any 1255 ) -> CheckDynamicStream: 1256 assert model.use_check_availability is not None # for mypy 1257 1258 use_check_availability = model.use_check_availability 1259 1260 return CheckDynamicStream( 1261 stream_count=model.stream_count, 1262 use_check_availability=use_check_availability, 1263 parameters={}, 1264 )
1266 def create_composite_error_handler( 1267 self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any 1268 ) -> CompositeErrorHandler: 1269 error_handlers = [ 1270 self._create_component_from_model(model=error_handler_model, config=config) 1271 for error_handler_model in model.error_handlers 1272 ] 1273 return CompositeErrorHandler( 1274 error_handlers=error_handlers, parameters=model.parameters or {} 1275 )
1277 @staticmethod 1278 def create_concurrency_level( 1279 model: ConcurrencyLevelModel, config: Config, **kwargs: Any 1280 ) -> ConcurrencyLevel: 1281 return ConcurrencyLevel( 1282 default_concurrency=model.default_concurrency, 1283 max_concurrency=model.max_concurrency, 1284 config=config, 1285 parameters={}, 1286 )
1288 @staticmethod 1289 def apply_stream_state_migrations( 1290 stream_state_migrations: List[Any] | None, stream_state: MutableMapping[str, Any] 1291 ) -> MutableMapping[str, Any]: 1292 if stream_state_migrations: 1293 for state_migration in stream_state_migrations: 1294 if state_migration.should_migrate(stream_state): 1295 # The state variable is expected to be mutable but the migrate method returns an immutable mapping. 1296 stream_state = dict(state_migration.migrate(stream_state)) 1297 return stream_state
1299 def create_concurrent_cursor_from_datetime_based_cursor( 1300 self, 1301 model_type: Type[BaseModel], 1302 component_definition: ComponentDefinition, 1303 stream_name: str, 1304 stream_namespace: Optional[str], 1305 stream_state: MutableMapping[str, Any], 1306 config: Config, 1307 message_repository: Optional[MessageRepository] = None, 1308 runtime_lookback_window: Optional[datetime.timedelta] = None, 1309 **kwargs: Any, 1310 ) -> ConcurrentCursor: 1311 component_type = component_definition.get("type") 1312 if component_definition.get("type") != model_type.__name__: 1313 raise ValueError( 1314 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1315 ) 1316 1317 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1318 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1319 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1320 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1321 if "$parameters" not in component_definition and "parameters" in component_definition: 1322 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1323 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1324 1325 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1326 raise ValueError( 1327 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1328 ) 1329 1330 model_parameters = datetime_based_cursor_model.parameters or {} 1331 interpolated_cursor_field = InterpolatedString.create( 1332 datetime_based_cursor_model.cursor_field, 1333 parameters=model_parameters, 1334 ) 1335 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1336 1337 interpolated_partition_field_start = InterpolatedString.create( 1338 datetime_based_cursor_model.partition_field_start or "start_time", 1339 parameters=model_parameters, 1340 ) 1341 interpolated_partition_field_end = InterpolatedString.create( 1342 datetime_based_cursor_model.partition_field_end or "end_time", 1343 parameters=model_parameters, 1344 ) 1345 1346 slice_boundary_fields = ( 1347 interpolated_partition_field_start.eval(config=config), 1348 interpolated_partition_field_end.eval(config=config), 1349 ) 1350 1351 datetime_format = datetime_based_cursor_model.datetime_format 1352 1353 cursor_granularity = ( 1354 parse_duration(datetime_based_cursor_model.cursor_granularity) 1355 if datetime_based_cursor_model.cursor_granularity 1356 else None 1357 ) 1358 1359 lookback_window = None 1360 interpolated_lookback_window = ( 1361 InterpolatedString.create( 1362 datetime_based_cursor_model.lookback_window, 1363 parameters=model_parameters, 1364 ) 1365 if datetime_based_cursor_model.lookback_window 1366 else None 1367 ) 1368 if interpolated_lookback_window: 1369 evaluated_lookback_window = interpolated_lookback_window.eval(config=config) 1370 if evaluated_lookback_window: 1371 lookback_window = parse_duration(evaluated_lookback_window) 1372 1373 connector_state_converter: DateTimeStreamStateConverter 1374 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1375 datetime_format=datetime_format, 1376 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1377 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1378 cursor_granularity=cursor_granularity, 1379 ) 1380 1381 # Adjusts the stream state by applying the runtime lookback window. 1382 # This is used to ensure correct state handling in case of failed partitions. 1383 stream_state_value = stream_state.get(cursor_field.cursor_field_key) 1384 if runtime_lookback_window and stream_state_value: 1385 new_stream_state = ( 1386 connector_state_converter.parse_timestamp(stream_state_value) 1387 - runtime_lookback_window 1388 ) 1389 stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format( 1390 new_stream_state 1391 ) 1392 1393 start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime] 1394 if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel): 1395 start_date_runtime_value = self.create_min_max_datetime( 1396 model=datetime_based_cursor_model.start_datetime, config=config 1397 ) 1398 else: 1399 start_date_runtime_value = datetime_based_cursor_model.start_datetime 1400 1401 end_date_runtime_value: Optional[Union[InterpolatedString, str, MinMaxDatetime]] 1402 if isinstance(datetime_based_cursor_model.end_datetime, MinMaxDatetimeModel): 1403 end_date_runtime_value = self.create_min_max_datetime( 1404 model=datetime_based_cursor_model.end_datetime, config=config 1405 ) 1406 else: 1407 end_date_runtime_value = datetime_based_cursor_model.end_datetime 1408 1409 interpolated_start_date = MinMaxDatetime.create( 1410 interpolated_string_or_min_max_datetime=start_date_runtime_value, 1411 parameters=datetime_based_cursor_model.parameters, 1412 ) 1413 interpolated_end_date = ( 1414 None 1415 if not end_date_runtime_value 1416 else MinMaxDatetime.create( 1417 end_date_runtime_value, datetime_based_cursor_model.parameters 1418 ) 1419 ) 1420 1421 # If datetime format is not specified then start/end datetime should inherit it from the stream slicer 1422 if not interpolated_start_date.datetime_format: 1423 interpolated_start_date.datetime_format = datetime_format 1424 if interpolated_end_date and not interpolated_end_date.datetime_format: 1425 interpolated_end_date.datetime_format = datetime_format 1426 1427 start_date = interpolated_start_date.get_datetime(config=config) 1428 end_date_provider = ( 1429 partial(interpolated_end_date.get_datetime, config) 1430 if interpolated_end_date 1431 else connector_state_converter.get_end_provider() 1432 ) 1433 1434 if ( 1435 datetime_based_cursor_model.step and not datetime_based_cursor_model.cursor_granularity 1436 ) or ( 1437 not datetime_based_cursor_model.step and datetime_based_cursor_model.cursor_granularity 1438 ): 1439 raise ValueError( 1440 f"If step is defined, cursor_granularity should be as well and vice-versa. " 1441 f"Right now, step is `{datetime_based_cursor_model.step}` and cursor_granularity is `{datetime_based_cursor_model.cursor_granularity}`" 1442 ) 1443 1444 # When step is not defined, default to a step size from the starting date to the present moment 1445 step_length = datetime.timedelta.max 1446 interpolated_step = ( 1447 InterpolatedString.create( 1448 datetime_based_cursor_model.step, 1449 parameters=model_parameters, 1450 ) 1451 if datetime_based_cursor_model.step 1452 else None 1453 ) 1454 if interpolated_step: 1455 evaluated_step = interpolated_step.eval(config) 1456 if evaluated_step: 1457 step_length = parse_duration(evaluated_step) 1458 1459 clamping_strategy: ClampingStrategy = NoClamping() 1460 if datetime_based_cursor_model.clamping: 1461 # While it is undesirable to interpolate within the model factory (as opposed to at runtime), 1462 # it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime 1463 # object which we want to keep agnostic of being low-code 1464 target = InterpolatedString( 1465 string=datetime_based_cursor_model.clamping.target, 1466 parameters=model_parameters, 1467 ) 1468 evaluated_target = target.eval(config=config) 1469 match evaluated_target: 1470 case "DAY": 1471 clamping_strategy = DayClampingStrategy() 1472 end_date_provider = ClampingEndProvider( 1473 DayClampingStrategy(is_ceiling=False), 1474 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1475 granularity=cursor_granularity or datetime.timedelta(seconds=1), 1476 ) 1477 case "WEEK": 1478 if ( 1479 not datetime_based_cursor_model.clamping.target_details 1480 or "weekday" not in datetime_based_cursor_model.clamping.target_details 1481 ): 1482 raise ValueError( 1483 "Given WEEK clamping, weekday needs to be provided as target_details" 1484 ) 1485 weekday = self._assemble_weekday( 1486 datetime_based_cursor_model.clamping.target_details["weekday"] 1487 ) 1488 clamping_strategy = WeekClampingStrategy(weekday) 1489 end_date_provider = ClampingEndProvider( 1490 WeekClampingStrategy(weekday, is_ceiling=False), 1491 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1492 granularity=cursor_granularity or datetime.timedelta(days=1), 1493 ) 1494 case "MONTH": 1495 clamping_strategy = MonthClampingStrategy() 1496 end_date_provider = ClampingEndProvider( 1497 MonthClampingStrategy(is_ceiling=False), 1498 end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1499 granularity=cursor_granularity or datetime.timedelta(days=1), 1500 ) 1501 case _: 1502 raise ValueError( 1503 f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH" 1504 ) 1505 1506 return ConcurrentCursor( 1507 stream_name=stream_name, 1508 stream_namespace=stream_namespace, 1509 stream_state=stream_state, 1510 message_repository=message_repository or self._message_repository, 1511 connector_state_manager=self._connector_state_manager, 1512 connector_state_converter=connector_state_converter, 1513 cursor_field=cursor_field, 1514 slice_boundary_fields=slice_boundary_fields, 1515 start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1516 end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1517 lookback_window=lookback_window, 1518 slice_range=step_length, 1519 cursor_granularity=cursor_granularity, 1520 clamping_strategy=clamping_strategy, 1521 )
1523 def create_concurrent_cursor_from_incrementing_count_cursor( 1524 self, 1525 model_type: Type[BaseModel], 1526 component_definition: ComponentDefinition, 1527 stream_name: str, 1528 stream_namespace: Optional[str], 1529 stream_state: MutableMapping[str, Any], 1530 config: Config, 1531 message_repository: Optional[MessageRepository] = None, 1532 **kwargs: Any, 1533 ) -> ConcurrentCursor: 1534 component_type = component_definition.get("type") 1535 if component_definition.get("type") != model_type.__name__: 1536 raise ValueError( 1537 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1538 ) 1539 1540 incrementing_count_cursor_model = model_type.parse_obj(component_definition) 1541 1542 if not isinstance(incrementing_count_cursor_model, IncrementingCountCursorModel): 1543 raise ValueError( 1544 f"Expected {model_type.__name__} component, but received {incrementing_count_cursor_model.__class__.__name__}" 1545 ) 1546 1547 interpolated_start_value = ( 1548 InterpolatedString.create( 1549 incrementing_count_cursor_model.start_value, # type: ignore 1550 parameters=incrementing_count_cursor_model.parameters or {}, 1551 ) 1552 if incrementing_count_cursor_model.start_value 1553 else 0 1554 ) 1555 1556 interpolated_cursor_field = InterpolatedString.create( 1557 incrementing_count_cursor_model.cursor_field, 1558 parameters=incrementing_count_cursor_model.parameters or {}, 1559 ) 1560 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1561 1562 connector_state_converter = IncrementingCountStreamStateConverter( 1563 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1564 ) 1565 1566 return ConcurrentCursor( 1567 stream_name=stream_name, 1568 stream_namespace=stream_namespace, 1569 stream_state=stream_state, 1570 message_repository=message_repository or self._message_repository, 1571 connector_state_manager=self._connector_state_manager, 1572 connector_state_converter=connector_state_converter, 1573 cursor_field=cursor_field, 1574 slice_boundary_fields=None, 1575 start=interpolated_start_value, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1576 end_provider=connector_state_converter.get_end_provider(), # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice 1577 )
1598 def create_concurrent_cursor_from_perpartition_cursor( 1599 self, 1600 state_manager: ConnectorStateManager, 1601 model_type: Type[BaseModel], 1602 component_definition: ComponentDefinition, 1603 stream_name: str, 1604 stream_namespace: Optional[str], 1605 config: Config, 1606 stream_state: MutableMapping[str, Any], 1607 partition_router: PartitionRouter, 1608 attempt_to_create_cursor_if_not_provided: bool = False, 1609 **kwargs: Any, 1610 ) -> ConcurrentPerPartitionCursor: 1611 component_type = component_definition.get("type") 1612 if component_definition.get("type") != model_type.__name__: 1613 raise ValueError( 1614 f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead" 1615 ) 1616 1617 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1618 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1619 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1620 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1621 if "$parameters" not in component_definition and "parameters" in component_definition: 1622 component_definition["$parameters"] = component_definition.get("parameters") # type: ignore # This is a dict 1623 datetime_based_cursor_model = model_type.parse_obj(component_definition) 1624 1625 if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel): 1626 raise ValueError( 1627 f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}" 1628 ) 1629 1630 interpolated_cursor_field = InterpolatedString.create( 1631 datetime_based_cursor_model.cursor_field, 1632 # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases: 1633 # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters` 1634 # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters` 1635 # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory. 1636 parameters=datetime_based_cursor_model.parameters or {}, 1637 ) 1638 cursor_field = CursorField(interpolated_cursor_field.eval(config=config)) 1639 1640 datetime_format = datetime_based_cursor_model.datetime_format 1641 1642 cursor_granularity = ( 1643 parse_duration(datetime_based_cursor_model.cursor_granularity) 1644 if datetime_based_cursor_model.cursor_granularity 1645 else None 1646 ) 1647 1648 connector_state_converter: DateTimeStreamStateConverter 1649 connector_state_converter = CustomFormatConcurrentStreamStateConverter( 1650 datetime_format=datetime_format, 1651 input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats, 1652 is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state 1653 cursor_granularity=cursor_granularity, 1654 ) 1655 1656 # Create the cursor factory 1657 cursor_factory = ConcurrentCursorFactory( 1658 partial( 1659 self.create_concurrent_cursor_from_datetime_based_cursor, 1660 state_manager=state_manager, 1661 model_type=model_type, 1662 component_definition=component_definition, 1663 stream_name=stream_name, 1664 stream_namespace=stream_namespace, 1665 config=config, 1666 message_repository=NoopMessageRepository(), 1667 ) 1668 ) 1669 1670 # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state 1671 use_global_cursor = isinstance( 1672 partition_router, GroupingPartitionRouter 1673 ) or component_definition.get("global_substream_cursor", False) 1674 1675 # Return the concurrent cursor and state converter 1676 return ConcurrentPerPartitionCursor( 1677 cursor_factory=cursor_factory, 1678 partition_router=partition_router, 1679 stream_name=stream_name, 1680 stream_namespace=stream_namespace, 1681 stream_state=stream_state, 1682 message_repository=self._message_repository, # type: ignore 1683 connector_state_manager=state_manager, 1684 connector_state_converter=connector_state_converter, 1685 cursor_field=cursor_field, 1686 use_global_cursor=use_global_cursor, 1687 attempt_to_create_cursor_if_not_provided=attempt_to_create_cursor_if_not_provided, 1688 )
1690 @staticmethod 1691 def create_constant_backoff_strategy( 1692 model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any 1693 ) -> ConstantBackoffStrategy: 1694 return ConstantBackoffStrategy( 1695 backoff_time_in_seconds=model.backoff_time_in_seconds, 1696 config=config, 1697 parameters=model.parameters or {}, 1698 )
1700 def create_cursor_pagination( 1701 self, model: CursorPaginationModel, config: Config, decoder: Decoder, **kwargs: Any 1702 ) -> CursorPaginationStrategy: 1703 if isinstance(decoder, PaginationDecoderDecorator): 1704 inner_decoder = decoder.decoder 1705 else: 1706 inner_decoder = decoder 1707 decoder = PaginationDecoderDecorator(decoder=decoder) 1708 1709 if self._is_supported_decoder_for_pagination(inner_decoder): 1710 decoder_to_use = decoder 1711 else: 1712 raise ValueError( 1713 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 1714 ) 1715 1716 return CursorPaginationStrategy( 1717 cursor_value=model.cursor_value, 1718 decoder=decoder_to_use, 1719 page_size=model.page_size, 1720 stop_condition=model.stop_condition, 1721 config=config, 1722 parameters=model.parameters or {}, 1723 )
1725 def create_custom_component(self, model: Any, config: Config, **kwargs: Any) -> Any: 1726 """ 1727 Generically creates a custom component based on the model type and a class_name reference to the custom Python class being 1728 instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor 1729 :param model: The Pydantic model of the custom component being created 1730 :param config: The custom defined connector config 1731 :return: The declarative component built from the Pydantic model to be used at runtime 1732 """ 1733 custom_component_class = self._get_class_from_fully_qualified_class_name(model.class_name) 1734 component_fields = get_type_hints(custom_component_class) 1735 model_args = model.dict() 1736 model_args["config"] = config 1737 1738 # There are cases where a parent component will pass arguments to a child component via kwargs. When there are field collisions 1739 # we defer to these arguments over the component's definition 1740 for key, arg in kwargs.items(): 1741 model_args[key] = arg 1742 1743 # Pydantic is unable to parse a custom component's fields that are subcomponents into models because their fields and types are not 1744 # defined in the schema. The fields and types are defined within the Python class implementation. Pydantic can only parse down to 1745 # the custom component and this code performs a second parse to convert the sub-fields first into models, then declarative components 1746 for model_field, model_value in model_args.items(): 1747 # If a custom component field doesn't have a type set, we try to use the type hints to infer the type 1748 if ( 1749 isinstance(model_value, dict) 1750 and "type" not in model_value 1751 and model_field in component_fields 1752 ): 1753 derived_type = self._derive_component_type_from_type_hints( 1754 component_fields.get(model_field) 1755 ) 1756 if derived_type: 1757 model_value["type"] = derived_type 1758 1759 if self._is_component(model_value): 1760 model_args[model_field] = self._create_nested_component( 1761 model, 1762 model_field, 1763 model_value, 1764 config, 1765 **kwargs, 1766 ) 1767 elif isinstance(model_value, list): 1768 vals = [] 1769 for v in model_value: 1770 if isinstance(v, dict) and "type" not in v and model_field in component_fields: 1771 derived_type = self._derive_component_type_from_type_hints( 1772 component_fields.get(model_field) 1773 ) 1774 if derived_type: 1775 v["type"] = derived_type 1776 if self._is_component(v): 1777 vals.append( 1778 self._create_nested_component( 1779 model, 1780 model_field, 1781 v, 1782 config, 1783 **kwargs, 1784 ) 1785 ) 1786 else: 1787 vals.append(v) 1788 model_args[model_field] = vals 1789 1790 kwargs = { 1791 class_field: model_args[class_field] 1792 for class_field in component_fields.keys() 1793 if class_field in model_args 1794 } 1795 return custom_component_class(**kwargs)
Generically creates a custom component based on the model type and a class_name reference to the custom Python class being instantiated. Only the model's additional properties that match the custom class definition are passed to the constructor
Parameters
- model: The Pydantic model of the custom component being created
- config: The custom defined connector config
Returns
The declarative component built from the Pydantic model to be used at runtime
1930 def create_datetime_based_cursor( 1931 self, model: DatetimeBasedCursorModel, config: Config, **kwargs: Any 1932 ) -> DatetimeBasedCursor: 1933 start_datetime: Union[str, MinMaxDatetime] = ( 1934 model.start_datetime 1935 if isinstance(model.start_datetime, str) 1936 else self.create_min_max_datetime(model.start_datetime, config) 1937 ) 1938 end_datetime: Union[str, MinMaxDatetime, None] = None 1939 if model.is_data_feed and model.end_datetime: 1940 raise ValueError("Data feed does not support end_datetime") 1941 if model.is_data_feed and model.is_client_side_incremental: 1942 raise ValueError( 1943 "`Client side incremental` cannot be applied with `data feed`. Choose only 1 from them." 1944 ) 1945 if model.end_datetime: 1946 end_datetime = ( 1947 model.end_datetime 1948 if isinstance(model.end_datetime, str) 1949 else self.create_min_max_datetime(model.end_datetime, config) 1950 ) 1951 1952 end_time_option = ( 1953 self._create_component_from_model( 1954 model.end_time_option, config, parameters=model.parameters or {} 1955 ) 1956 if model.end_time_option 1957 else None 1958 ) 1959 start_time_option = ( 1960 self._create_component_from_model( 1961 model.start_time_option, config, parameters=model.parameters or {} 1962 ) 1963 if model.start_time_option 1964 else None 1965 ) 1966 1967 return DatetimeBasedCursor( 1968 cursor_field=model.cursor_field, 1969 cursor_datetime_formats=model.cursor_datetime_formats 1970 if model.cursor_datetime_formats 1971 else [], 1972 cursor_granularity=model.cursor_granularity, 1973 datetime_format=model.datetime_format, 1974 end_datetime=end_datetime, 1975 start_datetime=start_datetime, 1976 step=model.step, 1977 end_time_option=end_time_option, 1978 lookback_window=model.lookback_window, 1979 start_time_option=start_time_option, 1980 partition_field_end=model.partition_field_end, 1981 partition_field_start=model.partition_field_start, 1982 message_repository=self._message_repository, 1983 is_compare_strictly=model.is_compare_strictly, 1984 config=config, 1985 parameters=model.parameters or {}, 1986 )
1988 def create_default_stream( 1989 self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any 1990 ) -> AbstractStream: 1991 primary_key = model.primary_key.__root__ if model.primary_key else None 1992 self._migrate_state(model, config) 1993 1994 partition_router = self._build_stream_slicer_from_partition_router( 1995 model.retriever, 1996 config, 1997 stream_name=model.name, 1998 **kwargs, 1999 ) 2000 concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config) 2001 if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel): 2002 cursor_model: DatetimeBasedCursorModel = model.incremental_sync 2003 2004 end_time_option = ( 2005 self._create_component_from_model( 2006 cursor_model.end_time_option, config, parameters=cursor_model.parameters or {} 2007 ) 2008 if cursor_model.end_time_option 2009 else None 2010 ) 2011 start_time_option = ( 2012 self._create_component_from_model( 2013 cursor_model.start_time_option, config, parameters=cursor_model.parameters or {} 2014 ) 2015 if cursor_model.start_time_option 2016 else None 2017 ) 2018 2019 datetime_request_options_provider = DatetimeBasedRequestOptionsProvider( 2020 start_time_option=start_time_option, 2021 end_time_option=end_time_option, 2022 partition_field_start=cursor_model.partition_field_start, 2023 partition_field_end=cursor_model.partition_field_end, 2024 config=config, 2025 parameters=model.parameters or {}, 2026 ) 2027 request_options_provider = ( 2028 datetime_request_options_provider 2029 if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor) 2030 else PerPartitionRequestOptionsProvider( 2031 partition_router, datetime_request_options_provider 2032 ) 2033 ) 2034 elif model.incremental_sync and isinstance( 2035 model.incremental_sync, IncrementingCountCursorModel 2036 ): 2037 if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor): 2038 raise ValueError( 2039 "PerPartition does not support per partition states because switching to global state is time based" 2040 ) 2041 2042 cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore 2043 2044 start_time_option = ( 2045 self._create_component_from_model( 2046 cursor_model.start_value_option, # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2047 config, 2048 parameters=cursor_model.parameters or {}, 2049 ) 2050 if cursor_model.start_value_option # type: ignore # mypy still thinks cursor_model of type DatetimeBasedCursor 2051 else None 2052 ) 2053 2054 # The concurrent engine defaults the start/end fields on the slice to "start" and "end", but 2055 # the default DatetimeBasedRequestOptionsProvider() sets them to start_time/end_time 2056 partition_field_start = "start" 2057 2058 request_options_provider = DatetimeBasedRequestOptionsProvider( 2059 start_time_option=start_time_option, 2060 partition_field_start=partition_field_start, 2061 config=config, 2062 parameters=model.parameters or {}, 2063 ) 2064 else: 2065 request_options_provider = None 2066 2067 transformations = [] 2068 if model.transformations: 2069 for transformation_model in model.transformations: 2070 transformations.append( 2071 self._create_component_from_model(model=transformation_model, config=config) 2072 ) 2073 file_uploader = None 2074 if model.file_uploader: 2075 file_uploader = self._create_component_from_model( 2076 model=model.file_uploader, config=config 2077 ) 2078 2079 stream_slicer: ConcurrentStreamSlicer = ( 2080 partition_router 2081 if isinstance(concurrent_cursor, FinalStateCursor) 2082 else concurrent_cursor 2083 ) 2084 2085 retriever = self._create_component_from_model( 2086 model=model.retriever, 2087 config=config, 2088 name=model.name, 2089 primary_key=primary_key, 2090 request_options_provider=request_options_provider, 2091 stream_slicer=stream_slicer, 2092 partition_router=partition_router, 2093 has_stop_condition_cursor=self._is_stop_condition_on_cursor(model), 2094 is_client_side_incremental_sync=self._is_client_side_filtering_enabled(model), 2095 cursor=concurrent_cursor, 2096 transformations=transformations, 2097 file_uploader=file_uploader, 2098 incremental_sync=model.incremental_sync, 2099 ) 2100 if isinstance(retriever, AsyncRetriever): 2101 stream_slicer = retriever.stream_slicer 2102 2103 schema_loader: SchemaLoader 2104 if model.schema_loader and isinstance(model.schema_loader, list): 2105 nested_schema_loaders = [ 2106 self._create_component_from_model(model=nested_schema_loader, config=config) 2107 for nested_schema_loader in model.schema_loader 2108 ] 2109 schema_loader = CompositeSchemaLoader( 2110 schema_loaders=nested_schema_loaders, parameters={} 2111 ) 2112 elif model.schema_loader: 2113 schema_loader = self._create_component_from_model( 2114 model=model.schema_loader, # type: ignore # If defined, schema_loader is guaranteed not to be a list and will be one of the existing base models 2115 config=config, 2116 ) 2117 else: 2118 options = model.parameters or {} 2119 if "name" not in options: 2120 options["name"] = model.name 2121 schema_loader = DefaultSchemaLoader(config=config, parameters=options) 2122 schema_loader = CachingSchemaLoaderDecorator(schema_loader) 2123 2124 stream_name = model.name or "" 2125 return DefaultStream( 2126 partition_generator=StreamSlicerPartitionGenerator( 2127 DeclarativePartitionFactory( 2128 stream_name, 2129 schema_loader, 2130 retriever, 2131 self._message_repository, 2132 ), 2133 stream_slicer, 2134 slice_limit=self._limit_slices_fetched, 2135 ), 2136 name=stream_name, 2137 json_schema=schema_loader.get_json_schema, 2138 primary_key=get_primary_key_from_stream(primary_key), 2139 cursor_field=concurrent_cursor.cursor_field.cursor_field_key 2140 if hasattr(concurrent_cursor, "cursor_field") 2141 else "", # FIXME we should have the cursor field has part of the interface of cursor, 2142 logger=logging.getLogger(f"airbyte.{stream_name}"), 2143 cursor=concurrent_cursor, 2144 supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), 2145 )
2287 def create_default_error_handler( 2288 self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any 2289 ) -> DefaultErrorHandler: 2290 backoff_strategies = [] 2291 if model.backoff_strategies: 2292 for backoff_strategy_model in model.backoff_strategies: 2293 backoff_strategies.append( 2294 self._create_component_from_model(model=backoff_strategy_model, config=config) 2295 ) 2296 2297 response_filters = [] 2298 if model.response_filters: 2299 for response_filter_model in model.response_filters: 2300 response_filters.append( 2301 self._create_component_from_model(model=response_filter_model, config=config) 2302 ) 2303 response_filters.append( 2304 HttpResponseFilter(config=config, parameters=model.parameters or {}) 2305 ) 2306 2307 return DefaultErrorHandler( 2308 backoff_strategies=backoff_strategies, 2309 max_retries=model.max_retries, 2310 response_filters=response_filters, 2311 config=config, 2312 parameters=model.parameters or {}, 2313 )
2315 def create_default_paginator( 2316 self, 2317 model: DefaultPaginatorModel, 2318 config: Config, 2319 *, 2320 url_base: str, 2321 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2322 decoder: Optional[Decoder] = None, 2323 cursor_used_for_stop_condition: Optional[Cursor] = None, 2324 ) -> Union[DefaultPaginator, PaginatorTestReadDecorator]: 2325 if decoder: 2326 if self._is_supported_decoder_for_pagination(decoder): 2327 decoder_to_use = PaginationDecoderDecorator(decoder=decoder) 2328 else: 2329 raise ValueError(self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(decoder))) 2330 else: 2331 decoder_to_use = PaginationDecoderDecorator(decoder=JsonDecoder(parameters={})) 2332 page_size_option = ( 2333 self._create_component_from_model(model=model.page_size_option, config=config) 2334 if model.page_size_option 2335 else None 2336 ) 2337 page_token_option = ( 2338 self._create_component_from_model(model=model.page_token_option, config=config) 2339 if model.page_token_option 2340 else None 2341 ) 2342 pagination_strategy = self._create_component_from_model( 2343 model=model.pagination_strategy, 2344 config=config, 2345 decoder=decoder_to_use, 2346 extractor_model=extractor_model, 2347 ) 2348 if cursor_used_for_stop_condition: 2349 pagination_strategy = StopConditionPaginationStrategyDecorator( 2350 pagination_strategy, CursorStopCondition(cursor_used_for_stop_condition) 2351 ) 2352 paginator = DefaultPaginator( 2353 decoder=decoder_to_use, 2354 page_size_option=page_size_option, 2355 page_token_option=page_token_option, 2356 pagination_strategy=pagination_strategy, 2357 url_base=url_base, 2358 config=config, 2359 parameters=model.parameters or {}, 2360 ) 2361 if self._limit_pages_fetched_per_slice: 2362 return PaginatorTestReadDecorator(paginator, self._limit_pages_fetched_per_slice) 2363 return paginator
2365 def create_dpath_extractor( 2366 self, 2367 model: DpathExtractorModel, 2368 config: Config, 2369 decoder: Optional[Decoder] = None, 2370 **kwargs: Any, 2371 ) -> DpathExtractor: 2372 if decoder: 2373 decoder_to_use = decoder 2374 else: 2375 decoder_to_use = JsonDecoder(parameters={}) 2376 model_field_path: List[Union[InterpolatedString, str]] = [x for x in model.field_path] 2377 return DpathExtractor( 2378 decoder=decoder_to_use, 2379 field_path=model_field_path, 2380 config=config, 2381 parameters=model.parameters or {}, 2382 )
2403 def create_http_requester( 2404 self, 2405 model: HttpRequesterModel, 2406 config: Config, 2407 decoder: Decoder = JsonDecoder(parameters={}), 2408 query_properties_key: Optional[str] = None, 2409 use_cache: Optional[bool] = None, 2410 *, 2411 name: str, 2412 ) -> HttpRequester: 2413 authenticator = ( 2414 self._create_component_from_model( 2415 model=model.authenticator, 2416 config=config, 2417 url_base=model.url or model.url_base, 2418 name=name, 2419 decoder=decoder, 2420 ) 2421 if model.authenticator 2422 else None 2423 ) 2424 error_handler = ( 2425 self._create_component_from_model(model=model.error_handler, config=config) 2426 if model.error_handler 2427 else DefaultErrorHandler( 2428 backoff_strategies=[], 2429 response_filters=[], 2430 config=config, 2431 parameters=model.parameters or {}, 2432 ) 2433 ) 2434 2435 api_budget = self._api_budget 2436 2437 request_options_provider = InterpolatedRequestOptionsProvider( 2438 request_body=model.request_body, 2439 request_body_data=model.request_body_data, 2440 request_body_json=model.request_body_json, 2441 request_headers=model.request_headers, 2442 request_parameters=model.request_parameters, # type: ignore # QueryProperties have been removed in `create_simple_retriever` 2443 query_properties_key=query_properties_key, 2444 config=config, 2445 parameters=model.parameters or {}, 2446 ) 2447 2448 assert model.use_cache is not None # for mypy 2449 assert model.http_method is not None # for mypy 2450 2451 should_use_cache = (model.use_cache or bool(use_cache)) and not self._disable_cache 2452 2453 return HttpRequester( 2454 name=name, 2455 url=model.url, 2456 url_base=model.url_base, 2457 path=model.path, 2458 authenticator=authenticator, 2459 error_handler=error_handler, 2460 api_budget=api_budget, 2461 http_method=HttpMethod[model.http_method.value], 2462 request_options_provider=request_options_provider, 2463 config=config, 2464 disable_retries=self._disable_retries, 2465 parameters=model.parameters or {}, 2466 message_repository=self._message_repository, 2467 use_cache=should_use_cache, 2468 decoder=decoder, 2469 stream_response=decoder.is_stream_response() if decoder else False, 2470 )
2472 @staticmethod 2473 def create_http_response_filter( 2474 model: HttpResponseFilterModel, config: Config, **kwargs: Any 2475 ) -> HttpResponseFilter: 2476 if model.action: 2477 action = ResponseAction(model.action.value) 2478 else: 2479 action = None 2480 2481 failure_type = FailureType(model.failure_type.value) if model.failure_type else None 2482 2483 http_codes = ( 2484 set(model.http_codes) if model.http_codes else set() 2485 ) # JSON schema notation has no set data type. The schema enforces an array of unique elements 2486 2487 return HttpResponseFilter( 2488 action=action, 2489 failure_type=failure_type, 2490 error_message=model.error_message or "", 2491 error_message_contains=model.error_message_contains or "", 2492 http_codes=http_codes, 2493 predicate=model.predicate or "", 2494 config=config, 2495 parameters=model.parameters or {}, 2496 )
2504 def create_complex_field_type( 2505 self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any 2506 ) -> ComplexFieldType: 2507 items = ( 2508 self._create_component_from_model(model=model.items, config=config) 2509 if isinstance(model.items, ComplexFieldTypeModel) 2510 else model.items 2511 ) 2512 2513 return ComplexFieldType(field_type=model.field_type, items=items)
2515 def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap: 2516 target_type = ( 2517 self._create_component_from_model(model=model.target_type, config=config) 2518 if isinstance(model.target_type, ComplexFieldTypeModel) 2519 else model.target_type 2520 ) 2521 2522 return TypesMap( 2523 target_type=target_type, 2524 current_type=model.current_type, 2525 condition=model.condition if model.condition is not None else "True", 2526 )
2528 def create_schema_type_identifier( 2529 self, model: SchemaTypeIdentifierModel, config: Config, **kwargs: Any 2530 ) -> SchemaTypeIdentifier: 2531 types_mapping = [] 2532 if model.types_mapping: 2533 types_mapping.extend( 2534 [ 2535 self._create_component_from_model(types_map, config=config) 2536 for types_map in model.types_mapping 2537 ] 2538 ) 2539 model_schema_pointer: List[Union[InterpolatedString, str]] = ( 2540 [x for x in model.schema_pointer] if model.schema_pointer else [] 2541 ) 2542 model_key_pointer: List[Union[InterpolatedString, str]] = [x for x in model.key_pointer] 2543 model_type_pointer: Optional[List[Union[InterpolatedString, str]]] = ( 2544 [x for x in model.type_pointer] if model.type_pointer else None 2545 ) 2546 2547 return SchemaTypeIdentifier( 2548 schema_pointer=model_schema_pointer, 2549 key_pointer=model_key_pointer, 2550 type_pointer=model_type_pointer, 2551 types_mapping=types_mapping, 2552 parameters=model.parameters or {}, 2553 )
2555 def create_dynamic_schema_loader( 2556 self, model: DynamicSchemaLoaderModel, config: Config, **kwargs: Any 2557 ) -> DynamicSchemaLoader: 2558 schema_transformations = [] 2559 if model.schema_transformations: 2560 for transformation_model in model.schema_transformations: 2561 schema_transformations.append( 2562 self._create_component_from_model(model=transformation_model, config=config) 2563 ) 2564 name = "dynamic_properties" 2565 retriever = self._create_component_from_model( 2566 model=model.retriever, 2567 config=config, 2568 name=name, 2569 primary_key=None, 2570 partition_router=self._build_stream_slicer_from_partition_router( 2571 model.retriever, config 2572 ), 2573 transformations=[], 2574 use_cache=True, 2575 log_formatter=( 2576 lambda response: format_http_message( 2577 response, 2578 f"Schema loader '{name}' request", 2579 f"Request performed in order to extract schema.", 2580 name, 2581 is_auxiliary=True, 2582 ) 2583 ), 2584 ) 2585 schema_type_identifier = self._create_component_from_model( 2586 model.schema_type_identifier, config=config, parameters=model.parameters or {} 2587 ) 2588 schema_filter = ( 2589 self._create_component_from_model( 2590 model.schema_filter, config=config, parameters=model.parameters or {} 2591 ) 2592 if model.schema_filter is not None 2593 else None 2594 ) 2595 2596 return DynamicSchemaLoader( 2597 retriever=retriever, 2598 config=config, 2599 schema_transformations=schema_transformations, 2600 schema_filter=schema_filter, 2601 schema_type_identifier=schema_type_identifier, 2602 parameters=model.parameters or {}, 2603 )
2623 def create_gzip_decoder( 2624 self, model: GzipDecoderModel, config: Config, **kwargs: Any 2625 ) -> Decoder: 2626 _compressed_response_types = { 2627 "gzip", 2628 "x-gzip", 2629 "gzip, deflate", 2630 "x-gzip, deflate", 2631 "application/zip", 2632 "application/gzip", 2633 "application/x-gzip", 2634 "application/x-zip-compressed", 2635 } 2636 2637 gzip_parser: GzipParser = ModelToComponentFactory._get_parser(model, config) # type: ignore # based on the model, we know this will be a GzipParser 2638 2639 if self._emit_connector_builder_messages: 2640 # This is very surprising but if the response is not streamed, 2641 # CompositeRawDecoder calls response.content and the requests library actually uncompress the data as opposed to response.raw, 2642 # which uses urllib3 directly and does not uncompress the data. 2643 return CompositeRawDecoder(gzip_parser.inner_parser, False) 2644 2645 return CompositeRawDecoder.by_headers( 2646 [({"Content-Encoding", "Content-Type"}, _compressed_response_types, gzip_parser)], 2647 stream_response=True, 2648 fallback_parser=gzip_parser.inner_parser, 2649 )
2653 @staticmethod 2654 def create_incrementing_count_cursor( 2655 model: IncrementingCountCursorModel, config: Config, **kwargs: Any 2656 ) -> DatetimeBasedCursor: 2657 # This should not actually get used anywhere at runtime, but needed to add this to pass checks since 2658 # we still parse models into components. The issue is that there's no runtime implementation of a 2659 # IncrementingCountCursor. 2660 # A known and expected issue with this stub is running a check with the declared IncrementingCountCursor because it is run without ConcurrentCursor. 2661 return DatetimeBasedCursor( 2662 cursor_field=model.cursor_field, 2663 datetime_format="%Y-%m-%d", 2664 start_datetime="2024-12-12", 2665 config=config, 2666 parameters={}, 2667 )
2716 def create_jwt_authenticator( 2717 self, model: JwtAuthenticatorModel, config: Config, **kwargs: Any 2718 ) -> JwtAuthenticator: 2719 jwt_headers = model.jwt_headers or JwtHeadersModel(kid=None, typ="JWT", cty=None) 2720 jwt_payload = model.jwt_payload or JwtPayloadModel(iss=None, sub=None, aud=None) 2721 request_option = ( 2722 self._create_component_from_model(model.request_option, config) 2723 if model.request_option 2724 else None 2725 ) 2726 return JwtAuthenticator( 2727 config=config, 2728 parameters=model.parameters or {}, 2729 algorithm=JwtAlgorithm(model.algorithm.value), 2730 secret_key=model.secret_key, 2731 base64_encode_secret_key=model.base64_encode_secret_key, 2732 token_duration=model.token_duration, 2733 header_prefix=model.header_prefix, 2734 kid=jwt_headers.kid, 2735 typ=jwt_headers.typ, 2736 cty=jwt_headers.cty, 2737 iss=jwt_payload.iss, 2738 sub=jwt_payload.sub, 2739 aud=jwt_payload.aud, 2740 additional_jwt_headers=model.additional_jwt_headers, 2741 additional_jwt_payload=model.additional_jwt_payload, 2742 passphrase=model.passphrase, 2743 request_option=request_option, 2744 )
2746 def create_list_partition_router( 2747 self, model: ListPartitionRouterModel, config: Config, **kwargs: Any 2748 ) -> ListPartitionRouter: 2749 request_option = ( 2750 self._create_component_from_model(model.request_option, config) 2751 if model.request_option 2752 else None 2753 ) 2754 return ListPartitionRouter( 2755 cursor_field=model.cursor_field, 2756 request_option=request_option, 2757 values=model.values, 2758 config=config, 2759 parameters=model.parameters or {}, 2760 )
2762 @staticmethod 2763 def create_min_max_datetime( 2764 model: MinMaxDatetimeModel, config: Config, **kwargs: Any 2765 ) -> MinMaxDatetime: 2766 return MinMaxDatetime( 2767 datetime=model.datetime, 2768 datetime_format=model.datetime_format or "", 2769 max_datetime=model.max_datetime or "", 2770 min_datetime=model.min_datetime or "", 2771 parameters=model.parameters or {}, 2772 )
2784 def create_oauth_authenticator( 2785 self, model: OAuthAuthenticatorModel, config: Config, **kwargs: Any 2786 ) -> DeclarativeOauth2Authenticator: 2787 profile_assertion = ( 2788 self._create_component_from_model(model.profile_assertion, config=config) 2789 if model.profile_assertion 2790 else None 2791 ) 2792 2793 if model.refresh_token_updater: 2794 # ignore type error because fixing it would have a lot of dependencies, revisit later 2795 return DeclarativeSingleUseRefreshTokenOauth2Authenticator( # type: ignore 2796 config, 2797 InterpolatedString.create( 2798 model.token_refresh_endpoint, # type: ignore 2799 parameters=model.parameters or {}, 2800 ).eval(config), 2801 access_token_name=InterpolatedString.create( 2802 model.access_token_name or "access_token", parameters=model.parameters or {} 2803 ).eval(config), 2804 refresh_token_name=model.refresh_token_updater.refresh_token_name, 2805 expires_in_name=InterpolatedString.create( 2806 model.expires_in_name or "expires_in", parameters=model.parameters or {} 2807 ).eval(config), 2808 client_id_name=InterpolatedString.create( 2809 model.client_id_name or "client_id", parameters=model.parameters or {} 2810 ).eval(config), 2811 client_id=InterpolatedString.create( 2812 model.client_id, parameters=model.parameters or {} 2813 ).eval(config) 2814 if model.client_id 2815 else model.client_id, 2816 client_secret_name=InterpolatedString.create( 2817 model.client_secret_name or "client_secret", parameters=model.parameters or {} 2818 ).eval(config), 2819 client_secret=InterpolatedString.create( 2820 model.client_secret, parameters=model.parameters or {} 2821 ).eval(config) 2822 if model.client_secret 2823 else model.client_secret, 2824 access_token_config_path=model.refresh_token_updater.access_token_config_path, 2825 refresh_token_config_path=model.refresh_token_updater.refresh_token_config_path, 2826 token_expiry_date_config_path=model.refresh_token_updater.token_expiry_date_config_path, 2827 grant_type_name=InterpolatedString.create( 2828 model.grant_type_name or "grant_type", parameters=model.parameters or {} 2829 ).eval(config), 2830 grant_type=InterpolatedString.create( 2831 model.grant_type or "refresh_token", parameters=model.parameters or {} 2832 ).eval(config), 2833 refresh_request_body=InterpolatedMapping( 2834 model.refresh_request_body or {}, parameters=model.parameters or {} 2835 ).eval(config), 2836 refresh_request_headers=InterpolatedMapping( 2837 model.refresh_request_headers or {}, parameters=model.parameters or {} 2838 ).eval(config), 2839 scopes=model.scopes, 2840 token_expiry_date_format=model.token_expiry_date_format, 2841 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2842 message_repository=self._message_repository, 2843 refresh_token_error_status_codes=model.refresh_token_updater.refresh_token_error_status_codes, 2844 refresh_token_error_key=model.refresh_token_updater.refresh_token_error_key, 2845 refresh_token_error_values=model.refresh_token_updater.refresh_token_error_values, 2846 ) 2847 # ignore type error because fixing it would have a lot of dependencies, revisit later 2848 return DeclarativeOauth2Authenticator( # type: ignore 2849 access_token_name=model.access_token_name or "access_token", 2850 access_token_value=model.access_token_value, 2851 client_id_name=model.client_id_name or "client_id", 2852 client_id=model.client_id, 2853 client_secret_name=model.client_secret_name or "client_secret", 2854 client_secret=model.client_secret, 2855 expires_in_name=model.expires_in_name or "expires_in", 2856 grant_type_name=model.grant_type_name or "grant_type", 2857 grant_type=model.grant_type or "refresh_token", 2858 refresh_request_body=model.refresh_request_body, 2859 refresh_request_headers=model.refresh_request_headers, 2860 refresh_token_name=model.refresh_token_name or "refresh_token", 2861 refresh_token=model.refresh_token, 2862 scopes=model.scopes, 2863 token_expiry_date=model.token_expiry_date, 2864 token_expiry_date_format=model.token_expiry_date_format, 2865 token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format), 2866 token_refresh_endpoint=model.token_refresh_endpoint, 2867 config=config, 2868 parameters=model.parameters or {}, 2869 message_repository=self._message_repository, 2870 profile_assertion=profile_assertion, 2871 use_profile_assertion=model.use_profile_assertion, 2872 )
2874 def create_offset_increment( 2875 self, 2876 model: OffsetIncrementModel, 2877 config: Config, 2878 decoder: Decoder, 2879 extractor_model: Optional[Union[CustomRecordExtractorModel, DpathExtractorModel]] = None, 2880 **kwargs: Any, 2881 ) -> OffsetIncrement: 2882 if isinstance(decoder, PaginationDecoderDecorator): 2883 inner_decoder = decoder.decoder 2884 else: 2885 inner_decoder = decoder 2886 decoder = PaginationDecoderDecorator(decoder=decoder) 2887 2888 if self._is_supported_decoder_for_pagination(inner_decoder): 2889 decoder_to_use = decoder 2890 else: 2891 raise ValueError( 2892 self._UNSUPPORTED_DECODER_ERROR.format(decoder_type=type(inner_decoder)) 2893 ) 2894 2895 # Ideally we would instantiate the runtime extractor from highest most level (in this case the SimpleRetriever) 2896 # so that it can be shared by OffSetIncrement and RecordSelector. However, due to how we instantiate the 2897 # decoder with various decorators here, but not in create_record_selector, it is simpler to retain existing 2898 # behavior by having two separate extractors with identical behavior since they use the same extractor model. 2899 # When we have more time to investigate we can look into reusing the same component. 2900 extractor = ( 2901 self._create_component_from_model( 2902 model=extractor_model, config=config, decoder=decoder_to_use 2903 ) 2904 if extractor_model 2905 else None 2906 ) 2907 2908 return OffsetIncrement( 2909 page_size=model.page_size, 2910 config=config, 2911 decoder=decoder_to_use, 2912 extractor=extractor, 2913 inject_on_first_request=model.inject_on_first_request or False, 2914 parameters=model.parameters or {}, 2915 )
2917 @staticmethod 2918 def create_page_increment( 2919 model: PageIncrementModel, config: Config, **kwargs: Any 2920 ) -> PageIncrement: 2921 return PageIncrement( 2922 page_size=model.page_size, 2923 config=config, 2924 start_from_page=model.start_from_page or 0, 2925 inject_on_first_request=model.inject_on_first_request or False, 2926 parameters=model.parameters or {}, 2927 )
2929 def create_parent_stream_config( 2930 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 2931 ) -> ParentStreamConfig: 2932 declarative_stream = self._create_component_from_model( 2933 model.stream, 2934 config=config, 2935 is_parent=True, 2936 **kwargs, 2937 ) 2938 request_option = ( 2939 self._create_component_from_model(model.request_option, config=config) 2940 if model.request_option 2941 else None 2942 ) 2943 2944 if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): 2945 raise ValueError( 2946 "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." 2947 ) 2948 2949 model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( 2950 [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] 2951 ) 2952 2953 return ParentStreamConfig( 2954 parent_key=model.parent_key, 2955 request_option=request_option, 2956 stream=declarative_stream, 2957 partition_field=model.partition_field, 2958 config=config, 2959 incremental_dependency=model.incremental_dependency or False, 2960 parameters=model.parameters or {}, 2961 extra_fields=model.extra_fields, 2962 lazy_read_pointer=model_lazy_read_pointer, 2963 )
2965 def create_properties_from_endpoint( 2966 self, model: PropertiesFromEndpointModel, config: Config, **kwargs: Any 2967 ) -> PropertiesFromEndpoint: 2968 retriever = self._create_component_from_model( 2969 model=model.retriever, 2970 config=config, 2971 name="dynamic_properties", 2972 primary_key=None, 2973 stream_slicer=None, 2974 transformations=[], 2975 use_cache=True, # Enable caching on the HttpRequester/HttpClient because the properties endpoint will be called for every slice being processed, and it is highly unlikely for the response to different 2976 ) 2977 return PropertiesFromEndpoint( 2978 property_field_path=model.property_field_path, 2979 retriever=retriever, 2980 config=config, 2981 parameters=model.parameters or {}, 2982 )
2984 def create_property_chunking( 2985 self, model: PropertyChunkingModel, config: Config, **kwargs: Any 2986 ) -> PropertyChunking: 2987 record_merge_strategy = ( 2988 self._create_component_from_model( 2989 model=model.record_merge_strategy, config=config, **kwargs 2990 ) 2991 if model.record_merge_strategy 2992 else None 2993 ) 2994 2995 property_limit_type: PropertyLimitType 2996 match model.property_limit_type: 2997 case PropertyLimitTypeModel.property_count: 2998 property_limit_type = PropertyLimitType.property_count 2999 case PropertyLimitTypeModel.characters: 3000 property_limit_type = PropertyLimitType.characters 3001 case _: 3002 raise ValueError(f"Invalid PropertyLimitType {property_limit_type}") 3003 3004 return PropertyChunking( 3005 property_limit_type=property_limit_type, 3006 property_limit=model.property_limit, 3007 record_merge_strategy=record_merge_strategy, 3008 config=config, 3009 parameters=model.parameters or {}, 3010 )
3012 def create_query_properties( 3013 self, model: QueryPropertiesModel, config: Config, *, stream_name: str, **kwargs: Any 3014 ) -> QueryProperties: 3015 if isinstance(model.property_list, list): 3016 property_list = model.property_list 3017 else: 3018 property_list = self._create_component_from_model( 3019 model=model.property_list, config=config, **kwargs 3020 ) 3021 3022 property_chunking = ( 3023 self._create_component_from_model( 3024 model=model.property_chunking, config=config, **kwargs 3025 ) 3026 if model.property_chunking 3027 else None 3028 ) 3029 3030 property_selector = ( 3031 self._create_component_from_model( 3032 model=model.property_selector, config=config, stream_name=stream_name, **kwargs 3033 ) 3034 if model.property_selector 3035 else None 3036 ) 3037 3038 return QueryProperties( 3039 property_list=property_list, 3040 always_include_properties=model.always_include_properties, 3041 property_chunking=property_chunking, 3042 property_selector=property_selector, 3043 config=config, 3044 parameters=model.parameters or {}, 3045 )
3047 def create_json_schema_property_selector( 3048 self, 3049 model: JsonSchemaPropertySelectorModel, 3050 config: Config, 3051 *, 3052 stream_name: str, 3053 **kwargs: Any, 3054 ) -> JsonSchemaPropertySelector: 3055 configured_stream = self._stream_name_to_configured_stream.get(stream_name) 3056 3057 transformations = [] 3058 if model.transformations: 3059 for transformation_model in model.transformations: 3060 transformations.append( 3061 self._create_component_from_model(model=transformation_model, config=config) 3062 ) 3063 3064 return JsonSchemaPropertySelector( 3065 configured_stream=configured_stream, 3066 properties_transformations=transformations, 3067 config=config, 3068 parameters=model.parameters or {}, 3069 )
3083 @staticmethod 3084 def create_request_option( 3085 model: RequestOptionModel, config: Config, **kwargs: Any 3086 ) -> RequestOption: 3087 inject_into = RequestOptionType(model.inject_into.value) 3088 field_path: Optional[List[Union[InterpolatedString, str]]] = ( 3089 [ 3090 InterpolatedString.create(segment, parameters=kwargs.get("parameters", {})) 3091 for segment in model.field_path 3092 ] 3093 if model.field_path 3094 else None 3095 ) 3096 field_name = ( 3097 InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {})) 3098 if model.field_name 3099 else None 3100 ) 3101 return RequestOption( 3102 field_name=field_name, 3103 field_path=field_path, 3104 inject_into=inject_into, 3105 parameters=kwargs.get("parameters", {}), 3106 )
3108 def create_record_selector( 3109 self, 3110 model: RecordSelectorModel, 3111 config: Config, 3112 *, 3113 name: str, 3114 transformations: List[RecordTransformation] | None = None, 3115 decoder: Decoder | None = None, 3116 client_side_incremental_sync_cursor: Optional[Cursor] = None, 3117 file_uploader: Optional[DefaultFileUploader] = None, 3118 **kwargs: Any, 3119 ) -> RecordSelector: 3120 extractor = self._create_component_from_model( 3121 model=model.extractor, decoder=decoder, config=config 3122 ) 3123 record_filter = ( 3124 self._create_component_from_model(model.record_filter, config=config) 3125 if model.record_filter 3126 else None 3127 ) 3128 3129 transform_before_filtering = ( 3130 False if model.transform_before_filtering is None else model.transform_before_filtering 3131 ) 3132 if client_side_incremental_sync_cursor: 3133 record_filter = ClientSideIncrementalRecordFilterDecorator( 3134 config=config, 3135 parameters=model.parameters, 3136 condition=model.record_filter.condition 3137 if (model.record_filter and hasattr(model.record_filter, "condition")) 3138 else None, 3139 cursor=client_side_incremental_sync_cursor, 3140 ) 3141 transform_before_filtering = ( 3142 True 3143 if model.transform_before_filtering is None 3144 else model.transform_before_filtering 3145 ) 3146 3147 if model.schema_normalization is None: 3148 # default to no schema normalization if not set 3149 model.schema_normalization = SchemaNormalizationModel.None_ 3150 3151 schema_normalization = ( 3152 TypeTransformer(SCHEMA_TRANSFORMER_TYPE_MAPPING[model.schema_normalization]) 3153 if isinstance(model.schema_normalization, SchemaNormalizationModel) 3154 else self._create_component_from_model(model.schema_normalization, config=config) # type: ignore[arg-type] # custom normalization model expected here 3155 ) 3156 3157 return RecordSelector( 3158 extractor=extractor, 3159 name=name, 3160 config=config, 3161 record_filter=record_filter, 3162 transformations=transformations or [], 3163 file_uploader=file_uploader, 3164 schema_normalization=schema_normalization, 3165 parameters=model.parameters or {}, 3166 transform_before_filtering=transform_before_filtering, 3167 )
3177 def create_selective_authenticator( 3178 self, model: SelectiveAuthenticatorModel, config: Config, **kwargs: Any 3179 ) -> DeclarativeAuthenticator: 3180 authenticators = { 3181 name: self._create_component_from_model(model=auth, config=config) 3182 for name, auth in model.authenticators.items() 3183 } 3184 # SelectiveAuthenticator will return instance of DeclarativeAuthenticator or raise ValueError error 3185 return SelectiveAuthenticator( # type: ignore[abstract] 3186 config=config, 3187 authenticators=authenticators, 3188 authenticator_selection_path=model.authenticator_selection_path, 3189 **kwargs, 3190 )
3192 @staticmethod 3193 def create_legacy_session_token_authenticator( 3194 model: LegacySessionTokenAuthenticatorModel, config: Config, *, url_base: str, **kwargs: Any 3195 ) -> LegacySessionTokenAuthenticator: 3196 return LegacySessionTokenAuthenticator( 3197 api_url=url_base, 3198 header=model.header, 3199 login_url=model.login_url, 3200 password=model.password or "", 3201 session_token=model.session_token or "", 3202 session_token_response_key=model.session_token_response_key or "", 3203 username=model.username or "", 3204 validate_session_url=model.validate_session_url, 3205 config=config, 3206 parameters=model.parameters or {}, 3207 )
3209 def create_simple_retriever( 3210 self, 3211 model: SimpleRetrieverModel, 3212 config: Config, 3213 *, 3214 name: str, 3215 primary_key: Optional[Union[str, List[str], List[List[str]]]], 3216 request_options_provider: Optional[RequestOptionsProvider] = None, 3217 cursor: Optional[Cursor] = None, 3218 has_stop_condition_cursor: bool = False, 3219 is_client_side_incremental_sync: bool = False, 3220 transformations: List[RecordTransformation], 3221 file_uploader: Optional[DefaultFileUploader] = None, 3222 incremental_sync: Optional[ 3223 Union[IncrementingCountCursorModel, DatetimeBasedCursorModel] 3224 ] = None, 3225 use_cache: Optional[bool] = None, 3226 log_formatter: Optional[Callable[[Response], Any]] = None, 3227 partition_router: Optional[PartitionRouter] = None, 3228 **kwargs: Any, 3229 ) -> SimpleRetriever: 3230 def _get_url(req: Requester) -> str: 3231 """ 3232 Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever. 3233 This is needed because the URL is not set until the requester is created. 3234 """ 3235 3236 _url: str = ( 3237 model.requester.url 3238 if hasattr(model.requester, "url") and model.requester.url is not None 3239 else req.get_url(stream_state=None, stream_slice=None, next_page_token=None) 3240 ) 3241 _url_base: str = ( 3242 model.requester.url_base 3243 if hasattr(model.requester, "url_base") and model.requester.url_base is not None 3244 else req.get_url_base(stream_state=None, stream_slice=None, next_page_token=None) 3245 ) 3246 3247 return _url or _url_base 3248 3249 if cursor is None: 3250 cursor = FinalStateCursor(name, None, self._message_repository) 3251 3252 decoder = ( 3253 self._create_component_from_model(model=model.decoder, config=config) 3254 if model.decoder 3255 else JsonDecoder(parameters={}) 3256 ) 3257 record_selector = self._create_component_from_model( 3258 model=model.record_selector, 3259 name=name, 3260 config=config, 3261 decoder=decoder, 3262 transformations=transformations, 3263 client_side_incremental_sync_cursor=cursor if is_client_side_incremental_sync else None, 3264 file_uploader=file_uploader, 3265 ) 3266 3267 query_properties: Optional[QueryProperties] = None 3268 query_properties_key: Optional[str] = None 3269 self._ensure_query_properties_to_model(model.requester) 3270 if self._has_query_properties_in_request_parameters(model.requester): 3271 # It is better to be explicit about an error if PropertiesFromEndpoint is defined in multiple 3272 # places instead of default to request_parameters which isn't clearly documented 3273 if ( 3274 hasattr(model.requester, "fetch_properties_from_endpoint") 3275 and model.requester.fetch_properties_from_endpoint 3276 ): 3277 raise ValueError( 3278 f"PropertiesFromEndpoint should only be specified once per stream, but found in {model.requester.type}.fetch_properties_from_endpoint and {model.requester.type}.request_parameters" 3279 ) 3280 3281 query_properties_definitions = [] 3282 for key, request_parameter in model.requester.request_parameters.items(): # type: ignore # request_parameters is already validated to be a Mapping using _has_query_properties_in_request_parameters() 3283 if isinstance(request_parameter, QueryPropertiesModel): 3284 query_properties_key = key 3285 query_properties_definitions.append(request_parameter) 3286 3287 if len(query_properties_definitions) > 1: 3288 raise ValueError( 3289 f"request_parameters only supports defining one QueryProperties field, but found {len(query_properties_definitions)} usages" 3290 ) 3291 3292 if len(query_properties_definitions) == 1: 3293 query_properties = self._create_component_from_model( 3294 model=query_properties_definitions[0], stream_name=name, config=config 3295 ) 3296 3297 # Removes QueryProperties components from the interpolated mappings because it has been designed 3298 # to be used by the SimpleRetriever and will be resolved from the provider from the slice directly 3299 # instead of through jinja interpolation 3300 if hasattr(model.requester, "request_parameters") and isinstance( 3301 model.requester.request_parameters, Mapping 3302 ): 3303 model.requester.request_parameters = self._remove_query_properties( 3304 model.requester.request_parameters 3305 ) 3306 elif ( 3307 hasattr(model.requester, "fetch_properties_from_endpoint") 3308 and model.requester.fetch_properties_from_endpoint 3309 ): 3310 # todo: Deprecate this condition once dependent connectors migrate to query_properties 3311 query_properties_definition = QueryPropertiesModel( 3312 type="QueryProperties", 3313 property_list=model.requester.fetch_properties_from_endpoint, 3314 always_include_properties=None, 3315 property_chunking=None, 3316 ) # type: ignore # $parameters has a default value 3317 3318 query_properties = self.create_query_properties( 3319 model=query_properties_definition, 3320 stream_name=name, 3321 config=config, 3322 ) 3323 elif hasattr(model.requester, "query_properties") and model.requester.query_properties: 3324 query_properties = self.create_query_properties( 3325 model=model.requester.query_properties, 3326 stream_name=name, 3327 config=config, 3328 ) 3329 3330 requester = self._create_component_from_model( 3331 model=model.requester, 3332 decoder=decoder, 3333 name=name, 3334 query_properties_key=query_properties_key, 3335 use_cache=use_cache, 3336 config=config, 3337 ) 3338 3339 if not request_options_provider: 3340 request_options_provider = DefaultRequestOptionsProvider(parameters={}) 3341 if isinstance(request_options_provider, DefaultRequestOptionsProvider) and isinstance( 3342 partition_router, PartitionRouter 3343 ): 3344 request_options_provider = partition_router 3345 3346 paginator = ( 3347 self._create_component_from_model( 3348 model=model.paginator, 3349 config=config, 3350 url_base=_get_url(requester), 3351 extractor_model=model.record_selector.extractor, 3352 decoder=decoder, 3353 cursor_used_for_stop_condition=cursor if has_stop_condition_cursor else None, 3354 ) 3355 if model.paginator 3356 else NoPagination(parameters={}) 3357 ) 3358 3359 ignore_stream_slicer_parameters_on_paginated_requests = ( 3360 model.ignore_stream_slicer_parameters_on_paginated_requests or False 3361 ) 3362 3363 if ( 3364 model.partition_router 3365 and isinstance(model.partition_router, SubstreamPartitionRouterModel) 3366 and not bool(self._connector_state_manager.get_stream_state(name, None)) 3367 and any( 3368 parent_stream_config.lazy_read_pointer 3369 for parent_stream_config in model.partition_router.parent_stream_configs 3370 ) 3371 ): 3372 if incremental_sync: 3373 if incremental_sync.type != "DatetimeBasedCursor": 3374 raise ValueError( 3375 f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." 3376 ) 3377 3378 elif incremental_sync.step or incremental_sync.cursor_granularity: 3379 raise ValueError( 3380 f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." 3381 ) 3382 3383 if model.decoder and model.decoder.type != "JsonDecoder": 3384 raise ValueError( 3385 f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." 3386 ) 3387 3388 return LazySimpleRetriever( 3389 name=name, 3390 paginator=paginator, 3391 primary_key=primary_key, 3392 requester=requester, 3393 record_selector=record_selector, 3394 stream_slicer=_NO_STREAM_SLICING, 3395 request_option_provider=request_options_provider, 3396 cursor=None, 3397 config=config, 3398 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3399 parameters=model.parameters or {}, 3400 ) 3401 3402 if ( 3403 model.record_selector.record_filter 3404 and model.pagination_reset 3405 and model.pagination_reset.limits 3406 ): 3407 raise ValueError("PaginationResetLimits are not supported while having record filter.") 3408 3409 return SimpleRetriever( 3410 name=name, 3411 paginator=paginator, 3412 primary_key=primary_key, 3413 requester=requester, 3414 record_selector=record_selector, 3415 stream_slicer=_NO_STREAM_SLICING, 3416 request_option_provider=request_options_provider, 3417 cursor=None, 3418 config=config, 3419 ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, 3420 additional_query_properties=query_properties, 3421 log_formatter=self._get_log_formatter(log_formatter, name), 3422 pagination_tracker_factory=self._create_pagination_tracker_factory( 3423 model.pagination_reset, cursor 3424 ), 3425 parameters=model.parameters or {}, 3426 )
3504 def create_state_delegating_stream( 3505 self, 3506 model: StateDelegatingStreamModel, 3507 config: Config, 3508 has_parent_state: Optional[bool] = None, 3509 **kwargs: Any, 3510 ) -> DeclarativeStream: 3511 if ( 3512 model.full_refresh_stream.name != model.name 3513 or model.name != model.incremental_stream.name 3514 ): 3515 raise ValueError( 3516 f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}." 3517 ) 3518 3519 stream_model = self._get_state_delegating_stream_model( 3520 False if has_parent_state is None else has_parent_state, model 3521 ) 3522 3523 return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel
3564 def create_async_retriever( 3565 self, 3566 model: AsyncRetrieverModel, 3567 config: Config, 3568 *, 3569 name: str, 3570 primary_key: Optional[ 3571 Union[str, List[str], List[List[str]]] 3572 ], # this seems to be needed to match create_simple_retriever 3573 stream_slicer: Optional[StreamSlicer], 3574 client_side_incremental_sync: Optional[Dict[str, Any]] = None, 3575 transformations: List[RecordTransformation], 3576 **kwargs: Any, 3577 ) -> AsyncRetriever: 3578 if model.download_target_requester and not model.download_target_extractor: 3579 raise ValueError( 3580 f"`download_target_extractor` required if using a `download_target_requester`" 3581 ) 3582 3583 def _get_download_retriever( 3584 requester: Requester, extractor: RecordExtractor, _decoder: Decoder 3585 ) -> SimpleRetriever: 3586 # We create a record selector for the download retriever 3587 # with no schema normalization and no transformations, neither record filter 3588 # as all this occurs in the record_selector of the AsyncRetriever 3589 record_selector = RecordSelector( 3590 extractor=extractor, 3591 name=name, 3592 record_filter=None, 3593 transformations=[], 3594 schema_normalization=TypeTransformer(TransformConfig.NoTransform), 3595 config=config, 3596 parameters={}, 3597 ) 3598 paginator = ( 3599 self._create_component_from_model( 3600 model=model.download_paginator, 3601 decoder=_decoder, 3602 config=config, 3603 url_base="", 3604 ) 3605 if model.download_paginator 3606 else NoPagination(parameters={}) 3607 ) 3608 3609 return SimpleRetriever( 3610 requester=requester, 3611 record_selector=record_selector, 3612 primary_key=None, 3613 name=name, 3614 paginator=paginator, 3615 config=config, 3616 parameters={}, 3617 log_formatter=self._get_log_formatter(None, name), 3618 ) 3619 3620 def _get_job_timeout() -> datetime.timedelta: 3621 user_defined_timeout: Optional[int] = ( 3622 int( 3623 InterpolatedString.create( 3624 str(model.polling_job_timeout), 3625 parameters={}, 3626 ).eval(config) 3627 ) 3628 if model.polling_job_timeout 3629 else None 3630 ) 3631 3632 # check for user defined timeout during the test read or 15 minutes 3633 test_read_timeout = datetime.timedelta(minutes=user_defined_timeout or 15) 3634 # default value for non-connector builder is 60 minutes. 3635 default_sync_timeout = datetime.timedelta(minutes=user_defined_timeout or 60) 3636 3637 return ( 3638 test_read_timeout if self._emit_connector_builder_messages else default_sync_timeout 3639 ) 3640 3641 decoder = ( 3642 self._create_component_from_model(model=model.decoder, config=config) 3643 if model.decoder 3644 else JsonDecoder(parameters={}) 3645 ) 3646 record_selector = self._create_component_from_model( 3647 model=model.record_selector, 3648 config=config, 3649 decoder=decoder, 3650 name=name, 3651 transformations=transformations, 3652 client_side_incremental_sync=client_side_incremental_sync, 3653 ) 3654 3655 stream_slicer = stream_slicer or SinglePartitionRouter(parameters={}) 3656 if self._should_limit_slices_fetched(): 3657 stream_slicer = cast( 3658 StreamSlicer, 3659 StreamSlicerTestReadDecorator( 3660 wrapped_slicer=stream_slicer, 3661 maximum_number_of_slices=self._limit_slices_fetched or 5, 3662 ), 3663 ) 3664 3665 creation_requester = self._create_component_from_model( 3666 model=model.creation_requester, 3667 decoder=decoder, 3668 config=config, 3669 name=f"job creation - {name}", 3670 ) 3671 polling_requester = self._create_component_from_model( 3672 model=model.polling_requester, 3673 decoder=decoder, 3674 config=config, 3675 name=f"job polling - {name}", 3676 ) 3677 job_download_components_name = f"job download - {name}" 3678 download_decoder = ( 3679 self._create_component_from_model(model=model.download_decoder, config=config) 3680 if model.download_decoder 3681 else JsonDecoder(parameters={}) 3682 ) 3683 download_extractor = ( 3684 self._create_component_from_model( 3685 model=model.download_extractor, 3686 config=config, 3687 decoder=download_decoder, 3688 parameters=model.parameters, 3689 ) 3690 if model.download_extractor 3691 else DpathExtractor( 3692 [], 3693 config=config, 3694 decoder=download_decoder, 3695 parameters=model.parameters or {}, 3696 ) 3697 ) 3698 download_requester = self._create_component_from_model( 3699 model=model.download_requester, 3700 decoder=download_decoder, 3701 config=config, 3702 name=job_download_components_name, 3703 ) 3704 download_retriever = _get_download_retriever( 3705 download_requester, download_extractor, download_decoder 3706 ) 3707 abort_requester = ( 3708 self._create_component_from_model( 3709 model=model.abort_requester, 3710 decoder=decoder, 3711 config=config, 3712 name=f"job abort - {name}", 3713 ) 3714 if model.abort_requester 3715 else None 3716 ) 3717 delete_requester = ( 3718 self._create_component_from_model( 3719 model=model.delete_requester, 3720 decoder=decoder, 3721 config=config, 3722 name=f"job delete - {name}", 3723 ) 3724 if model.delete_requester 3725 else None 3726 ) 3727 download_target_requester = ( 3728 self._create_component_from_model( 3729 model=model.download_target_requester, 3730 decoder=decoder, 3731 config=config, 3732 name=f"job extract_url - {name}", 3733 ) 3734 if model.download_target_requester 3735 else None 3736 ) 3737 status_extractor = self._create_component_from_model( 3738 model=model.status_extractor, decoder=decoder, config=config, name=name 3739 ) 3740 download_target_extractor = ( 3741 self._create_component_from_model( 3742 model=model.download_target_extractor, 3743 decoder=decoder, 3744 config=config, 3745 name=name, 3746 ) 3747 if model.download_target_extractor 3748 else None 3749 ) 3750 3751 job_repository: AsyncJobRepository = AsyncHttpJobRepository( 3752 creation_requester=creation_requester, 3753 polling_requester=polling_requester, 3754 download_retriever=download_retriever, 3755 download_target_requester=download_target_requester, 3756 abort_requester=abort_requester, 3757 delete_requester=delete_requester, 3758 status_extractor=status_extractor, 3759 status_mapping=self._create_async_job_status_mapping(model.status_mapping, config), 3760 download_target_extractor=download_target_extractor, 3761 job_timeout=_get_job_timeout(), 3762 ) 3763 3764 async_job_partition_router = AsyncJobPartitionRouter( 3765 job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator( 3766 job_repository, 3767 stream_slices, 3768 self._job_tracker, 3769 self._message_repository, 3770 # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk 3771 has_bulk_parent=False, 3772 # set the `job_max_retry` to 1 for the `Connector Builder`` use-case. 3773 # `None` == default retry is set to 3 attempts, under the hood. 3774 job_max_retry=1 if self._emit_connector_builder_messages else None, 3775 ), 3776 stream_slicer=stream_slicer, 3777 config=config, 3778 parameters=model.parameters or {}, 3779 ) 3780 3781 return AsyncRetriever( 3782 record_selector=record_selector, 3783 stream_slicer=async_job_partition_router, 3784 config=config, 3785 parameters=model.parameters or {}, 3786 )
3788 def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec: 3789 config_migrations = [ 3790 self._create_component_from_model(migration, config) 3791 for migration in ( 3792 model.config_normalization_rules.config_migrations 3793 if ( 3794 model.config_normalization_rules 3795 and model.config_normalization_rules.config_migrations 3796 ) 3797 else [] 3798 ) 3799 ] 3800 config_transformations = [ 3801 self._create_component_from_model(transformation, config) 3802 for transformation in ( 3803 model.config_normalization_rules.transformations 3804 if ( 3805 model.config_normalization_rules 3806 and model.config_normalization_rules.transformations 3807 ) 3808 else [] 3809 ) 3810 ] 3811 config_validations = [ 3812 self._create_component_from_model(validation, config) 3813 for validation in ( 3814 model.config_normalization_rules.validations 3815 if ( 3816 model.config_normalization_rules 3817 and model.config_normalization_rules.validations 3818 ) 3819 else [] 3820 ) 3821 ] 3822 3823 return Spec( 3824 connection_specification=model.connection_specification, 3825 documentation_url=model.documentation_url, 3826 advanced_auth=model.advanced_auth, 3827 parameters={}, 3828 config_migrations=config_migrations, 3829 config_transformations=config_transformations, 3830 config_validations=config_validations, 3831 )
3833 def create_substream_partition_router( 3834 self, 3835 model: SubstreamPartitionRouterModel, 3836 config: Config, 3837 *, 3838 stream_name: str, 3839 **kwargs: Any, 3840 ) -> SubstreamPartitionRouter: 3841 parent_stream_configs = [] 3842 if model.parent_stream_configs: 3843 parent_stream_configs.extend( 3844 [ 3845 self.create_parent_stream_config_with_substream_wrapper( 3846 model=parent_stream_config, config=config, stream_name=stream_name, **kwargs 3847 ) 3848 for parent_stream_config in model.parent_stream_configs 3849 ] 3850 ) 3851 3852 return SubstreamPartitionRouter( 3853 parent_stream_configs=parent_stream_configs, 3854 parameters=model.parameters or {}, 3855 config=config, 3856 )
3858 def create_parent_stream_config_with_substream_wrapper( 3859 self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any 3860 ) -> Any: 3861 # getting the parent state 3862 child_state = self._connector_state_manager.get_stream_state(stream_name, None) 3863 3864 # This flag will be used exclusively for StateDelegatingStream when a parent stream is created 3865 has_parent_state = bool( 3866 self._connector_state_manager.get_stream_state(stream_name, None) 3867 if model.incremental_dependency 3868 else False 3869 ) 3870 connector_state_manager = self._instantiate_parent_stream_state_manager( 3871 child_state, config, model, has_parent_state 3872 ) 3873 3874 substream_factory = ModelToComponentFactory( 3875 connector_state_manager=connector_state_manager, 3876 limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice, 3877 limit_slices_fetched=self._limit_slices_fetched, 3878 emit_connector_builder_messages=self._emit_connector_builder_messages, 3879 disable_retries=self._disable_retries, 3880 disable_cache=self._disable_cache, 3881 message_repository=StateFilteringMessageRepository( 3882 LogAppenderMessageRepositoryDecorator( 3883 { 3884 "airbyte_cdk": {"stream": {"is_substream": True}}, 3885 "http": {"is_auxiliary": True}, 3886 }, 3887 self._message_repository, 3888 self._evaluate_log_level(self._emit_connector_builder_messages), 3889 ), 3890 ), 3891 ) 3892 3893 return substream_factory.create_parent_stream_config( 3894 model=model, config=config, stream_name=stream_name, **kwargs 3895 )
3958 @staticmethod 3959 def create_wait_time_from_header( 3960 model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any 3961 ) -> WaitTimeFromHeaderBackoffStrategy: 3962 return WaitTimeFromHeaderBackoffStrategy( 3963 header=model.header, 3964 parameters=model.parameters or {}, 3965 config=config, 3966 regex=model.regex, 3967 max_waiting_time_in_seconds=model.max_waiting_time_in_seconds 3968 if model.max_waiting_time_in_seconds is not None 3969 else None, 3970 )
3972 @staticmethod 3973 def create_wait_until_time_from_header( 3974 model: WaitUntilTimeFromHeaderModel, config: Config, **kwargs: Any 3975 ) -> WaitUntilTimeFromHeaderBackoffStrategy: 3976 return WaitUntilTimeFromHeaderBackoffStrategy( 3977 header=model.header, 3978 parameters=model.parameters or {}, 3979 config=config, 3980 min_wait=model.min_wait, 3981 regex=model.regex, 3982 )
3990 @staticmethod 3991 def create_components_mapping_definition( 3992 model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any 3993 ) -> ComponentMappingDefinition: 3994 interpolated_value = InterpolatedString.create( 3995 model.value, parameters=model.parameters or {} 3996 ) 3997 field_path = [ 3998 InterpolatedString.create(path, parameters=model.parameters or {}) 3999 for path in model.field_path 4000 ] 4001 return ComponentMappingDefinition( 4002 field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString 4003 value=interpolated_value, 4004 value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type), 4005 create_or_update=model.create_or_update, 4006 condition=model.condition, 4007 parameters=model.parameters or {}, 4008 )
4010 def create_http_components_resolver( 4011 self, model: HttpComponentsResolverModel, config: Config, stream_name: Optional[str] = None 4012 ) -> Any: 4013 retriever = self._create_component_from_model( 4014 model=model.retriever, 4015 config=config, 4016 name=f"{stream_name if stream_name else '__http_components_resolver'}", 4017 primary_key=None, 4018 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4019 transformations=[], 4020 ) 4021 4022 components_mapping = [] 4023 for component_mapping_definition_model in model.components_mapping: 4024 if component_mapping_definition_model.condition: 4025 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4026 components_mapping.append( 4027 self._create_component_from_model( 4028 model=component_mapping_definition_model, 4029 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4030 component_mapping_definition_model.value_type 4031 ), 4032 config=config, 4033 ) 4034 ) 4035 4036 return HttpComponentsResolver( 4037 retriever=retriever, 4038 stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config), 4039 config=config, 4040 components_mapping=components_mapping, 4041 parameters=model.parameters or {}, 4042 )
4044 @staticmethod 4045 def create_stream_config( 4046 model: StreamConfigModel, config: Config, **kwargs: Any 4047 ) -> StreamConfig: 4048 model_configs_pointer: List[Union[InterpolatedString, str]] = ( 4049 [x for x in model.configs_pointer] if model.configs_pointer else [] 4050 ) 4051 4052 return StreamConfig( 4053 configs_pointer=model_configs_pointer, 4054 default_values=model.default_values, 4055 parameters=model.parameters or {}, 4056 )
4058 def create_config_components_resolver( 4059 self, 4060 model: ConfigComponentsResolverModel, 4061 config: Config, 4062 ) -> Any: 4063 model_stream_configs = ( 4064 model.stream_config if isinstance(model.stream_config, list) else [model.stream_config] 4065 ) 4066 4067 stream_configs = [ 4068 self._create_component_from_model( 4069 stream_config, config=config, parameters=model.parameters or {} 4070 ) 4071 for stream_config in model_stream_configs 4072 ] 4073 4074 components_mapping = [ 4075 self._create_component_from_model( 4076 model=components_mapping_definition_model, 4077 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4078 components_mapping_definition_model.value_type 4079 ), 4080 config=config, 4081 parameters=model.parameters, 4082 ) 4083 for components_mapping_definition_model in model.components_mapping 4084 ] 4085 4086 return ConfigComponentsResolver( 4087 stream_configs=stream_configs, 4088 config=config, 4089 components_mapping=components_mapping, 4090 parameters=model.parameters or {}, 4091 )
4093 def create_parametrized_components_resolver( 4094 self, 4095 model: ParametrizedComponentsResolverModel, 4096 config: Config, 4097 ) -> ParametrizedComponentsResolver: 4098 stream_parameters = StreamParametersDefinition( 4099 list_of_parameters_for_stream=model.stream_parameters.list_of_parameters_for_stream 4100 ) 4101 4102 components_mapping = [] 4103 for components_mapping_definition_model in model.components_mapping: 4104 if components_mapping_definition_model.condition: 4105 raise ValueError("`condition` is only supported for `ConfigComponentsResolver`") 4106 components_mapping.append( 4107 self._create_component_from_model( 4108 model=components_mapping_definition_model, 4109 value_type=ModelToComponentFactory._json_schema_type_name_to_type( 4110 components_mapping_definition_model.value_type 4111 ), 4112 config=config, 4113 ) 4114 ) 4115 return ParametrizedComponentsResolver( 4116 stream_parameters=stream_parameters, 4117 config=config, 4118 components_mapping=components_mapping, 4119 parameters=model.parameters or {}, 4120 )
4144 def create_http_api_budget( 4145 self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any 4146 ) -> HttpAPIBudget: 4147 policies = [ 4148 self._create_component_from_model(model=policy, config=config) 4149 for policy in model.policies 4150 ] 4151 4152 return HttpAPIBudget( 4153 policies=policies, 4154 ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", 4155 ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", 4156 status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or [429], 4157 )
4159 def create_fixed_window_call_rate_policy( 4160 self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any 4161 ) -> FixedWindowCallRatePolicy: 4162 matchers = [ 4163 self._create_component_from_model(model=matcher, config=config) 4164 for matcher in model.matchers 4165 ] 4166 4167 # Set the initial reset timestamp to 10 days from now. 4168 # This value will be updated by the first request. 4169 return FixedWindowCallRatePolicy( 4170 next_reset_ts=datetime.datetime.now() + datetime.timedelta(days=10), 4171 period=parse_duration(model.period), 4172 call_limit=model.call_limit, 4173 matchers=matchers, 4174 )
4176 def create_file_uploader( 4177 self, model: FileUploaderModel, config: Config, **kwargs: Any 4178 ) -> FileUploader: 4179 name = "File Uploader" 4180 requester = self._create_component_from_model( 4181 model=model.requester, 4182 config=config, 4183 name=name, 4184 **kwargs, 4185 ) 4186 download_target_extractor = self._create_component_from_model( 4187 model=model.download_target_extractor, 4188 config=config, 4189 name=name, 4190 **kwargs, 4191 ) 4192 emit_connector_builder_messages = self._emit_connector_builder_messages 4193 file_uploader = DefaultFileUploader( 4194 requester=requester, 4195 download_target_extractor=download_target_extractor, 4196 config=config, 4197 file_writer=NoopFileWriter() 4198 if emit_connector_builder_messages 4199 else LocalFileSystemFileWriter(), 4200 parameters=model.parameters or {}, 4201 filename_extractor=model.filename_extractor if model.filename_extractor else None, 4202 ) 4203 4204 return ( 4205 ConnectorBuilderFileUploader(file_uploader) 4206 if emit_connector_builder_messages 4207 else file_uploader 4208 )
4210 def create_moving_window_call_rate_policy( 4211 self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any 4212 ) -> MovingWindowCallRatePolicy: 4213 rates = [ 4214 self._create_component_from_model(model=rate, config=config) for rate in model.rates 4215 ] 4216 matchers = [ 4217 self._create_component_from_model(model=matcher, config=config) 4218 for matcher in model.matchers 4219 ] 4220 return MovingWindowCallRatePolicy( 4221 rates=rates, 4222 matchers=matchers, 4223 )
4225 def create_unlimited_call_rate_policy( 4226 self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any 4227 ) -> UnlimitedCallRatePolicy: 4228 matchers = [ 4229 self._create_component_from_model(model=matcher, config=config) 4230 for matcher in model.matchers 4231 ] 4232 4233 return UnlimitedCallRatePolicy( 4234 matchers=matchers, 4235 )
4244 def create_http_request_matcher( 4245 self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any 4246 ) -> HttpRequestRegexMatcher: 4247 return HttpRequestRegexMatcher( 4248 method=model.method, 4249 url_base=model.url_base, 4250 url_path_pattern=model.url_path_pattern, 4251 params=model.params, 4252 headers=model.headers, 4253 )
4260 def create_grouping_partition_router( 4261 self, 4262 model: GroupingPartitionRouterModel, 4263 config: Config, 4264 *, 4265 stream_name: str, 4266 **kwargs: Any, 4267 ) -> GroupingPartitionRouter: 4268 underlying_router = self._create_component_from_model( 4269 model=model.underlying_partition_router, 4270 config=config, 4271 stream_name=stream_name, 4272 **kwargs, 4273 ) 4274 if model.group_size < 1: 4275 raise ValueError(f"Group size must be greater than 0, got {model.group_size}") 4276 4277 # Request options in underlying partition routers are not supported for GroupingPartitionRouter 4278 # because they are specific to individual partitions and cannot be aggregated or handled 4279 # when grouping, potentially leading to incorrect API calls. Any request customization 4280 # should be managed at the stream level through the requester's configuration. 4281 if isinstance(underlying_router, SubstreamPartitionRouter): 4282 if any( 4283 parent_config.request_option 4284 for parent_config in underlying_router.parent_stream_configs 4285 ): 4286 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4287 4288 if isinstance(underlying_router, ListPartitionRouter): 4289 if underlying_router.request_option: 4290 raise ValueError("Request options are not supported for GroupingPartitionRouter.") 4291 4292 return GroupingPartitionRouter( 4293 group_size=model.group_size, 4294 underlying_partition_router=underlying_router, 4295 deduplicate=model.deduplicate if model.deduplicate is not None else True, 4296 config=config, 4297 )